From 926259c411c1022812ffb7fe88ca61f0180bd778 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 09:51:09 +0800
Subject: [PATCH 0001/1262] TST: test case for string

---
 tensorflow/python/kernel_tests/scatter_nd_ops_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 9f57949515..83d69c651a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -364,6 +364,16 @@ class ScatterNdTest(test.TestCase):
     del input_  # input_ is not used in scatter_nd
     return array_ops.scatter_nd(indices, updates, shape)
 
+  def testString(self):
+    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string)
+    expected = np.array(["", "one", "", "three", "four", "", "", "seven"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertTrue(np.array_equal(result, expected))
+
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
-- 
GitLab


From 005840c6e2d2a4c25ecd293162a38a79dedf1a4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 10:06:44 +0800
Subject: [PATCH 0002/1262] ENH: supports string for cpu

---
 tensorflow/core/kernels/scatter_nd_op.cc         | 1 +
 tensorflow/core/kernels/scatter_nd_op_cpu_impl.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 3a95dd1773..0caa7bd317 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -241,6 +241,7 @@ class ScatterNdUpdateOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
+TF_CALL_string(REGISTER_SCATTER_ND_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index cffc326174..155d354d85 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -160,6 +160,7 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
   REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB);
 
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
+REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)
 
 #undef REGISTER_SCATTER_ND_MATH
-- 
GitLab


From d887d2bcfc819034b17e812a9a60460e2d61e447 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 12:14:40 +0800
Subject: [PATCH 0003/1262] TST: ignore NonAliasingAdd

---
 tensorflow/python/kernel_tests/scatter_nd_ops_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 83d69c651a..03b2f892c6 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -594,6 +594,10 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
         shape, dtype=updates.dtype))
     return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)
 
+  def testString(self):
+    # Not supported yet.
+    pass
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 4b697e0d9472215c706bdb36bb72986cdce78edd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 14 Dec 2017 13:51:34 +0800
Subject: [PATCH 0004/1262] DOC: modify document

---
 tensorflow/core/ops/array_ops.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 5a31f433ce..933ebe6b63 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -5332,12 +5332,13 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
     .Doc(R"doc(
-Scatter `updates` into a new (initially zero) tensor according to `indices`.
+Scatter `updates` into a new (initially zero for numeric, empty for string)
+tensor according to `indices`.
 
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a zero (or empty string) tensor of the given `shape`
+according to indices. This operator is the inverse of the @{tf.gather_nd}
+operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
-- 
GitLab


From 597403e03680d69b72dbfa669f7bbdc77ce21ec9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Wed, 20 Dec 2017 16:34:48 +0800
Subject: [PATCH 0005/1262] CLN: conform docstring

---
 tensorflow/core/ops/array_ops.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 933ebe6b63..89b6eb7162 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -5332,13 +5332,12 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
     .Doc(R"doc(
-Scatter `updates` into a new (initially zero for numeric, empty for string)
-tensor according to `indices`.
+Scatter `updates` into a new empty tensor according to `indices`.
 
 Creates a new tensor by applying sparse `updates` to individual values or
-slices within a zero (or empty string) tensor of the given `shape`
-according to indices. This operator is the inverse of the @{tf.gather_nd}
-operator which extracts values or slices from a given tensor.
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices. This operator is the inverse of the
+@{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
-- 
GitLab


From e2a0db74cfa4ed73692ec5d0af944660bb4b688c Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Tue, 6 Feb 2018 17:52:07 -0800
Subject: [PATCH 0006/1262] Python3 support of docs generation

---
 tensorflow/docs_src/community/documentation.md | 18 +++---------------
 tensorflow/tools/docs/BUILD                    |  2 +-
 tensorflow/tools/docs/build_docs_test.py       |  4 ----
 tensorflow/tools/docs/generate_lib.py          |  2 --
 tensorflow/tools/docs/generate_lib_test.py     |  3 ---
 tensorflow/tools/docs/parser.py                |  4 ++--
 tensorflow/tools/docs/parser_test.py           |  4 ----
 tensorflow/tools/docs/pretty_docs.py           | 12 ++++++------
 tensorflow/workspace.bzl                       | 11 -----------
 9 files changed, 12 insertions(+), 48 deletions(-)

diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 003e0a25ec..8d55148e48 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -148,19 +148,7 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
 
-1. If pip isn't installed on your machine, install it now by issuing the
-following command:
-
-        $ sudo easy_install pip
-
-2. Use pip to install codegen, mock, and pandas by issuing the following
-   command (Note: If you are using
-   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
-   dependencies, you may not want to use sudo for these installations):
-
-        $ sudo pip install codegen mock pandas
-
-3. If bazel is not installed on your machine, install it now. If you are on
+1. If bazel is not installed on your machine, install it now. If you are on
    Linux, install bazel by issuing the following command:
 
         $ sudo apt-get install bazel  # Linux
@@ -168,10 +156,10 @@ following command:
     If you are on Mac OS, find bazel installation instructions on
     [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
 
-4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+2. Change directory to the top-level `tensorflow` directory of the TensorFlow
    source code.
 
-5. Run the `configure` script and answer its prompts appropriately for your
+3. Run the `configure` script and answer its prompts appropriately for your
    system.
 
         $ ./configure
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 8f10bc9e0c..cafa1f7eb3 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -37,7 +37,7 @@ py_library(
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["@com_github_andreif_codegen"],
+    deps = ["@astor_archive//:astor"],
 )
 
 py_test(
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index ae293f6576..2e8f634e7c 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -39,10 +39,6 @@ class Flags(object):
 class BuildDocsTest(googletest.TestCase):
 
   def testBuildDocs(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     doc_generator = generate_lib.DocGenerator()
 
     doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 003f972070..635408d87f 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -455,8 +455,6 @@ class DocGenerator(object):
   """Main entry point for generating docs."""
 
   def __init__(self):
-    if sys.version_info >= (3, 0):
-      sys.exit('Doc generation is not supported from python3.')
     self.argument_parser = argparse.ArgumentParser()
     self._py_modules = None
     self._private_map = _get_default_private_map()
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index 1ceaf31f1c..ea6d28a02b 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -52,9 +52,6 @@ class DummyVisitor(object):
 class GenerateTest(googletest.TestCase):
 
   def test_write(self):
-    if sys.version_info >= (3, 0):
-      self.skipTest('Warning: Doc generation is not supported from python3.')
-
     module = sys.modules[__name__]
 
     index = {
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 3db164c2b5..1798378d55 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -26,7 +26,7 @@ import os
 import re
 import sys
 
-import codegen
+import astor
 import six
 
 from google.protobuf.message import Message as ProtoMessage
@@ -705,7 +705,7 @@ def _generate_signature(func, reverse_index):
       if id(default) in reverse_index:
         default_text = reverse_index[id(default)]
       elif ast_default is not None:
-        default_text = codegen.to_source(ast_default)
+        default_text = astor.to_source(ast_default)
         if default_text != repr(default):
           # This may be an internal name. If so, handle the ones we know about.
           # TODO(wicke): This should be replaced with a lookup in the index.
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 8a0e9af521..7d2bf9177a 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -523,10 +523,6 @@ class TestParseFunctionDetails(googletest.TestCase):
 class TestGenerateSignature(googletest.TestCase):
 
   def test_known_object(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     known_object = object()
     reverse_index = {id(known_object): 'location.of.object.in.api'}
 
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 543b5fa6fe..55ab5bdd49 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -101,7 +101,7 @@ def _build_class_page(page_info):
 
     link_template = '[`{short_name}`]({url})'
     parts.append(', '.join(
-        link_template.format(**base.__dict__) for base in page_info.bases))
+        link_template.format(**base._asdict()) for base in page_info.bases))
 
   parts.append('\n\n')
 
@@ -159,7 +159,7 @@ def _build_class_page(page_info):
       h3 = ('<h3 id="{short_name}">'
             '<code>{short_name}</code>'
             '</h3>\n\n')
-      parts.append(h3.format(**method_info.__dict__))
+      parts.append(h3.format(**method_info._asdict()))
 
       if method_info.signature is not None:
         parts.append(_build_signature(method_info, use_full_name=False))
@@ -217,7 +217,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}`]({url}) module'
 
     for item in page_info.modules:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -229,7 +229,7 @@ def _build_module_page(page_info):
     template = '[`class {short_name}`]({url})'
 
     for item in page_info.classes:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -241,7 +241,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}(...)`]({url})'
 
     for item in page_info.functions:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -254,7 +254,7 @@ def _build_module_page(page_info):
     parts.append('## Other Members\n\n')
 
     for item in page_info.other_members:
-      parts.append('`{short_name}`\n\n'.format(**item.__dict__))
+      parts.append('`{short_name}`\n\n'.format(**item._asdict()))
 
   return ''.join(parts)
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index eca744a920..4a2274eb1a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -328,17 +328,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:backports_weakref.BUILD")),
   )
 
-  tf_http_archive(
-      name = "com_github_andreif_codegen",
-      urls = [
-          "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
-          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
-      ],
-      sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
-      strip_prefix = "codegen-1.0",
-      build_file = str(Label("//third_party:codegen.BUILD")),
-  )
-
   filegroup_external(
       name = "org_python_license",
       licenses = ["notice"],  # Python 2.0
-- 
GitLab


From 4f5d9a88f84e2261808bc986ece951e6e1d10725 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Tue, 6 Feb 2018 17:55:15 -0800
Subject: [PATCH 0007/1262] remove unused codegen.BUILD

---
 third_party/codegen.BUILD | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 third_party/codegen.BUILD

diff --git a/third_party/codegen.BUILD b/third_party/codegen.BUILD
deleted file mode 100644
index df436c8163..0000000000
--- a/third_party/codegen.BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- mode: python; -*-
-#
-# Description:
-#   Extension to ast that allow ast -> python code generation.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # New BSD
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "com_github_andreif_codegen",
-    srcs = glob(["codegen.py"]),
-    srcs_version = "PY2AND3",
-)
-- 
GitLab


From 736e8c4ccb16718d11cf7c8e1fac843bf6e388a7 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 14 Feb 2018 18:26:20 +0900
Subject: [PATCH 0008/1262] fix typo

---
 tensorflow/core/lib/io/record_writer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 3657243c5d..ebc5648269 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -49,7 +49,7 @@ RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
 #endif  // IS_SLIM_BUILD
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
-               << ". No comprression will be used.";
+               << ". No compression will be used.";
   }
   return options;
 }
-- 
GitLab


From 617fa4e5fa634270c36a2a8762e6ce96bd38f2f8 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 14 Feb 2018 18:35:31 +0900
Subject: [PATCH 0009/1262] fix typo

---
 tensorflow/contrib/makefile/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index b0228c5435..995230dfa8 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -155,7 +155,7 @@ CC_PREFIX=ccache tensorflow/contrib/makefile/build_all_android.sh -s tensorflow/
 (add -T on subsequent builds to skip protobuf downloading/building)
 
 
-#### Testing the the CUDA-enabled benchmark via adb:
+#### Testing the CUDA-enabled benchmark via adb:
 Build binaries first as above, then run:
 
 ```bash
-- 
GitLab


From 15f3b920ad7eb7fcca3afee14d16049db2046d4b Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 14 Feb 2018 16:27:23 -0800
Subject: [PATCH 0010/1262] Fix __shared__ types with non-empty constructor

std::complex<T> has a non-empty constructor (zero assignment) that is not
compatible with CUDA __shared__ memory. This fixes current reliance on
undefined behavior. (and removes an unnecessary run-time initialization).
---
 .../core/kernels/reduction_gpu_kernels.cu.h   | 37 +++++++++++++++++--
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 15ae4c1fc5..95a3e222b5 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -244,6 +244,33 @@ __global__ void RowReduceKernel(
   if (row < num_rows && lane == 0) out[row] = sum;
 }
 
+template <typename T1>
+struct storage_type {
+  T1 val;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator T1() { return val; }
+  __host__ __device__ storage_type<T1>& operator=(const T1& in) {
+    val = in;
+    return *this;
+  }
+};
+
+template <typename T2>
+struct storage_type<std::complex<T2>> {
+  T2 real;
+  T2 imag;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator std::complex<T2>() {
+    return std::complex<T2>(real, imag);
+  }
+  __host__ __device__ storage_type<std::complex<T2>>& operator=(
+      const std::complex<T2>& in) {
+    real = in.real();
+    imag = in.imag();
+    return *this;
+  }
+};
+
 // Works only if there are <= 16 columns
 // each warps sums over multiple rows at once
 template <typename T, typename outT, typename Op>
@@ -268,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -294,7 +321,8 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
     if (blockDim.y > 1) {
       for (int row = 1; row < blockDim.y; ++row) {
-        s = op(s, partial_sums[threadIdx.x * 33 + row]);
+        value_type t = partial_sums[threadIdx.x * 33 + row];
+        s = op(s, t);
       }
     }
 
@@ -316,7 +344,7 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += gridDim.y * blockDim.y;
 
@@ -347,7 +375,8 @@ __global__ void ColumnReduceKernel(
         min(blockDim.y, num_rows - blockIdx.y * blockDim.y);
 
     for (int row = 1; row < numRowsThisBlock; ++row) {
-      s = op(s, partial_sums[threadIdx.x * 33 + row]);
+      value_type t = partial_sums[threadIdx.x * 33 + row];
+      s = op(s, t);
     }
 
     out[col * gridDim.y + blockIdx.y] = s;
-- 
GitLab


From b81aaac898d93e17b4a280bb02547d2a60d490cb Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 15 Feb 2018 08:28:12 +0000
Subject: [PATCH 0011/1262] Fix warnings in
 tf.contrib.bayesflow.monte_carlo.expectation

This fix fixes several warnings in tf.contrib.bayesflow.monte_carlo.expectation
by switching to keepdims for tf.reduce_mean.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 985177e897..5263e87ae6 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -328,7 +328,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
     if not callable(f):
       raise ValueError('`f` must be a callable function.')
     if use_reparametrization:
-      return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims)
     else:
       if not callable(log_prob):
         raise ValueError('`log_prob` must be a callable function.')
@@ -348,7 +348,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
       # "Is there a floating point value of x, for which x-x == 0 is false?"
       # http://stackoverflow.com/q/2686644
       fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
-      return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims)
 
 
 def _sample_mean(values):
-- 
GitLab


From 9c272adf248228408448db6219b238145f5a02ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 16 Feb 2018 10:38:50 +0800
Subject: [PATCH 0012/1262] DOC: move doc to api def file

---
 .../core/api_def/base_api/api_def_ScatterNd.pbtxt      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 4cb8c064fc..4e95895f54 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -25,12 +25,12 @@ A new tensor with the given shape and updates applied according
 to the indices.
 END
   }
-  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  summary: "Scatter `updates` into a new empty tensor according to `indices`."
   description: <<END
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices.  This operator is the inverse of the
+@{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
-- 
GitLab


From 672ec270f96144bca5e1d75d002421c1e9b49921 Mon Sep 17 00:00:00 2001
From: Hovhannes Harutyunyan <hovhannes.harutyunyan@picsart.com>
Date: Mon, 19 Feb 2018 12:56:40 +0400
Subject: [PATCH 0013/1262] Add broadcasting functionality fro Div and Sub ops.

---
 tensorflow/contrib/lite/kernels/div.cc        | 117 ++++++--
 tensorflow/contrib/lite/kernels/div_test.cc   | 174 ++++++++++++
 .../internal/optimized/optimized_ops.h        | 268 +++++++++++++++++-
 .../internal/reference/reference_ops.h        | 257 +++++++++++++++++
 tensorflow/contrib/lite/kernels/sub.cc        | 135 +++++++--
 tensorflow/contrib/lite/kernels/sub_test.cc   | 213 ++++++++++++++
 .../testing/generated_examples_zip_test.cc    |  15 +-
 7 files changed, 1122 insertions(+), 57 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/div_test.cc
 create mode 100644 tensorflow/contrib/lite/kernels/sub_test.cc

diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index 44bd0dc85d..c77a0de9b7 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,35 +61,85 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
-void EvalDivFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteDivParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDivParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_DIV(type)                                        \
-  type::Div(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_DIV(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_DIV(reference_ops, BroadcastDiv);
+    } else {
+      TF_LITE_DIV(reference_ops, Div);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_DIV(optimized_ops, BroadcastDiv);
+    } else {
+      TF_LITE_DIV(optimized_ops, Div);
+    }
+  }
+#undef TF_LITE_DIV
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDivParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
+  auto input1_offset = -input1->params.zero_point;
+  auto input2_offset = -input2->params.zero_point;
+  auto output_offset = output->params.zero_point;
+
+  int32_t output_multiplier;
+  int output_shift;
+
+  double real_multiplier =
+      input1->params.scale * input2->params.scale / output->params.scale;
+  QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                   &output_shift);
+
+  int32 output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(params->activation, output,
+                                &output_activation_min, &output_activation_max);
+
+#define TF_LITE_DIV(type, opname)                                      \
+  type::opname(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
+               input1_offset, GetTensorData<uint8_t>(input2),          \
+               GetTensorDims(input2), input2_offset, output_offset,    \
+               output_multiplier, output_shift, output_activation_min, \
+               output_activation_max, GetTensorData<uint8_t>(output),  \
+               GetTensorDims(output));
+  // The quantized version of Div doesn't support activations, so we
+  // always use BroadcastDiv.
   if (kernel_type == kReference) {
-    TF_LITE_DIV(reference_ops);
+    TF_LITE_DIV(reference_ops, BroadcastDiv);
   } else {
-    TF_LITE_DIV(optimized_ops);
+    TF_LITE_DIV(optimized_ops, BroadcastDiv);
   }
 #undef TF_LITE_DIV
 }
@@ -81,15 +147,20 @@ void EvalDivFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalDivFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8) {
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Div only supports FLOAT32 and quantized UINT8 now.");
     return kTfLiteError;
   }
 
@@ -99,19 +170,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace div
 
 TfLiteRegistration* Register_DIV_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_DIV_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_DIV_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+  static TfLiteRegistration r = {div::Init, div::Free, div::Prepare,
                                  div::Eval<div::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/div_test.cc b/tensorflow/contrib/lite/kernels/div_test.cc
new file mode 100644
index 0000000000..78918a0d79
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/div_test.cc
@@ -0,0 +1,174 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseDivOpModel : public SingleOpModel {
+ public:
+  BaseDivOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_DIV, BuiltinOptions_DivOptions,
+                 CreateDivOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatDivOpModel : public BaseDivOpModel {
+ public:
+  using BaseDivOpModel::BaseDivOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// For quantized Div, the error shouldn't exceed (2*step + step^2).
+// The param min=-1.0 & max=1.0 is used in the following tests.
+// The tolerance value is ~0.0157.
+const float kQuantizedStep = 2.0 / 255.0;
+const float kQuantizedTolerance =
+    2.0 * kQuantizedStep + kQuantizedStep * kQuantizedStep;
+
+class QuantizedDivOpModel : public BaseDivOpModel {
+ public:
+  using BaseDivOpModel::BaseDivOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(FloatDivOpTest, NoActivation) {
+  FloatDivOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.4, 1.0, 0.8, 1.6})));
+}
+
+TEST(FloatDivOpTest, ActivationRELU_N1_TO_1) {
+  FloatDivOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-1.0, 1.0, 0.8, 1.0})));
+}
+
+TEST(FloatDivOpTest, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.6, 0.5, -1.1, -0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-20.0, 1.0, 0.5, 1.6, -1.0, 20.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatDivOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.07, 0.08, 0.11, -0.123});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.0, 2.0, 0.7, 0.8, 1.1, -1.23})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedDivOpTest, NoActivation) {
+  QuantizedDivOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {}, -1.0, 1.0},
+                        ActivationFunctionType_NONE);
+  m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.6, 0.2, 0.9, -0.7});
+  m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.8, 0.4, 0.9, -0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.75, 0.5, 1.0, 0.875},
+                                              kQuantizedTolerance)));
+}
+
+// for quantized Div, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(QuantizedDivOpTest, WithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedDivOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.2, 0.2, 0.07, 0.08, 0.11, -0.123});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-2.0, 2.0, 0.7, 0.8, 1.1, -1.23}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index dec58fea4f..d12a3eca1d 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1928,6 +1928,126 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastDiv(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 unclamped_result =
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  input1_val / input2_val, output_multiplier, output_shift);
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, unclamped_result));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastDiv(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
 // TODO(aselle): This is not actually optimized yet.
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
@@ -1955,6 +2075,152 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
     }
   }
 }
+
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastSub(left_shift, input1_data, input1_dims, input1_offset,
+               input1_multiplier, input1_shift, input2_data, input2_dims,
+               input2_offset, input2_multiplier, input2_shift, output_offset,
+               output_multiplier, output_shift, output_activation_min,
+               output_activation_max, output_data, output_dims);
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
@@ -2866,7 +3132,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 5f4d5be323..c7b7687622 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1208,6 +1208,122 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastDiv(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 unclamped_result =
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  input1_val / input2_val, output_multiplier, output_shift);
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, unclamped_result));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastDiv(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -1235,6 +1351,147 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastSub(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastSub(left_shift, input1_data, input1_dims, input1_offset,
+               input1_multiplier, input1_shift, input2_data, input2_dims,
+               input2_offset, input2_multiplier, input2_shift, output_offset,
+               output_multiplier, output_shift, output_activation_min,
+               output_activation_max, output_data, output_dims);
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index ddaf498d5b..410585a293 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -26,7 +26,7 @@ namespace ops {
 namespace builtin {
 namespace sub {
 
-// This file has three implementation of Div.
+// This file has three implementation of Sub.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,51 +61,122 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalSubFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteSubParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+                  TfLiteSubParams* params, const OpData* data,
+                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_Sub(type)                                        \
-  type::Sub(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_SUB(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_SUB(reference_ops, BroadcastSub);
+    } else {
+      TF_LITE_SUB(reference_ops, Sub);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_SUB(optimized_ops, BroadcastSub);
+    } else {
+      TF_LITE_SUB(optimized_ops, Sub);
+    }
+  }
+#undef TF_LITE_SUB
+}
+
+template <KernelType kernel_type>
+void EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLiteSubParams* params, const OpData* data,
+                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      TfLiteTensor* output) {
+  auto input1_offset = -input1->params.zero_point;
+  auto input2_offset = -input2->params.zero_point;
+  auto output_offset = output->params.zero_point;
+  const int left_shift = 20;
+  const double twice_max_input_scale =
+      2 * std::max(input1->params.scale, input2->params.scale);
+  const double real_input1_multiplier =
+      input1->params.scale / twice_max_input_scale;
+  const double real_input2_multiplier =
+      input2->params.scale / twice_max_input_scale;
+  const double real_output_multiplier =
+      twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+
+  int32 input1_multiplier;
+  int input1_shift;
+  QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier,
+                                   &input1_shift);
+  int32 input2_multiplier;
+  int input2_shift;
+  QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier,
+                                   &input2_shift);
+  int32 output_multiplier;
+  int output_shift;
+  QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier,
+                                   &output_shift);
+
+  int32 output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(params->activation, output,
+                                &output_activation_min, &output_activation_max);
+
+#define TF_LITE_SUB(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Sub doesn't support activations, so we
+  // always use BroadcastSub.
   if (kernel_type == kReference) {
-    TF_LITE_Sub(reference_ops);
+    TF_LITE_SUB(reference_ops, BroadcastSub);
   } else {
-    TF_LITE_Sub(optimized_ops);
+    TF_LITE_SUB(optimized_ops, BroadcastSub);
   }
-#undef TF_LITE_Sub
+#undef TF_LITE_SUB
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalSubFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalSubFloat<kernel_type>(context, node, params, data, input1, input2,
+                              output);
+  } else if (output->type == kTfLiteUInt8) {
+    EvalSubQuantized<kernel_type>(context, node, params, data, input1, input2,
+                                  output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Inputs and outputs not all float|unit8 types.");
     return kTfLiteError;
   }
 
@@ -99,19 +186,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace sub
 
 TfLiteRegistration* Register_SUB_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_SUB_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_SUB_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+  static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
                                  sub::Eval<sub::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/sub_test.cc b/tensorflow/contrib/lite/kernels/sub_test.cc
new file mode 100644
index 0000000000..b2c6d05f62
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sub_test.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSubOpModel : public SingleOpModel {
+ public:
+  BaseSubOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_Sub, BuiltinOptions_SubOptions,
+                 CreateSubOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedSubOpModel : public BaseSubOpModel {
+ public:
+  using BaseSubOpModel::BaseSubOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// for quantized Sub, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(FloatSubOpModel, NoActivation) {
+  FloatSubOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-2.1, 0.0, 1.4, -0.3}));
+}
+
+TEST(FloatSubOpModel, ActivationRELU_N1_TO_1) {
+  FloatSubOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.0, 0.0, 1.0, -0.3}));
+}
+
+TEST(FloatSubOpModel, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5, -1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.8, -1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreArray({-2.1, 0.0, 1.4, -0.3, 0.0, 1.9}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSubOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 1.7, 0.5, -1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.5});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-2.5, -0.3, 1.2, 0.0, -1.6, 1.5})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
+  std::vector<std::initializer_list<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
+  std::vector<std::initializer_list<float>> results = {
+      {-0.5, -0.2, 0.0, 0.3}, {-0.8, -0.2, -0.1, 0.9}, {-0.61, -0.2, 0.88, -0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                                       {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                                       {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::initializer_list<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                                       {-1.0, -0.2, 1.0, 0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-2.1, -0.1, 0.4, 0.3, 0.0, 1.9},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-2.7, -0.5, 0.0, 0.1, 0.4, 1.3},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 49766cedac..1e177d5f6e 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -47,9 +47,6 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Sub and Div don't support broadcasting.
-    {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
 
     // Add only supports float32. (and "constant" tests use Add)
     {R"(^\/adda.*int32)", "68808744"},
@@ -235,22 +232,23 @@ TEST_P(OpsTest, RunStuff) {
 
 INSTANTIATE_TESTS(add)
 INSTANTIATE_TESTS(avg_pool)
-INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
 INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
+INSTANTIATE_TESTS(div)
 INSTANTIATE_TESTS(exp)
 INSTANTIATE_TESTS(fully_connected)
 INSTANTIATE_TESTS(fused_batch_norm)
 INSTANTIATE_TESTS(gather)
 INSTANTIATE_TESTS(global_batch_norm)
-INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(l2_pool)
+INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(local_response_norm)
 INSTANTIATE_TESTS(max_pool)
+INSTANTIATE_TESTS(mean)
 INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
@@ -260,14 +258,13 @@ INSTANTIATE_TESTS(reshape)
 INSTANTIATE_TESTS(resize_bilinear)
 INSTANTIATE_TESTS(sigmoid)
 INSTANTIATE_TESTS(softmax)
+INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(space_to_depth)
-INSTANTIATE_TESTS(sub)
 INSTANTIATE_TESTS(split)
-INSTANTIATE_TESTS(div)
-INSTANTIATE_TESTS(transpose)
-INSTANTIATE_TESTS(mean)
 INSTANTIATE_TESTS(squeeze)
 INSTANTIATE_TESTS(strided_slice)
+INSTANTIATE_TESTS(sub)
+INSTANTIATE_TESTS(transpose)
 
 }  // namespace testing
 }  // namespace tflite
-- 
GitLab


From 779d457008ab7ea2c11f4d73370099a1e56c0652 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Sun, 25 Feb 2018 21:39:52 +0900
Subject: [PATCH 0014/1262] fix typo

---
 .../python/kernel_tests/linalg/linear_operator_diag_test.py     | 2 +-
 tensorflow/python/ops/linalg/linear_operator_diag.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 343d158498..8cb9f9e621 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -129,7 +129,7 @@ class LinearOperatorDiagTest(
     with self.test_session() as sess:
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
-      # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
+      # This LinearOperatorDiag will be broadcast to (2, 2, 3, 3) during solve
       # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index b3ec3d5b7c..e180e83026 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -67,7 +67,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator = LinearOperatorDiag(diag)
 
   # Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
-  # since the batch dimensions, [2, 1], are brodcast to
+  # since the batch dimensions, [2, 1], are broadcast to
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
-- 
GitLab


From b569035378ef4a8595c64e5f398d74244cac376e Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Sun, 25 Feb 2018 21:44:12 +0900
Subject: [PATCH 0015/1262] fix typo

---
 tensorflow/contrib/slim/python/slim/data/parallel_reader.py | 2 +-
 tensorflow/python/ops/distributions/special_math.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index ad5e985487..b3343aef47 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -221,7 +221,7 @@ def parallel_read(data_sources,
         the data will be cycled through indefinitely.
     num_readers: a integer, number of Readers to create.
     reader_kwargs: an optional dict, of kwargs for the reader.
-    shuffle: boolean, wether should shuffle the files and the records by using
+    shuffle: boolean, whether should shuffle the files and the records by using
       RandomShuffleQueue as common_queue.
     dtypes:  A list of types.  The length of dtypes must equal the number
         of elements in each record. If it is None it will default to
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index bed4cbb2c1..1d605c5dfc 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -213,7 +213,7 @@ def _ndtri(p):
 
   # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
   # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
-  # arrays based on wether p < exp(-32).
+  # arrays based on whether p < exp(-32).
   z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
   first_term = z - math_ops.log(z) / z
   second_term_small_p = (_create_polynomial(1. / z, p2)
-- 
GitLab


From f1f70ef5c268d6ce41bdab4867ed0f2e19d6f924 Mon Sep 17 00:00:00 2001
From: Hovhannes Harutyunyan <hovhannes.harutyunyan@picsart.com>
Date: Mon, 26 Feb 2018 10:52:11 +0400
Subject: [PATCH 0016/1262] Remove code that was written for compatibility with
 old checked-in code. Update code to have 80 characters per line.

---
 tensorflow/contrib/lite/kernels/div_test.cc   |  3 +-
 .../internal/optimized/optimized_ops.h        | 41 -------------------
 .../internal/reference/reference_ops.h        | 41 -------------------
 tensorflow/contrib/lite/kernels/sub_test.cc   | 18 +++++---
 4 files changed, 15 insertions(+), 88 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/div_test.cc b/tensorflow/contrib/lite/kernels/div_test.cc
index 78918a0d79..e67e0ec034 100644
--- a/tensorflow/contrib/lite/kernels/div_test.cc
+++ b/tensorflow/contrib/lite/kernels/div_test.cc
@@ -154,7 +154,8 @@ TEST(QuantizedDivOpTest, WithBroadcast) {
                           {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
                           {TensorType_UINT8, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.2, 0.2, 0.07, 0.08, 0.11, -0.123});
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.2,  0.2,   0.07,
+                                                0.08, 0.11, -0.123});
     m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
     m.Invoke();
     EXPECT_THAT(m.GetDequantizedOutput(),
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d12a3eca1d..b19f46beaa 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1973,19 +1973,6 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  BroadcastDiv(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
-}
-
 inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
@@ -2033,21 +2020,6 @@ inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastDiv(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
-}
-
 // TODO(aselle): This is not actually optimized yet.
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
@@ -2121,19 +2093,6 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  BroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
-}
-
 inline void BroadcastSub(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index c7b7687622..847075e207 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1249,19 +1249,6 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  BroadcastDiv(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
-}
-
 inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
@@ -1309,21 +1296,6 @@ inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastDiv(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
-}
-
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -1392,19 +1364,6 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  BroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
-}
-
 inline void BroadcastSub(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
diff --git a/tensorflow/contrib/lite/kernels/sub_test.cc b/tensorflow/contrib/lite/kernels/sub_test.cc
index b2c6d05f62..1fd0ee2a0e 100644
--- a/tensorflow/contrib/lite/kernels/sub_test.cc
+++ b/tensorflow/contrib/lite/kernels/sub_test.cc
@@ -125,11 +125,17 @@ TEST(FloatSubOpModel, WithBroadcast) {
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::initializer_list<float>> inputs1 = {
-      {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
+                                {0.1, 0.2, 0.3, 0.4},
+                                {-0.2, 0.2, 0.4, 0.7},
+                                {-0.01, 0.2, 0.7, 0.3}};
   std::vector<std::initializer_list<float>> inputs2 = {
-      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
+                                {0.6, 0.4, 0.3, 0.1},
+                                {0.6, 0.4, 0.5, -0.2},
+                                {0.6, 0.4, -0.18, 0.5}};
   std::vector<std::initializer_list<float>> results = {
-      {-0.5, -0.2, 0.0, 0.3}, {-0.8, -0.2, -0.1, 0.9}, {-0.61, -0.2, 0.88, -0.2}};
+                              {-0.5, -0.2, 0.0, 0.3},
+                              {-0.8, -0.2, -0.1, 0.9},
+                              {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -179,7 +185,8 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
     m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
     EXPECT_THAT(m.GetDequantizedOutput(),
-                ElementsAreArray(ArrayFloatNear({-2.1, -0.1, 0.4, 0.3, 0.0, 1.9},
+                ElementsAreArray(ArrayFloatNear({-2.1, -0.1, 0.4,
+                                                  0.3,  0.0, 1.9},
                                                 kQuantizedTolerance)))
         << "With shape number " << i;
   }
@@ -198,7 +205,8 @@ TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
     m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
     m.Invoke();
     EXPECT_THAT(m.GetDequantizedOutput(),
-                ElementsAreArray(ArrayFloatNear({-2.7, -0.5, 0.0, 0.1, 0.4, 1.3},
+                ElementsAreArray(ArrayFloatNear({-2.7, -0.5, 0.0,
+                                                  0.1,  0.4, 1.3},
                                                 kQuantizedTolerance)))
         << "With shape number " << i;
   }
-- 
GitLab


From 62a05fe71ba5157e7abeb291f4b8b6ac7abf97fb Mon Sep 17 00:00:00 2001
From: DavidNorman <davidn@graphcore.ai>
Date: Tue, 27 Feb 2018 11:51:05 +0000
Subject: [PATCH 0017/1262] Ensure that the backend_deps is a non-frozen object

---
 tensorflow/compiler/xla/tests/build_defs.bzl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 610302ac12..eac2eb286c 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -137,7 +137,8 @@ def xla_test(name,
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
       this_backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
-      backend_deps = plugins[backend]["deps"]
+      backend_deps = []
+      backend_deps += plugins[backend]["deps"]
       this_backend_copts += plugins[backend]["copts"]
       this_backend_tags += plugins[backend]["tags"]
       this_backend_args += plugins[backend]["args"]
-- 
GitLab


From 2e98952221bfe83fadc3054e66b2ff3c23c44a24 Mon Sep 17 00:00:00 2001
From: DavidNorman <davidn@graphcore.ai>
Date: Tue, 27 Feb 2018 13:52:13 +0000
Subject: [PATCH 0018/1262] Allow the large R1 slice tests to be disabled

---
 tensorflow/compiler/xla/tests/slice_test.cc | 35 +++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index fe36df160d..50cd56d2d4 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -211,6 +211,9 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+// A version of SliceR1Test used to label and disable 'large' tests
+class SliceR1LargeTest : public SliceR1Test {};
+
 string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
   const R1Spec& spec = data.param;
   return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
@@ -230,6 +233,18 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
+XLA_TEST_P(SliceR1LargeTest, DoIt_F32) { Run<float>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_F64) { Run<double>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U32) { Run<uint32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S32) { Run<int32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U64) { Run<uint64>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
+
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
@@ -237,12 +252,6 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestInstantiation,
     SliceR1Test,
     ::testing::Values(
-// TODO(b/69425338): This uses too much memory on GPU.
-#ifndef XLA_TEST_BACKEND_GPU
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
-#endif
         R1Spec{10, 0, 0, 1},
         R1Spec{10, 7, 7, 1},
         R1Spec{10, 0, 5, 1},
@@ -278,6 +287,20 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestDataToString
 );
 
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestBigSlicesInstantiation,
+    SliceR1LargeTest,
+    ::testing::Values(
+          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
+    ),
+    SliceR1TestDataToString
+);
+#endif
+
 INSTANTIATE_TEST_CASE_P(
     SliceStridedR1TestInstantiation,
     SliceR1Test,
-- 
GitLab


From 0489bf25930ea0dc4b7d8ffc792b0390bfbc06bc Mon Sep 17 00:00:00 2001
From: Jingwen <jin@users.noreply.github.com>
Date: Tue, 27 Feb 2018 18:30:09 -0500
Subject: [PATCH 0019/1262] Include cstring in logging.cc for use of strrchr()

---
 tensorflow/core/platform/default/logging.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 2b874da198..c6e5777c26 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
+#include <cstring>
 #endif
 
 #include <stdlib.h>
-- 
GitLab


From ef4e8ad826c8946f8ff3e0f7e1b3bb3bec61010c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Wed, 21 Feb 2018 15:06:04 +0800
Subject: [PATCH 0020/1262] CLN: extract ApplyAdamBaseOp

---
 tensorflow/core/kernels/training_ops.cc       | 146 +++++++++++++++---
 tensorflow/core/kernels/training_ops.h        |  13 ++
 .../core/kernels/training_ops_gpu.cu.cc       |  30 ++++
 tensorflow/core/ops/training_ops.cc           |  37 +++++
 4 files changed, 202 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 233aa03c32..7d383d980a 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -328,6 +328,45 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename Device, typename T>
+struct ApplyAdaMaxNonCuda {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
+    if (use_nesterov) {
+      LOG(WARNING) << "AdaMax doesn't support use_nesterov yet, ignore it.";
+    }
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    // v == u
+    v.device(d) = (beta2() * v).cwiseMax(grad.abs());
+    // var == θ
+    var.device(d) -= (lr * m) / ((T(1) - beta1_power()) * v);
+  }
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct ApplyAdaMaxSYCL {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  T beta1_power, T beta2_power, T lr, T beta1, T beta2,
+                  T epsilon, typename TTypes<T>::ConstFlat grad) {
+    m.device(d) += (grad - m) * (T(1) - beta1);
+    v.device(d) = (beta2 * v).cwiseMax(grad.abs());
+    var.device(d) -= (lr * m) / ((T(1) - beta1_power) * v);
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
+template <typename T>
+struct ApplyAdaMax<CPUDevice, T> : ApplyAdaMaxNonCuda<CPUDevice, T> {};
+
 template <typename T>
 struct ApplyRMSProp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -2477,10 +2516,12 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-template <typename Device, typename T>
-class ApplyAdamOp : public OpKernel {
+template <typename Device, typename T,
+          template <typename Device2, typename T2>
+          class Functor>
+class ApplyAdamBaseOp : public OpKernel {
  public:
-  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
@@ -2553,11 +2594,11 @@ class ApplyAdamOp : public OpKernel {
                                 grad.shape().DebugString()));
 
     const Device& device = ctx->template eigen_device<Device>();
-    functor::ApplyAdam<Device, T>()(
-        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
-        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
-        grad.flat<T>(), use_nesterov_);
+    auto functor = Functor<Device, T>();
+    functor(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+            beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+            beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+            grad.flat<T>(), use_nesterov_);
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -2568,10 +2609,11 @@ class ApplyAdamOp : public OpKernel {
 };
 
 #ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
+template <typename T,
+          template <typename T2> class Functor>
+class ApplyAdamBaseOp<SYCLDevice, T, Functor> : public OpKernel {
  public:
-  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
@@ -2672,9 +2714,10 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
                                 var.shape().DebugString(), " ",
                                 grad.shape().DebugString()));
 
-    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-                                beta1_power, beta2_power, lr, beta1, beta2,
-                                epsilon, grad.flat<T>());
+    auto functor = Functor<T>();
+    functor(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+            beta1_power, beta2_power, lr, beta1, beta2,
+            epsilon, grad.flat<T>());
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -2684,28 +2727,28 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
 };
 #endif  // TENSORFLOW_USE_SYCL
 
-#define REGISTER_KERNELS(D, T)                                     \
+#define REGISTER_KERNELS(D, T, F)                                  \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdamOp<D##Device, T>);                                  \
+      ApplyAdamBaseOp<D##Device, T, F>);                           \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdam")                \
                               .HostMemory("var")                   \
                               .HostMemory("m")                     \
                               .HostMemory("v")                     \
                               .Device(DEVICE_##D)                  \
                               .TypeConstraint<T>("T"),             \
-                          ApplyAdamOp<D##Device, T>);
-#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
-
+                          ApplyAdamBaseOp<D##Device, T, F>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdam);
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
-
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdamSYCL);
 TF_CALL_float(REGISTER_SYCL_KERNELS);
 TF_CALL_double(REGISTER_SYCL_KERNELS);
+#undef REGISTER_SYCL_KERNELS
 #endif
 
 #if GOOGLE_CUDA
@@ -2730,11 +2773,66 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
-REGISTER_KERNELS(GPU, Eigen::half);
-REGISTER_KERNELS(GPU, float);
-REGISTER_KERNELS(GPU, double);
+#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdam);
+REGISTER_GPU_KERNELS(Eigen::half);
+REGISTER_GPU_KERNELS(float);
+REGISTER_GPU_KERNELS(double);
+#undef REGISTER_GPU_KERNELS
 #endif
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(D, T, F)                                    \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("ApplyAdaMax").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdamBaseOp<D##Device, T, F>);                             \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdaMax")                \
+                              .HostMemory("var")                     \
+                              .HostMemory("m")                       \
+                              .HostMemory("v")                       \
+                              .Device(DEVICE_##D)                    \
+                              .TypeConstraint<T>("T"),               \
+                          ApplyAdamBaseOp<D##Device, T, F>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdaMax);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdaMaxSYCL);
+TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
+#undef REGISTER_SYCL_KERNELS
+#endif
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdaMax<GPUDevice, T>::operator()(                 \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar beta2_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad, bool use_nesterov); \
+  extern template struct ApplyAdaMax<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdaMax);
+REGISTER_GPU_KERNELS(Eigen::half);
+REGISTER_GPU_KERNELS(float);
+REGISTER_GPU_KERNELS(double);
+#undef REGISTER_GPU_KERNELS
+#endif
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 7ee956053a..46a5290210 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -139,6 +139,19 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdaMax {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov);
+};
+
 template <typename Device, typename T>
 struct ApplyRMSProp {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 0376a3b2c6..1776c108ab 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -142,6 +142,32 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdaMax<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs());
+    var.device(d) -=
+        (lr * m) / ((beta1_power.constant(one) -
+                    beta1_power).reshape(single).broadcast(bcast) * v);
+  }
+};
+
 template <typename T>
 struct ApplyRMSProp<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -278,6 +304,10 @@ template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdaMax<GPUDevice, float>;
+template struct functor::ApplyAdaMax<GPUDevice, double>;
+
 template struct functor::ApplyRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyRMSProp<GPUDevice, float>;
 template struct functor::ApplyRMSProp<GPUDevice, double>;
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 6ce9595fb6..6f107db3ea 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -737,6 +737,43 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+REGISTER_OP("ApplyAdaMax")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("v: Ref(T)")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceApplyAdaMax")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-- 
GitLab


From 4d31dac8111b963ed427969c71c6957c929d3e5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Wed, 21 Feb 2018 20:29:46 +0800
Subject: [PATCH 0021/1262] ENH: add AdaMaxOptimizer in python side

---
 tensorflow/contrib/opt/BUILD                  |  20 +++
 tensorflow/contrib/opt/__init__.py            |   2 +
 .../contrib/opt/python/training/adamax.py     |  72 ++++++++++
 .../opt/python/training/adamax_test.py        | 124 ++++++++++++++++++
 tensorflow/core/kernels/training_ops.cc       |   2 +-
 5 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/opt/python/training/adamax.py
 create mode 100644 tensorflow/contrib/opt/python/training/adamax_test.py

diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 86ceda71b7..a86d150f7a 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/adamax.py",
         "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
         "python/training/elastic_average_optimizer.py",
@@ -48,6 +49,25 @@ py_library(
     ],
 )
 
+py_test(
+    name = "adamax_test",
+    srcs = ["python/training/adamax_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",  # b/73507407
+        "notsan",  # b/31055119
+    ],
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 6c1bb1adc0..4c13c8e247 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
@@ -36,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
+    'AdaMaxOptimizer',
     'PowerSignOptimizer',
     'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
new file mode 100644
index 0000000000..4e0c541d3a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""AdaMax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("train.AdaMaxOptimizer")
+class AdaMaxOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the AdaMax algorithm.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.apply_ada_max(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_ada_max(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    raise NotImplementedError()
+
+  def _apply_sparse(self, grad, var):
+    raise NotImplementedError()
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
new file mode 100644
index 0000000000..a1499118dd
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -0,0 +1,124 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AdaMax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import adamax
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**t)) * m_t / v_t
+  return param_t, m_t, v_t
+
+
+class AdaMaxOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if context.in_graph_mode():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if context.in_graph_mode():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 7d383d980a..b3b53d9ee0 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -346,7 +346,7 @@ struct ApplyAdaMaxNonCuda {
     // v == u
     v.device(d) = (beta2() * v).cwiseMax(grad.abs());
     // var == θ
-    var.device(d) -= (lr * m) / ((T(1) - beta1_power()) * v);
+    var.device(d) -= (lr() * m) / ((T(1) - beta1_power()) * v);
   }
 };
 
-- 
GitLab


From ba258d530f1af5fbcc8c1b72637dc7b2177a48c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 2 Mar 2018 19:33:30 +0800
Subject: [PATCH 0022/1262] ENH: support sparse grad

---
 .../contrib/opt/python/training/adamax.py     | 51 +++++++++++++++++--
 .../opt/python/training/adamax_test.py        |  2 +-
 tensorflow/core/kernels/training_ops.cc       |  4 +-
 .../core/kernels/training_ops_gpu.cu.cc       |  5 +-
 4 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index 4e0c541d3a..137fce769f 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import optimizer
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -65,8 +65,49 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
         grad, use_locking=self._use_locking)
 
-  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
-    raise NotImplementedError()
+  def _apply_sparse_shared(self, grad, var, indices,
+                           scatter_add, scatter_update):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta1_t + grad * (1 - beta1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = scatter_update(m, indices, m_t_slice)
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, "v")
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta1_power) * (m_t_slice /
+                                             (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var):
-    raise NotImplementedError()
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        lambda x, i, v: state_ops.scatter_update(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices,
+	self._resource_scatter_add, self._resource_scatter_update)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index a1499118dd..0e2ba0987a 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -45,7 +45,7 @@ def adamax_update_numpy(param,
                       epsilon=1e-8):
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = np.maximum(beta2 * v, np.abs(g_t))
-  param_t = param - (alpha / (1 - beta1**t)) * m_t / v_t
+  param_t = param - (alpha / (1 - beta1**t)) * m_t / (v_t + epsilon)
   return param_t, m_t, v_t
 
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index b3b53d9ee0..0387e3011e 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -346,7 +346,7 @@ struct ApplyAdaMaxNonCuda {
     // v == u
     v.device(d) = (beta2() * v).cwiseMax(grad.abs());
     // var == θ
-    var.device(d) -= (lr() * m) / ((T(1) - beta1_power()) * v);
+    var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon()));
   }
 };
 
@@ -359,7 +359,7 @@ struct ApplyAdaMaxSYCL {
                   T epsilon, typename TTypes<T>::ConstFlat grad) {
     m.device(d) += (grad - m) * (T(1) - beta1);
     v.device(d) = (beta2 * v).cwiseMax(grad.abs());
-    var.device(d) -= (lr * m) / ((T(1) - beta1_power) * v);
+    var.device(d) -= lr / (T(1) - beta1_power) * (m / (v + epsilon));
   }
 };
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 1776c108ab..54c06b130c 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -163,8 +163,9 @@ struct ApplyAdaMax<GPUDevice, T> {
     v.device(d) =
         (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs());
     var.device(d) -=
-        (lr * m) / ((beta1_power.constant(one) -
-                    beta1_power).reshape(single).broadcast(bcast) * v);
+        lr / (beta1_power.constant(one) -
+                 beta1_power).reshape(single).broadcast(bcast) *
+                     (m / (v + epsilon));
   }
 };
 
-- 
GitLab


From f6f5a6019970bb8d667819da7d6316a8088a0b78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 3 Mar 2018 10:02:43 +0800
Subject: [PATCH 0023/1262] DOC: add docment

---
 .../contrib/opt/python/training/adamax.py     | 51 ++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index 137fce769f..ddae06bec7 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -29,7 +29,6 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdaMaxOptimizer")
 class AdaMaxOptimizer(adam.AdamOptimizer):
   """Optimizer that implements the AdaMax algorithm.
 
@@ -37,6 +36,56 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="AdaMax"):
+    """Construct a new AdaMax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section7.1 of the paper:
+
+    ```
+    t <- t + 1
+    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - lr_t / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AdaMax".
+    """
+    super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2,
+                                          epsilon, use_locking, name)
+
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
-- 
GitLab


From f750e21a63c8836b9e7243ce786af2de3f65cc3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 3 Mar 2018 12:31:54 +0800
Subject: [PATCH 0024/1262] TST: add more tests

---
 .../contrib/opt/python/training/adamax.py     |   2 +-
 .../opt/python/training/adamax_test.py        | 243 +++++++++++++++++-
 2 files changed, 233 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index ddae06bec7..36d49d4cbf 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -159,4 +159,4 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
   def _resource_apply_sparse(self, grad, var, indices):
     return self._apply_sparse_shared(
         grad, var, indices,
-	self._resource_scatter_add, self._resource_scatter_update)
+        self._resource_scatter_add, self._resource_scatter_update)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 0e2ba0987a..e91e5cb96a 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -35,22 +35,142 @@ from tensorflow.python.platform import test
 
 
 def adamax_update_numpy(param,
-                      g_t,
-                      t,
-                      m,
-                      v,
-                      alpha=0.001,
-                      beta1=0.9,
-                      beta2=0.999,
-                      epsilon=1e-8):
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = np.maximum(beta2 * v, np.abs(g_t))
-  param_t = param - (alpha / (1 - beta1**t)) * m_t / (v_t + epsilon)
+  param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - ((alpha / (1 - beta1**t)) *
+                                    (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
   return param_t, m_t, v_t
 
 
 class AdaMaxOptimizerTest(test.TestCase):
 
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.AdaMaxOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
   def doTestBasic(self, use_resource=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.test_session(graph=ops.Graph()):
@@ -93,7 +213,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
-        # Run 3 steps of Adam
+        # Run 3 steps of AdaMax
         for t in range(1, 4):
           if context.in_graph_mode():
             self.evaluate(update)
@@ -112,13 +232,114 @@ class AdaMaxOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
           if use_resource:
-            self.assertEqual("var0_%d/Adam:0" % (i,),
+            self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
 
   def testBasic(self):
     with self.test_session():
       self.doTestBasic(use_resource=False)
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined AdaMax1 and AdaMax2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adamax.AdaMaxOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 8b5e4ad404ba16919ad4f17a763ee5383d61a400 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 3 Mar 2018 17:39:56 +0800
Subject: [PATCH 0025/1262] DOC: add apidef

---
 .../contrib/opt/python/training/adamax.py     |  3 +-
 .../base_api/api_def_ApplyAdaMax.pbtxt        | 89 +++++++++++++++++++
 .../api_def_ResourceApplyAdaMax.pbtxt         | 83 +++++++++++++++++
 3 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index 36d49d4cbf..fe5522a170 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -53,11 +53,10 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
 
     ```
     t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 
     m_t <- beta1 * m_{t-1} + (1 - beta1) * g
     v_t <- max(beta2 * v_{t-1}, abs(g))
-    variable <- variable - lr_t / (1 - beta1^t) * m_t / (v_t + epsilon)
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
     ```
 
     Similar to AdamOptimizer, the epsilon is added for numerical stability
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..106c30ca83
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,89 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+Always `False`, unsupported argument.
+END
+  }
+  summary: "Update \'*var\' according to the AdaMax algorithm."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..5b81e50a07
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,83 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+Always `False`, unsupported argument.
+END
+  }
+  summary: "Update \'*var\' according to the AdaMax algorithm."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+END
+}
-- 
GitLab


From 4b7db48218799ef172c7c9794d9d98e56d838ecb Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 5 Mar 2018 17:41:00 +0000
Subject: [PATCH 0026/1262] Update the documentation of `softmax_cross_entropy`

This fix updates the documentation of `softmax_cross_entropy`,
and removed the shape restrictions of `onehot_labels` and `logits`.
They only needs to be of the same shape, not necessary `[batch_size, num_classes]`.

This fix fixes 16263.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/losses/losses_impl.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 7386976e93..04c13cb6c6 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -710,11 +710,16 @@ def softmax_cross_entropy(
       new_onehot_labels = onehot_labels * (1 - label_smoothing)
                           + label_smoothing / num_classes
 
+  Note that `onehot_labels` and `logits` must have the same shape,
+  e.g. `[batch_size, num_classes]`. The shape of `weights` must be
+  broadcastable to loss, whose shape is decided by the shape of `logits`.
+  In case the shape of `logits` is `[batch_size, num_classes]`, loss is
+  a `Tensor` of shape `[batch_size]`.
+
   Args:
-    onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
-    logits: `[batch_size, num_classes]` logits outputs of the network .
-    weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
-      broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
+    onehot_labels: One-hot-encoded labels.
+    logits: Logits outputs of the network.
+    weights: Optional `Tensor` that is broadcastable to loss.
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
-- 
GitLab


From 82e34cd19f554509113d438ca98ad76e42fdf4e9 Mon Sep 17 00:00:00 2001
From: Hovhannes Harutyunyan <hovhannes.harutyunyan@picsart.com>
Date: Wed, 7 Mar 2018 09:14:53 +0400
Subject: [PATCH 0027/1262] Remove quantized versiaon of Div till fixing it.

---
 .../internal/optimized/optimized_ops.h        | 47 -------------------
 1 file changed, 47 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index b19f46beaa..9c181fddad 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1973,53 +1973,6 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastDiv/8bit");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val / input2_val, output_multiplier, output_shift);
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, unclamped_result));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
-}
-
 // TODO(aselle): This is not actually optimized yet.
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
-- 
GitLab


From f82d009d878dc675a307e69f89ba9f4dfdcd6c71 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 7 Mar 2018 21:58:39 +0800
Subject: [PATCH 0028/1262] Fix broken link of typical distributed
 configuration in graphs.md

---
 tensorflow/docs_src/programmers_guide/graphs.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index e69b717432..ca74b17542 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -210,9 +210,8 @@ with tf.device("/device:GPU:0"):
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-If you are deploying TensorFlow in a @{$deploy/distributed$typical distributed configuration},
-you might specify the job name and task ID to place variables on
-a task in the parameter server job (`"/job:ps"`), and the other operations on
+
+If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration, you might specify the job name and task ID to place variables on a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
 
 ```python
-- 
GitLab


From 04b6127510793b4c5aaa540b60b68ffdf3fd48ce Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 7 Mar 2018 22:23:50 +0800
Subject: [PATCH 0029/1262] revert the minor space nit

---
 tensorflow/docs_src/programmers_guide/graphs.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index ca74b17542..3b5e3e5a9a 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -210,8 +210,9 @@ with tf.device("/device:GPU:0"):
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-
-If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration, you might specify the job name and task ID to place variables on a task in the parameter server job (`"/job:ps"`), and the other operations on
+If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration,
+you might specify the job name and task ID to place variables on
+a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
 
 ```python
-- 
GitLab


From 2548a3d2cf035a229d35ab6257bee511aa3a8e23 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Thu, 8 Mar 2018 00:15:22 +0800
Subject: [PATCH 0030/1262] fix some typo

---
 tensorflow/docs_src/programmers_guide/graphs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 3b5e3e5a9a..f28660d44a 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -505,10 +505,10 @@ multiple graphs in the same process.
 As noted above, TensorFlow provides a "default graph" that is implicitly passed
 to all API functions in the same context. For many applications, a single graph
 is sufficient. However, TensorFlow also provides methods for manipulating
-the default graph, which can be useful in more advanced used cases. For example:
+the default graph, which can be useful in more advanced use cases. For example:
 
 * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each
-  operation in a single graph must have a unique name. TensorFlow will
+  operation in a single graph must have an unique name. TensorFlow will
   "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to
   their names if the requested name is already taken. Using multiple explicitly
   created graphs gives you more control over what name is given to each
-- 
GitLab


From 955f41c5f2240495a086b503e54eac6928876aca Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 7 Mar 2018 14:04:26 -0800
Subject: [PATCH 0031/1262] Cleanup `astor` output to match `codegen` output.

The default `astor` output messes up the function signature docs for many docs without a bit of cleanup.

With this change the only differences I see are parens around lambdas and math expressions in default arguments.
---
 tensorflow/tools/docs/parser.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 1798378d55..0fcd0abc4a 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -650,6 +650,9 @@ def _remove_first_line_indent(string):
   return '\n'.join([line[indent:] for line in string.split('\n')])
 
 
+PAREN_NUMBER_RE = re.compile("^\(([0-9.e-]+)\)")
+
+
 def _generate_signature(func, reverse_index):
   """Given a function, returns a list of strings representing its args.
 
@@ -705,7 +708,11 @@ def _generate_signature(func, reverse_index):
       if id(default) in reverse_index:
         default_text = reverse_index[id(default)]
       elif ast_default is not None:
-        default_text = astor.to_source(ast_default)
+        default_text = (
+            astor.to_source(ast_default).rstrip('\n').replace('\t','\\t')
+                 .replace('\n','\\n').replace('"""',"'"))
+        default_text = PAREN_NUMBER_RE.sub('\\1',default_text)
+
         if default_text != repr(default):
           # This may be an internal name. If so, handle the ones we know about.
           # TODO(wicke): This should be replaced with a lookup in the index.
-- 
GitLab


From c22d11f4fcc2801d0a5de98a84461e03e1bcb674 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Wed, 7 Mar 2018 14:14:08 -0800
Subject: [PATCH 0032/1262] add back docs

---
 tensorflow/docs_src/community/documentation.md | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 8d55148e48..f7b7ba14e5 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -148,7 +148,19 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
 
-1. If bazel is not installed on your machine, install it now. If you are on
+1. If pip isn't installed on your machine, install it now by issuing the
+following command:
+
+        $ sudo easy_install pip
+
+2. Use pip to install mock and pandas by issuing the following
+   command (Note: If you are using
+   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
+   dependencies, you may not want to use sudo for these installations):
+
+        $ sudo pip install mock pandas
+
+3. If bazel is not installed on your machine, install it now. If you are on
    Linux, install bazel by issuing the following command:
 
         $ sudo apt-get install bazel  # Linux
@@ -156,10 +168,10 @@ doing the following:
     If you are on Mac OS, find bazel installation instructions on
     [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
 
-2. Change directory to the top-level `tensorflow` directory of the TensorFlow
+4. Change directory to the top-level `tensorflow` directory of the TensorFlow
    source code.
 
-3. Run the `configure` script and answer its prompts appropriately for your
+5. Run the `configure` script and answer its prompts appropriately for your
    system.
 
         $ ./configure
-- 
GitLab


From cbb517551964879dcb6eac2b00bf74db6c827975 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Wed, 7 Mar 2018 14:54:24 -0800
Subject: [PATCH 0033/1262] Revert "add back docs"

This reverts commit c22d11f4fcc2801d0a5de98a84461e03e1bcb674.
---
 tensorflow/docs_src/community/documentation.md | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index f7b7ba14e5..8d55148e48 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -148,19 +148,7 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
 
-1. If pip isn't installed on your machine, install it now by issuing the
-following command:
-
-        $ sudo easy_install pip
-
-2. Use pip to install mock and pandas by issuing the following
-   command (Note: If you are using
-   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
-   dependencies, you may not want to use sudo for these installations):
-
-        $ sudo pip install mock pandas
-
-3. If bazel is not installed on your machine, install it now. If you are on
+1. If bazel is not installed on your machine, install it now. If you are on
    Linux, install bazel by issuing the following command:
 
         $ sudo apt-get install bazel  # Linux
@@ -168,10 +156,10 @@ following command:
     If you are on Mac OS, find bazel installation instructions on
     [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
 
-4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+2. Change directory to the top-level `tensorflow` directory of the TensorFlow
    source code.
 
-5. Run the `configure` script and answer its prompts appropriately for your
+3. Run the `configure` script and answer its prompts appropriately for your
    system.
 
         $ ./configure
-- 
GitLab


From d34eaf348848fe153a5fd245aa75c2ca32973b36 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Wed, 7 Mar 2018 21:53:25 -0800
Subject: [PATCH 0034/1262] fix encoding and lint

---
 tensorflow/tools/docs/build_docs_test.py |  1 -
 tensorflow/tools/docs/generate_lib.py    | 13 ++++++-------
 tensorflow/tools/docs/parser.py          |  6 +++---
 tensorflow/tools/docs/py_guide_parser.py |  2 +-
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index 2e8f634e7c..0cbf8b478f 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
 import textwrap
 
 import tensorflow as tf
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 635408d87f..a7ab0fa538 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import argparse
 import fnmatch
 import os
-import sys
 
 import six
 
@@ -134,8 +133,8 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
     try:
       if not os.path.exists(directory):
         os.makedirs(directory)
-      with open(path, 'w') as f:
-        f.write(pretty_docs.build_md_page(page_info))
+      with open(path, 'wb') as f:
+        f.write(pretty_docs.build_md_page(page_info).encode('utf-8'))
     except OSError as e:
       print('Cannot write documentation for %s to %s: %s' % (full_name,
                                                              directory, e))
@@ -434,19 +433,19 @@ def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
       full_out_path = os.path.join(output_dir, suffix)
       if not fnmatch.fnmatch(base_name, file_pattern):
         print('Copying un-matched file %s...' % suffix)
-        open(full_out_path, 'w').write(open(full_in_path).read())
+        open(full_out_path, 'wb').write(open(full_in_path, 'rb').read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
         content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        content = open(full_in_path).read()
+        content = open(full_in_path, 'rb').read().decode('utf-8')
 
       content = reference_resolver.replace_references(content,
                                                       relative_path_to_root)
-      with open(full_out_path, 'w') as f:
-        f.write(content)
+      with open(full_out_path, 'wb') as f:
+        f.write(content.encode('utf-8'))
 
   print('Done.')
 
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 0fcd0abc4a..dd0351b4c6 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -709,9 +709,9 @@ def _generate_signature(func, reverse_index):
         default_text = reverse_index[id(default)]
       elif ast_default is not None:
         default_text = (
-            astor.to_source(ast_default).rstrip('\n').replace('\t','\\t')
-                 .replace('\n','\\n').replace('"""',"'"))
-        default_text = PAREN_NUMBER_RE.sub('\\1',default_text)
+            astor.to_source(ast_default).rstrip('\n').replace('\t', '\\t')
+              .replace('\n', '\\n').replace('"""', "'"))
+        default_text = PAREN_NUMBER_RE.sub('\\1', default_text)
 
         if default_text != repr(default):
           # This may be an internal name. If so, handle the ones we know about.
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
index 216353ecee..328f42d18f 100644
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -44,7 +44,7 @@ class PyGuideParser(object):
 
   def process(self, full_path):
     """Read and process the file at `full_path`."""
-    md_string = open(full_path).read()
+    md_string = open(full_path, 'rb').read().decode('utf-8')
     self._lines = md_string.split('\n')
     seen = set()
 
-- 
GitLab


From f7a04228e0368f3c9bad22a66fe7267e41ecb128 Mon Sep 17 00:00:00 2001
From: DavidNorman <davidn@graphcore.ai>
Date: Thu, 8 Mar 2018 07:05:53 +0000
Subject: [PATCH 0035/1262] Register half in some ops which support all
 floating point types

---
 tensorflow/core/ops/nn_ops.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 910fbaca9e..6d4a3fda51 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -472,7 +472,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -490,7 +490,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -589,7 +589,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("AvgPool3DGrad")
@@ -600,7 +600,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -618,7 +618,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {half, bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("MaxPool3DGrad")
@@ -630,8 +630,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float} = DT_FLOAT")
-    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
-- 
GitLab


From cee41f9d10b81ce3b49f566ddd448a7f3f2872c3 Mon Sep 17 00:00:00 2001
From: KB Sriram <kbsriram@gmail.com>
Date: Wed, 7 Mar 2018 08:11:03 -0800
Subject: [PATCH 0036/1262] C++ gradient for StridedSlice

See https://github.com/tensorflow/tensorflow/issues/9645
---
 tensorflow/cc/gradients/array_grad.cc      | 36 ++++++++++++++++++++++
 tensorflow/cc/gradients/array_grad_test.cc | 24 +++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 6545e4ee3e..ff348fadb2 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad);
 
+Status StridedSliceGradHelper(const Scope& scope, const Operation& op,
+                              const std::vector<Output>& grad_inputs,
+                              std::vector<Output>* grad_outputs) {
+  Input x = Shape(scope, op.input(0));
+  Input begin = op.input(1);
+  Input end = op.input(2);
+  Input strides = op.input(3);
+  int64 begin_mask;
+  int64 end_mask;
+  int64 ellipsis_mask;
+  int64 new_axis_mask;
+  int64 shrink_axis_mask;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask));
+  grad_outputs->push_back(
+      StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0],
+                       StridedSliceGrad::BeginMask(begin_mask)
+                           .EndMask(end_mask)
+                           .EllipsisMask(ellipsis_mask)
+                           .NewAxisMask(new_axis_mask)
+                           .ShrinkAxisMask(shrink_axis_mask)));
+  // No gradients returned for begin, end and strides
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 4a215fcc92..2a2180297c 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) {
   RunTest(x, x_shape, y, y_shape);
 }
 
+TEST_F(ArrayGradTest, StridedSliceGrad) {
+  TensorShape x_shape({6, 4, 4});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+
+  // y = x[2:6:2, 1:3, 1:3]
+  auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1});
+  // y.shape = [2, 2, 2];
+  RunTest(x, x_shape, y, {2, 2, 2});
+
+  // y = x[2:6:2, 1:3, 1:3]
+  // begin_mask = 1<<1 (ignore begin_index = 1)
+  // end_mask = 1<<2 (ignore end_index = 2)
+  y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1},
+                   StridedSlice::BeginMask(1<<1).EndMask(1<<2));
+  // y.shape = [2, 3, 3];
+  RunTest(x, x_shape, y, {2, 3, 3});
+
+  // y = [tf.newaxis, 2:6:2, 1:3, 1:3]
+  y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1},
+                   StridedSlice::NewAxisMask(1<<0));
+  // y.shape = [1, 2, 2, 2];
+  RunTest(x, x_shape, y, {1, 2, 2, 2});
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From e31fb25f4e3989a846a8e54d789a3bf5efff0cea Mon Sep 17 00:00:00 2001
From: KB Sriram <kbsriram@gmail.com>
Date: Thu, 8 Mar 2018 07:40:24 -0800
Subject: [PATCH 0037/1262] Clang-format fixes.

---
 tensorflow/cc/gradients/array_grad_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 2a2180297c..de3bd0fc9e 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -367,13 +367,13 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
   // begin_mask = 1<<1 (ignore begin_index = 1)
   // end_mask = 1<<2 (ignore end_index = 2)
   y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1},
-                   StridedSlice::BeginMask(1<<1).EndMask(1<<2));
+                   StridedSlice::BeginMask(1 << 1).EndMask(1 << 2));
   // y.shape = [2, 3, 3];
   RunTest(x, x_shape, y, {2, 3, 3});
 
   // y = [tf.newaxis, 2:6:2, 1:3, 1:3]
   y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1},
-                   StridedSlice::NewAxisMask(1<<0));
+                   StridedSlice::NewAxisMask(1 << 0));
   // y.shape = [1, 2, 2, 2];
   RunTest(x, x_shape, y, {1, 2, 2, 2});
 }
-- 
GitLab


From d6533df7cd3ef19b39081a64fcb0bed5f83c7ee0 Mon Sep 17 00:00:00 2001
From: Giuseppe <giuscri@gmail.com>
Date: Thu, 8 Mar 2018 17:49:29 +0100
Subject: [PATCH 0038/1262] Fix markdown error in layers tutorial.

---
 tensorflow/docs_src/tutorials/layers.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index ee03f440c9..b24d3f4cad 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -192,8 +192,7 @@ dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
 you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s},
 and find the above code intuitive, you may want to skim these sections or just
-skip ahead to ["Training and Evaluating the CNN MNIST
-Classifier"](#training-and-evaluating-the-cnn-mnist-classifier).
+skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#training_and_evaluating_the_cnn_mnist_classifier).
 
 ### Input Layer
 
@@ -534,9 +533,8 @@ if mode == tf.estimator.ModeKeys.TRAIN:
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining
-> the training op for the model"} in the @{$get_started/custom_estimators$"Creating Estimations in
-> tf.estimator"} tutorial.
+> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
+> in the @{$get_started/custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
 
 ### Add evaluation metrics
 
-- 
GitLab


From e8cf1fb7dc9dabe1a2a0b181a7b587c1300888a3 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Thu, 8 Mar 2018 14:07:30 -0800
Subject: [PATCH 0039/1262] Use getfullargspec in signature parsing.

---
 tensorflow/python/util/tf_inspect.py | 36 ++++++++++++++++++++++------
 tensorflow/tools/docs/parser.py      | 34 +++++++++++++-------------
 2 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index c4168f7b1a..1fbc33ba0b 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -18,12 +18,22 @@ from __future__ import division
 from __future__ import print_function
 
 import inspect as _inspect
+import six
+from collections import namedtuple
 
 from tensorflow.python.util import tf_decorator
 
 ArgSpec = _inspect.ArgSpec
 
 
+if six.PY3:
+  FullArgSpec = _inspect.FullArgSpec
+else:
+  FullArgSpec = namedtuple(
+      'FullArgSpec', ['args', 'varargs', 'varkw', 'defaults',
+      'kwonlyargs', 'kwonlydefaults', 'annotations'])
+
+
 def currentframe():
   """TFDecorator-aware replacement for inspect.currentframe."""
   return _inspect.stack()[1][0]
@@ -46,20 +56,32 @@ def getargspec(object):  # pylint: disable=redefined-builtin
 
 
 def getfullargspec(obj):  # pylint: disable=redefined-builtin
-  """TFDecorator-aware replacement for inspect.getfullargspec and fallback to
-  inspect.getargspec in Python 2.
+  """TFDecorator-aware replacement for inspect.getfullargspec.
 
   Args:
     obj: A callable, possibly decorated.
 
   Returns:
-    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    The `FullArgSpec` that describes the signature of
     the outermost decorator that changes the callable's signature. If the
-    callable is not decorated, `inspect.getfullargspec()`
-    (`inspect.getargspec()` in Python 2) will be called directly on the
-    callable.
+    callable is not decorated, `inspect.getfullargspec()` will be called
+    directly on the callable.
   """
-  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  if six.PY2:
+    def spec_fn(target):
+      argspecs = _inspect.getargspec(target)
+      fullargspecs = FullArgSpec(
+          args=argspecs.args,
+          varargs=argspecs.varargs,
+          varkw=argspecs.keywords,
+          defaults=argspecs.defaults,
+          kwonlyargs=[],
+          kwonlydefaults={},
+          annotations={})
+      return fullargspecs
+  else:
+    spec_fn = _inspect.getfullargspec
+
   decorators, target = tf_decorator.unwrap(obj)
   return next((d.decorator_argspec for d in decorators
                if d.decorator_argspec is not None), spec_fn(target))
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index dd0351b4c6..16513d0ee1 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -601,20 +601,20 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
 def _get_arg_spec(func):
   """Extracts signature information from a function or functools.partial object.
 
-  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
-  corrects the signature of the underlying function to take into account the
-  removed arguments.
+  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
+  objects, corrects the signature of the underlying function to take into
+  account the removed arguments.
 
   Args:
     func: A function whose signature to extract.
 
   Returns:
-    An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
-    by `tf_inspect.getargspec`.
+    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
+    as returned by `tf_inspect.getfullargspec`.
   """
-  # getargspec does not work for functools.partial objects directly.
+  # getfullargspec does not work for functools.partial objects directly.
   if isinstance(func, functools.partial):
-    argspec = tf_inspect.getargspec(func.func)
+    argspec = tf_inspect.getfullargspec(func.func)
     # Remove the args from the original function that have been used up.
     first_default_arg = (
         len(argspec.args or []) - len(argspec.defaults or []))
@@ -637,12 +637,14 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    return tf_inspect.ArgSpec(args=argspec_args,
-                              varargs=argspec.varargs,
-                              keywords=argspec.keywords,
-                              defaults=tuple(argspec_defaults))
+    # NOTE Some fields from FullArgSpec were removed here.
+    # Add them back if needed in the future.
+    return tf_inspect.FullArgSpec(args=argspec_args,
+                                  varargs=argspec.varargs,
+                                  varkw=argspec.varkw,
+                                  defaults=tuple(argspec_defaults))
   else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getargspec(func)
+    return tf_inspect.getfullargspec(func)
 
 
 def _remove_first_line_indent(string):
@@ -657,7 +659,7 @@ def _generate_signature(func, reverse_index):
   """Given a function, returns a list of strings representing its args.
 
   This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getargspec, which
+  python function. It uses tf_inspect.getfullargspec, which
   does not generalize well to Python 3.x, which is more flexible in how *args
   and **kwargs are handled. This is not a problem in TF, since we have to remain
   compatible to Python 2.7 anyway.
@@ -710,7 +712,7 @@ def _generate_signature(func, reverse_index):
       elif ast_default is not None:
         default_text = (
             astor.to_source(ast_default).rstrip('\n').replace('\t', '\\t')
-              .replace('\n', '\\n').replace('"""', "'"))
+            .replace('\n', '\\n').replace('"""', "'"))
         default_text = PAREN_NUMBER_RE.sub('\\1', default_text)
 
         if default_text != repr(default):
@@ -745,8 +747,8 @@ def _generate_signature(func, reverse_index):
   # Add *args and *kwargs.
   if argspec.varargs:
     args_list.append('*' + argspec.varargs)
-  if argspec.keywords:
-    args_list.append('**' + argspec.keywords)
+  if argspec.varkw:
+    args_list.append('**' + argspec.varkw)
 
   return args_list
 
-- 
GitLab


From 8cf2a1f0db40174cd6feab96c07e47ba8349d11c Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Thu, 8 Mar 2018 14:18:54 -0800
Subject: [PATCH 0040/1262] fix encoding again

---
 tensorflow/tools/docs/generate_lib.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index a7ab0fa538..d9e8069a61 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -133,8 +133,12 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
     try:
       if not os.path.exists(directory):
         os.makedirs(directory)
+      # This function returns raw bytes in PY2 or unicode in PY3.
+      text = pretty_docs.build_md_page(page_info)
+      if six.PY3:
+        text = text.encode('utf-8')
       with open(path, 'wb') as f:
-        f.write(pretty_docs.build_md_page(page_info).encode('utf-8'))
+        f.write(text)
     except OSError as e:
       print('Cannot write documentation for %s to %s: %s' % (full_name,
                                                              directory, e))
-- 
GitLab


From b4db970c338123ee3156bb0e216193bde35d4b17 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 13 Mar 2018 00:04:33 +0800
Subject: [PATCH 0041/1262] fix broken link of tensor-like type

---
 tensorflow/docs_src/programmers_guide/graphs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index f28660d44a..81fd99cb4a 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -362,7 +362,7 @@ operations that are needed to compute the result.
 
 @{tf.Session.run} requires you to specify a list of **fetches**, which determine
 the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or
-a [tensor-like type](#tensor-like-objects) such as @{tf.Variable}. These fetches
+a [tensor-like type](#tensor-like_objects) such as @{tf.Variable}. These fetches
 determine what **subgraph** of the overall @{tf.Graph} must be executed to
 produce the result: this is the subgraph that contains all operations named in
 the fetch list, plus all operations whose outputs are used to compute the value
-- 
GitLab


From 66b38c5e7af4b607f393973d18aaabb6e00f9723 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 12 Mar 2018 12:56:59 -0700
Subject: [PATCH 0042/1262] Block docs for str, repr, hash.

No python2 code is generating useful docs for these, and in python3 many useless docs are generated, so I've blocked them.
---
 tensorflow/tools/docs/parser.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 5f2a411bae..95155b1149 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -1127,7 +1127,8 @@ class _ClassPageInfo(object):
       # Remove builtin members that we never want to document.
       if short_name in ['__class__', '__base__', '__weakref__', '__doc__',
                         '__module__', '__dict__', '__abstractmethods__',
-                        '__slots__', '__getnewargs__']:
+                        '__slots__', '__getnewargs__', '__str__',
+                        '__repr__', '__hash__']:
         continue
 
       child_name = '.'.join([self.full_name, short_name])
@@ -1172,7 +1173,7 @@ class _ClassPageInfo(object):
         # obvious what they do, don't include them in the docs if there's no
         # docstring.
         if not child_doc.brief.strip() and short_name in [
-            '__str__', '__repr__', '__hash__', '__del__', '__copy__']:
+            '__del__', '__copy__']:
           print('Skipping %s, defined in %s, no docstring.' % (child_name,
                                                                defining_class))
           continue
-- 
GitLab


From 1f03b013ef00c128cf8331f274524a23d86ac458 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 13 Mar 2018 16:44:57 +0800
Subject: [PATCH 0043/1262] revert wrong typo fix

---
 tensorflow/docs_src/programmers_guide/graphs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 81fd99cb4a..69eb6df5f6 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -508,7 +508,7 @@ is sufficient. However, TensorFlow also provides methods for manipulating
 the default graph, which can be useful in more advanced use cases. For example:
 
 * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each
-  operation in a single graph must have an unique name. TensorFlow will
+  operation in a single graph must have a unique name. TensorFlow will
   "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to
   their names if the requested name is already taken. Using multiple explicitly
   created graphs gives you more control over what name is given to each
-- 
GitLab


From d751b6bfa84dae1be9835fc40cc3094a8205a74e Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 13 Mar 2018 23:11:47 +0800
Subject: [PATCH 0044/1262] Fix link of typical distributed configuration

---
 tensorflow/docs_src/programmers_guide/graphs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 69eb6df5f6..e4095cf7dd 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -210,7 +210,7 @@ with tf.device("/device:GPU:0"):
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration,
+If you are deploying TensorFlow in a @{$distributed$typical distributed configuration},
 you might specify the job name and task ID to place variables on
 a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
-- 
GitLab


From b618740a8754e85a2a6ee142028105f76a4d5d58 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Fri, 16 Mar 2018 00:11:38 +0900
Subject: [PATCH 0045/1262] implement matrix 2-norm

---
 tensorflow/python/ops/linalg_ops.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 37470e00d7..110b766a6e 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -454,7 +454,7 @@ def norm(tensor,
 
   This function can compute several different vector norms (the 1-norm, the
   Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
-  matrix norms (Frobenius, 1-norm, and inf-norm).
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
 
   Args:
     tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
@@ -465,7 +465,7 @@ def norm(tensor,
       Some restrictions apply:
         a) The Frobenius norm `fro` is not defined for vectors,
         b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
-           `np.inf` are supported.
+           `2`, `np.inf` are supported.
       See the description of `axis` on how to compute norms for a batch of
       vectors or matrices stored in a tensor.
     axis: If `axis` is `None` (the default), the input is considered a vector
@@ -521,8 +521,7 @@ def norm(tensor,
         axis[0] == axis[1]):
       raise ValueError(
           "'axis' must be None, an integer, or a tuple of 2 unique integers")
-    # TODO(rmlarsen): Implement matrix 2-norm using tf.svd().
-    supported_matrix_norms = ['euclidean', 'fro', 1, np.inf]
+    supported_matrix_norms = ['euclidean', 'fro', 1, 2, np.inf]
     if ord not in supported_matrix_norms:
       raise ValueError("'ord' must be a supported matrix norm in %s, got %s" %
                        (supported_matrix_norms, ord))
@@ -539,10 +538,20 @@ def norm(tensor,
 
   with ops.name_scope(name, 'norm', [tensor]):
     tensor = ops.convert_to_tensor(tensor)
+    rank = len(tensor.get_shape().as_list())
+    axis = tuple(map(lambda i: i if i >= 0 else i + rank, axis))
+
     if ord in ['fro', 'euclidean', 2, 2.0]:
-      # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for
-      # matrices.
-      result = math_ops.sqrt(
+      if is_matrix_norm and ord in [2, 2.0]:
+        axes = list(range(rank))
+        perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis)
+        perm_after = list(map(lambda i: perm_before.index(i), axes))
+        result = array_ops.transpose(array_ops.expand_dims(math_ops.reduce_max(
+            gen_linalg_ops.svd(array_ops.transpose(tensor, perm=perm_before),
+                               compute_uv=False)[0], axis=-1, keepdims=True),
+            axis=-1), perm=perm_after)
+      else:
+        result = math_ops.sqrt(
           math_ops.reduce_sum(
               tensor * math_ops.conj(tensor), axis, keepdims=True))
     else:
-- 
GitLab


From a280a1d0cfd64831857826db639a3ee0180094de Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Fri, 16 Mar 2018 00:32:34 +0900
Subject: [PATCH 0046/1262] follow python coding style

---
 tensorflow/python/ops/linalg_ops.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 110b766a6e..b467711e3b 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -546,14 +546,15 @@ def norm(tensor,
         axes = list(range(rank))
         perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis)
         perm_after = list(map(lambda i: perm_before.index(i), axes))
-        result = array_ops.transpose(array_ops.expand_dims(math_ops.reduce_max(
-            gen_linalg_ops.svd(array_ops.transpose(tensor, perm=perm_before),
-                               compute_uv=False)[0], axis=-1, keepdims=True),
-            axis=-1), perm=perm_after)
+        result = array_ops.transpose(array_ops.expand_dims(
+            math_ops.reduce_max(gen_linalg_ops.svd(
+                array_ops.transpose(tensor, perm=perm_before),
+                compute_uv=False)[0], axis=-1, keepdims=True), axis=-1),
+                                     perm=perm_after)
       else:
         result = math_ops.sqrt(
-          math_ops.reduce_sum(
-              tensor * math_ops.conj(tensor), axis, keepdims=True))
+            math_ops.reduce_sum(
+                tensor * math_ops.conj(tensor), axis, keepdims=True))
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
-- 
GitLab


From cc10ac9b7d593375a7cee0c167c20989dc29e8cf Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Fri, 16 Mar 2018 00:40:05 +0900
Subject: [PATCH 0047/1262] remove unnecessary lambda

---
 tensorflow/python/ops/linalg_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index b467711e3b..db6ce71125 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -545,7 +545,7 @@ def norm(tensor,
       if is_matrix_norm and ord in [2, 2.0]:
         axes = list(range(rank))
         perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis)
-        perm_after = list(map(lambda i: perm_before.index(i), axes))
+        perm_after = list(map(perm_before.index, axes))
         result = array_ops.transpose(array_ops.expand_dims(
             math_ops.reduce_max(gen_linalg_ops.svd(
                 array_ops.transpose(tensor, perm=perm_before),
-- 
GitLab


From b21ceeb518ca9462a247d8be05870f12bebad201 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 15 Mar 2018 23:13:25 -0700
Subject: [PATCH 0048/1262] Enhancement with deprecated_argument_lookup for
 argmax

This fix makes some enhancement for argmax, using
deprecated_argument_lookup instread of customerized logic.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e18d0e9501..9a88b71398 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -208,11 +208,9 @@ def argmax(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
-- 
GitLab


From 82571ca199869f60fe2036d15d0071031d997b47 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 15 Mar 2018 23:15:37 -0700
Subject: [PATCH 0049/1262] Enhancement with deprecated_argument_lookup for
 argmin

This fix makes some enhancement for argmin, using
deprecated_argument_lookup instread of customerized logic.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 9a88b71398..a2892d206d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -226,11 +226,9 @@ def argmin(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
-- 
GitLab


From 52fef7f6b8b41d4fffa92bddcb78d96eb6333051 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Fri, 16 Mar 2018 16:03:26 +0900
Subject: [PATCH 0050/1262] fix typo

---
 tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 272410c693..7651a03fe5 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -398,7 +398,7 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNorms) {
 }
 
 TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithConcat) {
-  // Test axis is not 3, so all weigths and offsets are fused to each of inputs
+  // Test axis is not 3, so all weights and offsets are fused to each of inputs
   // of conv2d.
   TestFoldFusedBatchNormsWithConcat(/*split=*/true);
   // Test axis = 3, BatchNorm weights and offsets will be split before fused
-- 
GitLab


From 20424e92417b520d7ea8c7323eee46538d2b909f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 17 Mar 2018 09:30:24 +0800
Subject: [PATCH 0051/1262] CLN: remove the unused import: tf_export

---
 tensorflow/contrib/opt/python/training/adamax.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index fe5522a170..65918831e9 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -26,7 +26,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_ops
-from tensorflow.python.util.tf_export import tf_export
 
 
 class AdaMaxOptimizer(adam.AdamOptimizer):
-- 
GitLab


From b5ebb7e9e5f5ae59e6db93bb5950f4bb68bf9e18 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sun, 18 Mar 2018 00:48:46 +0900
Subject: [PATCH 0052/1262] update norm_op_test

---
 tensorflow/python/kernel_tests/norm_op_test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index d85512fae6..d6625b69ef 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -85,8 +85,6 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
     if ((not is_matrix_norm and ord_ == "fro") or
         (is_matrix_norm and is_fancy_p_norm)):
       self.skipTest("Not supported by neither numpy.linalg.norm nor tf.norm")
-    if is_matrix_norm and ord_ == 2:
-      self.skipTest("Not supported by tf.norm")
     if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2):
       self.skipTest("Not supported by numpy.linalg.norm")
     matrix = np.random.randn(*shape_).astype(dtype_)
-- 
GitLab


From c53160a2a5decdae30bda6e8f40b45f3b4dd9f8e Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sun, 18 Mar 2018 00:49:13 +0900
Subject: [PATCH 0053/1262] use tf function instead of np

---
 tensorflow/python/ops/linalg_ops.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index db6ce71125..d8150d85b9 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
@@ -538,19 +539,27 @@ def norm(tensor,
 
   with ops.name_scope(name, 'norm', [tensor]):
     tensor = ops.convert_to_tensor(tensor)
-    rank = len(tensor.get_shape().as_list())
-    axis = tuple(map(lambda i: i if i >= 0 else i + rank, axis))
 
     if ord in ['fro', 'euclidean', 2, 2.0]:
       if is_matrix_norm and ord in [2, 2.0]:
-        axes = list(range(rank))
-        perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis)
-        perm_after = list(map(perm_before.index, axes))
-        result = array_ops.transpose(array_ops.expand_dims(
-            math_ops.reduce_max(gen_linalg_ops.svd(
-                array_ops.transpose(tensor, perm=perm_before),
-                compute_uv=False)[0], axis=-1, keepdims=True), axis=-1),
-                                     perm=perm_after)
+        rank = array_ops.rank(tensor)
+        axis = functional_ops.map_fn(
+            lambda i: control_flow_ops.cond(i >= 0, lambda: i,
+                                            lambda: i + rank),
+            ops.convert_to_tensor(axis)).eval()
+        axes = math_ops.range(rank)
+        perm_before = array_ops.concat(
+            [array_ops.setdiff1d(axes, axis)[0], axis], axis=0)
+        perm_after = functional_ops.map_fn(
+            lambda i: math_ops.cast(
+                array_ops.squeeze(
+                    array_ops.where(math_ops.equal(perm_before, i))),
+                dtype=dtypes.int32), axes)
+        permed = array_ops.transpose(tensor, perm=perm_before)
+        matrix_2_norm = array_ops.expand_dims(
+            math_ops.reduce_max(gen_linalg_ops.svd(permed, compute_uv=False)[0],
+                                axis=-1, keepdims=True), axis=-1)
+        result = array_ops.transpose(matrix_2_norm, perm=perm_after)
       else:
         result = math_ops.sqrt(
             math_ops.reduce_sum(
-- 
GitLab


From fda633fb7187da8522ef79555d1267996fa983bc Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sun, 18 Mar 2018 21:29:16 +0900
Subject: [PATCH 0054/1262] remove test code

---
 tensorflow/python/ops/linalg_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index d8150d85b9..608b72c574 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -546,7 +546,7 @@ def norm(tensor,
         axis = functional_ops.map_fn(
             lambda i: control_flow_ops.cond(i >= 0, lambda: i,
                                             lambda: i + rank),
-            ops.convert_to_tensor(axis)).eval()
+            ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
         perm_before = array_ops.concat(
             [array_ops.setdiff1d(axes, axis)[0], axis], axis=0)
-- 
GitLab


From a34a3b2035ca0cfd48488c03bd4b088070bf9a25 Mon Sep 17 00:00:00 2001
From: Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>
Date: Thu, 22 Mar 2018 14:32:12 -0700
Subject: [PATCH 0055/1262] Fixing the issue where MKL-DNN is getting built
 when not using --config=mkl

---
 tensorflow/tensorflow.bzl | 53 +++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 9b0db8a112..8549c34691 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -788,7 +788,33 @@ def tf_cc_test_mkl(srcs,
                    tags=[],
                    size="medium",
                    args=None):
-  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
+  for src in srcs:
+    native.cc_test(
+      name=src_to_test_name(src),
+      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
+      copts=tf_copts(),
+      linkopts=select({
+        clean_dep("//tensorflow:android"): [
+            "-pie",
+          ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
+        "//conditions:default": [
+            "-lpthread",
+            "-lm"
+        ],
+      }) + _rpath_linkopts(src_to_test_name(src)),
+      deps=deps + if_mkl(
+          [
+              "//third_party/mkl:intel_binary_blob",
+          ],
+      ),
+      linkstatic=linkstatic,
+      tags=tags,
+      size=size,
+      args=args,
+      nocopts="-fno-exceptions")
+
 
 def tf_cc_tests_gpu(srcs,
                     deps,
@@ -1006,16 +1032,12 @@ register_extension_info(
 def tf_mkl_kernel_library(name,
                           prefix=None,
                           srcs=None,
-                          gpu_srcs=None,
                           hdrs=None,
                           deps=None,
                           alwayslink=1,
                           copts=tf_copts(),
-                          nocopts="-fno-exceptions",
-                          **kwargs):
+                          nocopts="-fno-exceptions"):
   """A rule to build MKL-based TensorFlow kernel libraries."""
-  gpu_srcs = gpu_srcs  # unused argument
-  kwargs = kwargs  # unused argument
 
   if not bool(srcs):
     srcs = []
@@ -1028,16 +1050,15 @@ def tf_mkl_kernel_library(name,
     hdrs = hdrs + native.glob(
         [prefix + "*.h"])
 
-  if_mkl(
-      native.cc_library(
-          name=name,
-          srcs=srcs,
-          hdrs=hdrs,
-          deps=deps,
-          alwayslink=alwayslink,
-          copts=copts,
-          nocopts=nocopts
-      ))
+  native.cc_library(
+      name=name,
+      srcs=if_mkl(srcs),
+      hdrs=hdrs,
+      deps=deps,
+      alwayslink=alwayslink,
+      copts=copts,
+      nocopts=nocopts
+  )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
-- 
GitLab


From 341f906e7b6011de4d4a10380a17040abc8bdf5e Mon Sep 17 00:00:00 2001
From: Sami Kama <samikama@users.noreply.github.com>
Date: Thu, 22 Mar 2018 23:49:47 -0700
Subject: [PATCH 0056/1262] Do not follow control edges in segmenter and in
 conversion and (#17936)

gracefully handle some failures

(cherry picked from commit 5daa95eeeae66b21fc60e08bf0f7c35b3df517f6)
(cherry picked from commit ee87a13583001dd9b19cb5272f85d227ad59297f)
(cherry picked from commit 9a1e6b0e9ca25da050f5a1866235189e6db528ae)

and squashed
---
 .../contrib/tensorrt/convert/convert_graph.cc | 19 +++--
 .../contrib/tensorrt/convert/convert_nodes.cc | 82 +++++++++++++------
 .../contrib/tensorrt/segment/segment.cc       | 55 +++++++++----
 tensorflow/contrib/tensorrt/segment/segment.h |  4 +-
 .../contrib/tensorrt/segment/segment_test.cc  |  8 +-
 5 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 90447ee666..ff8cc6374d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -49,7 +49,7 @@ namespace tensorrt {
 namespace convert {
 namespace {
 
-bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+bool IsTensorRTCandidate(const tensorflow::Node* node) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
@@ -75,7 +75,7 @@ bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
       // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node_def.op());
+  return candidate_ops.count(node->type_string());
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
@@ -85,10 +85,10 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
     const tensorflow::Node* node = graph.FindNodeId(node_id);
     for (const tensorflow::Edge* edge : node->in_edges()) {
       if (!subgraph_node_ids.count(edge->src()->id()) &&
-          !edge->src()->IsSource()) {
+          !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
       } else {
-        VLOG(2) << edge->src()->name() << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
       }
     }
   }
@@ -101,11 +101,11 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     const tensorflow::Node* node = graph.FindNodeId(node_id);
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
-          !edge->dst()->IsSink()) {
-        VLOG(2) << edge->dst()->name() << " Y, ";
+          !edge->dst()->IsSink() && !edge->IsControlEdge()) {
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << edge->dst()->name() << " N, ";
+        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -410,8 +410,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
       if (status != tensorflow::Status::OK()) {
         LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
-                     << " due to: \n"
-                     << status.ToString() << " SKIPPING......";
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
       }
       count++;
     }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 979b5648c2..f22502aaeb 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -53,8 +53,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
-
 namespace {
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
@@ -429,9 +429,8 @@ class Converter {
   tensorflow::tensorrt::TRTWeightStore* weight_store_;
   bool fp16_;
   void register_op_converters();
-  std::vector<TRT_TensorOrWeights> get_inputs(
-      const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs;
+  tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights>* inputs) {
     for (auto const& input_name : node_def.input()) {
       /*************************************************************************
        * TODO(jie) handle case 1) here
@@ -452,13 +451,17 @@ class Converter {
 
       VLOG(2) << "retrieve input: " << name;
       if (trt_tensors_.count(name)) {
-        inputs.push_back(trt_tensors_.at(name));
+        inputs->push_back(trt_tensors_.at(name));
       } else {
-        LOG(FATAL) << "input: " << name << " not availabled for node at, "
-                   << node_def.name();
+        string str("Node ");
+        StrAppend(&str, node_def.name(), " should have an input named '", name,
+                  "' but it is not available");
+        LOG(WARNING) << "input: " << name << " not available for node at "
+                     << node_def.name();
+        return tensorflow::errors::InvalidArgument(str);
       }
     }
-    return inputs;
+    return tensorflow::Status::OK();
   }
 
  public:
@@ -482,7 +485,8 @@ class Converter {
   }
 
   tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs = this->get_inputs(node_def);
+    std::vector<TRT_TensorOrWeights> inputs;
+    TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
     if (!op_registry_.count(op)) {
       return tensorflow::errors::Unimplemented(
@@ -887,7 +891,7 @@ tensorflow::Status BinaryTensorOpWeight(
 
   // Check type consistency
   nvinfer1::DataType ttype;
-  TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
 
   // Check scale mode
   auto dims_w = weights.shape_;
@@ -1152,9 +1156,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1397,8 +1401,11 @@ tensorflow::Status ConvertConst(Converter& ctx,
           scalar_shape.d[0] = weights_tensor.float_val_size();
           scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
         } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
+          LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
+                       << " kUNIFORM, at: " << node_def.name();
+          string err_str("Broadcast method is not supported for '");
+          StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
+          return tensorflow::errors::InvalidArgument(err_str);
         }
       }
     } else {
@@ -1436,8 +1443,11 @@ tensorflow::Status ConvertConst(Converter& ctx,
           scalar_shape.d[0] = weights_tensor.int_val_size();
           scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
         } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
+          LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
+                       << " kUNIFORM, at: " << node_def.name();
+          string err_str("Broadcast method is not supported for '");
+          StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
+          return tensorflow::errors::InvalidArgument(err_str);
         }
       }
     } else {
@@ -2139,8 +2149,11 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->thr_->join();
   delete calib_res->thr_;
   if (!calib_res->engine_) {
-    LOG(FATAL) << "Calibration failed!, engine is nullptr. Did you run "
+    LOG(ERROR) << "Calibration failed!, engine does not exist. Did you run "
                   "calibration graph?";
+    return tensorflow::errors::FailedPrecondition(
+        "Calibration graph needs to be executed on"
+        " calibration data before convertsion to inference graph");
   }
   auto weight_rmgr = trt_rm->getManager("WeightStore");
   TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
@@ -2177,7 +2190,7 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
     return status;
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
-  TF_CHECK_OK(status);
+  TF_RETURN_IF_ERROR(status);
   for (size_t i = 0; i < out_edges.size(); i++) {
     VLOG(1) << "Connecting trt_engine_node output " << i << " with "
             << out_edges.at(i)->dst()->name() << " port "
@@ -2275,6 +2288,12 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+    auto type_status = ConvertDType(tf_dtype, &dtype);
+    if (type_status != tensorflow::Status::OK()) {
+      LOG(WARNING) << "Data type conversion for input '" << node_name
+                   << "' failed";
+      return type_status;
+    }
     TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
     VLOG(2) << "accessing output index of: " << output_idx
@@ -2342,8 +2361,8 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
+      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+                                                 "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (!tensor) {
@@ -2500,7 +2519,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
+    auto type_status = ConvertDType(tf_dtype, &dtype);
+    if (type_status != tensorflow::Status::OK()) {
+      LOG(WARNING) << "Type conversion failed for " << node_name;
+      return type_status;
+    }
 
     VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
@@ -2511,8 +2534,12 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
     // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
     //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4)
-      return tensorflow::errors::Unimplemented("require 4 dimensional input");
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
@@ -2573,8 +2600,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
+                                                 "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (!tensor) {
@@ -2618,7 +2645,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
   TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
       engine_name, engine_name));
-  LOG(INFO) << "finished engine " << engine_name;
+  LOG(INFO) << "finished engine " << engine_name << " containing "
+            << s.subgraph_node_ids.size() << " nodes";
 
   // Build the TRT op
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 6193f0b0a1..8fc4697c51 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -80,13 +80,20 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
                                                 dst->in_edges().end());
   for (const tensorflow::Edge* in_edge : in_edges) {
-    if (in_edge->src() != src) {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
-      if (e->src() == graph->source_node()) {
-        graph->AddEdge(e->src(), e->src_output(), src,
-                       tensorflow::Graph::kControlSlot);
-      } else {
-        graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+    if (in_edge->IsControlEdge()) {
+      if (in_edge->src() != src) {
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        graph->AddControlEdge(e->src(), src);
+      }
+    } else {
+      if (in_edge->src() != src) {
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        if (e->src() == graph->source_node()) {
+          graph->AddEdge(e->src(), e->src_output(), src,
+                         tensorflow::Graph::kControlSlot);
+        } else {
+          graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+        }
       }
     }
   }
@@ -94,12 +101,19 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
                                                  dst->out_edges().end());
   for (const tensorflow::Edge* out_edge : out_edges) {
-    tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
-    if (e->dst() == graph->sink_node()) {
-      graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
-                     e->dst_input());
+    if (out_edge->IsControlEdge()) {
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      graph->AddControlEdge(src, e->dst());
     } else {
-      graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      if (e->dst() == graph->sink_node()) {
+        VLOG(1) << " edge to sink node " << src->name() << " -> "
+                << e->dst()->name();
+        graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
+                       e->dst_input());
+      } else {
+        graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+      }
     }
   }
 
@@ -118,7 +132,7 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
 
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments) {
   // Create a Graph representation of the GraphDef.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -136,7 +150,7 @@ tensorflow::Status SegmentGraph(
   for (int i = 0; i < graph.num_node_ids(); ++i) {
     tensorflow::Node* node = graph.FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node->def())) {
+        !candidate_fn(node)) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -155,7 +169,7 @@ tensorflow::Status SegmentGraph(
 
   for (const tensorflow::Node* node : order) {
     // All output nodes of 'node' have been visited...
-    VLOG(2) << "Trying node " << node->name();
+    VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
 
     // 'node' must be a TRT candidate...
     if (node_segments[node->id()].Value() == nullptr) {
@@ -169,8 +183,12 @@ tensorflow::Status SegmentGraph(
     while (true) {
       std::set<const tensorflow::Edge*> contract_edges;
       for (const tensorflow::Edge* out_edge : node->out_edges()) {
-        VLOG(2) << "... out node " << out_edge->dst()->name();
-
+        VLOG(2) << "... out node " << out_edge->dst()->name() << " ( "
+                << out_edge->dst()->id() << " <- " << node->id() << " )";
+        if (out_edge->IsControlEdge()) {
+          VLOG(2) << "... ... Control Edge, Skipping";
+          continue;
+        }
         // Out node must be TRT candidate...
         if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
           VLOG(2) << "... ... not a TRT candidate";
@@ -196,7 +214,8 @@ tensorflow::Status SegmentGraph(
         const tensorflow::Node* src = contract_edge->src();
         const tensorflow::Node* dst = contract_edge->dst();
 
-        VLOG(2) << "Merge " << src->name() << " <- " << dst->name();
+        VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " ("
+                << src->id() << " <- " << dst->id();
         node_segments[src->id()].Merge(&node_segments[dst->id()]);
 
         // Contracting the edge leaves disconnected graph edges.
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index ee6e2b3ed2..7e8685f44a 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+
 namespace tensorrt {
 namespace segment {
 
@@ -46,7 +48,7 @@ struct SegmentOptions {
 // @return the status.
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
 }  // namespace segment
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 74cbc5f2b3..7ddabec268 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -35,7 +35,7 @@ class SegmentTest : public ::testing::Test {
   TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                     TF_Status* s, const char* name);
 
-  std::function<bool(const NodeDef&)> MakeCandidateFn(
+  std::function<bool(const Node*)> MakeCandidateFn(
       const std::set<string>& node_names);
 
  protected:
@@ -60,10 +60,10 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
   return ret;
 }
 
-std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
+std::function<bool(const Node*)> SegmentTest::MakeCandidateFn(
     const std::set<string>& node_names) {
-  return [node_names](const NodeDef& node) -> bool {
-    return node_names.find(node.name()) != node_names.end();
+  return [node_names](const Node* node) -> bool {
+    return node_names.find(node->name()) != node_names.end();
   };
 }
 
-- 
GitLab


From ad61950fb5db57aa5a4089203a1a4bf48df8c5f4 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 23 Mar 2018 14:12:34 -0700
Subject: [PATCH 0057/1262] Instead of depending on ctest to be in PATH,
 directly reference the binary. (#17964)

PiperOrigin-RevId: 190137278
---
 tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index b537192a94..97829892b1 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Set ctest binary location.
+IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe")
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -47,4 +50,4 @@ if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
 echo Running tests...
-ctest -C Release --output-on-failure --jobs 1
+%CTEST_EXE% -C Release --output-on-failure --jobs 1
-- 
GitLab


From 3fbdba0c84941f34782a5e074b691916bca61a93 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 26 Mar 2018 11:49:03 -0700
Subject: [PATCH 0058/1262] update GPU installation instructions

---
 tensorflow/docs_src/install/install_linux.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 378946b459..3c5db9bced 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -33,7 +33,7 @@ must be installed on your system:
 
   * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
-    Ensure that you append the relevant Cuda pathnames to the
+    Ensure that you append the relevant CUDA pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
   * The NVIDIA drivers associated with CUDA Toolkit 9.0.
@@ -56,7 +56,7 @@ must be installed on your system:
     and add its path to your `LD_LIBRARY_PATH` environment variable:
 
     <pre>
-    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b>
+    $ <b>export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</b>
     </pre>
 
     For CUDA Toolkit <= 7.5 do:
@@ -64,6 +64,16 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
+  * **[OPTIONAL]**  For optimized inferencing performance, you can also install
+    NVIDIA TensorRT 3.0. For details, see
+    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
+    Only steps 1-4 in the TensorRT Tar File installation instructions are
+    required for compatibility with TensorFlow; the Python package installation
+    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+
+    **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
+    package, please use the Ubuntu **14.04** tar file package of TensorRT
+    even when installing onto an Ubuntu 16.04 system.   
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
-- 
GitLab


From ea644ac0783537a6ac8a2c8a2432829b3db69aeb Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 26 Mar 2018 13:05:52 -0700
Subject: [PATCH 0059/1262] Disabling the state_management_test.

For non-pip builds also.
---
 tensorflow/contrib/timeseries/python/timeseries/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 64f5cd8357..d72cc1b8a2 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -233,6 +233,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "manual",
         "no_pip",  # b/64527635
         "no_pip_gpu",  # b/63391119
     ],
-- 
GitLab


From 1fcef75aaa1989376324ff8dfc25033b443a69df Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 26 Mar 2018 13:48:00 -0700
Subject: [PATCH 0060/1262] Update BUILD

---
 tensorflow/contrib/timeseries/python/timeseries/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index d72cc1b8a2..67ee644d3b 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -233,7 +233,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
-        "manual",
+        "no_oss",
         "no_pip",  # b/64527635
         "no_pip_gpu",  # b/63391119
     ],
-- 
GitLab


From 083cf6b91a380641933457a4301f9b1efa13af92 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 25 Oct 2017 17:03:15 +0000
Subject: [PATCH 0061/1262] Add customerized kernel implementation for
 clip_by_value

This fix tries to address the issue raised in 7225 where
`tf.clip_by_value` does not have a custom kernel and reused
`tf.maximum` and `tf.mimimum`. In case scalar values are passed
to `tf.clip_by_value`, unnecessary memory might incur.

This fix adds the customerized kernel implementation for
`tf.clip_by_value`.

This fix fixes 7225.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/cwise_op_clip.cc | 150 +++++++++++++++++++++++
 tensorflow/core/ops/math_ops.cc          |  23 ++++
 2 files changed, 173 insertions(+)
 create mode 100644 tensorflow/core/kernels/cwise_op_clip.cc

diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
new file mode 100644
index 0000000000..6ce062b08f
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -0,0 +1,150 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+//#include "third_party/eigen3/Eigen/Core/CwiseTernaryOp.h"
+
+namespace tensorflow {
+
+// Unary functor for clip
+template <typename T>
+struct UnaryClipOp {
+  UnaryClipOp(const T& value_min, const T& value_max)
+      : value_min_(value_min), value_max_(value_max) {}
+  const T operator()(const T& value) const {
+    return std::max(std::min(value, value_max_), value_min_);
+  }
+  T value_min_;
+  T value_max_;
+};
+
+// Binary functor for clip
+template <typename T>
+struct BinaryClipMinOp {
+  BinaryClipMinOp(const T& value_min) : value_min_(value_min) {}
+  const T operator()(const T& value, const T& value_max) const {
+    return std::max(std::min(value, value_max), value_min_);
+  }
+  T value_min_;
+};
+
+// Binary functor for clip
+template <typename T>
+struct BinaryClipMaxOp {
+  BinaryClipMaxOp(const T& value_max) : value_max_(value_max) {}
+  const T operator()(const T& value, const T& value_min) const {
+    return std::max(std::min(value, value_max_), value_min);
+  }
+  T value_max_;
+};
+
+// Basic coefficient-wise tenary operations.
+// This is the case for example of the clip_by_value.
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined above. E.g., functor::clip.
+template <typename Device, typename T>
+class TenaryOp : public OpKernel {
+ public:
+  explicit TenaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    const Tensor& in2 = ctx->input(2);
+
+    auto in0_flat = in0.flat<T>();
+    auto in1_flat = in1.flat<T>();
+    auto in2_flat = in2.flat<T>();
+    const Device& d = ctx->eigen_device<Device>();
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
+    if (in1.shape() == in2.shape()) {
+      if (in0.shape() == in1.shape()) {
+        out_flat = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+      } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        out_flat = in0_flat.unaryExpr(UnaryClipOp<T>(in1_flat(0), in2_flat(0)));
+      }
+    } else {
+      if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        out_flat =
+            in0_flat.binaryExpr(in1_flat, BinaryClipMaxOp<T>(in2_flat(0)));
+
+      } else {
+        OP_REQUIRES(ctx, (in0.shape() == in2.shape() &&
+                          TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        out_flat =
+            in0_flat.binaryExpr(in2_flat, BinaryClipMinOp<T>(in1_flat(0)));
+      }
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      TenaryOp<CPUDevice, type>);
+
+REGISTER_CPU_KERNEL(Eigen::half);
+REGISTER_CPU_KERNEL(float);
+REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(int8);
+REGISTER_CPU_KERNEL(int16);
+REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int64);
+REGISTER_CPU_KERNEL(uint8);
+REGISTER_CPU_KERNEL(uint16);
+
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+// REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("ClipByValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("t")
+                            .HostMemory("clip_value_min")
+                            .HostMemory("clip_value_min")
+                            .TypeConstraint<int32>("T"),
+                        TenaryOp<CPUDevice, int32>);
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f33d51d5a..602a6ec115 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1558,6 +1558,29 @@ REGISTER_OP("Bucketize")
     .Attr("boundaries: list(float)")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("ClipByValue")
+    .Input("t: T")
+    .Input("clip_value_min: T")
+    .Input("clip_value_max: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Clips tensor values to a specified min and max.
+
+Given a tensor `t`, this operation returns a tensor of the same type and
+shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+greater than `clip_value_max` are set to `clip_value_max`.
+
+t: A `Tensor`.
+clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+  as `t`. The minimum value to clip by.
+clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+  as `t`. The maximum value to clip by.
+output: A clipped `Tensor` with the same shape as input 't'.
+)doc");
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
     .Input("inputs: N * T")
-- 
GitLab


From daf0b206b5afde875a19270136ad22d9d2bb138c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 25 Oct 2017 17:08:32 +0000
Subject: [PATCH 0062/1262] Add python wrapper for tf.clip_by_value

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/clip_ops.py    |  17 +-
 tensorflow/python/ops/hidden_ops.txt | 395 +++++++++++++++++++++++++++
 2 files changed, 400 insertions(+), 12 deletions(-)
 create mode 100644 tensorflow/python/ops/hidden_ops.txt

diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 49f8c66531..a5baebb3f6 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -58,18 +59,10 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   """
   with ops.name_scope(name, "clip_by_value",
                       [t, clip_value_min, clip_value_max]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-
-    # Go through list of tensors, for each value in each tensor clip
-    t_min = math_ops.minimum(t, clip_value_max)
-    # Assert that the shape is compatible with the initial shape,
-    # to prevent unintentional broadcasting.
-    _ = t.shape.merge_with(t_min.shape)
-
-    t_max = math_ops.maximum(t_min, clip_value_min, name=name)
-    _ = t.shape.merge_with(t_max.shape)
-
-  return t_max
+    return gen_math_ops._clip_by_value(t,
+                                       clip_value_min,
+                                       clip_value_max,
+                                       name=name)
 
 
 @tf_export("clip_by_norm")
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
new file mode 100644
index 0000000000..e1217e984c
--- /dev/null
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -0,0 +1,395 @@
+# array_ops
+BatchToSpace
+BroadcastArgs
+BroadcastGradientArgs
+ConcatOffset
+Concat
+ConcatV2
+ConjugateTranspose
+Const
+DebugGradientIdentity
+DebugGradientRefIdentity
+EditDistance
+ExpandDims
+ListDiff
+MirrorPad
+MirrorPadGrad
+OneHot
+Pack
+Pad
+PadV2
+ParallelConcat
+Placeholder
+RefIdentity
+Reverse
+Snapshot
+SpaceToBatch
+Split
+SplitV
+Squeeze
+Slice
+TileGrad  # Exported through array_grad instead of array_ops.
+ZerosLike  # TODO(josh11b): Use this instead of the Python version.
+Unique
+UniqueV2
+UniqueWithCounts
+UniqueWithCountsV2
+Unpack
+
+# candidate_sampling_ops
+AllCandidateSampler
+ComputeAccidentalHits
+FixedUnigramCandidateSampler
+LearnedUnigramCandidateSampler
+LogUniformCandidateSampler
+ThreadUnsafeUnigramCandidateSampler
+UniformCandidateSampler
+
+# checkpoint_ops
+GenerateVocabRemapping
+LoadAndRemapMatrix
+
+
+# control_flow_ops
+Switch
+Merge
+RefMerge
+Exit
+RefExit
+
+# ctc_ops
+CTCLoss
+CTCGreedyDecoder
+CTCBeamSearchDecoder
+
+# data_flow_ops
+Barrier
+BarrierClose
+BarrierIncompleteSize
+BarrierInsertMany
+BarrierReadySize
+BarrierTakeMany
+DeleteSessionTensor
+FakeQueue
+FIFOQueue
+FIFOQueueV2
+GetSessionHandle
+GetSessionHandleV2
+GetSessionTensor
+HashTable
+HashTableV2
+InitializeTable
+InitializeTableV2
+InitializeTableFromTextFile
+InitializeTableFromTextFileV2
+LookupTableExport
+LookupTableExportV2
+LookupTableFind
+LookupTableFindV2
+LookupTableImport
+LookupTableImportV2
+LookupTableInsert
+LookupTableInsertV2
+LookupTableSize
+LookupTableSizeV2
+MutableDenseHashTable
+MutableDenseHashTableV2
+MutableHashTable
+MutableHashTableV2
+MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
+Mutex
+MutexAcquire
+MutexRelease
+PaddingFIFOQueue
+PaddingFIFOQueueV2
+PriorityQueue
+PriorityQueueV2
+QueueClose
+QueueCloseV2
+QueueDequeue
+QueueDequeueV2
+QueueDequeueMany
+QueueDequeueManyV2
+QueueDequeueUpTo
+QueueDequeueUpToV2
+QueueEnqueue
+QueueEnqueueV2
+QueueEnqueueMany
+QueueEnqueueManyV2
+QueueSize
+QueueSizeV2
+RandomShuffleQueue
+RandomShuffleQueueV2
+Stack
+StackClose
+StackPop
+StackPush
+StackV2
+StackCloseV2
+StackPopV2
+StackPushV2
+TensorArray
+TensorArrayClose
+TensorArrayCloseV2
+TensorArrayConcat
+TensorArrayConcatV2
+TensorArrayGather
+TensorArrayGatherV2
+TensorArrayGrad
+TensorArrayGradV2
+TensorArrayPack
+TensorArrayPackV2
+TensorArrayRead
+TensorArrayReadV2
+TensorArrayScatter
+TensorArrayScatterV2
+TensorArraySize
+TensorArraySizeV2
+TensorArraySplit
+TensorArraySplitV2
+TensorArrayUnpack
+TensorArrayUnpackV2
+TensorArrayV2
+TensorArrayWrite
+TensorArrayWriteV2
+TensorArrayV3
+TensorArrayCloseV3
+TensorArrayConcatV3
+TensorArrayGatherV3
+TensorArrayGradV3
+TensorArrayReadV3
+TensorArrayPackV3
+TensorArrayScatterV3
+TensorArraySizeV3
+TensorArraySplitV3
+TensorArrayUnpackV3
+TensorArrayWriteV3
+
+# functional_ops
+SymbolicGradient
+
+# image_ops
+AdjustContrastv2
+NonMaxSuppression
+NonMaxSuppressionV2
+RandomCrop
+ResizeBilinearGrad
+ResizeBicubicGrad
+ResizeNearestNeighborGrad
+SampleDistortedBoundingBox
+SampleDistortedBoundingBoxV2
+ScaleImageGrad
+
+# io_ops
+FixedLengthRecordReader
+IdentityReader
+ReaderNumRecordsProduced
+ReaderNumWorkUnitsCompleted
+ReaderRead
+ReaderReadUpTo
+ReaderReset
+ReaderRestoreState
+ReaderSerializeState
+ReaderWorkQueueLength
+FixedLengthRecordReaderV2
+IdentityReaderV2
+ReaderNumRecordsProducedV2
+ReaderNumWorkUnitsCompletedV2
+ReaderReadV2
+ReaderReadUpToV2
+ReaderResetV2
+ReaderRestoreStateV2
+ReaderSerializeStateV2
+ReaderWorkQueueLengthV2
+Restore
+RestoreSlice
+Save
+SaveSlices
+ShardedFilename
+ShardedFilespec
+TextLineReader
+TFRecordReader
+WholeFileReader
+TextLineReaderV2
+TFRecordReaderV2
+WholeFileReaderV2
+LMDBReader
+DecodeCSV
+
+# linalg_ops
+BatchCholesky
+BatchCholeskyGrad
+BatchMatrixDeterminant
+BatchMatrixInverse
+BatchMatrixSolve
+BatchMatrixSolveLs
+BatchMatrixTriangularSolve
+BatchSelfAdjointEig
+BatchSelfAdjointEigV2
+BatchSvd
+LogMatrixDeterminant
+MatrixExponential
+MatrixLogarithm
+MatrixSolveLs
+SelfAdjointEig
+SelfAdjointEigV2
+Svd
+
+# logging_ops
+Assert
+AudioSummary
+AudioSummaryV2
+HistogramSummary
+ImageSummary
+MergeSummary
+Print
+ScalarSummary
+TensorSummary
+TensorSummaryV2
+
+# math_ops
+Abs
+AccumulateNV2
+AddN
+AddV2
+All
+Any
+BatchMatMul
+BatchFFT
+BatchFFT2D
+BatchFFT3D
+BatchIFFT
+BatchIFFT2D
+BatchIFFT3D
+Bucketize
+ClipByValue
+Complex
+ComplexAbs
+Conj
+FloorDiv
+FloorMod
+HistogramFixedWidth
+Max
+Mean
+Min
+Mul
+Neg
+Pow
+Prod
+Range
+RealDiv
+Select
+SparseMatMul
+Sub
+Sum
+MatMul
+Sigmoid
+Tanh
+SigmoidGrad
+TanhGrad
+InvGrad
+ReciprocalGrad
+SqrtGrad
+RsqrtGrad
+TruncateDiv
+TruncateMod
+
+# nn_ops
+AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
+AvgPool3DGrad
+BatchNormWithGlobalNormalization
+BatchNormWithGlobalNormalizationGrad
+FusedBatchNorm
+FusedBatchNormV2
+SoftmaxCrossEntropyWithLogits
+SparseSoftmaxCrossEntropyWithLogits
+LRNGrad
+MaxPoolGrad
+MaxPoolGradWithArgmax
+MaxPoolGradGrad
+MaxPoolGradGradWithArgmax
+MaxPool3DGrad
+MaxPool3DGradGrad
+ReluGrad
+Relu6Grad
+EluGrad
+SeluGrad
+SoftplusGrad
+SoftsignGrad
+TopK
+TopKV2
+BiasAdd
+BiasAddV1
+Relu6
+AvgPool
+MaxPool
+MaxPoolV2
+Softmax
+LogSoftmax
+FractionalAvgPoolGrad
+FractionalMaxPoolGrad
+InTopK
+InTopKV2
+
+# parsing_ops
+ParseExample
+ParseSingleSequenceExample
+
+# random_ops
+RandomGamma
+RandomPoisson
+RandomUniform
+RandomUniformInt
+RandomShuffle
+RandomStandardNormal
+ParameterizedTruncatedNormal
+TruncatedNormal
+
+# script_ops
+PyFunc
+PyFuncStateless
+EagerPyFunc
+
+# sdca_ops
+
+# state_ops
+Variable
+VariableV2
+TemporaryVariable
+DestroyTemporaryVariable
+
+# sparse_ops
+AddSparseToTensorsMap
+AddManySparseToTensorsMap
+TakeManySparseFromTensorsMap
+DeserializeManySparse
+DeserializeSparse
+SerializeManySparse
+SerializeSparse
+SparseAdd
+SparseAddGrad
+SparseConcat
+SparseCross
+SparseFillEmptyRows
+SparseFillEmptyRowsGrad
+SparseSplit
+SparseSelectLastK
+SparseReorder
+SparseReshape
+SparseToDense
+SparseTensorDenseAdd
+SparseTensorDenseMatMul
+
+# string_ops
+StringSplit
+
+# user_ops
+Fact
+
+# training_ops
+# (None)
+
+# word2vec deprecated ops
+NegTrain
+Skipgram
-- 
GitLab


From 90a271e7a37574fc1c90fd6042c3b3972645d114 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 25 Oct 2017 17:09:05 +0000
Subject: [PATCH 0063/1262] Update tests for `tf.clip_by_value`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/clip_ops_test.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 5c8b71da17..d47930350e 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.platform import test
@@ -42,10 +43,12 @@ class ClipTest(test.TestCase):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
-      with self.assertRaises(ValueError):
-        _ = clip_ops.clip_by_value(x, -clip, clip)
-      with self.assertRaises(ValueError):
-        _ = clip_ops.clip_by_value(x, 1.0, clip)
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        ans = clip_ops.clip_by_value(x, -clip, clip)
+        tf_ans = ans.eval()
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        ans = clip_ops.clip_by_value(x, 1.0, clip)
+        tf_ans = ans.eval()
 
   def testClipByValueNonFinite(self):
     with self.test_session():
-- 
GitLab


From cff8abcb1a9305491637dc44559316aa1d8184e6 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 26 Oct 2017 04:37:55 +0000
Subject: [PATCH 0064/1262] Add GPU kernel for tf.clip_by_value

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/cwise_op_clip.cc      | 162 +++++++++++++-----
 tensorflow/core/kernels/cwise_op_clip.h       |  61 +++++++
 .../core/kernels/cwise_op_clip_gpu.cu.cc      | 134 +++++++++++++++
 3 files changed, 313 insertions(+), 44 deletions(-)
 create mode 100644 tensorflow/core/kernels/cwise_op_clip.h
 create mode 100644 tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc

diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 6ce062b08f..c2980acdd8 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -13,43 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/cwise_ops_common.h"
-
-//#include "third_party/eigen3/Eigen/Core/CwiseTernaryOp.h"
+#include "tensorflow/core/kernels/cwise_op_clip.h"
 
 namespace tensorflow {
 
-// Unary functor for clip
-template <typename T>
-struct UnaryClipOp {
-  UnaryClipOp(const T& value_min, const T& value_max)
-      : value_min_(value_min), value_max_(value_max) {}
-  const T operator()(const T& value) const {
-    return std::max(std::min(value, value_max_), value_min_);
-  }
-  T value_min_;
-  T value_max_;
-};
-
-// Binary functor for clip
-template <typename T>
-struct BinaryClipMinOp {
-  BinaryClipMinOp(const T& value_min) : value_min_(value_min) {}
-  const T operator()(const T& value, const T& value_max) const {
-    return std::max(std::min(value, value_max), value_min_);
-  }
-  T value_min_;
-};
-
-// Binary functor for clip
-template <typename T>
-struct BinaryClipMaxOp {
-  BinaryClipMaxOp(const T& value_max) : value_max_(value_max) {}
-  const T operator()(const T& value, const T& value_min) const {
-    return std::max(std::min(value, value_max_), value_min);
-  }
-  T value_max_;
-};
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 // Basic coefficient-wise tenary operations.
 // This is the case for example of the clip_by_value.
@@ -76,7 +45,8 @@ class TenaryOp : public OpKernel {
     auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
-        out_flat = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+        functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                            out_flat);
       } else {
         OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
                     errors::InvalidArgument(
@@ -85,7 +55,8 @@ class TenaryOp : public OpKernel {
                         "input shape: ", in0.shape().DebugString(),
                         "clip_value_min shape: ", in1.shape().DebugString(),
                         "clip_value_max shape: ", in2.shape().DebugString()));
-        out_flat = in0_flat.unaryExpr(UnaryClipOp<T>(in1_flat(0), in2_flat(0)));
+        functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                          out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
@@ -96,9 +67,8 @@ class TenaryOp : public OpKernel {
                         "input shape: ", in0.shape().DebugString(),
                         "clip_value_min shape: ", in1.shape().DebugString(),
                         "clip_value_max shape: ", in2.shape().DebugString()));
-        out_flat =
-            in0_flat.binaryExpr(in1_flat, BinaryClipMaxOp<T>(in2_flat(0)));
-
+        functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                               out_flat);
       } else {
         OP_REQUIRES(ctx, (in0.shape() == in2.shape() &&
                           TensorShapeUtils::IsScalar(in1.shape())),
@@ -108,13 +78,103 @@ class TenaryOp : public OpKernel {
                         "input shape: ", in0.shape().DebugString(),
                         "clip_value_min shape: ", in1.shape().DebugString(),
                         "clip_value_max shape: ", in2.shape().DebugString()));
-        out_flat =
-            in0_flat.binaryExpr(in2_flat, BinaryClipMinOp<T>(in1_flat(0)));
+        functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                                out_flat);
       }
     }
   }
 };
 
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipFunc {
+  UnaryClipFunc(const T& value_min, const T& value_max)
+      : value_min_(value_min), value_max_(value_max) {}
+  const T operator()(const T& value) const {
+    return std::max(std::min(value, value_max_), value_min_);
+  }
+  T value_min_;
+  T value_max_;
+};
+template <typename T>
+struct UnaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat = in0_flat.unaryExpr(UnaryClipFunc<T>(in1_flat(0), in2_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipFunc {
+  BinaryRightClipFunc(const T& value_min) : value_min_(value_min) {}
+  const T operator()(const T& value, const T& value_max) const {
+    return std::max(std::min(value, value_max), value_min_);
+  }
+  T value_min_;
+};
+template <typename T>
+struct BinaryRightClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in2_flat, BinaryRightClipFunc<T>(in1_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipFunc {
+  BinaryLeftClipFunc(const T& value_max) : value_max_(value_max) {}
+  const T operator()(const T& value, const T& value_min) const {
+    return std::max(std::min(value, value_max_), value_min);
+  }
+  T value_max_;
+};
+template <typename T>
+struct BinaryLeftClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in1_flat, BinaryLeftClipFunc<T>(in2_flat(0)));
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_CPU(T)                         \
+  template struct UnaryClipOp<CPUDevice, T>;       \
+  template struct BinaryRightClipOp<CPUDevice, T>; \
+  template struct BinaryLeftClipOp<CPUDevice, T>;  \
+  template struct TernaryClipOp<CPUDevice, T>;
+INSTANTIATE_CPU(Eigen::half);
+INSTANTIATE_CPU(float);
+INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(int8);
+INSTANTIATE_CPU(int16);
+INSTANTIATE_CPU(int32);
+INSTANTIATE_CPU(int64);
+INSTANTIATE_CPU(uint8);
+INSTANTIATE_CPU(uint16);
+#undef INSTANTIATE_CPU
+}  // namespace functor
+
 #define REGISTER_CPU_KERNEL(type)                                       \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
@@ -129,11 +189,22 @@ REGISTER_CPU_KERNEL(int32);
 REGISTER_CPU_KERNEL(int64);
 REGISTER_CPU_KERNEL(uint8);
 REGISTER_CPU_KERNEL(uint16);
-
 #undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA
-// REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
+
+#define REGISTER_GPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      TenaryOp<GPUDevice, type>);
+REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int64);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(uint16);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -142,9 +213,12 @@ REGISTER_KERNEL_BUILDER(Name("ClipByValue")
                             .Device(DEVICE_GPU)
                             .HostMemory("t")
                             .HostMemory("clip_value_min")
-                            .HostMemory("clip_value_min")
+                            .HostMemory("clip_value_max")
+                            .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         TenaryOp<CPUDevice, int32>);
+
+#undef REGISTER_GPU_KERNEL
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_clip.h b/tensorflow/core/kernels/cwise_op_clip.h
new file mode 100644
index 0000000000..1a4bf8cf1d
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
+#define TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename Device, typename T>
+struct UnaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename Device, typename T>
+struct BinaryRightClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename Device, typename T>
+struct BinaryLeftClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename Device, typename T>
+struct TernaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
new file mode 100644
index 0000000000..5c07847548
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+template <typename T>
+__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
+                                      const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
+                                            const T *in1, const T *in2,
+                                            T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[i] < in0[i] ? in2[i] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
+                                           const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[i] ? in1[i] : value;
+  }
+}
+
+namespace functor {
+
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    UnaryClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryRightClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryLeftClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_GPU(T)                         \
+  template struct UnaryClipOp<GPUDevice, T>;       \
+  template struct BinaryRightClipOp<GPUDevice, T>; \
+  template struct BinaryLeftClipOp<GPUDevice, T>;  \
+  template struct TernaryClipOp<GPUDevice, T>;
+INSTANTIATE_GPU(Eigen::half);
+INSTANTIATE_GPU(float);
+INSTANTIATE_GPU(double);
+INSTANTIATE_GPU(int8);
+INSTANTIATE_GPU(int16);
+INSTANTIATE_GPU(int32);
+INSTANTIATE_GPU(int64);
+INSTANTIATE_GPU(uint8);
+INSTANTIATE_GPU(uint16);
+#undef INSTANTIATE_GPU
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
-- 
GitLab


From a3553d45b63fba1cd4eb8d1d5b6dd0d565c94879 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 26 Oct 2017 04:38:38 +0000
Subject: [PATCH 0065/1262] Update test cases for tf.clip_by_value

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/clip_ops_test.py      | 105 ++++++++++++++----
 1 file changed, 85 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index d47930350e..2d03fb99e4 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
@@ -29,7 +30,7 @@ class ClipTest(test.TestCase):
 
   # ClipByValue test
   def testClipByValue(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
@@ -38,8 +39,72 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  # [Tensor, Scalar, Scalar]
+  def testClipByValue0Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = 2
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Scalar]
+  def testClipByValue1Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = constant_op.constant([2, 2, 2, 3, 3, 3], shape=[2, 3],
+                                              dtype=dtype)
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Scalar, Tensor]
+  def testClipByValue2Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[4, 4, 4], [4, 5, 6]]
+        clip_value_min = 4
+        clip_value_max = constant_op.constant([6, 6, 6, 6, 6, 6], shape=[2, 3],
+                                              dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Tensor]
+  def testClipByValue3Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [5, 5, 6]]
+        clip_value_min = constant_op.constant([2, 2, 2, 5, 5, 5], shape=[2, 3],
+                                              dtype=dtype)
+        clip_value_max = constant_op.constant([5, 5, 5, 7, 7, 7], shape=[2, 3],
+                                              dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
   def testClipByValueBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -51,7 +116,7 @@ class ClipTest(test.TestCase):
         tf_ans = ans.eval()
 
   def testClipByValueNonFinite(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
@@ -63,7 +128,7 @@ class ClipTest(test.TestCase):
   # ClipByNorm tests
   def testClipByNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
@@ -79,7 +144,7 @@ class ClipTest(test.TestCase):
     self.assertAllClose(np_ans, tf_ans_tensor)
 
   def testClipByNormBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -88,7 +153,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -100,7 +165,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
@@ -112,7 +177,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim0(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
@@ -124,7 +189,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim1(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
@@ -136,7 +201,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClippedWithAxes(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
@@ -149,7 +214,7 @@ class ClipTest(test.TestCase):
   # ClipByGlobalNorm tests
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -170,7 +235,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -191,7 +256,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -214,7 +279,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = ops.IndexedSlices(
           constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@@ -247,7 +312,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -266,7 +331,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([0.0, 0.0])
       # Norm = 0, no changes
@@ -285,7 +350,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -297,7 +362,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClippedTensor(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -309,7 +374,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormNotClipped(self):
     # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -321,7 +386,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormZero(self):
     # No norm clipping when average clip_norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Average norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
-- 
GitLab


From a5e9d9a387680b0b1d7d8ed08fc9c07477a7efe7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 30 Oct 2017 23:42:08 +0000
Subject: [PATCH 0066/1262] Add grad registration for clip_by_value

and address review feedbacks.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/cwise_op_clip.cc      |  2 +-
 .../python/kernel_tests/clip_ops_test.py      | 16 ++++++++++++
 tensorflow/python/ops/clip_ops.py             | 25 +++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index c2980acdd8..f30c49fdf8 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 2d03fb99e4..cb1359be15 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -23,11 +23,27 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
 
 
 class ClipTest(test.TestCase):
 
+  def testClipByValueGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
+    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
+    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
+    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
+    with self.test_session():
+      error_1 = gradient_checker.compute_gradient_error(inputs, [4],
+                                                        outputs_1, [4])
+      self.assertLess(error_1, 1e-4)
+
+      error_2 = gradient_checker.compute_gradient_error(inputs, [4],
+                                                        outputs_2, [4])
+      self.assertLess(error_2, 1e-4)
+
   # ClipByValue test
   def testClipByValue(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index a5baebb3f6..e84cfc6944 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -64,6 +65,30 @@ def clip_by_value(t, clip_value_min, clip_value_max,
                                        clip_value_max,
                                        name=name)
 
+@ops.RegisterGradient("ClipByValue")
+def _ClipByValueGrad(op, grad):
+  """Returns grad of clip_by_value."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  z = op.inputs[2]
+  gdtype = grad.dtype
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  sz = array_ops.shape(z)
+  gradshape = array_ops.shape(grad)
+  zeros = array_ops.zeros(gradshape, gdtype)
+  xymask = math_ops.less(x, y)
+  xzmask = math_ops.greater(x, z)
+  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
+  rx, rz = gen_array_ops._broadcast_gradient_args(sx, sz)
+  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
+  ygrad = array_ops.where(xymask, grad, zeros)
+  zgrad = array_ops.where(xzmask, grad, zeros)
+  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
+  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
+  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
+  return (gx, gy, gz)
+
 
 @tf_export("clip_by_norm")
 def clip_by_norm(t, clip_norm, axes=None, name=None):
-- 
GitLab


From 71ddf90d3c8c49d4401c0d298bf63b92150dadaa Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 14 Dec 2017 04:06:58 +0000
Subject: [PATCH 0067/1262] Update with `TenaryOp` -> `ClipOp`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/cwise_op_clip.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index f30c49fdf8..bd22f5777c 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -25,9 +25,9 @@ typedef Eigen::GpuDevice GPUDevice;
 //   Device: E.g., CPUDevice, GPUDevice.
 //   Functor: defined above. E.g., functor::clip.
 template <typename Device, typename T>
-class TenaryOp : public OpKernel {
+class ClipOp : public OpKernel {
  public:
-  explicit TenaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit ClipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in0 = ctx->input(0);
@@ -178,7 +178,7 @@ INSTANTIATE_CPU(uint16);
 #define REGISTER_CPU_KERNEL(type)                                       \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      TenaryOp<CPUDevice, type>);
+      ClipOp<CPUDevice, type>);
 
 REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
@@ -196,7 +196,7 @@ REGISTER_CPU_KERNEL(uint16);
 #define REGISTER_GPU_KERNEL(type)                                       \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
-      TenaryOp<GPUDevice, type>);
+      ClipOp<GPUDevice, type>);
 REGISTER_GPU_KERNEL(Eigen::half);
 REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(double);
@@ -216,7 +216,7 @@ REGISTER_KERNEL_BUILDER(Name("ClipByValue")
                             .HostMemory("clip_value_max")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
-                        TenaryOp<CPUDevice, int32>);
+                        ClipOp<CPUDevice, int32>);
 
 #undef REGISTER_GPU_KERNEL
 #endif
-- 
GitLab


From d1078b562532e2de60bc16fc544a94823149ae77 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 18 Dec 2017 17:42:37 +0000
Subject: [PATCH 0068/1262] Fix failing test //tensorflow/python:function_test

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/framework/function_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 65ca801cbe..24aaff3748 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1333,7 +1333,7 @@ class UnrollLSTMTest(test.TestCase):
         value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
     new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
         i_g) * math_ops.tanh(i_i)
-    new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
+    new_c = math_ops.maximum(math_ops.minimum(new_c, 50.0), -50.0)
     new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
     return new_m, new_c
 
-- 
GitLab


From 14e9c14ecdb9e9ddb283c5ec9cf27b3c5dbb900e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 18 Dec 2017 18:58:42 +0000
Subject: [PATCH 0069/1262] Fix api_compatibility_test with `--update_goldens
 True`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../base_api/api_def_ClipByValue.pbtxt        | 36 +++++++++++++++++++
 .../python_api/api_def_ClipByValue.pbtxt      |  4 +++
 2 files changed, 40 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000..803d8970ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ClipByValue"
+  in_arg {
+    name: "t"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "clip_value_min"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The minimum value to clip by.
+END
+  }
+  in_arg {
+    name: "clip_value_max"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The maximum value to clip by.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A clipped `Tensor` with the same shape as input 't'.
+END
+  }
+  summary: "Clips tensor values to a specified min and max."
+  description: <<END
+Given a tensor `t`, this operation returns a tensor of the same type and
+shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+greater than `clip_value_max` are set to `clip_value_max`.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000..cacdd5c2ca
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ClipByValue"
+  visibility: HIDDEN
+}
-- 
GitLab


From 229b20326be0be956f08351ec5881b06e3cc4d88 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 8 Mar 2018 21:05:22 +0000
Subject: [PATCH 0070/1262] Update math_ops.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops.cc   | 17 +----------------
 tensorflow/python/ops/clip_ops.py |  8 ++++----
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 602a6ec115..4548e59fbf 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1564,22 +1564,7 @@ REGISTER_OP("ClipByValue")
     .Input("clip_value_max: T")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Clips tensor values to a specified min and max.
-
-Given a tensor `t`, this operation returns a tensor of the same type and
-shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-greater than `clip_value_max` are set to `clip_value_max`.
-
-t: A `Tensor`.
-clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-  as `t`. The minimum value to clip by.
-clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-  as `t`. The maximum value to clip by.
-output: A clipped `Tensor` with the same shape as input 't'.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index e84cfc6944..c32726e91a 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -60,10 +60,10 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   """
   with ops.name_scope(name, "clip_by_value",
                       [t, clip_value_min, clip_value_max]) as name:
-    return gen_math_ops._clip_by_value(t,
-                                       clip_value_min,
-                                       clip_value_max,
-                                       name=name)
+    return gen_math_ops.clip_by_value(t,
+                                      clip_value_min,
+                                      clip_value_max,
+                                      name=name)
 
 @ops.RegisterGradient("ClipByValue")
 def _ClipByValueGrad(op, grad):
-- 
GitLab


From ab74c5f30fa89b172f2d45d8877a523d58791352 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 8 Mar 2018 21:17:10 +0000
Subject: [PATCH 0071/1262] Fix python sanity check

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/framework/function_test.py            | 1 -
 tensorflow/python/keras/_impl/keras/integration_test.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 24aaff3748..e995e6f2e1 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 280f7ed1b1..53dd56ba9a 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -101,7 +101,7 @@ class KerasIntegrationTest(test.TestCase):
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
                           validation_data=(x_test, y_test),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.80)
 
   def test_image_classification_declarative(self):
     with self.test_session():
-- 
GitLab


From 5324b75ceb5751938d6a7f59c2fc1793933d38ad Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 27 Mar 2018 20:15:46 +0000
Subject: [PATCH 0072/1262] Remove unneeded _ in gen_array_ops

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/clip_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index c32726e91a..0829aa67ed 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -79,8 +79,8 @@ def _ClipByValueGrad(op, grad):
   zeros = array_ops.zeros(gradshape, gdtype)
   xymask = math_ops.less(x, y)
   xzmask = math_ops.greater(x, z)
-  rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  rx, rz = gen_array_ops._broadcast_gradient_args(sx, sz)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
   xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
   ygrad = array_ops.where(xymask, grad, zeros)
   zgrad = array_ops.where(xzmask, grad, zeros)
-- 
GitLab


From 3551d41e36c569a200d3cbaeb1074b4165fb8f0a Mon Sep 17 00:00:00 2001
From: Nick Felt <nfelt@users.noreply.github.com>
Date: Tue, 27 Mar 2018 15:35:37 -0700
Subject: [PATCH 0073/1262] Update tensorboard dependency to 1.7.0+

---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 7a3184d64d..8b83257887 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -39,7 +39,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorboard >= 1.6.0, < 1.7.0',
+    'tensorboard >= 1.7.0, < 1.8.0',
     'termcolor >= 1.1.0',
 ]
 
-- 
GitLab


From 07502453382cc007f42818118a592220a8c7d849 Mon Sep 17 00:00:00 2001
From: "wenhao.hu" <wenhao.hu@leapmind.io>
Date: Wed, 28 Mar 2018 10:25:47 +0900
Subject: [PATCH 0074/1262] clean the pollution of axis

---
 tensorflow/python/ops/linalg_ops.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 608b72c574..86be1e7752 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -543,13 +543,12 @@ def norm(tensor,
     if ord in ['fro', 'euclidean', 2, 2.0]:
       if is_matrix_norm and ord in [2, 2.0]:
         rank = array_ops.rank(tensor)
-        axis = functional_ops.map_fn(
-            lambda i: control_flow_ops.cond(i >= 0, lambda: i,
-                                            lambda: i + rank),
+        positive_axis = functional_ops.map_fn(
+            lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
             ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
         perm_before = array_ops.concat(
-            [array_ops.setdiff1d(axes, axis)[0], axis], axis=0)
+            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis], axis=0)
         perm_after = functional_ops.map_fn(
             lambda i: math_ops.cast(
                 array_ops.squeeze(
@@ -557,8 +556,11 @@ def norm(tensor,
                 dtype=dtypes.int32), axes)
         permed = array_ops.transpose(tensor, perm=perm_before)
         matrix_2_norm = array_ops.expand_dims(
-            math_ops.reduce_max(gen_linalg_ops.svd(permed, compute_uv=False)[0],
-                                axis=-1, keepdims=True), axis=-1)
+            math_ops.reduce_max(
+                gen_linalg_ops.svd(permed, compute_uv=False)[0],
+                axis=-1,
+                keepdims=True),
+            axis=-1)
         result = array_ops.transpose(matrix_2_norm, perm=perm_after)
       else:
         result = math_ops.sqrt(
-- 
GitLab


From e9ea69058974d9155851c6325362dc3cb188cefb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Wed, 28 Mar 2018 10:22:31 +0800
Subject: [PATCH 0075/1262] CLN: remove no_oss, notsan tags

---
 tensorflow/contrib/opt/BUILD | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index a86d150f7a..aaf0012808 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -53,10 +53,6 @@ py_test(
     name = "adamax_test",
     srcs = ["python/training/adamax_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # b/73507407
-        "notsan",  # b/31055119
-    ],
     deps = [
         ":opt_py",
         "//tensorflow/python:array_ops",
-- 
GitLab


From 3a9d5e51bbb7f205a74cbfe5e6bae953d4fc2149 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Wed, 28 Mar 2018 10:28:21 +0800
Subject: [PATCH 0076/1262] CLN: add comment for variable

---
 tensorflow/contrib/opt/python/training/adamax.py | 2 +-
 tensorflow/core/kernels/training_ops.cc          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index 65918831e9..403fdaa637 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -48,7 +48,7 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
     ```
 
     The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section7.1 of the paper:
+    described at the end of section 7.1 of the paper:
 
     ```
     t <- t + 1
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 0387e3011e..45c600fd40 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -343,9 +343,9 @@ struct ApplyAdaMaxNonCuda {
       LOG(WARNING) << "AdaMax doesn't support use_nesterov yet, ignore it.";
     }
     m.device(d) += (grad - m) * (T(1) - beta1());
-    // v == u
+    // Here v is u in section 7.1
     v.device(d) = (beta2() * v).cwiseMax(grad.abs());
-    // var == θ
+    // var is θ  in section 7.1
     var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon()));
   }
 };
-- 
GitLab


From 0a2a35e210d899d81a7e0478eeb49ea478b05bb8 Mon Sep 17 00:00:00 2001
From: hsm207 <hsm207@users.noreply.github.com>
Date: Wed, 28 Mar 2018 12:05:21 -0400
Subject: [PATCH 0077/1262] Fix typo (#17947)

---
 tensorflow/python/eager/execution_callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 535361498a..9a08259653 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -253,7 +253,7 @@ def add_execution_callback(callback):
       `f(op_type, op_name, attrs, inputs, outputs)`.
       `op_type` is the type of the operation that was just executed (e.g.,
         `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
+      `op_name` is the name of the operation that was just executed. This
         name is set by the client who created the operation and can be `None` if
         it is unset.
       `attrs` contains the attributes of the operation as a `tuple` of
-- 
GitLab


From c3603d0a0d9a8390fd9e9423cd661717d2702bfd Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 28 Mar 2018 10:20:49 -0700
Subject: [PATCH 0078/1262] Trying to fix libtensorflow GPU build. (#18056)

CUDNN path error.
Invalid path to cuDNN 7 toolkit. None of the following files can be found:
C:/tools/cuda\lib/x64/cudnn.lib
C:/tools/cuda\lib/x64/cudnn.lib
---
 tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 7b2d7e1a56..d654b433e7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -120,7 +120,9 @@ function run_configure_for_gpu_build {
   export TF_CUDA_VERSION=9.0
   export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
   export TF_CUDNN_VERSION=7.0
-  export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  if [ -z "$CUDNN_INSTALL_PATH" ]; then
+    export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  fi
   export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
-- 
GitLab


From c15dbc39505de93770fd89cab4f4ae9a2a72b4e1 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Thu, 29 Mar 2018 02:33:24 +0900
Subject: [PATCH 0079/1262] fix test

---
 tensorflow/python/kernel_tests/norm_op_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index d6625b69ef..0e7d4fd9b9 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -37,17 +37,17 @@ class NormOpTest(test_lib.TestCase):
 
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
-    for ord_ in "foo", -7, -1.1, 0:
+    for ord_ in "fro", -7, -1.1, 0:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported vector norm"):
-        linalg_ops.norm(matrix, ord="fro")
+        linalg_ops.norm(matrix, ord=ord_)
 
-    for ord_ in "foo", -7, -1.1, 0:
+    for ord_ in "fro", -7, -1.1, 0:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported vector norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=-1)
 
-    for ord_ in 1.1, 2:
+    for ord_ in "foo", -7, -1.1, 1.1:
       with self.assertRaisesRegexp(ValueError,
                                    "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
-- 
GitLab


From 0aaa61ab332611e9dcfd3d1cc25115a8972bd5fd Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 28 Mar 2018 10:55:00 -0700
Subject: [PATCH 0080/1262] Update version strings to 1.7

---
 tensorflow/core/public/version.h              |  2 +-
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       | 14 ++++++------
 tensorflow/tools/pip_package/setup.py         |  2 +-
 8 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 40eebd1db0..706968d347 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 9059b3f3b6..a3eca4bf37 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 2e47a6d212..1a0956634d 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index eff066d200..cdde45a6f4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.7.0-rc1</version>
+                 <version>1.7.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.7.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.7.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.7.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 3c5db9bced..fdf9bf81e7 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -199,7 +199,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -304,7 +304,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -490,7 +490,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -657,14 +657,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -676,14 +676,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -695,14 +695,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -714,14 +714,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index fa6951a8f1..6f55e6a650 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -523,7 +523,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
 </pre>
 
 
@@ -531,5 +531,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 0454c172f8..73446663e9 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -359,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.7.0rc1 on Linux:
+for TensorFlow 1.7.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -459,8 +459,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.7.0rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
@@ -480,7 +480,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -495,8 +495,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.7.0rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 8b83257887..a486631621 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.7.0-rc1'
+_VERSION = '1.7.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From ab4efde7162445f20c73bdd3419811ab9c324a24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Thu, 29 Mar 2018 06:48:19 +0800
Subject: [PATCH 0081/1262] DOC: explain difference between adamax and adam

---
 tensorflow/contrib/opt/python/training/adamax.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index 403fdaa637..ea08a0931b 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -31,7 +31,8 @@ from tensorflow.python.training import training_ops
 class AdaMaxOptimizer(adam.AdamOptimizer):
   """Optimizer that implements the AdaMax algorithm.
 
-  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  Adamax is sometimes superior to adam, specially in models with embeddings,
+  see [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
-- 
GitLab


From 3e51f9ede54bc61a8d4f7797992ab78140467d08 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Wed, 28 Mar 2018 18:59:13 -0700
Subject: [PATCH 0082/1262] Default to disable including the coordinator in the
 job

---
 .../cluster_resolver/python/training/tpu_cluster_resolver.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 300b19733e..95c5c920aa 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -73,7 +73,7 @@ class TPUClusterResolver(ClusterResolver):
                zone=None,
                project=None,
                job_name='worker',
-               coordinator_name='coordinator',
+               coordinator_name=None,
                coordinator_address=None,
                credentials='default',
                service=None):
-- 
GitLab


From aef7d8b3e877924973e3d8d8e6266ba7b8322a66 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Wed, 28 Mar 2018 19:27:36 -0700
Subject: [PATCH 0083/1262] Fix the test

---
 .../python/training/tpu_cluster_resolver_test.py          | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 48c3f6bb4f..e1e3e6867a 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -117,7 +117,8 @@ class TPUClusterResolverTest(test.TestCase):
         zone=None,
         tpu=['test-tpu-1'],
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
@@ -170,6 +171,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -196,6 +198,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -239,7 +242,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
-- 
GitLab


From da0b3295c92d38a514123ec92086d807d207d647 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 28 Mar 2018 15:31:19 -0700
Subject: [PATCH 0084/1262] Refresh Community pages to surface new resources,
 SIGs and mailing lists.

PiperOrigin-RevId: 190845545
---
 tensorflow/docs_src/community/contributing.md | 64 +++++++++++++
 tensorflow/docs_src/community/groups.md       | 17 ++++
 tensorflow/docs_src/community/index.md        | 92 ++++++++++++++++---
 tensorflow/docs_src/community/leftnav_files   |  4 +-
 tensorflow/docs_src/community/lists.md        | 35 +++++++
 5 files changed, 198 insertions(+), 14 deletions(-)
 create mode 100644 tensorflow/docs_src/community/contributing.md
 create mode 100644 tensorflow/docs_src/community/groups.md
 create mode 100644 tensorflow/docs_src/community/lists.md

diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
new file mode 100644
index 0000000000..b0960df435
--- /dev/null
+++ b/tensorflow/docs_src/community/contributing.md
@@ -0,0 +1,64 @@
+# Contributing to TensorFlow
+
+TensorFlow is an open-source project, and we welcome your participation
+and contribution. This page describes how to get involved.
+
+## Repositories
+
+The code for TensorFlow is hosted in the [TensorFlow GitHub
+organization](https://github.com/tensorflow). Multiple projects are located
+inside the organization, including:
+
+* [TensorFlow](https://github.com/tensorflow/tensorflow)
+* [Models](https://github.com/tensorflow/models)
+* [TensorBoard](https://github.com/tensorflow/tensorboard)
+* [TensorFlow.js](https://github.com/tensorflow/tfjs)
+* [TensorFlow Serving](https://github.com/tensorflow/serving)
+* [TensorFlow Documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src)
+
+## Contributor checklist
+
+* Before contributing to TensorFlow source code, please review the [contribution
+guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+* Join the
+[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/developers)
+mailing list, to coordinate and discuss with others contributing to TensorFlow.
+
+* For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
+
+* Finally, review @{$documentation$Writing TensorFlow Documentation}, which
+  explains documentation conventions.
+
+You may also wish to review our guide to @{$benchmarks$defining and running benchmarks}.
+
+## Special Interest Groups
+
+To enable focused collaboration on particular areas of TensorFlow, we host
+Special Interest Groups (SIGs). SIGs do their work in public: if you want to
+join and contribute, review the work of the group, and get in touch with the
+relevant SIG leader.
+
+* **SIG Build** focuses on issues surrounding building, packaging, and
+  distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/build).
+
+* **SIG TensorBoard** furthers the development and direction of TensorBoard and its plugins.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/tensorboard).
+
+* **SIG Rust** collaborates on the development of TensorFlow's Rust bindings.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/rust). 
+
+## Projects developed by the TensorFlow community
+
+The TensorFlow community has created many great projects around TensorFlow, including:
+
+* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
+* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
+* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
+* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
+* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
+* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
+* [Swift language bindings](https://github.com/PerfectlySoft/Perfect-TensorFlow)
+* [Sublime Tensorflow - A plugin for Sublime Text](https://github.com/baptisteArnaud/Sublime-Tensorflow)
+* [GPflow - Gaussian processes in TensorFlow](https://github.com/GPflow/GPflow)
+* [CS 20SI: Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/) - please note, this course was designed with TensorFlow v0.12, so some of the notes may be out of date - but it's still a great resource.
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
new file mode 100644
index 0000000000..d92f5775fa
--- /dev/null
+++ b/tensorflow/docs_src/community/groups.md
@@ -0,0 +1,17 @@
+# User Groups
+
+TensorFlow has communities around the world.
+
+## Asia
+
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+
+
+## Europe
+
+* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
+* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index b706d9b204..c08aeb7a97 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -1,15 +1,81 @@
 # Community
 
-This section contains the following documents:
-
-  * @{$welcome$Welcome to the TensorFlow Community}, which explains how
-    you can get involved, where to report issues, and where to join
-    like-minded TensorFlow enthusiasts online.
-  * @{$roadmap$Roadmap}, which summarizes upcoming additions to TensorFlow.
-  * @{$documentation$Writing TensorFlow Documentation}, which explains
-    TensorFlow's documentation conventions.  If you are modifying
-    TensorFlow source code or documentation, please read this guide.
-  * @{$style_guide$TensorFlow Style Guide}, which identifies coding style
-    conventions that TensorFlow developers and users should follow.
-  * @{$community/benchmarks$Benchmarks}, Benchmarks, a guide for defining and
-    running a TensorFlow benchmark.
+Welcome to the TensorFlow community! This page explains where to get help, and
+different ways to be part of the community. We are committed to fostering an
+open and welcoming environment, and request that you review our [code of
+conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md).
+
+## Get Help
+
+### Technical Questions
+
+To ask or answer technical questions about TensorFlow, use [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow). For example,
+ask or search about a particular error message you encountered during
+installation.
+
+### Bugs and Feature Requests
+
+To report bugs or make feature requests, file an issue on GitHub. Please choose
+the appropriate repository for the project. Major repositories include:
+
+  * [TensorFlow](https://github.com/tensorflow/tensorflow/issues)
+  * [TensorBoard](https://github.com/tensorflow/tensorboard/issues)
+  * [TensorFlow models](https://github.com/tensorflow/models/issues)
+  
+### Security
+
+Before using TensorFlow, please take a look at our security model, list of
+recent security announcements, and ways you can report security issues to the
+TensorFlow team at the
+[Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
+
+## Stay Informed
+
+### Announcements Mailing List
+
+All major releases and important announcements are sent to
+[announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
+We recommend that you join this list if you depend on TensorFlow in any way.
+
+### Development Roadmap
+
+The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
+
+### Social Media
+
+For news and updates from around the universe of TensorFlow projects, follow
+[@tensorflow](https://twitter.com/tensorflow) on Twitter.
+
+### YouTube
+
+Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learing
+and AI with TensorFlow. On it we have a number of new shows, including:
+
+- TensorFlow Meets: meet with community contributors to learn and share what they're doing
+- Ask TensorFlow: the team answers the best questions tagged #AskTensorFlow from social media 
+- Coding TensorFlow: short bites with tips for success with TensorFlow
+
+
+## Community Support
+
+### Mailing Lists
+
+For general discussion about TensorFlow development and direction, please join
+the [TensorFlow discuss mailing
+list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
+
+A number of other mailing lists exist, focused on different project areas, which
+can be found at @{$lists$TensorFlow Mailing Lists}.
+
+### User Groups
+
+To meet with like-minded people local to you, check out the many
+@{$groups$TensorFlow user groups} around the world.
+
+
+## Contributing To TensorFlow
+
+We welcome contributions and collaboration on TensorFlow. For more information,
+please read [Contributing to TensorFlow](contributing.md).
+
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index fab35024ad..0bd1f14de9 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -1,6 +1,8 @@
 index.md
-welcome.md
 roadmap.md
+contributing.md
+lists.md
+groups.md
 documentation.md
 style_guide.md
 benchmarks.md
diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md
new file mode 100644
index 0000000000..dc9240030e
--- /dev/null
+++ b/tensorflow/docs_src/community/lists.md
@@ -0,0 +1,35 @@
+# Mailing Lists
+
+As a community, we do much of our collaboration on public mailing lists.
+Please note that if you're looking for help using TensorFlow, [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow) and
+[GitHub issues](https://github.com/tensorflow/tensorflow/issues)
+are the best initial places to look. For more information,
+see [how to get help](/community/#get_help).
+
+## General TensorFlow lists
+
+* [announce](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce) - Low-volume announcements of new releases.
+* [discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) - General community discussion around TensorFlow.
+* [developers](https://groups.google.com/a/tensorflow.org/forum/#!forum/developers) - Discussion for developers contributing to TensorFlow.
+
+## Project-specific lists
+
+These projects inside the TensorFlow GitHub organization have lists dedicated to their communities:
+
+* [tensor2tensor](https://groups.google.com/forum/#!forum/tensor2tensor) - User
+  and peer support for Tensor2Tensor.
+
+## Special Interest Groups
+
+TensorFlow's [Special Interest
+Groups](/community/contributing#special_interest_groups) (SIGs) support
+community collaboration on particular project focuses. Members of these groups
+work together to build and support TensorFlow related projects.
+
+* [build](https://groups.google.com/a/tensorflow.org/forum/#!forum/build) -
+  Supporting SIG Build, for build, distribution and packaging of TensorFlow.
+* [tensorboard](https://groups.google.com/a/tensorflow.org/forum/#!forum/tensorboard) -
+  Supporting SIG TensorBoard, for plugin development and other contribution.
+* [rust](https://groups.google.com/a/tensorflow.org/forum/#!forum/rust) -
+  Supporting SIG Rust, for the Rust language bindings.
-- 
GitLab


From be554c147c1cb5a53e3d891342dcdeeeaa7b16ce Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Thu, 29 Mar 2018 13:28:05 -0700
Subject: [PATCH 0085/1262] Docs: Move TFLite docs into tensorflow.org

PiperOrigin-RevId: 190977057
---
 tensorflow/contrib/lite/README.md             | 237 +-----------------
 .../lite/g3doc/TFLite-Architecture.jpg        | Bin 48710 -> 0 bytes
 tensorflow/docs_src/mobile/leftnav_files      |   1 +
 .../docs_src/mobile/tflite/demo_android.md    | 156 ++++++++++--
 tensorflow/docs_src/mobile/tflite/demo_ios.md |   2 +-
 tensorflow/docs_src/mobile/tflite/devguide.md | 224 +++++++++++++++++
 tensorflow/docs_src/mobile/tflite/index.md    |   4 +-
 7 files changed, 363 insertions(+), 261 deletions(-)
 delete mode 100644 tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg
 create mode 100644 tensorflow/docs_src/mobile/tflite/devguide.md

diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index df8c1c623c..a676b705f1 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -1,235 +1,8 @@
 # TensorFlow Lite
-TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
 
-TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded
+devices. It enables low-latency inference of on-device machine learning models
+with a small binary size and fast performance supporting hardware acceleration.
 
-![image](g3doc/TFLite-Architecture.jpg)
-# Getting Started with an Android Demo App
-
-This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using either a quantized Mobilenet model or a floating point Inception-v3 model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
-
-There are 3 ways to get the demo app to your device
- - Download the prebuilt binary or
- - Use Android Studio to build the application or
- - Download the source code for TensorFlow Lite and the demo and build it using bazel
-
-## Description
-In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
-
-## Downloading the pre-built binary
-The fastest path to trying the demo, is to download the pre-built binary
-[TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
-
-Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
-
-## Building in Android Studio using TensorFlow Lite AAR from JCenter
-The simplest way to compile the demo app, and try out changes to the project code is to use AndroidStudio.
-
- - Install the latest version of Android Studio 3 as specified [here](https://developer.android.com/studio/index.html).
- - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
- - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
- - Click through installing all the Gradle extensions it requests.
- - Either
-     - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
-         - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
-           `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
-     - Or download the floating point Inception-v3 model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
-         - unzip and copy inceptionv3_non_slim_2015.tflite to the assets directory
-         - change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java) from
-         `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`
-         to
-         `classifier = new ImageClassifierFloatInception(getActivity());`
- - Build and run the demo app
-
-## Building TensorFlow Lite and the demo app from source
-
-### Clone the TensorFlow repo
-- git clone
-  [https://github.com/tensorflow/tensorflow](https://github.com/tensorflow/tensorflow)
-
-### Install Bazel
-If bazel is not installed on your system, install it now by following [these directions](https://bazel.build/versions/master/docs/install.html)
-
-NOTE: Bazel does not fully support building Android on Windows yet. Full support for Gradle/CMake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
-
-### Install Android NDK and SDK
-Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and SDK must be installed on your system.
- - Install the latest version of Bazel as per the instructions on the [Bazel website](https://bazel.build/versions/master/docs/install.html)
- - The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The current recommended version is 14b, which can be found [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
- - The Android SDK and build tools may be obtained [here](https://developer.android.com/tools/revisions/build-tools.html), or alternatively as part of [Android Studio](https://developer.android.com/studio/index.html). Build tools API >= 23 is required to build the TF Android demo (though it will run on API >= 21 devices).
- - In the root of the TensorFlow repository update the `WORKSPACE` file with the `api_level` and location of the SDK and NDK. If you installed it with AndroidStudio the SDK path can be found in the SDK manager, and the default NDK path is:`{SDK path}/ndk-bundle.`
-
-```
-android_sdk_repository (
-    name = "androidsdk",
-    api_level = 23,
-    build_tools_version = "23.0.2",
-    path = "/home/xxxx/android-sdk-linux/",
-)
-
-android_ndk_repository(
-    name = "androidndk",
-    path = "/home/xxxx/android-ndk-r10e/",
-    api_level = 19,
-)
-```
-
-Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
-
-### Build the source code
-Run bazel with the following command to build the demo.
-
-Build the demo app:
-
-```
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
-```
-
-### Note
-
-Currently, we only support building the Android demo app within a Python 2
-environment (due to a Bazel bug).
-
-### More about the demo
-The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used (299 * 299 for Inception-v3). The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch. 224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. Both models have 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The model file must be downloaded and bundled within the assets directory of the app.
-
-# iOS Demo App
-
-Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
-
-This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
-
-1.   Run `third_party/tensorflow/contrib/lite/examples/ios/download_models.sh` to download the model files used by the demo app.
-1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
-1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
-1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
-
-# TensorFlow Lite Quick Start
-
-## Step 1. Decide which GraphDef to use
- Depending on the use case, the developer may choose to use one of the popular
- open-sourced models such as InceptionV3 or MobileNets, re-train these models
- with their own custom data set or even build their own custom model.
-
-### Using a pre-trained model
-
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html) is a family of mobile-first computer vision models for [TensorFlow](https://www.tensorflow.org/) designed to effectively maximize accuracy while being mindful of the restricted resources for an on-device or embedded application. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as [Inception](https://arxiv.org/pdf/1602.07261.pdf), are used. Google provides 16 pre-trained [ImageNet](http://www.image-net.org/challenges/LSVRC/)  classification checkpoints for MobileNets for use in mobile projects of all sizes.
-
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model which achieves fairly high accuracy in recognizing general objects with 1000 classes, like "Zebra", "Dalmatian", and "Dishwasher". The model extracts general features from input images using a convolutional neural network and classifies them based on those features with fully-connected and softmax layers.
-
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)  is an on-device model which provides one-touch replies for an incoming text message by suggesting contextually relevant messages. The model is built specifically for memory constrained devices such as watches & phones and it has been successfully used to surface [Smart Replies on Android Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html). Note that this model only works on Android as of now.
-
-These pre-trained models can be downloaded from [here](g3doc/models.md).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-The above pre-trained models have been trained on the ImageNet data set, which consists of 1000 predefined classes. A model will need to be re-trained if these classes are not relevant or useful for a given use case. This technique is called transfer learning, which starts with a model that has been already trained on a problem and will then be retrained on a similar problem. Deep learning from scratch can take days, but transfer learning can be done fairly quickly. In order to do this, a developer will need to generate their custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/) codelab walks through this process step-by-step. The retraining code supports retraining for both floating point and quantized inference.
-
-
-### Train a custom model
-A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow's Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
-
-TensorFlow Lite currently supports a subset of TensorFlow operators. Please refer to [this document](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for details of supported operators and their usage. This
-set will continue to expand in future releases of Tensorflow Lite.
-
-
-## Step 2. Model format conversion
-
-The model generated in Step 1 is a standard Tensorflow model. After the completion of Step 1 a user should have a standard .pb or .pbtxt GraphDef file. If the application developer is using a pre-trained model (as defined in Step 1 above), they can download a ready to use, already converted model for use from [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/models.md). Models generated using retraining (aka transfer learning) or custom models will need to be converted using the steps mentioned below.
-
-A prerequisite to converting the model to the Tensorflow Lite format is to freeze the graph.
-
-Since we employ several formats, the following definitions may be useful:
- - GraphDef (.pb) - a protobuf that represents the TensorFlow training and or computation graph. This contains operators, tensors, and variables definitions.
-
- - CheckPoint (.ckpt) - Serialized variables from a TensorFlow graph. Note, this does not contain the graph structure, so alone it cannot typically be interpreted.
-
- - FrozenGraphDef - a subclass of GraphDef that contains no variables. A GraphDef can be converted to a frozen graphdef by taking a checkpoint and a graphdef and converting every variable into a constant with the value looked up in the checkpoint.
-
- - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
-
- - TensorFlow lite model (.tflite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
-
-### Freeze Graph
-To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
-
-The developer should know where the checkpoints folder is present or checkpoints can also be downloaded for a pre-trained model (Example: Here is a link to the [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
-
-Graph freezing can be done using the command below (and modifying the arguments appropriately)
-
-```
-bazel build tensorflow/python/tools:freeze_graph
-
-bazel-bin/tensorflow/python/tools/freeze_graph\
-    --input_graph=/tmp/mobilenet_v1_224.pb \
-    --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-    --input_binary=true --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-    --output_node_names=MobileNet/Predictions/Reshape_1
-```
-
-The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
-graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
-
-This frozen Graphdef is now ready to be converted to flatbuffer format (.tflite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
-
-Here is a sample command line to convert the frozen Graphdef to '.tflite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
-(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
-
-```
-bazel build tensorflow/contrib/lite/toco:toco
-
-bazel-bin/tensorflow/contrib/lite/toco/toco \
-  --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite --inference_type=FLOAT \
-  --input_type=FLOAT --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
-```
-
-- The input_file argument should point to the frozen GraphDef file that holds the model architecture.
-- The output_file argument should point to where the TensorFlow Lite model file should be generated.
-- The input_type and inference_type arguments should be set to FLOAT, unless converted a [quantized](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/) model.
-- Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
-
-Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
-documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py). A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
-with tf.Session() as sess:
-  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-
-```
-For detailed instructions on how to use the Tensorflow Optimizing Converter, please see [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
-
-You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
-
-If you would like to see a visual description of your TensorFlow Lite model after conversion, you can use tensorflow/contrib/lite/tools/visualize.py by running
-```sh
-bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
-```
-and then visualize the resulting HTML file in a browser.
-
-## Step 3. Use the TensorFlow Lite model for inference in a mobile app
-
-After completion of Step 2 the developer should have a .tflite model.
-
-### For Android
-Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
-
-The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it's a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-
-Note that you'd need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
-
-### For iOS
-Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
-
-## Core ML support
-
-Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+See the documentation: https://www.tensorflow.org/mobile/tflite/
+Documentation edits can be made here: [tensorflow/docs_src/mobile/tflite](../../docs_src/mobile/tflite)
diff --git a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg b/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg
deleted file mode 100644
index bc83946647c6a923a8a0bd3a041b42e4febe6a31..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 48710
zcmex=<NpH&0WUXCHwH!~1_nk3Mh1rew;7xnIM~?O*;qN)+1WWcIk<R4czL+Fc_f8|
z`9)-<<mF_gWMmXn^wbrUbd+UeG|V-13=B<7Oyt!qZ7qy!^o&i6K!z}Ka&q!;^GNXW
zN*F21C>oIr{vTiv<Y2nMe1MrziGfLwky()O{}Bdx1_nk}MlfK20!Aig7FITP4o)ua
z|3?_M3NSD+GBY!=Ftf6<urM$%)-p0NGq4D<3Mm>ovIz$!vMUve7&T5@$f4}C@t|nX
z#SbdRNkvVZTw>x9l2WQ_>Kd9_CZ=ZQ7M51dF0O9w9-dyoA)#U65s^{JDXD4c8JStd
zC8cHM6_r)ZEv;?s9i3g1CQq3<ZTgIvvlcC0vUJ(<6)RV5+Pr1!w(UE1?mBe%$kAiR
zPn<k;>GGAU*RJ2VdF$b$$4{O<d;a3(tB;>PfBE|D`;VW$K>lK6U<UgNL_j=-<}X18
zCPpR}7G@T9kiQt2${84$m<3r_6%E;h90S=C3x$=88aYIqCNA7~kW<+>=!0ld(M2vX
z6_bamA3<IN`;0h`HId~rxW^Fwy2Zf5%*enb$SlZU&+z@1#-au;5PrC)*!03xlhe`J
zJ9S&0XPeYkUoVc)@2!85bkY9)e+DUrm46#o)L%GU{gcangDL+niTrxS=q~|m?BCcH
z{%3IJeg8z`-+?RkFB13uY0barAR&K4s{JpI`Thxue=`UCXIR+w{gdc^SB8Z8ge>_l
z2KW9s^}peD{?{gb|3uhd5yr+*!$*T<G_{OoqJbz|G&f~LHMyPr&%nw1o?%|oE~&oy
zuR`^^>ZdX|KG2@sy;|tI(UwiTr}yrfW3cYnj`{8NpZM~>&;QThBzil5(NzQ$BbS-)
zR=0j!|EDKP{r7){#|~0c0$mwIsD?vIyT4vGS>3((s`yj6wYM7d1LMr)wyro{u(@{E
zgQ>YUjJDZpeLc;+W&ehzSN6M=*#1;^F>D08R+4?;q6RJ}rzTGZz9knpZ27XZ*Z=r3
z|KHB}PcoP+zG_P@5j<L%ue{wgl2QG`*{JoZAJ+fr`4<0v{*&x$MT`d(0+=pGUgY38
zwe#iVp3AE{Kbp6d#`=f<_T9W@qEQ30-Q5}g8EjWDZ(Fn~^04l_=^{ZVpPd$oNdKx3
zuDImK>41ILa^I?UG0IHJwGMgy^8$M**i8xwUzDabaA_=Ik!uNHy69T)>wZuEm%B-i
zE@><<Ja&NTRPlxC3+#oi?#wfF9~WPpA{+z@b|Lm4RtAwkM}<<01x#HU)WM7A|8<%F
zZvNAtkmCOgD<=QF{AUHz{KNZyvYY=t|DRzY*W1;dKg$0zC_MYmQ2C$Xi(>3njF=TY
z^yB(JH(lYs{~126d$l88>OaGYql@bI|1)7+3HCBt!XZ~IM!x^6?j^_NZg=$Gs~lgS
z_~-S`Ss_jAvz(ZerX^J~#+j~&+xl<0>64ju$<O8Pwm)@{-YzF^_(M<n?fm`spKX_U
zV^iO_ww~ctT<V@%HNBtR%8!4st&e{mS@h#kWX`OMTYlGTF8=g!$CqQ;!p>Ltrmwzz
zm-*A`#N(^x@iJ8Tcgs#bT<+5o+91>1d?kGD%k5Ho4|`@Ns=WKmV|LAwq59i;$B)VE
zANF3>JT*0lf5nM)$q&E(Y1ONL-TzgDarML1()amq$KQF?5oFNVeoorrT$R+br&mhr
zMPAKMZ*vPx@;b66G_=-#ZF!tFPmO8ycFS9w$CtSl*)eFD&TKsPnDh3}E{2sWUi>=W
zE~>ZhLfN!qKVm-AzFz;~-?m@pnhP#Jek-ujUHQk32iNXCzU7|h`?AG(;(TlA;vZKd
zo?L#iao4}|*-1>Y|J-c(jQfu@*sS;b9sg|&)A4_H-GcHPmDlEE{gb)2CU<tg&Yi-!
z2a7fdoqiwkNBcj6vhDQW&womN@fWT?vAy;`L(Kj|4F)y5Yh&*2xh8I8GA}B>cgB;(
zIfC<#ZQPStXj7*=H7HZGOrPrr-&FO6`X{#Pe+%-TY3=X*&+sI5`ro7fLKrW_XPVDC
z-d=Jzw@As*dFN@vpPSg17R;($vVGPRvyC1?k@8vF{Mq(Kd;MpaSakj0-u6%HO+P%h
zQ{VKTVcq@54zk~7^X!xQ@a1j2`JCjm$qJJvI2?F>Z^!;^*>>7j&KP<fD00cV9VmWp
zx4cxloz(1n-&fHuXFaU3&tG*X#%q@Efwq7J+s|*A=peTK%iQ$CYSs&joP>KnS{|$A
zFuh$-94obL!gb-dk%>Q3kNeF$TWlUD|1tKBn)K?a({!F^pFLKuwR=fMRNLf5uU;1j
z8y%j)aq|WLj|*&Si`QkvDP3h<JA0SQ+ieyT-k$iH{OeMdob0s{@BD2)|1)SkF6?ri
zy6lI}ijO+h8&7Ss+p*qvXNK6_TQ80&GC%)pxc%c<hQeERODiAca=rJ<Y`UiqyJ?=e
zUiIVs@@ors&7HGNYRb0FCsZaGF<YoS@A|r0vSg=9V#2kh2Y&A9-FbYU_F0=yP+YzL
z{rD#r!y6mph4)^zMR(6$I?XbXF~rc~{k<K3rJj|2)LpgZlc@27rsuNRS91R|Jdqdw
zTO@yK&vMuQ3=?Nv{<r%-Llnc8zd2JcyziRC(Rf1nMRwDN*dv!Jx=bsVq<*^HmXy8s
zy8RR8`u_}Z@Bj2xes8jWlG6Wg=T?S@k4KMOeEmBp-1XD+)+F1kC1r9-*K;4u6bf$@
zQAwSnp}=?3=5JQLET~-g&yX|y*Y)Cu?|+J~sej#dfh}zBI`0K9u1)*+`0XY)cAGZ`
zB<<%GeUsAL`bgu}hdXmy9@t!xpYfY(^Wz8q8Eywm{_dlk|18s_n_t*}*^yth54>|~
zR;eDd=LulVFx8DdbZh<1cPizs$~8H+r?#(H@7t8Jqsel@Q(0%`H`(Xj{ZW6U{nBN7
zkh{)|1L@P}gqJUg_D)}NJNkUTk;S2Hd{qIgw?I7vN-?Zf^$a}!Z;}6Z|4$3J>iPaN
zJdwHo-u_>U_O^BE5AXkJp8fave})UKWmgyiT}4oEIQQfDpAuXAzyD`AwtLmXn(Y4!
z%3_`W{xkfzz_wH*(1DdoSi(;Ccg`-ytsA#b3ZJ=c-@!ThRYyLQx2HF~s-Ej~W!Y7(
zZmqBCE7v{iVrYI-w*T{6{=dIP5C7PGc)C$mcJcCCCY?$(k4%3rssFk$_OA%ziAt-f
z*Bh5?o^>g)T!KSjy$+*<(&`Frul*aYq*t$wObqL1*m2A+d&i;%<yj&-8Xrz9S6&$=
zcWlqI&xfUsb=H5IYv8GqRW!-<d&3u{y;@xipS5S$1g{rqdc--C@tiD^hTZa|WmbAE
zCb>e<H9sexTDnAep~VaBqArHd@q8Jx=C1G1RhxG#<58IE%;}8>*F8QW;9b1no->#6
ziH{FMt0o`K40MnYZ#J3hbUboW^lHz<o<kwV>)494<i%dx>V3Cg<=VAkZ%J3isYO=d
z3okl|Y*`U!D|JmWWc3=YHOp5#+VkrB;akThuaaDw`*dR9cUK+f@;Zz4L5ms;OhWcu
zHSz8!-EqHU$vVlhxjX(dNIiM?M}N(W>-RG*?>QOj7Zl-thJB&@lklY%9pnnW%Nbl*
z{M)eaVQ3pChw|e)A7iIACYC2IS-SVjqfGX^-QmA?zK_}xz&<zYp3oJO*Z&!|zVy1}
zB3+eZZ<Buc-hv%=+SioDCLb=i_<TZuf_?DOs#Foi#)tL%bA_(nO5-gyGk^PHL5b*j
zP&8e7)nA**?JKiDjPt7hrQN?Ym?wRhdu*rbVdoV~PUp%*_#4^oX)-*2X@lv0o~$sb
z#djhOt8AElU;60D;5m<FvoC8b;I}sXu|0BYg__WDhTz>rEqC9(%Dg9ZajUd$rP|KT
zvox9v?wmeaB`U&rpyIg475%N}t*)JmO#XBwFZ9HD)9Wt^PUbI)JgOx!QHgQNSGFx0
z3wYuMUvRFA(g@=$l=I1}s^u*#`?gQxg6!h$Z3Ro7v;@4U{e69Tpo3g>e(IyWao;4<
zCuNuRZfCDih+ovV^#zAya=`Io#*@dRN>MWmdIlnv4{!6V@`B}AhG4YjPN6+=*2Qde
zA+>i*?;_{i22+hiT?|yk>&lNswO*UM#8r36F-^II*7lf$`7dLBc|Ouq4wpK}*j9gD
zv^U39OVg_F*p?L`t3PPZYjv&sR^$4g!Ey4xdajrs-T#`^|NIj@|95HYzvBEAkL;&>
z;nF_;`H}uFp821D1kM+%zmW2uVg8~1w>tWt+piSLpQ>UNeg5;K_+K{vpMNyYckF**
zRDb@V`?qQ0KM%jM=znU<)b;s4!;cmJ86XmWLh8Rr+CTrW__sUvpNC&%+&|50bgldc
zcY-m@s=td{{}ue|Q~v4iuxQVJ20Nwy3=2;FXRvRTgjx3Q;-UWx7Panzf0i!@jH`#c
ztq1B0k^0QU`Z;!sJN^Y<&{%H|bDJC7F8eJ7`_IV-9;w%SAre0S`H^^t%Mez{-?E5*
z&QGOK_yc#=QFmx~qdGwNkpll8KmMP8w9^k(O?|g8w@hE%*rbSINncgimqL@(@i%wh
zIJTB+%0AQN6^pvGixn1iX)sU&uXz5SAyEDI@}D77W&Sg~Qu)tN@}EJAVflym{~XW$
z`~08bh12e=M|Gg;*>eBA`oA3ExtLW<!9LUf3=_Lu{xj77$=<b2{Mi0ah8gqU|7YlB
z$Rako5vh*xRK<UWe-0DsuOIl&a3N}S^nZrqTjJmGKMkv?|IeV{VgLF;{R`dDwf`9&
z-_rli{%Jk4&5wj?OSexdi@LO*^B2IfoVL5)hAq20wtPR${O*90@%(F>SAMKt=l{Jm
z*YRjb%&8aWwxm4x8Qc4NK3`OnfZCO-+&m2@d;V?h%E>Di*!F4DSN6mwRoC8k|7W-!
zUB4si-sTnOm%qH0oflHs*>>uA57*c1#1%WwMo)FSon1XIR>w6}!uqpi`6o+{w|Q#5
zA03rfyj$h5dRJ`g-q&Zg3wcU9P7tl1TN*esci&Nw&7wMmPY;Tps_lH<f2zC0<+^@X
zgR#)$<8@gNrykPN-TmvApv|EoU)6m7;12J|yUy-B>y|taSn_PPj?sbNs=X8UN+j*-
z$V=P&p{Cg3b_$P!#W}m=aFvMneiKa(wWa^;tDd{_U+MI3DXL3S*Y&?Q{dZwb>{D6O
z_`>7T-c$R}WLL7(-_t7e?c!X=DRAhM;PtG@UGG(Pdc0rK<v#Pu`d<tA!sT`z4fxEv
zynX3x-x$jYb6QU_moLAxDl$arpL%Nb_X{(g1y9<p_icN!bl|1L&6jfX!s};iGB%ez
zn00*J>LdOtvqWc~F;q;5?kV1KT{|YS*CKfBY1Y(~vV?TaOM7;2Sk$H6@&-H>tFfp-
zaNo;c!FBTg8CLB7&#<of*X7yo<}WpW_@Cid_kV_8FYMRW^0|Kd=iWX0t?=eWUD_}F
z1z_<@DXnGspW%w<znA|)Ro(4>b=u#xKh>c5$NE3RgyQ-={~3M>th+U6#z0&dissq>
zXPBTO|Mi3Y7qL~>{xf`hEB?FrXXt$S{|qOD_<#M7|HU1eUH{`-_uqqmRv$QTCn0w(
za8IYkYB8rPcN#0sDIT}By6#}oEAsN$?=w3;u?Nqq*9q@;X16-IW9#y=s?v>ti{7f7
zi|e{L*+2U9q+KCJr#a#xj~Uc$W!aVcM4Xdlg3*cRT(<R6=OU_BhpgMZw!2Sh_T8J`
zs;6eGm474a&r^5mzW$CCx?66aK6HpfcH12D94UYPQ0t=oJL>LDU4N!)%Yvy2eT&w_
zM8rm&+UU!?dOMfloWEM-7fmG|evZ2Ot#O`#|5K;MVZU5o@NL!BKAGiMUSC{u>bg+A
zmVpIR_gbfCUzPhR%hx1x8L(-7`xW~3j#7j|x>MoAu4|hvtaY1jdTc>z@1rCcMH$9e
zlNCwX@ujt;maoDN2YILG9p(7rxYCuQH_t+rGr-p%dsf5DrNV#SER1*)=l(Q2dHY4x
zT}{@R>SdK8qJ@n;p98XX<=stvo3l{;DT~(UsB;o44$t)ab=$pI!=LBrsfT&H7EQdO
z*W0n#=-&gGx3_GIuTAsydKZz{xy)Me`1A(GnCi>#N@l-y;Wyo_HP84;klplZnTswL
zWIz2qIPq+;X=jx6x7VMn!mrL-b-yv;M|-egW!mFy=DCt9a=upI7C0o?C-J0BQu6CE
zz3Ek-y@d|ke)RjXa#wEat~;*;4yrlrPI$aFXrAgh^St*i-ffbuH?{MQ$G)0A{o<N(
z<NQ!d<_wvCp+VbA-}332Ni5&47ch72j^}byvko6F`<$6wTyTj~(wl*gL3Xug!bVZ8
zQ$kmlUO8v6$JWZLFwA$(QPI51K701D+&*w_@qWj^)2YR*2bw%$Q*6a$>!u!iam9P~
z)ElouS8mUK<>*=xW!I><Gk^PSdF?be|N29lj?FGCt+XlIV60^+cX)o(tq7})U6Z4~
zyh@dc)$~2~yHw+1)U;VK8()3<c721<>d6ORRD2bc+nOu7Y^9FDx+@X~{%EPT&B^@j
zzHFP8MTgg-<+3F^=iD#(tGS>5Kf{&y{|wh1|7M>3?tf|ahyM(JUH>!uEs?*rcTV@)
zKmEq$x75>JMZz!4Qy{X2LFsK_(=)^xhEHv<R*b9dkc*6MPip7s)i=~@eJDP1Co%cP
zDcL!l*~|O?xy-cN|J$_ar>MfN8O~2S@1L?ixa0As_0qi2H;dJ_huQl0>SrxZ-@NnM
z3u$SG(%#4=cWzHJyY>9zuaL{9{J&M%ZaMk;;nt-I{*TXP_cpzXPQTByR=R3+(p=TA
zXD;bXiaBv^f5|jUd9N>;&%8_?%Y`pIY{OhCvvW&&jGo$mhL)`hKgS$>`uFUek}r$y
zoLX5kKi%j+T3%xPoZ^=;4SRM7SKiN<eD;|6d&@sBllJdya;tck5|%to@w|PM@{A>4
zyYKGb6Lk7^l<B5Fe5YI@^54jL-Z<vTDYN6wa+zY+JylKF--Ue6f79KQx-UM5>(R5F
zvmW^yMb5obqI2`8*R83Kk3O}%vu4II?pf>KYTWy?HoHDPu+}MTyV0%l{}``u_bWT>
zn|1Zuqp%z9zpu=SeZ0Kwo!%0rbe<#a|Mcc&PZBp;qb6wF(I0a6;2+JPXDJ;=HhU=B
z23Ruu6P^F9JIL>>cJ#B%udJsYmwerS;#PCm@4BU_@;4LPa}I85o*aE)%D=n&0~}@?
zn_c|H^x%><bH2XN>k}LgimKmwX3W|1Nz(n}bjv@lG}44xa!t#NRwpEzzq8hsP3?+x
zUJ@@GIy;k{$?fFJu$(-_>7LuKs6KtWIPu40_MhR>wtvrFYMuA+3G-%iQK`E=8zsWh
z1?5&qytQx5mN`B3rP$S7?@~T}S6wsp{pF3f<InR%xgJ|6&RZ<ie_TTT+)iKdf6ik2
z-*>IbOUrQ4HJ;J#qEps?XvQRyJV}|=d_8;|>#Ba7&yEtcEQ#golJHOW3)r{jy}Dui
zz1WwVOAhV&&+v8Shr2qL?mM-AFMA=t__}%i=eIHHxAkn^Tfa5>Ipf&h=k{}zV$Vw^
zO&8Mp7tdFhzqa&0L)PoM@az8=Pye0uHZTA6%BzBf`fa+?eRWh=@&)I}*I!#dy;uEm
z(Az0@a}_4P-ah}@jt>2Gy2VfYJ~m&^4p!MQDYNFLYFhn!v1bd@0;5*!e!1M&TI2)o
z?qcn%KVPl0HNLW5*SlBrpCL{8$;qF!8#Hsy<S&cS@p^YC{LV(RYv*1)z8$(RfpO35
z#>RG~MQxudWKN2_>#Le@w{C0UnMbp{bk~;O5472pn|0m)z?-GBOOMU!&(}%u{IsP-
z_`=t9<wxp2NzeZ9FVyk<x|FG@)iZgzFK(CVU+SuIUi<wtw=!FAnORSdJ{HX=Qu%IW
zu*2Og%Xe+tcIPk4{@DL#2(16l5X=5`sg6OVtfmd)zh(a!{_Wy_9lKL%^ZXM}&)%w8
z4k_3)uoY6|hz+rV?cB936{Wisb4~gs!ucc(=RD|t9bx~WA)0Si>-i_z>laP-d}ng&
z#uNL8jNAS*ENDIdMB_h03)Ag?P2%<w*6r(x+4x50{LT~ghZuAJGdOb3KN0w!frEAX
zUlzT3#n@VrBKHkHm)?0||BwNzZcF7GF<{-yNV<9P>t=)74bgq<<b!C7#rkT?tu$Ww
zF|4qED0IpGh1q|G@0YIoKH~q$6jRIoc^%gd!-)CGdZ8j!KX{mqTrMukxc;A^#oOaQ
z!-CuY8SZ-rAAOLmIk_fk&b#Hi9&{aC_xP~<CszOOw|`zb^r&{r&zK-NYyF6<m{-0t
z<v%psuD{^){F6xiA(*Sf<6Wy{HomEpx%93GhrG<hH+t!K1EL0^IU0-MaLw4`z%u!U
z<^19ah<I7Mmfx}2sV(B+)F(z}&-A;lE}G?;!@GS)psUCcYE2r7B6V~p!KSzzME)W7
zabx_Er@2;ObhVNDxaFZpb6g8}y(k?aCQARR>(~9$^U@-uQsegsd}3d#X|26!vAFuF
zu6>IC878P6zff-&BU(~5z2o7&4b$!_O!R&{$zA5UWt^DR5-nc~>-phR16I0C?sd&|
z+TG7ne?rv$e*W_q+Y%}N4#mrB#O^+r*pXDkaol7>sIuSECvVzk1<uWq*>d&nxkFvc
zm)|%2V^@Cjo{D|R&foXFcE-I)S+cHw@*Ek})4GScG*+ol+V{eze`xi~L_(`?|Bkid
zw|bY_W%HJG_k6E&{~21=yZmRE|K&f!KK~y9Y-dG#|1)sLb^d3tfBB!GX8wo8|Ah7m
z|Es_J@2CAk2WgWq^FOLp>VN(-EdTqd{!sr9uP^>T|1<die)6B8l_BS1#QhIbU(Emf
zZ~pg_{~5UKA8KE;fBx70{p5cljGJAP>kox5sz3j${=WJjP~k_s#olhLw)uw-evi4S
zCcXNI_3!=<>q46NI1b9L-iGRdFtM$b3+}P)*tV}X{goRB|F6>Z=HkuIzQ24|y|XU!
zbXMuSxsu(ndYum+tKFV`^-!rte|>S}FVl1Fd9Fv_zDW!Go_tWT_H0Q!vM=VSalgIt
zyZel>_r1B|pUoH;7}yw=uE|I7x;(OmuO)UhHy3T(kK}t~pEIX*9Yb|>AZm~=;GLx{
zjTG$e$U)v<mVwW122_hh>y8I)zOpz|=XqqPpR{Z*LliYe>0o9Iz0tZFhyM%{WFP|*
zYgU-77HcvyJK-7NoAm?xFkYNoX;49rWo&=6R!iE`GwZ_to!s!BVSdQs-)e{d)h4f=
z65jZA*8Ow;zE#H^n*M41j-`JmC;n%!Tj~DIxBbs=_t>!R>YPi_H%~5iR7^WKaf<Mt
zRrl5xX6IM@s<DdQZM}l)p@XG;Yx%Lcp26R`Hov`IcbsRHo2l*Wr9X2D<8PYDe|{}1
z51~q%4_$i8alN3s$w=vaK-7<_?2cvSGZhUj!}x4%U&+dpzAgVgZS{j&`({<j+y`0y
zYbjXgtEm_7Rn$dSJe;IC>7i=Eg7q@<rdobD+<*7)Hu){Ncb_IrTHLogyuRY^HldR*
zzssyg)f$;0d}qldZ>8uXEi2Q%%4X)(*qiRTz^LUZaz->IuG!S?y!7d_<=^{nC0gz*
z`Q4Wf2^C1-1nv6I@Wbuj@j~IV#lroj?9;cuXF3ozZ9!*tfzW;Dwi4}02ifCK?MiGt
zTC_cAaoE@Ti7g*3>QWZ6ynnIZ`22r{O*$UuZwmA;TmlL5h}XLsr{(eo7w1fn+5Yap
z@BJEUPUrKU*4<O=Uo5+S=6BIwvpO#v&3yCh?$z%ZE5Z-IJ}<I%dGSn$W@vl`?TYwg
zS#bS))+Z&SWnV2$B)$#k<$L_w%=^*-7L}aiD$G6C;?7Io{rha+b<U^XB_IJfr9S0G
z{rOd4&z3H=n>As}q}3*itsS0(O`Dy~GEZ{%%!j>8f^H-&nE2zOpT$hSkUO_B<I6P9
zwInQCG%N46OglI{kAsuLucg^9yP|`)2Z&61+$Je{S#NTF`P%c}=BqyqVDUUwA#?TN
z%17CUXBk)DE(~Ak*0bipwoexJZ)WrV{OV=DDaZc#RqbVV^3}aw8M#mUPx*O2Z~gY`
zO8G63Y18&+c2&GnlrdJ%>I^=%<Y{^8GmEQ-woRRSf8x3S3}D9xLHrPUOMCij)BSV*
zJ`dlp^v~*d8gtbSv;P&f3f<rS$FYvx-=JREFMn6vl>6=f8D8oCXL!Bf-`00ewyN9J
zrPuZ=n=Y4F@oBZ}T#>-IkB_M}ugeM>&Rha7OafT#kymqt3!@DFW6V-%BQ58Oy^1{c
zpTQhNt))j$bNGUf|7ZBb()@29`@g2EQTPAZF8a?<P_MQ5@qdP&OpgB<;+g+5IA*QQ
z|IZ+Iv3`gB)C$Sxhna7lGM}g_5*Tk$-FjE?nr+z4xlJqTJS~{Elv<w%KJxL->Q3dL
zB~vZeNNKN|rxmdL#PsMnR}D^;THbz{yl2we)R}p??{bb<JiU|J{yK8mH_eh-|C*1I
zvbPyz@?Kc5zl>Qcdei1w&~MxS48oHvllpI2RlG}(liucBwBoha<mPEH<}qOlCaFuW
zd#Cek`|TrhrFMk&KCy63vpa0kB2eIFc;h0^xx(<m7t`+upMI>qWY?Wn4<=m>ytwT0
z!yOTKtaCnF_dE%HvTN(cZI|EJU6^F$v2=!<2=lBb87m@Zdq2GXMf<mH{G-do3$CtH
zntbHjr8TQAHp@krIXu6%^ds9woA49c!s~Z`I#wp&R=F<w<DyN|>~ekG{nCpg9xRws
zx%Fw7m9O0=<A-k@e2pqEiCH8W9T!b|Wnp^T+r8_ZMNfQf#Do6NT~Aj03Y&N)s;T91
zY5s}M1aadzv2)LDdmYhG(R9I8X-fu|b>E!dSC3y^D<15v@KHLwx7*Ck$p1gXx*b_s
zk7tMYzTCb450hM^?~?VGf2D2jcz5k}XufW<yHslspKHd8^(?bkzO647@q6QN??AYr
zYw_h{7T2amYJZkK&oNij)3*18$%&HRr>mP++&&@ZIAM*w-PVP&cX$2R=E-w%<0CE!
zbIC_LrL;YguUcyrY}$Kb9#`<SqJ7h>wK*Sb*t2`<v`4y^&1dw^S#R6xyy}i8-&7O-
z!qm@mI35L``19+!x@*X0^EY=ya{n{1v`*J6waY)Cv8e0Qc@MYW%U}BbsQ<SJ(#gii
zm0v8yZ~v6j%irR6u1jP6LK_EIl}srO&xONw0?rnB0<J^@GVh&TXxq0vV5gn@hOJ%a
zO!)pYRNhE8b$a)Xw={QApV0I(4r$zt@55C0T;94{BIw<N+vmk=N>@C&vgNE@bm*oI
zkBjE#HDZJeyS}eF>3R2TwRx`ny5*gIdiJTmU2EO{GaRx_NH<IRx$ow7>jJ}n^-@ij
z4}bLJy&UUzQ9@>=LgDeM{omFqWTl6Sw``368u8rXXt;0Mt!Q2s>y}C7s=_i88`wXs
znyRA`dn-0~rPealwbCzK?OM~eeTY3+svPa{FtXzbQ{k-OhfDumkB&F|(=7M*?<?i{
z-2V(5n{Ia6*GXF&)bE{jF<#&{uY|DruY~Rd3Cr{T|Mohy@dfSn%$Wa&>8VagS?;Ur
ze}tyU$!#y4RhcK7JF#me_p}_D^v{dp`IPQOttmE+_MafpBX}&URz7mORK(_K#kpF~
zb7q`)?Oz*qQ~%B3EM6mJv58#_>@0lE_tpj-+*O#Vo8!E)yXRK!M7!Rlk#~>FY(JBF
z&(4SW3ro_HYhQL(_qyHVP7*4Q3(ynKw>M463NJXl-t>y^KHXP0zaG<0`y8?%*6DUo
zSH0oXnr%INh38ETrb!jfk@AeKJ?drp=%MwtH~&7VeK@;1zfF53?<t-M;ZOg?Ut6^8
z!{5!Bk6yicy6~XluY`n0Tb5m2>uvISwSkjGXF!VlsZH-ntt<Cf<(}ESPV&28-+r^Y
zsiBz<9+pii^S{s$a@6bO_Wj9150(3#xpaO%#wWVh)!kL}^QZFMd2d1|Il77jNq~m3
zdeH{=(MSKYUli$Y|6~($|IOUFB7yNv6^-OBBt_Q;Tc))V-t!D$^haKzb@d49v>;?7
z334YMd6CxjQskx43|AIWX&xVH4)c_cKh&LD8V~G{{Lip2YUhhR?r9U1B%YsI!OmrW
zY_Cp(8wX2JW#-?be^$>HZdu;x8S<he;K;ewpH}k<|1)svPXF_H{=c8{kKO;MY>WSV
zuKxFv`ra!0Y4h*IpX%4O-?4vc;eUqYhr8|fy!>~0{a@vC_77Dz{b!h0{GZ{z@}G?V
z3@y`6|EoOzpP{Dz<9hc`mw$Kv41Bu$yZEQYzvX#v{(bm&dHgT;IrWFUQ~xvAJpa$I
z&;7?mWNYL4A*xUNztex}{>@H$^Y6pI{q}$9w?Hv^@jszx{h!b6|9-MRwpSja7~;k^
zHRkEx+rQ1Ne<??tf8=lc$UFa@|E<-3hD$b&k<9J?;hD4l`JDd@-%tE!kb3tu^6a#^
zx<@=_Uza%b@!HP%AD-*$pHGqhezf@7fd?zCJ&m{X21|GJo9HGRV;C|8YRKjP3@x+1
z|EqNT&rlQYx#-T;e{(jUnfPAE{nVa9uW8(@VPRe6Jgz=@ax$N{_9<f+19QuldT>n0
zMb#J9YrXt;{BT))<xw(2w*J6pRF9%WX!gZFTmC)z7XnT&E5J5Y)MsA*H}Sdrhe<c<
z&s+Xyct7!<2}*okU;Go4yjDOW|8IZWn|~k3Pb;(M3*8iOpAL@ct^edxZpWMP@ieo4
zSa%#8>s}0j4Gh1+!=%~0dF#`!JYV0nGVab_)<s>~Yo5W@5Kv6RR(}ol6`S~a2vN_{
zJL;cs7JiXG6V4j@zQZ2Ij#Eqi<5KJ9y;S^_=$(q>e-11E{jRrq_<F~O<9`~1|9<;d
zvgtp=qd#5e!hhSJxv{8A`}WRH3tdHm8K4CYP3WQ4Cc-*p9Gd-F<New*L1Mvwbe6OF
zMlPB%Y0_pbqr&p&dDSZ;@80l9+L4xOU%L4)*W$Rz<(a&1cRO3<Isa^arQLqDq;t#3
zeK~t{>O2|#^6#yfu}5d+zKmD>D^GvsxE3?%QvA%hVQ+HRtxpq(e)s+0f%sy-%Oxkj
zYJPJuXnw!*#iHG_r8!J~7m8^A`z*Jn_Eo`-$dH=zE@6rNtsI|pQ;hvCFTVWXhQvw5
z-{<c7#xCmmt}H5=QnTdQbG<Z?K-WM;^cs+w{2^AG&i$y~@%&<*?t<;xBIhOepYiyk
zZTwH@ZvMZ9mp^Bv_trl#U;4NHTVj~@$uotY51rD|`fe-3$7Rv6U45hYAJtv=|1w_w
zY*K$HdhLIP3$pgcK^No7zdenA8sB;M?_}@YO8*&JwnzVGSYTC`*8ju1bpJ2r<<GLB
zH2&RL{b%(@{x`E?Z(=hgJm=lM<3*b*J=GY0#-xOY^S&v{FnT4Qw4HC}&$5Ebb$85C
ztzPb4a&5QqWWmN>u{E>(FFgFu@KE>Ke})%)^JgxGd*H>Q&9#;DJ~{Hv-IJWDW$}1c
z&ObSgYwvkaon<*E;<0zL^RY*_p0x-&Y}k?VGhJ=w`Tq=XpSZ((XL_ankiPh>DE{s1
z=WZ`rcCYaL*|4o2exJ~jdz9Ly5wY6Lcux7^*A-VTJ&CW-vEyC+c{4NI{=R(StnUI_
zf?iDAWpI$)^upZ?BX=F9TPbtZEy84y?_8byaK>p9!~1U6y>=Qtntt7I)2?qB_2Gw(
zo%NkFcYDXzYzr|76OXIx3vI64|1j-!{TFusGuIuHA3oc)by8-!z>GVq*<=l`&$@A0
z=h3#y?_4-VZ!ezPvch_Y=vp2(tuQ|=t^3zn!k4y(XZiU&Jl<`3sK6ve{Z(f4(W+N-
z%z765lIeOKSADrSYr0g5*{jP7?kz}PYaV*($yL+ueMW*o29f<)sy*vZS7`6Poz~eW
zH{EpEo3;0ztN&-<%-jB-!J*{OS$VjlobB7Ezq#a_UN)Uc=RHG_p~pSBe5;Gevx{PQ
z-W|XD*mUx-<orCBtPMB(pQqbnHNk10O#Rkam(RH#A_leltCcpKknUh#njCI&<kF7^
z6W8jn$*ujVf;F`SU5xn8kQe$d^vA6K3@Ml1o!E~hB~0tG->zM+^~d?Q+VnC<^&Zn1
zi_>e>OhYAACLb-iCAaL>zNw;*k140Ah_Wj;cRelD|H*czen)ox!TL|Sj{iQ_TYdfX
zCuL(m(WI^Y=F)9_a-ObtUT7@p`ogC`t!iSZRSArTe_a3PTor%)SJ%eSKiU5o6!+@C
z{_7gCe%5*WPI>3Mr|dXcMFQgl->`}V`sj&XxPABb`j)5LGbi{Ix2VtCdcGws=+u(N
zD~*>XZC@A_l6UED#G$Pk*EZ#+7cc+tZ8=L-&9q5A8%|FOoKsL7eCL&u+P1FfA1Z%e
zP3Ks^z<93s)=nwo*n%_9rLx3sJewEGd8K&Y^MLEu_J5eBdtjcu`srhTKF$f*c71~M
z>dkD6HlDa7v|5eJeE(0=6OX&%wmhEc$M<J%^zW0?;yJa8OEzAQeHB%^_V1DdM?N!m
z&Qm<b@t;Aesy_Ypt4Xi!=l4#%-f{4`{gZH6_p16^Mf)~HzkG7|x0Tr?|IJp(E2DiE
z#y2aine1_9g}=q4nM>mj=Rf>C<KKmi=R7sc6L*!ZE1It&bcAt|dB9zBp0vfkofc2u
zvfaD>p#H0nE0y}Yca~ib*fBj|n|Zk2#U-nbSAVd49m{b|XL8gW$;qc?nKC&PF-oqp
zUS3t%7TV*LoY1~Fx&Pd<HmkK3H?_ZBDcw-=%KF}2zIp5$FMRpERJ_yr^xUrbeww$^
zwz61>zgP9&;<Pl-^U1Gpb=ORpQ-$TrrB&9X>##^|dAjyp>GY~ALO&|IPR-An%4F1h
ze&IU5`A<D=JM{Lw6Yty7FMZ^8iqjfvgQLZdHi{bZ`eo<x^_X%5?CQRgD-sx!9uVj%
z!WbU(pJ77o`M<@cTOaKIDYoE0L*@RtNw4BR@$UT3ur6)e9`XMSN>}9Hn}1%v?8JYD
zR{h`uXZLr?J1<z&rG3X{!f3uBkZ*)<{AW1fbpP_%_~%jov_pN}6c#W(<(nTI@vU#;
zJ*RgCtCc3zvSly6$~SRU%#yP!I=(Ht!+N%=B13CN<U!Nwwdd^x^?Y1XE^nR5*3(xa
z{Y<FUfzRzgb?EH3_P)MM=8+=xWdVD39`{<GTb!7_^p)7kGWiO-NabliGtAAx>!iEx
zuA8R0@Oh@pB<t`R+eu5`+@1aOU7`QnE5*H+_U9`qez>E_<M!#rQJwvzqQ%XTQq$(E
z%`QtQ<2@GIUKs0fZS8yAMJfBE-Q)MIYk2;7U2)Xz=-r=ATFw-8YSUsZPF|U~(ABDX
zu7uT&&d{yCeT&`NBuzMi0(MLgeSY6`=7N{=+!mLr9+1vG={T=t-hYPapO?9>a^!Dc
z&}G?vE4koy^@p3QtJ-#E>Xf}??(tOGa;Jp(+U}hn-X03d`sn?<xcAtm;-0WcIcaN`
zZd<%=iQz;k9mlyFFHa7YKm6^pqj5ys&r41PCtTxm%TC$m?3iF9<RY*`zND%C*rKNA
zeixH>PLtkPsB*tleEZL{sa6%4r-G)=eP3JlHtAuoAjgW;U$)KuzSHZzQ)7<uuKl||
z*&P1VVEkpRZrQ>2%PzlNq&(H$%;Lat)A+Ymb*mTMK3G_$`=nWtrHUuyyVcz(=GTH;
z)n`g6D^IXwcs&2otTri=J0^iOdrOmw*K2h}eb)Y^wD?CFo6zz6invFHlh*7I`F(Nw
z!D|nC1P;ibx%zUe2jiratJ6;>PVG%*dA4bN{@Z6C?ig@7&M}KDJRj_}rQpj$`}EY3
zUqAb%tiCd(^6O^j8jXcpEtwhIr}*0kMor%?b;PbR+QO#wj<Za9{`#n6AD%uDi)ubu
zsNiBaaq{blq5ApDr@eceduFY$b*}U|!}(s%mUg!4sD$2q_svS{;nQGYU6W0BUd&Y$
z%sBdlXPN5(?!K$h0hg9s`QvJmbx<(W+swF6^HNq>8m|WDb%R|q?QcF>(jT*>J$A>0
zlQU97nH;PHV>gv(=|!IQUVq|A*IM?otZU~#3*2=<QLHE?@~&jf`h9D5tdV-Hzg4C0
zRPC-+XU-XPOkDpYd-8g9gH{dSYWw+L%c?3Qug;0zQh)u9NT6%w^TN?$NCa!|3^b^S
z2a{5L`#q3Yyj3KKVewMYa*yrNYoqRo|6aPO_T$-1FTN#L#Dlavo%H+{id-`n`}Xm?
zmWZd_yQ-@NOZyq-FIa0Y5Mz35z0styqn|Xs9L@aU6TSKDr;YO^EnjZi^{}-y#(iV+
zruc=YJ_;>w|F$VS-8Mhl_Rb*-<xJ@dWvh!bs(SbM>P&kY=;nFC;&1lSi?jVA*GFAU
z4U69w{qbd>Yr%9Wu_YI6)=H@Gn|9i=mO1M#ee+&3_SKh9mZ|-#&u1Jv?fd6kP_%bP
zlIN1vYv#Y#xJUIgEoVvieLY^qVCk|+*M+=Z#1`y4Z}DjM+MCh$Uw!*_a;JLgX)_DX
z^RFXAPD}MZ_?`JGljrcgE!S&n`P|m{CKl%BiyoOcUn_8G;)!*0eiR*YR^6p~$YZfm
z*zYA*W2MY?Y`S7{@|;Vh_M7^@NlQ~Nbw8aS@;2}Ff_rvGF&AF$Uq10{LZIhc@3`bJ
zTh(<7A^pxE&<F$5x~2aa6mHM|^(;FZ-sU`dW&acD;{OaWXLBp`|GBQPznA|!-gn}E
z2Clf}jkD{8?Aje&MZ$MfQmsQf#5%9-AJ_kB`?7xhSJ9-^;Kpb;s4*IEIu8^yd#BV0
zacL~tqkJ7cKDKjd%-VU&S>Duz=GC=#EqBv;dG?T(_d?^QIWh6U)j`{~Z@<3h_13g4
zGv4!TIjbME>GIdiyDm*PuV+m@DIaxm(zD7$kGQam%Tt3YL`6&Yr0e=sWv~5u|DVK@
z2=B{JR{V<o_1Q|=*57}_u04iZ){2}mo3sZsylS;Jc~!LIqPw^Dy*CO|EwFf@E!Pzv
zxALOi{*Hf3{n!4gV|)8-bFS{Rc@x|9^G==NIA9@~^gU|B?;NStofDVGPo2=iyXW85
zv%#@f*<zdeg9@W`4(+Z|t+gsseUPy__HEtXbK7$BeakZcU9R5I|JCaV&lRa)X<4)H
zfwL7(A4%-jjVYP6wI{ynDgUNF_fIXFe>Ct^r&#*TfJ6~K*3!$XJ6HX@Bm3s<t6y_A
zFLjzz8S<riR!CS-mekbaf2uQuw^fM+{}Z)}tXTB;-HoFhU(@6t$9y<d7<4z{#kAD+
zzRo*)UBlvee@u$sUz;z!&i>b-rSWW8LG|md-KbWY|8ec(iPx%6O6<8c-{qvHIpdwW
zl7|l*?!PXLZu`amZ?8=Sv!DHIy?3`?oU&h<wMqS2ip|}UPR3cPua)>$bLHR4{m<}x
zPyP2Bt5-kJm95%)xi&@qLdg6N%^CYYpUeOKBkP*0{PRB?|E|}cO3E@9*fzI-<>I7(
z2W+DI5v@^^3T0RMPsf7)GvxZ_+_-W3<+je;oYth62g0+yY=|&z?d~eyd9drKjmwl-
zTf0pHUCZV`^Caxh5%TGwl$93sJ>y(b@px+F<@Xx4T2Y?MH#`*=Pb<2oeNq3b_G<6h
zqV5@tr#2_gX^wcaPo?>9-Xxi-<L5uA?PcpawoPhk{v+*KmyU!i{a`IuaBJ%w`;Q-&
z#w=fX@IM1r){nmHi;o-0m&!kmo9t7ucuJDT#pA!7O^@p==GANWI<t{`<@3{zcy0GT
zUCh5~*3{Cuw!321KUz}n_2$0U<t4wrnLYn^v#ars_2T{aF0cL1z;gW4nt7hii_=z{
zt=aDJ^gqK^9sfUjUH>!G2K9QK__C|}-!IV%^EqGSR9?RPp-leTj2gv-@;?iiUn<x4
zx_^_^f0kOGedS+B!d$;2r69#3msj23|5?@l!Sq5uM~3si7{*^G{xdud(r^9Z@^ZOL
z-A?;67dBbNpE>+HZuzIhQ)`R;<A3Jwez9zUozj+b%6*yqIr{O}cCYx7Yb5`*OziZ3
zhMc1@9*=vHOhV<l45QrD4zInPR&+Ld@0!Y2@mb;WAKxy%V+P4jH!9{mU%JHX!S^!x
zN7;K9?Mj^XaPc=D)xIO%Z)HO21h1(-dmkPmf44*UpJ<*u<I4waJ9qA!yY~e1pZcg*
zE7q;JxSYRLrTEm6r@IXweAw2V-@DClr|gr{^;VbKuHCznB%$0>zhObhm9Q4<1Lh&Y
zv5t8sHl0`z^U%@OSM5@Ga$r%J%a%Y_L+t8FJaS>sFTY`!z~=Aens7?eL!qb1`bv2J
z)0;m38SXmOdp+4wFJRh|%X=VG=~%FA$ggnQ7?(_6w^+YZ5|-DOJ>_dpR$0lDtI{Vb
zKIzB@?MYYvxP5-tbY@ET!n(I#zN~rc9lqnrlAq^pwBPM1k$)Wd_1UAsb#-sdY}fFG
zN|nvcjn6ysZTjTMJ8M2{laZ_a+gm;>Z>iYQ9F_id8LM)(J)gFgXyniOnX*}etFA16
zcfWJwgO5TjKg)e?8y&p7+JEkD(<|>@Eb98+KN()3A<w}QU~ApKe|Y(A$)$Fu_TIS8
zT3c)S_q4C(>x<PIU)I_S9<I^n+0I;ItE~Oq;Xi}a;r0B|aYfVBPRfT)eW-bP`CIe+
zm3x;|bVd7axbw`TitXM0$FU{G?uA09h1=a;IIcHdy{m3z;j+`a)J_S1R(z59aO0)+
z-L?T6cgAGhEBP|<MM~qfiIK6+nk;gTqQ_s|Q#PM=ns?Kdti#`1zWxgLTw?cE?Z)$e
zk&SUvZG5zRkJ<8b?c-W1<+apTTJNH(=%vC|N)G%O^r?|c>h#Yl*VFYITr0np+o{yf
zGTHxDz&`5X>Z51k-wWBzR}EWqe@38d_K*5EYJYTZH68yO_kQ>Ms)x(@DmCsMUm3N0
z)3WE;H>P`j__Dn<Z6}+BRpZ_Lhjw!)blno|65;5+z%{As7nejuVBEXsCz3X9H8lFZ
z?TtvFYl%!Er1?ow700a7tJb@CqO)=-OSt;#irfcxZ71l92g{fKs=6NUni$Q`b?VZc
zSBD<GN_<*jYb_Uld9CQ)N5M&YO$Qdl@PBQss{i$xZC~9QOSQ+<H-2-8)~+?Rj(Qk-
zzdgRLn}4J7*L6a>?YVa5YEJ*XN$_v|+OpL5-<Qvwv*g2n2II0WjrBP{6DS!q8WJO{
zJ*yt`ENovWxgzS=#eaX-=zi^yn%Cv0`K~K?N?Do4w%+CT(H8p|yIS{HFEc!U^Mu_w
z_pfWcv}}9LzXh&3axL2P?_;GNSE&U$PCZP=)#kg3F1m1Z+ua+1t|A=Jbx9bj9r1~4
z%9byTnK<vkgG(13)RT28s(pDP4n)jbI%odtV7=bu>fRRpSBw9=`YJkWv2X2^oR6mS
zZ`D`E=Hv>$i#o91<G{+RqHLm0{SJoSu0dVV%I22dWfV`q(W_y#FYia=imdPTLhDO@
zm|AyCuUe9&ni?u0+Zyoiz(@VR>`VVM9O%?we#NEz^gqKJexd&i7pmg_a7^k)VmH_=
zShO!5Sv@j)RPV3|E5H4-<1gsvZhvjA_oiIa^L~$f)vsm8+-Cc4n|LnvQCRXT4-v*B
z=xSf$4=!A}%HzXJ^G(N&#wwHrSqm)N7k*jg`q!J!KYg7&CB-u7JO8=5DU4s|ihlaf
z@QuH-{)MgnKNe5-Z|pFpgM2_>y#9ZNgo^3^nlJrlc;GUn9*NEPl@+WNu6|VOPz|Q3
z%Rk%>y83pX^6kveufl~DbtY+homt!XB7^w__&hgytnQIFt=Rp~TlV_j^7y|j;jS_7
zyZ1lYc(?v~>3@a`t|cLO1}N<`Qs#yBN!;?3ZdhgH|ImSF(UMu5YnK}O3m;W@wQh<0
zi7O=xfv&a>rztG+xXaA5?y0f~!zr!oYVYptnz<~S4J4IJbR7O@sRgYGZDf%VsFq^D
zw>{BfzI6Qwx8r{=|2bM~zwAH53BAkzcBubr(tgH|T|$HDDv7O6k*)K8GT#5saDB`C
zU!3b_+Hn78kT>7&`fux(dz=3=C|Lhz`1Nh_e+Ea>&+3i!kDq;ivHx0wbrjCl`|^`K
z8<WrUURG8vnmEycXVDzxqSixZ7ym>&n(e?U@~ro&vynmYg^353UBEb^7QKBJ^Piz{
z|C0@l{~2QCe}+4va0A#cl?1vn&}rwWUDn6)Khvz&|9ZFo7stA;o%hA-PbTmB&#><G
ze})BJFEDqbuHNM~@7J=mxv3H-ryiQ0cyDER>I3P$>fXC{UNMzk@pz(ej>gTC>MLJ5
z81J~gWw&nd;qO(p3*xs#x-E5(X=0Z@wZoqGhtu`Qi|f|ETYv9rLXz|o76x{e>yAtR
zxNn*rp<TG__AOUKCGVw{N!IConW<N|NZSg>Z`pb!v3_e_ML_zb!^{1n7>-{3!=EcQ
zm2L6j`&*v`omklxVB~qs{c-dz`{pfYxuY+i`E{&R=e<Dg!@aHPkw3cqHh+*^yZx5n
zdZoE42E~(@oBym%di0;+u=%g2{T+VWCYpG$PdNQq__powr)~!vSN`zg7y2<jbXxYY
z?Vp$uT3KY8uYX!IqlR_)y*Q6=vrgH~<w}cg2;G<WAi42q-r+~JqNTEB&MRDmN*o2B
zKRYyUPPp}hx#fP_FXxxk`sHsvUi>}Pmd%aBZc0Pi@&~6wF8QrqlvSU*y?0;1DI;5t
zwflu%Ouu#J7H|KpiGNO<7W>rnhJj}uziCvhzsrZ~t?8F?OhvD#?@01q?s|@`){1Ly
z<)go9_d_l`?|PZmr!mX2i^->wA#H|ujsAknD7B5Hvh!Bt<xP^^>~tWtO6Fr+@T&R~
zQJ??qw*S|XeMM*8ALFI3ukYIQdZWkn?js2)`hJgJzKzRNXnG%C`&ldcbV0G^3z6+n
zR{aZK++*3i^<s^csq}|c*Eg+FOuR9N@yD#^ruAIw|EQl{Ha%nQy8Bf|JrCseug%Yn
zv$|CBWy{758@F9~`^d5BuRtH05x;4fZMNq}*?;YoG9Qnxa94}sD`J#TUVh>7E8U5`
z{~5TxyJz1ldGYnQRYUA+LB|8%WnV3u{Xjdj<o4>d?t$6P*~adQ99#{C=c=v;mYL{k
zAN?+M_1?P9f{Aa<judf9ocQ`^_VSph*ZYL7KHo9>&9>{tb}7o%;tWi#9y0kId*t(%
zy%fy4mHKh_rX)rk24$P{&+FXwG9Q_>ZsPJER<g`~d!ui1vM%v!NLTsK@OAb17YvKK
zG#ij6X&8_Exc$$`%>Lc{XWB3Sn*VbwoB#Eo{TIP_g9@zT4q_`7k#@-5(l!4XPVoI_
z__ue-e})UbH76g$|M<E0m;TrF%f4bSx3)0ex`=i{C(Ok!K0njr<8!ULw{o#;@R2Ir
zt$$A2aC3VExNo;OE}MN);E}8PmGGsn;|w-zyKd5bTf22<pT?QumRF3&*V;9y?*6zs
z^2eHtixIbbZB+$~-ah$u;_daVI<xFLm;P~TIXB@r%ZHy6l^<xGo_}iLtow(=XFa%n
zZcB$bXWX40q5J1Q91Ym<(XV}V#m1#q-m=Up=y4U^VXw*lUhw@aXRF=6_a>ct^!n{B
zAM;<1!5g9(&av;X%|4zl#lCmu+Qk_U(p3d_t@yG&@KXL3fBw_9mwqg`6S?8<-Ff9X
zzj&U15q&Ohbo*7*@6$*3Zu$IV@<ow(`==_rsFH73^L9%v!>jV!q3cybd=GE5ke{!%
z^KJg27yGRH{dtnM?Wuk7%8JRT@W-`pn;-Y8`*vMA?!S54m9>u@Pk&h9?vqlwI&*EN
zo${`WdXp+1T~V5@c7INT<B6A{yIE4^x6d-%H`y<?@a&J+)1Y+ubyZY;!`i1C_UyQ_
zRcwpdW+M-$rX8}cSM81E_x)IN>+HRC6YsdoUCwd0wd#-4vNd~kJ@&)4tu~*fv!lD)
z&$Of`oH%GQ?Lgr_g%71$+q&h}FO8pl+DySP!N~Z2*AwjvrGLXcF55`TH7>ue`=-b3
zrSJmDllFhJKEHAh33N5U7+-q+hxb2&Y7_sj7ylVf?cze=HrP!Db;{}0xn}$vj?%UW
zJ@*5t<q^H?mhQiWyqgUJl@lgdg`D#eEUH#qa(k0jVC?ok3fKNKeEzNe_XB9BxYYfB
z^(y}v_OVvpzb)ba><{1kRiSSRO~OHYTuk@>`RDncp@z%;A=~!<4D(Ogzi)K^c1-x^
ze#Y9M^~s%!cIW?aT>GEl`6vJ14?u@%AT(?<n1A*M`&F%P7PGXs{by)_TO<FWIr=}t
z`6u(gA6Wc5ne)&62Es<|C)TKcE5kP)xd`*(PjoNJ-<F7f_J{SW*WNz0uDAP<JOv38
zu!es(4*h2^e;^;S`o^P-b+`WsWY_=vGx^^Sk@`ct`TrU0C)M9)>VMm#|GAF&*V4G-
z!mefak-UVYJSVY!_6Kor5Nut3|NPSXM=Nted<E|;ZF4Q1k4uaF><|2_Lf;haTDQD#
zP5j=sH=T_iTZc3EXKl@WWb1U@WwuA};@>Ja?{L-J`6cxNIbIN6(5%mX2sH;!MEVmO
z`c#Za>_)jJRy4czyG;}EjPf@S^5#hvUCVQvKdNRz)h}TiTMxZV<uf+9{}~je<-hxX
z7X9*X_J0P2-TuEG%>Tu)eg-a?b&u)PV%1*pA$RH4JJXvU-P3exy}g$?dHveb`8R&$
z|7Ct#|Nhc{hSo}pHMb<H*3IsHQ+nypsU?+@ZPs;t^?y@q{;%=d{_ijA5A`Flcm8MC
zzVbhVw#l!a-Sfg-tCoMWoBf}`;dlJ+FZ&<5qjJ95KTZ59S&q-R{<p8{PaS?`^sX{i
z^c9M$<dK~yfBVXRhN<mWQty1)rG3?Y<Cput*l+!3xW5#1)C4LI>^lAxr*}Rr313y8
z{Br*n{;mHR_Iv*mLFMEJ{|nP6PMJM%%H#v57oU1{w)M-c+%M+Sp4QyxdwR#nIBxpi
z{$>9eF4$fF_gDW%AS!3C|Idh8x7hO*SGPvbP5b*^@0;WdhB7^N(X{JF%D>H0`Q_tg
zzv+x;L9p)6>$Rv_Q4HPd|5E~Fj`pIx!N28w|1&J8UH|u2{D;M;oT~YsHVlb)`_F(J
zar1B7%K!5t>OaH%h5tnAK@kTI>eur>nE#o^VgGvZe+I4BUsqg`O?qW<F;L8jX~hB2
z6BW|`yjkSGUi{B6wYuc_c~R9X4<~Kj_+_q0V3@)HRf1*vWdAcv_`3f0*Q3bY@a%{8
zfAUBDXV@Q+RNrjxmg)W2efdRK(GRx%6&h=`S8F`p)Z}qxdQJMAE9zdm1>b-A&%o1j
zJ>)~^Vr|vBT@vnsZ0oB<UaY)#DW$>Z^d~vvS;Y}C>!<(TGyV4hsq?R{=I4dQZ*4Ey
zDw-E-_DbUWo*KR#mPbQYtn@0myKIfcIiqVj8f$0UYH8*iJ~`j8QTv?M+}1_jH3>6h
zc%I9Ihini{dNBXijgv_V@1F!M+UB}$)_1oXI!%5v&x_nJIpG|$W@fVdUCX1He`kij
z+47&^0r%^`Z7WVJYBDHvtvvG*qk)30`~0Y;_&<Yu*!*|qpG9wn3qPL!<JXq|42An=
ztoPUW&rlt2JYD>cTE+X`{ZEvW|GoT|v~T|9`X}O<{~2P0|1&s>erCrd^Ub}F)H6?*
zrs3-Q@0)$C{s}wiR)I_PFVyy>e9-?>bMLSCuM6y(D#ZWhe*eAlKf~tm^qBO<`fK;=
zcg_E8T9!Py>&Km|@#3m4swKj+G`m)v{~ogaLfQn-@ipJqo<7+(GoPjG#exk_CWhvz
z_XkB>ve;OBr;l;P>ZTUkbM>_)+n)%oSAQSvY}R$XR&n2f9sPgRZT?&poj27uhTqQq
zF3<C4OFrFtm1TA(TiEaH{baMQ-L;9Td-@(4^1Wf6)pm}#xMbqDbN-ihK5bhmweRFw
z-_VF1pO-IW<9MY3x>ZN}$K#z_rdj;^qqgu;d9PKl)VqS?8Fo9?eV=-3`{rktuJ1hm
zb$#)yV~Q-LEP>5E53W0Qrf-=2_M+AVzFX@n*uSZ;RCR?K-HX_{GGppDnZoeQ=(Rmr
z8jJQUUW0F*0$J|U+v<PL)8~J^a9#F6{!bl&`s?ohLT|jj|4C-ge};8!JKi(bpG;`}
zx6A(1`u&IhGjQ3pHwHgW2MvGZIMC^UxSy+!-v1QwZvX3D{~2B=?T+XB&v2sZ`rluV
z_Qd{YQ0;sDZ}<9tP1;*Oy#Ld_w*LMaL;HjC7n>~oxcKs`KUUc>QI!`xB>yw)4ADHP
ze5|PNRBfO3BMrYfDSwyb^=B<fxxVX7evCrIn+NGP9`w!KvAS^C?Y6(Slk<Na`g#6g
z%xY`nyfl?xll_g)IIx9V{AbA88<BRQJAcZP3QN<STT@OK<moPXAv5vIS^b&cdZSNA
za;?d3V6QsF_pqBS!N2AbTf*&T*?T+JOv`+COwZc3mh0ED^})+Rl%#u(1X~`9w%+@C
z-j;?tu`0<spUq02x$D&Pf2=-XT188mU-71P9Xqt=p<b-=kLg}5E0?X`dGTyvrs$ch
z%SKyjSeg5ecV#rFZP>K_PO1Ie;w958RVKg6d=$xMUva1U+I!QUxleiLUYvhbD#|$Q
zaj@9;AeQa@6&ClZE`PZ>D<f~ogg<BhiFSFat30{1deh%)$}h4do&|rKbLn=ht#?Jl
zr^q~$uZz00&s5D#L?0Ey#*gvutAG6J^50|sl6Geymm@Eq%D>NkR`lpUL(6&DkK6yW
z<ka7_KNEiWSN@--xAw1_>R%}BOTZz1;^XO%Kv#_WgYX^t7KFWQxPH0n{GW#R{~2E2
z^8d?n{meYp{|x=J?=Sk#5XJD#pXoot+I{~SO6|X8z4^(KwBtOo!0X4x={Yg8KH1Ju
z|1fdG=bNv@KCbyRV~N-L%EDYJm9pe#ahn*MBY!N~JkPGUJ7(4U^?}FvMZRC&xZu`o
zY1S5#rhv+tD`nF^Ia}?S<nh94UDZ`9EjhEzlV(p_vD8T{>-IFgck>>FTi-sduITwP
ztYmk0wd<tF=<hQQt<$#ZnpLs#(E4pv8>UL^C@L|_{W>MHW15Of-Wqlj*Ye!_+=7Xx
zPaA3;T$81}Fm3AEwX?6s2b;IFJ4NcH&q#b)Cf9QH_UsiYmln0g8|fJtWSez8Z_`>5
zGWq&!=g1)6^$Lr+v}asFE(Y+ihhP!GUw@(uTtdVnl@OaQ3NF2-dr>9h#&*6PC;q&a
z@&Aze;`H*5i?6I$bN|ih^}o1d|1;dz{LgUY;@_F2{}~*!_n)-?G3!TtI{$OqbLlrO
zE!teaS#SRf-SVI6{~3hczolOP%ewAAgUWw~N14?Y_4dzXF1^VNjNE^7di)o$-G3(k
zXXsq~+vxSbrt9^S>OY!5l$_tV)HUrtLt1S8g{b$R{Qs%-L#zh%p^sdA^YG7qh8e})
z#3-^mHdS;svP<oeUAMXY^S{KidW6-S`7IR^mXGY2jDKf#{byLP_5PFje|+W6$)Eq(
zZnB#QMaBetZ$Tx04L;X|9{KV7pWRyfJNeJFmq0>qFF5py>h!lyy7%acXQ}S7&<!bE
zhL<mJeJNVum=qke@41=g{<JSP(~sT%B)AcL(Sl*T|I7WKxVQXgSjPz(+yB{&TkhB6
zle(~>MIvcN@5uiQ6J-B0{H^x<&#++D&&db&fBaniCH{51?`!){%=`Z{tk1OnA{=j2
zA^y)Uzpndl2J=lD?tj~U|Gn~`;pXbi>x>WVf1SVouK(XzyRCklx&8N@`{tH^;Dy7N
zm*6Wx2+0k>B8KQz$csf=b#fD<-FNSZeNYnhqsLAz^!ceR8{dYkSg`KQF71H5t#@}%
z=w|8uUims8^5xqinT;v8j4WP9on|#xmCE({+Yx$o_N1*{AtCd8W`BE=xBu>H^Ig?Z
zZ*E0?=b2{}rp+IC^sKI6-Wt8B-!%i>x8~h?xuLexWtGi!mgaj~4n0)w%ynOAnyr;-
zmKYc%mAlFI_fexfE`GMPi*EXsUAv#G<>vM<I`H)KAT2-7*xS!sK5FS*Gt^kSLPsHf
ztF}h&{E~Th!gLcm4xMZAi}lsK8Y;crf#}LZyYu7uKebWyJMNze&qZn_?6q0^_ifF&
znD`GfAMZ2!&oH64|M&8rUBBvm|1(U8UH-Sg|1V4U897`sH{~p|NL(Sn6@{w_@GBoz
zo^X&m`5^x5_WLeKNq=AGpIak7<-;pi)aLj1Kkd1o=6C$T>5%64>!hQPcV3=%VTY%6
z+Qt{_yACJGa&FIFzVOY}<!?oMotGWX&ifSWW&WRmBRDwX)$(MWcU__dbHX%LzN%b|
z6+Hf1Q{BxhcK5HDd+e>Y+`YN@@5bV1?|$0az0cZQzv|Db4ORw;p9?Q-%vcfGr#vYj
zzwmF^e50}(8&d`5y%pUdc~Wg#`u;86TX|%*)G+ajo)7fr@0-d|bTI4gJ5hsIOV-`I
zxTbK~3iYtqb?r9aSD9Pwd1=zo$i~N<JnPh=Evskx9-lVr>9zk1l>&#Byw~-eYZQC!
zy3gBbc}4zF#};)xUEfif$g%aqKea6{i#+!|bQK9V+(1krjNGvX-54VCpW*n``M-Pr
z9IdK!|9$D_oE!Te8XvDAc^q^7CYvOBOugGq22}u%T<{oNGrR!xu}eSB-2VGYe>&$M
z70@=b<sQU@1r9ZgPj7wy)2?s-df__j1NT2gDEwzw*Z*hr#_Ri^SXTaLh~+K3$M&CL
z(n0pWCG}6^>)Zc{)bcYeKX!h}i$z^G8U|`iDN5&L{_5MO)!ug9w02bgBzo+}e#8oo
z_$B`tPWZ{MP43vfY3gsO!0;6lU&w!$7*T)u_}?G-pc8(j`F~aXXV}mCPt>XF$2~9m
z(?9+*9I%`G>e9cY{|tXr%;dj({LfG)4L-HE?eM>O_U{kZw+1ZggR10T?frBA`aSzW
zm-V*)wW+_~1{#rlsJP)j!}9*$AMQU4(O5jG|I<JF56rJF{mXy-UjIMCA?L*Z4F2-p
zAMQWY|HC84{^jF;f6RY`a%ntS{AYjvABJCB_Rs$H`}BW?4-;-+cc`Vte+K=>{}~wU
z*6Kg|x9`)Sb?MgYhp+1_Y}xRj=d1n0$k3IoOY<|A{`t@FK+wudUZe}B7t{X<o?id+
zHUG2vL;XM8_4YqsV?Xmh1K07_Gvxj=^#5V9J^%a<*L@t}gQR&nk~!-VszK3k=U@GS
z)i|RoZ2v=<UH=*8ZTOcCj`E_{|EdoEod03z*E9U}$Nw=@&p-b|v>qvb-I3CW*7ZNi
z)8jv1lRxvHp;i7vUu^yPh<f9HBL8+az6T{M{=5Gf>>q^wXJ~<^Bzc4%R>vcB)E}}e
z{m)>V_~-nOKuT<`EoQ#|@jpX@-1E=>SmP0i4Rq`>FFd_&{m;Ph^!lH#?Vs%*dLpsC
zp~21Y-G0tmlpuRr6J5{x=i6DfKS~qI7smTXy-E4-_e`Kon@7Q_hzIlTw-($}`Onbb
zVSm^D+}@md>*UrcTE*J@XLum<W5@cWJE_H|_mE!FuCocEOG%r3$f`w*zbyT;UBQeS
zt-I1MgoA=h6dqhpYD}*^|Hu3{yKY~&QLlODbfZ81)$ER^x-zH#XLzUt&-y4yqgC|q
zN%x=i@;}(auP*&_|N6c7{|qf%umS}s8LScsgeYYEnptoD^*hK+L~(*@rbtoXzxd<-
z7;2;9&;G4_`k#~NsH0&+4Uh8McZOsSs^N(u5|(?8e8SJwX%f|q<E^++Ysia>W&
z`KN#CAFv~wi7mBab0#jYV{s;|)I)P-SWeE8{WFeAYfad_>Bz7Ck1OYIXo*>}C9eJh
zs}`TuI~UX}uMID|k+M8;dGYMmGxq(Ua`M=Pe?{B>sDvSx=J3Fe{Ljz=%jb(R3Jmsl
z_4W_KP$LUof#ArqK`?v4<<x$Z3J2lnY$Uso3c3221K&a6zw^(3hQ{!JLU0Wt^@m*H
zHOE@yA_6JTx?eZ2{m&r(gAwFxP*DT-CQ`kJ!`Wy>4N7#vgAP|AhpVUwLQY+4|1%&;
zwovW2R<Sk<CQjY{T-tN>@{WU7rt6s2?f<~GSkl@poptq@=~7$IY<!+=sjSSlQy)GW
zzHW6OsN-_{_p_bxaKZJv7ff0%U1nFeYRgT~99PtY0a+rr-}67iggNr>!xbOB|EW_R
z|MhPD7s1%N$^RJ?E$r{DDU?4neX-fXKOVmpZ(Kbwb9(TNy^k~umX*u=3blIm<@`_K
zb^jS2gnqgaWw=^x>CMeAK87y!(mcM)(rc20>W(R{T;jzSxL&7c>)IVU<ebQ}R)Jv>
z`|?X`)`VS)Q|L_G=KA@vrTn4V)vGLvcKOBb7oHW_ynVOae}={*-ol5q-L<BcS}1&5
z7M?L}+pT$f3(~gxM1SmmKl#q(_c=Q6bW%SjY-<ZSKP@R|{_&;i(@rke*lWdfCMeU(
zYL2Vu%~$$~%Wg(*+|YY3^jKM+#nRo=PWyEonQJG>bAYLU&Ei*3WmS~rTKm8?)n!}c
zc4rlqgcr^7of36XiN)}wU~Wqa^V`rh4=;bsb!@u1rtsF2+z0HI@4M=6ZrbJN^``!o
z;npj$6OUi*)LECb-MdpUwe#O=xwhThv!yN9sv6yna#yK2{&as#)d$U8tw)L~>hAxQ
zH8c~^PdQ)c`}CCNXYXY#^%*-&Ps=SAb}c`>EGlwIz}mH&7xUk<S=;u<;@RrRm)iv5
z;*%tvwuVMJK9?<G<=39IeNWqFi!Fbz6^4Yq4SL6YdcwDFk5~eG7w-*RZ&esL+4Ig9
zMRkp3S7(Hmi~pMPpP?k2GxkpWtKh#E<M@iV*wrsQ+ILI&#i9j!_H4WSR(1ED9ewxi
zDeinawP{s<QT8Fv?J1Q{zJwbt`{VLq+WNX}W>HV~OxG)0WRP;FmN9|R-s|OSuPs^^
zYqlmm`g3)^q1wEIt|DR1;JHFAvEAFew`|FLEUvfon$qEFjaLq@``k@U&1T(U>e5&O
zS~|;OUjOmA{omsJr*WQ-{D1st|9kn*>uc}+Gdw<5|LfWPQ+p;Kng8)4|L^5LuV4Q#
z|KpOF{|tARf3n$)dprx@Sl3^kPu_Hx4fyKMTe|Z<L&JZDsQYX95$yNz+B5z$?Abr@
z%g*}WyYyl+r<)bHo19|u{O*@EFMH>nF3ag|eXcxfD%tm+cG7LmE_%8D=7-i#-^Gsy
zAOCUf^6ABAGfv&LD>G<IRF$tXuljpjns@HeBmOFe#*1nnS%-+;dD$<zciBRovKL}^
zXV%~Ub+uO8e_J*C{@*6NCOwg1&MkrS=4#fbYZk3IeWm^uZ?UDjU%RYp*`=4;{OeN<
zx<t#8RC_tJ>VgCJ?5fW8T&~)Zoc(0m#pdHny-&}&=UpwfS>#*yt>@CNWiS8EH(P5r
zt>i^h>$wkBv%2nGJ9XG&$E@s-*(awqoRGBn=>Dzxu5WZ~=?ne0qT99i&F?&Rd7_l3
zs6bwt1w*;%jhENTri)F=^3LR}<UciOj@Gv?%g?gT-FkNErRamttvl`tZepwMFYH?J
zbloYxsJf<UNs|i$uYa|8-JQ8M_0#j7#g8paMay>e7`&2GY&#bbp0(~<;nnt~5!3xQ
zPB@)!x^+z<V@qm*wEgpYvrkLsPm9uR*{m|@S>KumE7qM@+ctCI;bPTUeutm3XPo@L
z&M7@oy*%{Rnj@`W*4em8-?(XBc5aQG`fc;nlwCb*>-p+WIQjp2Qa@u&*Yyw2|Fo$5
zXISU{v+L8x`|mD)F*#Dt<?in)5`JQyMqotL`<^}44_6B5#)mfD3YTt4*ZJ}F)`AJP
zub%a;&x}*tVjkuD$Go7q&2V1GFTpGFYjZw)oob);;Ai#C+wVoMzR9@gD%x=fJRZf#
zAmqoeV(q%M?{lwJM?Rh<rlK=(L#l;^h4LM#va%gQ7hOdfKqH~r+(+iWTKjMLudug?
zKPLYT-T(Xeukf<yf1ibO%s(bo`Yry?peWt{{cO=bL{B$1Y$+De`1Ke5$<*>|ZT~30
z5?tZTY^BQ<PX1XabFb?!)jv^Z|9$lY@`4ITub2P(Y8m$rOJAH_Q7>e-;!SqY?a1s}
zrC1vW3D5cS%YA+)ZF;eO!t$VyyN}MVT<K>jx}?|pZ0fqVxe1?~eo8zqEABtEeD;wd
z?-kK^yGw%wZf`Fu=RdVkV{P5`u<tvkU)7Ad_HX8%dn%gs-&R+L^cNgi<yU^CJnHG5
z4W2SmC4naMYx_;RUCn#fTwSNyylBnajVo7Xvj=tEQ|9PD`Ph2%{7buJy@R)H+OpB;
z*3#prI8Odv!u4pSr}|2@brQEG2Gz#DaQP50T`Ni@Z++*pe;2BkH5h1a3@&5pTAaMv
zH?iis;0nc)ciI<9uKA^{-nHgd?whO)TP9`deldG*db;xMQ;(7tiz@4W3(pdd4_>M}
z=UTU-<0rd!?Hkv4Y0Wd*oO@?#^P;J1O1K}nOcRN|E%8ppf7g>M1{JLfC%rwos(uNB
z!P<uXOOr18KF;2IV;Wc2zJ^N6MP1qxUU9v9^q(PU{=2S++Pk~|de!gJK60+C{rndT
z!`m*kmE}j@e>v-a@R#Y~O)t0Knh?E!kD1{FgZhiNaf>7DbU)0S|5#T3x809DsWP5C
zA7yO(tOWmwK7YESBk^Ov>fqG+sh7il=2cGiJ$B;hKFuH5&g~D+e)*XBZQ7=rTDDsz
z99eOqe8mCfjqDQt8MuyhAGxP@W$S+i?u;_2C1v%uta!|WPwVbF6?*RPC5B3sRvF>G
z?8txOuXb(RzV)Mi&w5i|JvJU)7A9pitNh#}^Pj)dwm&K*`}F;>UMXFpSGk4@FaBpJ
zEah>WrnFX1WaWdN!hYwtXJX4_FMku7wc^s8b(b?U)-IWPZ`FJMJ?A{{ecQ<W$LabX
z-P}ie^&@7<dIw$-Q`@@hQ%jPgVu1OokH@}R{Ob9>VvqIXvq}1b83i#Ft)(|i%>-ke
zOdd0CE<1L2?Up=Ai$DBpOTNW(zm4|3u6XUM=2q*!>G`+vZ8l2AF4x>x$;{4vxxuh&
z<Dv@vm<?Cfs9pJ!SXrC7e4&SKQu~Q;i36X#Hs0^y+c!Vv%*M$4&9gi$tD^qBihIB7
z?fcmG^1B_L&b`?BP&fQooW@7CWzS>U?)zNY>fX8Nw~^R9gZr;eCpR#>zW&UvY3JJF
z<NuT{B~K1)K4w<cdZau$-hb2O@3+=eKB#=^{`vggs`j=&l46Z~JGt*|J(YDm$nQsg
zVvn6;KvD7a^`fWD4@tkQF}f!+`RJto3@OJor`;3X<Ph4O{yg_)-rD0bbG{w5k~p*X
zk;?N^3w8#@>|R}cKXv#0i{GjuWYTlFlrD!)|JJB0wr=C(l+AN)Yo6Wr!N)$xzJINE
z;<}34$HJuccWlYcyYI7M(=3m-Ilik-K4)S0^QJPU$*TN~)%C#o2h}(C<R(AL<}J8+
zR)jOZSp24Q=f!9hCjZmBmlxS)o7K0OfBW>x>=%F6I+MNXK@*qk+$45djl1Bs#Ds9=
zKI`PQUdIyEB|dxEoHC3$_A>JKsXOb#)~2tI4Lv-~<M7@e?T>Vm{Xh83(zy^B^ExOd
z`p)VjvdpWM*d$Lfq-<mSX1VlZe1}c=!|1izzE?|scj)x+e12_qE6eklSL}vXe=erC
zFVA1!9`T=nPbYcyq5X-ws;leyt#`R4-%HmEnG~(euI2CdwfKmg_|;O~C|!S#kE+LJ
zYaL0PbkF18#eAiC|Dpw$tABINcx=G6M498L`@Ox7*B{l3-WORrZI|i&TzQ=}<`4gE
zw%hvfvhBI8sjE*++SENe<4k(K1YZm9+8<T5*{jzjAFSi+J|tG&{cYRs-FxSpFSehi
z6qbB5pkVRj<NF`R+E&cC?iJCwOYh^pS<xRh{h7YVr1$Mg-ur(*ho;Be{uw^0V)8#b
zMeyXH)z{toe|(<*Z|C+;>s3Cg|EY2Q_wrxqmpiI|GNsP{y;ZMF&S=JkewD4*)^y+z
zW7SRn=TQEi;p<!Ve@)rXKr1MYpM8I^er>HeuI&;F_(MKikDmX7|LfXc2jHxqF{l4N
zbHAMWpW)q$JCl#(JUjDu2LEQ6{`v0<`~5UK*Kd05zvAy4p_ALw{>`<1d~4&1HE~9p
zukJVYUlkb3yIWsCI=g%Oq#K^Hr^PgPFV$V&;li!Mo7JE3WTIHzi!Z;zWJRu-ZQ45T
zP=%HL!fm&{WS8gahzTB>ke=0i^2G6&pqJYheLnKC=#}z4&tt`X+I)Ret{E+!RCHCn
z&_>o&Z1LVTrMKGli}o!nP~7ac<M+O@s=wLtzl)rgdZ&D3Gcrt8d-vq|(v->bwgj*4
zU2XPt(;KY{tpKlAZ?0W7J-%^!cILw9?y`Suk40i%Yii$~U2U--BD$#Y-lmKdzwYLR
za=+;QwN_oPIxFw)kDjxIyl2c-iykVid0y~k`PoMnD*Wo-a-1GsXFYlT%euO*`q;JW
zc2EAQWtF*cW$~J=$0mz5HfT8UeYjiCb$U_Pv8A3N<rDW>1uk1FfV0{2;CTJ+^B3m0
ze^^rTCD65|^U*x-57+%SUAGB-^fx}VNqeuWDO+pJ7yhRTIt&b7UiL4#BD|zd;>YDm
zzK+g(-fR6T-vwP5TqlNhSXc3d`v2khb=`Y)jn$5P!I<oQQXg4<$-ce+r&h~FwJ-Sf
zmWk)<{`^Y0xbz2~J%8lus<qD++O1uCFY3gkdp<j?%l=-CnZw6kcUWTm@`^h)%dPfZ
zdQ})0u=(PP{fAStSKCi)*S>XJG5wLe{M|p6a>i1BS40OLxj9?q=ppS1-rJLJGw`v;
z%$gthV{-M%*HyKx%Qo$k+fr+#ee+%2vAj8Zx+L7h*&8H^kJnXkt$r|{G3(ZUhQqt9
zBW+yElb6PNye(foJ5^=sgumA<v^W9^Pn@@Xz5U8Rd8=37s<yY4mR-!xeJgfizG9lE
zVgOUql0G+fyTg}uX<t44GD($l#a+u=7dQ1?^S_(<_1dKklV(TB#NI!$U!cG3(knr~
z?p0A)-5J-p^O)`&3z={*qHs<X-`ri{u4~ReI?rEG8Xx*`-puU6;xg`-(^<wm6_Oqk
zH1FL!A9tYSZ0waf>9eaU%xt+n9|@ms+`X|}_35FRoC<<lL$;sV^W|*DkJAS)Rm`hZ
z`*u0ls&3)ZGpXCB98!%ddm^fm{;vPpPC0#sS8fJ@fzKxAPx}>F^j>!MeqXC~twpDf
z&%60!e%!S!`=v7c*GZpvTXMOO^~<e_CO03ZSIG(qMw1rIb1Tx^rQT7xpQ}ds;;c*W
zv}dmkdlH@bQbjTBidfE*{{1J;+wJ`ryKe4>chR-ckylD(%c6glc8hJ<rFJ*jQi1n$
za|43}vwUyxwjU91A6$NT)<162D-G4ilAnvV3z{zGJ>KW{W}2=4<!x4~eW!0dxjcOq
z@4DE(x2>yJuUntJ`exQeo6U;8@ki_UKen|m%l>u7>gu#jXN7ibi-_Z7v&wyCbo%=J
z<yCP4*K7(OW%s)M$rkwA?Xz#1VV3UX(sOJssT)7_?>-+t<MV-a&0ZgPKYSFOxAl>~
zpzqo{5t`8^X0N&2{67kCp8Wnzwm{qc&^=y%-ps1C{kNaaE#+RR$J4P)t-q?VRbleP
zK7P@Y;o>d5mPIR0EuFNhuRpw6ZR+0J+BYl9HQmBL*_?L$(Y*HT%4ogikFH$)%3QeZ
zxzCl7gCdGXb)gd-6AJsH<>M^BM}78Q-+F7>%3QXYt1Gruzm=2Rb5LemS02YnYo8;N
z?|l9hzLY`KL59IVd$!r?)>RL6Y<JyNO?~<;`NzGxH$$#Wdvr~{)BbAoe}+BvOY_1I
z?4qap-y2$exPP?ssa5}<mOT6Wrl)=&R#cZpVG*p`@BE|ii_NV2N9vY?mT6|C4fZjp
zMP1sD`PYI+9<Z%U28khByvzIrBG+UsRXf2@d-0#?lV@A9LIa<KZ#}1zYU!&NTAb8%
z5;RD9{`cg1=nW)YP<Hn5Uwz6yC69is+J7oLFw*|R#K`{)Mfty<)c49GvjdORD~oSf
zdE?rmP4y3zHvfBaAAIA7Jrdh)amPP5`S8J~=ECm(4E>h>8D7t>*ZSlyRDb-4{k!?k
z<NEFPW~%O)xXO2$6I)2ReAMTDm{ff5p)XlMYp%Lxw`LW!NyVJHwV0e%#6FuMy3C0L
zACAqde_(%Y`)g)6>uIt6_nGp()BiK<2A!4M@D^u6LLLY>vHF+qn=o@<orPUfv_JKH
zp7>AnY){K_r@*e=8>gl`^<7f>CCvYiN?84!+0grMV9a>quQKkRj8FdB8~@aF#hUp)
zJl6bY*lGUvll`%NB=-C(h4Lp3-w1g_D6R1)25(&Jn)09FkZbzillvhTsKA-E?t*`M
zY+L<~Z55sTpMg_!{&zL_H8}_l$VCR<M6IQ+;Bd4%Nv?^gQ~A#jn*Z<Pzp3VkFo>+$
zw^Z~$!xSYxrgZBz?e&bUYk6E!HEv1=EDj9uU3~Vd{!flO_4~4?BJBrxwN)Nd@NZoE
zhsQ4}w(jrv>kpc>zj8*)@;}2B&wnrfg{s1aFB&vauTk1DzrFqwU;g*`{~4S_Z(|<A
zgnW(CyVXcn5;{mtfvzL2xr#g-o;PFNtsW<{(3%esDdDQfGHt#885ASzw)Uys>X|=r
zEo;@rxOD#~S@NGh=Kp7?>;G{PiGB5z_M?9r^-jODTYqES`}Z0@rP}}eiT=-UfAK$)
zN(9@qs>=?fia%-h{ymG4^rM=k@$1NL{T;Sl@BV%2pSYs_{L%e?f7w5FM=--zEPgHY
zX5WKv-n)OFvU6MbpTS=2Kg0JI{~3DaP<U~R`qy}G`rUj}51*r^hR%5yvhfSEiH^&?
zPW>IWQ^Lc)PqFEB{#U2_pW*$*{|r(noIMp5%IrH#0>5^a#TQho-u?SjUOC`D!~E|0
z-(TvF^&_#@2e|QZfSvjfzq(b{O`#Hv*O?~G-#$O}JNu^o^6yh>a5`YGtNt4QO~2dU
zOedgBK4Ln^wAlOi0zaAZ|NOE2Kf`|aKN(2utgj-qNwL%K$ek>&uXI0g#s2xD`~UvR
ze_V`UuD-HcI_>ZB{B;%!!)M>r-w}7t_Wr#9Br~v?XP-h$3{N65hW(IZIN|k4D}Ro8
zF2!3OCr$jDy=Gndk^3J%v;S`Yd41v3Yeh$vT>9N7b7)pC+q?5RI9Ao;T5Z4L=cOS~
zfP-2Bc=6(`OzGk)>rc2n{`)y*edB)k@g1Sq1aAd`#!e@${^S0tR^3%3Y*`|GPRn9`
zwjSjKk2fayKe-P6`&#%<??1z&<E8%@Vy}btRNct9|4Fp@@2kV2e<Xhy=h%NtepR+5
z(6vVM>qA!&@Hxl~t7pZ|+<PUdU0$!~zSr_t{`PmeCci7+J)3HCCo%c%XTjJCt-`sf
z=e|VExv{HHWWL^er*FH{a|@U4nYN<w?QPyI$*&{Ls<>h@Pgm;RI@I!cN6A`m-l}8e
z(<F5tZ}Vi-Il3eHedxPoTe}ZmigGU&-ch`TO;XwP_0=}1)umgq?#-GPYI%Lh-f1iE
ztSo<cQ)$|r=rgLGeWJVSOxG{X`0(=Abn#A=Q}azKo`htty5*R;T;5XRc*6sksgFfX
zoDS(Ger3C&c%=K#JJFDg#D%67QU7*l_(olQ$}YJ*w<<4vUF+Y4rgsnS5Y#$zpvhuI
z(uWu2Q_tSgT~kxJd~fvc((T{E7R%3<d#ReVxLij|*67)_&n~rQnb$atC!9TC+P~?r
z<%*lJtKVPWxM^iaK;AF4X?LwBoL*!*=h}4xzsU~o7;;LUt<9Phdu;LAHP(+M`c<bE
zHu}$ccj?%vTL<-o!i6-`WZ5lEQ@3WDUiYp%l{`sz_Q4bEri&K0c^>)|n^|n19lO;!
zcFEfXf3^MFPH`XYdYb;2xAD}hrzS^vS4C?xI8G?2+j=?jZ?kXoo>zjFvsHMyo|Rnp
z*s7gg#<f~OW!|0ZcKXwd7Ok1O=lzxYyk{PV?#<ipyyE(oiskL+w)KWZ>`V>p=~}%u
z;^(znC1D@6PO7mp%kI8(_jTdz*RM^2I(2U{?Ylhj-kPkaqqE{7&dRJ*QkFf)SHQ9E
z*8H#)%ic{}X}{-b`}TIV>37$EGQNG7%dt~`;<~1HhL2XBUSwnCxVO?HHEEem$#dE4
zrPBj@cFwc)x|yiF>_XHE*W%ojFFxJcd#BJVW>R!R<+;L$D~A$euas}zw*P~7o4W9x
zZM*i(|G@M3;inBZckcMl(BH&<_vOS{(;gr8<`i7}CRbB|#isD}wJpo{Zt698G3lMT
zzz!Z~J_d=iX|XYp-j7;mg}ZNmtb3(o#me?W2cu4hO?!H1pHlAYP5Ft>4;yYMY5OZz
z60<33>5cG%>o|TkKlsmZKXl#nZx?+PtAFRKPXC^<L*h@iSoZ1ShbtclyQLZliI`j5
zjrQy1GMwBt%h=-T+R&x8rS6ulBH?G}{eUen3^#sM|Kn%#-@|`iFI$&><Ua$n&|E&{
zKf_-AnFnXq@7{bG*A8*qd&k$Ww@VomV`mzRx*k|NfoGsJ77%}C8dPoHmB3q##@$!;
zKT$6H_j6r1(wfUv*aY8d{Ab{^Yd>-IpYm7GVHVoK1_QO|Qp)^KYr=nqsvZ9suokF^
zUH^FgN^X<?vGFVD%&2`LzdkJL(u5zc^-y}sJ;80i9(ReE7nCWR*|KGCzOZNh5BK)D
zTNINxO)_U?XI{F$P_lQr#=Wa=+_j5(Qs(t$KdCG_QyDWkYWeb+M+0L;=e06NN$);3
zqupN1Wy!4V`HOBvyqd7tdXwkBwUO1oq>f3Zrl)=Jxfa>0TCz3c$%=KGp8H&0Uz)j)
zSLu{^VW@<tPes9hh7^<k4EDDe?e({1dh4z&Kb_Vtb7<;x)615>v=>i4JW=|F?&66D
zHTMNobJ=e+dA(((PF{+!^YwCP@84$oC)8&IezrU{^R1q8V|v95m%9>uy!oz+GB3WI
z7#ruWQoXC8`dZJff-5E7(G~H6?<AA|-nup~Jjiopey-qLpJSZsLYIC`7j?~F*PmRt
zs<!V|xsriTk6)#1tN1Us`Mg{DMK-%?rLfLOu)nmSvNkSU*W|P5udmuGUaxHQ6Fd=K
z;dcA)n?jGS*hA}8<|OUjp}2kf)^5)l@kvVl*Y|&3-h6P6>)kigpWmq16Te}T*V{Qt
zi+3K<OnzhRmys5Fvh%=Kt?Y##-tLaho-SbURHXk(*xt=qx%Ec(+~sflJg89eHu~b}
zkOxcEE|<UD+N1UOj@p%=tm|Jb68b_mxM^8U(A;_>T1VOYYKuZ;;{80ksmHJDY4g4j
zDcyUn$4vEF(WPr#8jE(E+W<Om1%#2azHiL=&i@RLpYea^|GeH`^FKpXym7P9e}?k(
zY1q#W!gZ!ld|b^*`fNs6`EcEK`w#N3qrW!7S)X=J|9z%^+3EibcP|9GiXJE<`|w}c
z6+^x4q>rWT^cuXE-x+JuX;GJ<gY<+~(@s9zzDdvJxrK!Nxpgn^DK}&;lirr7wT_Wt
zPS?{F>wct(F1Wf&TPFCx<yqyg-j#}3J>C5B(XGrU+e=<5@jMAJ&iZzGWz;Tzn~N?*
zXa4P5wDXqMywu2}>KCR?zvR1l(T*!QiPx3(Z=XBIT!^PcVs-kFZT}f=9ca(r^yc7C
z|M;avA;EpAQTDvkwsWR_Su*wAmyH*e?U%~lbZ5fZyw{r*irCls&1##xtb69$lu40k
z@+Ey!PhSj{IkxJq+Slz<zsqb8xp#Ky&)Ma-mhL^)%DZW&v36?P<@e!AA9>7n#cm1M
zmg>a3GWha-(=&mdX`)l^Tzjkfo%cbe#R}W?#fz)&O1#Q0lXBcs^V?iA?CYatd#~y2
z-@LT>t$x`#j)2p>TTcgu{#_hA^{A`%<iGR%UT*uMpD7e|aD~ctA18HXE<fLF-$%>E
zm!EDk=Ez}Fc&D^;m$p{Ktt|@{@|x~k>!WJ5`T4x%VY`man6-0nqy&3+HgEg7Y^Kiv
zJLXlE-8q$*roww%;#a7Z=Cxy2SIdPjU$8Ra<)!lbSMFRc)y~cpdUY#IDbjdLuFc1Y
zlE8?o>$N8Kq<-_w4D7Y^@?H_>D%!XKN97Ld5RKODp!PuDl|YAi$A?Cb=syF}p@uIP
z&hoAd^O&8z-Z`?)`a^$L{3F(PE0?_#7Z=>U`lw3gPJf25@^9rw>VGlD)<3XSn;e$x
zyS${#{`Rc)Ki~f|9Cy|K_H5yQ2HXD(A1}t_BNWQtyf65l;RS2ye};yy?N>GGm45fu
zZI1pq^JaZsmi*`Y{~2UOz^sq|8Tz^&*KgjZ{GZ_h*X#cbj4%G3`_ccO!9jcdANIws
zrr1xcJ@|Fj{WIrI|1J*r&oJ*l!;cq>5zKYwzpWqH|7BRW|ASoroA(|68NM*>`p<Bn
z%5J6ePw_kQk!$~)d9#1JDgV#+Ae-#rEE4RT-~Xp1@bAtQ_2=vVl!QzFo%>P#Uqg8O
z4}Nzf4~H!L)BTQr&9#4NW%2OPkq5E<)IVOd<36(6?2*0uYbpPe_=4KB?VpV|{kwC;
z{`q5g;DCZj^w@uf^cwg73=2fB|6{(0WH~rgn_rvmpE-9rA{N}iOndnsFNhDCAE1~7
zyA>R#h5M0W?@K+B?O(m*Pwac}>umnB^i)`YKtksDKP&BC<d{Q>mQ?}&T)(ql%dR)J
z)_;4}`Cs*ahR2Kg;Ve?ZX8%7c4R|1t5Hz6hpfJ^d1Bry7ISzB`eiDMF&i>;?NDSi5
zQeSF=+CS;<umk0&8~+(VSqhwy))rUVKGyd8eX57K>*;?6<Ya~vH>=v@rpP`PnQ62(
z$m`PNj8izqurhXk=<h47ULp0~arY_N1wXAFSiY}cwq{X=PVU3Pc8{siuh_JcjjHet
z6fJ5%pC!=DM%@4{{m)LnuKREH%eP+QJ1)I@(3AS4^`iSqg~oW}8$Y~{N4Z?Co$k2j
z^c+KjcQOI&xw)0L%ib$~d-wWS*70v=|1&^Jp~C+RIFjtOpoM;ZpSpIg$qwn-G<oBV
z`ked!89@0s@bBh&Six0~l#h|j`D!1w_|J;^h}D12ys_Vw|DWMGIQiY&4=sk)*&`)#
z<os>7*8OKdZPNPB#vA^@RWJUX_MZWg_ILbeKvs@g9Bh?;>iD&5|Cw`#5gPj6&i>B;
zDZw7&jMl}kuGCLuzozw{!PuG~p#ko4?S7;*uo%hds{{Xq$dCFGM?Eqcmkf^=(LBXk
zS*m2FWXe39@nqKX8l+=AC;zs7WDcp`x7Nx1@IET=<KOMzD_^?1E2eBdsnWT%QgoMR
zMo;v<HQ#@MN+K#BE54_g_u^HP)2p*R^<3DCb!w|Gu+0^1{JVce{e`{NKfCO=f93xr
zzP{ejLHbJ=`?r4!|1&s!eg913-~B80FXHz8Im(cKF+%?KulB#(*Z0p@{JTEjKf}VR
z@1I2&_q!(4=f9Hwq96A!ssHU?=YOrQ@1F@^`#Wm*Xt0c?meEW^zp~}4%zuVgD&Uo`
zT1Vgomzi&QK*i1ahw^)$2wn+b>p#5zC%gIY^ZywZa=l&M_d#^)(Y2l4GiGV>J-cz&
PEh0SG0d`57{QsK(<sPdF

diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
index 4cf134cc3c..585470d5f0 100644
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ b/tensorflow/docs_src/mobile/leftnav_files
@@ -1,6 +1,7 @@
 index.md
 ### TensorFlow Lite
 tflite/index.md
+tflite/devguide.md
 tflite/demo_android.md
 tflite/demo_ios.md
 >>>
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/docs_src/mobile/tflite/demo_android.md
index c94b5597a6..7f2f8882a2 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_android.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_android.md
@@ -1,42 +1,144 @@
-# TensorFlow Lite Demo for Android
+# Android Demo App
 
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model.
+An example Android application using TensorFLow Lite is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+The demo is a sample camera app that classifies images continuously
+using either a quantized Mobilenet model or a floating point Inception-v3 model.
+To run the demo, a device running Android 5.0 ( API 21) or higher is required.
 
-You'll need an Android device running Android 5.0 or higher to run the demo.
+In the demo app, inference is done using the TensorFlow Lite Java API. The demo
+app classifies frames in real-time, displaying the top most probable
+classifications. It also displays the time taken to detect the object.
 
-To get you started working with TensorFlow Lite on Android, we'll walk you
-through building and deploying our TensorFlow demo app in Android Studio.
+There are three ways to get the demo app to your device:
 
-Note: For a more detailed guide see the
-[TFLite Codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/index.html#0)
+* Download the [prebuilt binary APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+* Use Android Studio to build the application.
+* Download the source code for TensorFlow Lite and the demo and build it using
+  bazel.
 
-It's also possible to build the demo app with Bazel, but we only recommend
-this for advanced users who are very familiar with the Bazel build
-environment. For more information on that, see our page [on Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite#building-tensorflow-lite-and-the-demo-app-from-source).
 
-## Build and deploy with Android Studio
+## Download the pre-built binary
 
-1. Clone the TensorFlow repository from GitHub if you haven't already:
+The easiest way to try the demo is to download the
+[pre-built binary APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
 
-        git clone https://github.com/tensorflow/tensorflow
+Once the APK is installed, click the app icon to start the program. The first
+time the app is opened, it asks for runtime permissions to access the device
+camera. The demo app opens the back-camera of the device and recognizes objects
+in the camera's field of view. At the bottom of the image (or at the left
+of the image if the device is in landscape mode), it displays top three objects
+classified and the classification latency.
 
-2. Install the latest version of Android Studio from [here](https://developer.android.com/studio/index.html).
 
-3. From the **Welcome to Android Studio** screen, use the **Import Project
-   (Gradle, Eclipse ADT, etc)** option to import the
-   `tensorflow/contrib/lite/java/demo` directory as an existing Android Studio
-   Project.
+## Build in Android Studio with TensorFlow Lite AAR from JCenter
 
-    Android Studio may prompt you to install Gradle upgrades and other tool
-    versions; you should accept these upgrades.
+Use Android Studio to try out changes in the project code and compile the demo
+app:
 
-4. Download the TensorFlow Lite MobileNet model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
+* Install the latest version of
+  [Android Studio](https://developer.android.com/studio/index.html).
+* Make sure the Android SDK version is greater than 26 and NDK version is greater
+  than 14 (in the Android Studio settings).
+* Import the `tensorflow/contrib/lite/java/demo` directory as a new
+  Android Studio project.
+* Install all the Gradle extensions it requests.
 
-    Unzip this and copy the `mobilenet_quant_v1_224.tflite` file to the assets
-    directory: `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+To get a model, either:
 
-5. Build and run the app in Android Studio.
+* Download the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+  and unzip and copy `mobilenet_quant_v1_224.tflite` to the assets directory:
+  `tensorflow/contrib/lite/java/demo/app/src/main/assets/`.
+* Or, download the floating point [Inception-v3 model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+  and unzip and copy `inceptionv3_non_slim_2015.tflite` to the assets
+  directory. Change the chosen classifier in
+  [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
+  from: `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`<br>
+  to: `classifier = new ImageClassifierFloatInception(getActivity());`.
 
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
+Now you can build and run the demo app.
+
+
+## Build TensorFlow Lite and the demo app from source
+
+### Clone the TensorFlow repo
+
+```sh
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Install Bazel
+
+If `bazel` is not installed on your system, see
+[Installing Bazel](https://bazel.build/versions/master/docs/install.html).
+
+Note: Bazel does not currently support Android builds on Windows. Windows users
+should download the
+[prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+
+### Install Android NDK and SDK
+
+The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The
+current recommended version is *14b* and can be found on the
+[NDK Archives](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads)
+page.
+
+The Android SDK and build tools can be
+[downloaded separately](https://developer.android.com/tools/revisions/build-tools.html)
+or used as part of
+[Android Studio](https://developer.android.com/studio/index.html). To build the
+TensorFlow Lite Android demo, build tools require API >= 23 (but it will run on
+devices with API >= 21).
+
+In the root of the TensorFlow repository, update the `WORKSPACE` file with the
+`api_level` and location of the SDK and NDK. If you installed it with
+Android Studio, the SDK path can be found in the SDK manager. The default NDK
+path is:`{SDK path}/ndk-bundle.` For example:
+
+```
+android_sdk_repository (
+    name = "androidsdk",
+    api_level = 23,
+    build_tools_version = "23.0.2",
+    path = "/home/xxxx/android-sdk-linux/",
+)
+
+android_ndk_repository(
+    name = "androidndk",
+    path = "/home/xxxx/android-ndk-r10e/",
+    api_level = 19,
+)
+```
+
+Some additional details are available on the
+[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+
+### Build the source code
+
+To build the demo app, run `bazel`:
+
+```
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+```
+
+Caution: Because of an bazel bug, we only support building the Android demo app
+within a Python 2 environment.
+
+
+## About the demo
+
+The demo app is resizing each camera image frame (224 width * 224 height) to
+match the quantized MobileNets model (299 * 299 for Inception-v3). The resized
+image is converted—row by row—into a
+[ByteBuffer](https://developer.android.com/reference/java/nio/ByteBuffer.html).
+Its size is  1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch.
+224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents
+the 3 colors of a pixel.
+
+This demo uses the TensorFlow Lite Java inference API
+for models which take a single input and provide a single output. This outputs a
+two-dimensional array, with the first dimension being the category index and the
+second dimension being the confidence of classification. Both models have 1001
+unique categories and the app sorts the probabilities of all the categories and
+displays the top three. The model file must be downloaded and bundled within the
+assets directory of the app.
diff --git a/tensorflow/docs_src/mobile/tflite/demo_ios.md b/tensorflow/docs_src/mobile/tflite/demo_ios.md
index 3ee9b1cbca..3be21da89f 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_ios.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_ios.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite Demo for iOS
+# iOS Demo App
 
 The TensorFlow Lite demo is a camera app that continuously classifies whatever
 it sees from your device's back camera, using a quantized MobileNet model. These
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
new file mode 100644
index 0000000000..5b521dca7b
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -0,0 +1,224 @@
+# Developer Guide
+
+Using a TensorFlow Lite model in your mobile app requires multiple
+considerations: you must choose a pre-trained or custom model, convert the model
+to a TensorFLow Lite format, and finally, integrate the model in your app.
+
+## 1. Choose a model
+
+Depending on the use case, you can choose one of the popular open-sourced models,
+such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
+data set or even build your own custom model.
+
+### Use a pre-trained model
+
+[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+is a family of mobile-first computer vision models for TensorFlow designed to
+effectively maximize accuracy, while taking into consideration the restricted
+resources for on-device or embedded applications. MobileNets are small,
+low-latency, low-power models parameterized to meet the resource constraints for
+a variety of uses. They can be used for classification, detection, embeddings, and
+segmentation—similar to other popular large scale models, such as
+[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
+[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
+for MobileNets that can be used in mobile projects of all sizes.
+
+[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
+that achieves fairly high accuracy recognizing general objects with 1000 classes,
+for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
+features from input images using a convolutional neural network and classifies
+them based on those features with fully-connected and softmax layers.
+
+[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+is an on-device model that provides one-touch replies for incoming text messages
+by suggesting contextually relevant messages. The model is built specifically for
+memory constrained devices, such as watches and phones, and has been successfully
+used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+
+These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md)
+
+### Re-train Inception-V3 or MobileNet for a custom data set
+
+These pre-trained models were trained on the *ImageNet* data set which contains
+1000 predefined classes. If these classes are not sufficient for your use case,
+the model will need to be re-trained. This technique is called
+*transfer learning* and starts with a model that has been already trained on a
+problem, then retrains the model on a similar problem. Deep learning from
+scratch can take days, but transfer learning is fairly quick. In order to do
+this, you need to generate a custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through the re-training process step-by-step. The code supports
+both floating point and quantized inference.
+
+### Train a custom model
+
+A developer may choose to train a custom model using Tensorflow (see the
+@{$tutorials} for examples of building and training models). If you have already
+written a model, the first step is to export this to a @{tf.GraphDef} file. This
+is required because some formats do not store the model structure outside the
+code, and we must communicate with other parts of the framework. See
+[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
+to create .pb file for the custom model.
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
+[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for supported operators and their usage. This set of operators will continue to
+grow in future Tensorflow Lite releases.
+
+
+## 2. Convert the model format
+
+The model generated (or downloaded) in the previous step is a *standard*
+Tensorflow model and you should now have a .pb or .pbtxt @{tf.GraphDef} file.
+Models generated with transfer learning (re-training) or custom models must be
+converted—but, we must first freeze the graph to convert the model to the
+Tensorflow Lite format. This process uses several model formats:
+
+* @{tf.GraphDef} (.pb) —A protobuf that represents the TensorFlow training or
+  computation graph. It contains operators, tensors, and variables definitions.
+* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
+  does not contain a graph structure, it cannot be interpreted by itself.
+* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
+  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
+  CheckPoint and a `GraphDef`, and converting each variable into a constant
+  using the value retrieved from the CheckPoint.
+* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
+  input and output arguments to a model. A `GraphDef` and CheckPoint can be
+  extracted from a `SavedModel`.
+* *TensorFlow Lite model* (.tflite) —A serialized
+  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+  Lite operators and tensors for the TensorFlow Lite interpreter, similiar to a
+  `FrozenGraphDef`.
+
+### Freeze Graph
+
+To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
+that contain trained weight parameters. The .pb file only contains the structure
+of the graph. The process of merging the checkpoint values with the graph
+structure is called *freezing the graph*.
+
+You should have a checkpoints folder or download them for a pre-trained model
+(for example,
+[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+
+To freeze the graph, use the following command (changing the arguments):
+
+```
+freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
+  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
+  --input_binary=true \
+  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
+  --output_node_names=MobileNetV1/Predictions/Reshape_1
+```
+
+The `input_binary` flag must be enabled so the protobuf is read and written in
+a binary format. Set the `input_graph` and `input_checkpoint` files.
+
+The `output_node_names` may not be obvious outside of the code that built the
+model. The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
+or `graphviz`.
+
+The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
+(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
+Optimizing Converter tool supports both float and quantized models. To convert
+the frozen `GraphDef` to the .tflite format:
+
+```
+toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+  --inference_type=FLOAT \
+  --input_type=FLOAT \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --input_shapes=1,224,224,3
+```
+
+The `input_file` argument should reference the frozen `GraphDef` file
+containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. `output_file` is where the TensorFlow
+Lite model will get generated. The `input_type` and `inference_type`
+arguments should be set to `FLOAT`, unless converting a
+@{$performance/quantization$quantized model}. Setting the `input_array`,
+`output_array`, and `input_shape` arguments are not as straightforward. The
+easiest way to find these values is to explore the graph using Tensorboard. Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
+
+It is also possible to use the Tensorflow Optimizing Converter with protobufs
+from either Python or from the command line (see the 
+[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py)
+example). This allows you to integrate the conversion step into the model design
+workflow, ensuring the model is easily convertible to a mobile inference graph.
+For example:
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+out = tf.identity(val, name="out")
+
+with tf.Session() as sess:
+  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
+  open("converteds_model.tflite", "wb").write(tflite_model)
+```
+
+For usage, see the Tensorflow Optimizing Converter
+[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
+
+Refer to the
+[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for troubleshooting help, and if that doesn't help, please
+[file an issue](https://github.com/tensorflow/tensorflow/issues).
+
+The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
+to visualize TensorFlow Lite models after conversion. To build the
+[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/visualize.py)
+tool:
+
+```sh
+bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
+```
+
+This generates an interactive HTML page listing subgraphs, operations, and a
+graph visualization.
+
+
+## 3. Use the TensorFlow Lite model for inference in a mobile app
+
+After completing the prior steps, you should now have a .tflite model file.
+
+### Android
+
+Since Android apps are written in Java and the core TensorFlow library is in C++,
+a JNI library is provided as an interface. This is only meant for inference—it
+provides the ability to load a graph, set up inputs, and run the model to
+calculate outputs.
+
+The open source Android demo app uses the JNI interface and is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+You can also download a
+[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+See the @{$tflite/demo_android} guide for details.
+
+The @{$mobile/android_build} guide has instructions for installing TensorFlow on
+Android and setting up `bazel` and Android Studio.
+
+### iOS
+
+To integrate a TensorFlow model in an iOS app, see the
+[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
+guide and @{$tflite/demo_ios} guide.
+
+#### Core ML support
+
+Core ML is a machine learning framework used in Apple products. In addition to
+using Tensorflow Lite models directly in your applications, you can convert
+trained Tensorflow models to the
+[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
+devices. To use the converter, refer to the
+[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index beb24794fc..11f11ea4dc 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -155,7 +155,9 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-![tensorflow lite architecture](https://www.tensorflow.org/images/tflite-architecture.jpg)
+<img src="/images/tflite-architecture.jpg"
+     alt="TensorFlow Lite architecture diagram"
+     style="max-width:600px;">
 
 Starting with a trained TensorFlow model on disk, you'll convert that model to
 the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
-- 
GitLab


From 266d701ea642394771d238566d9f3c00eab9ea19 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Thu, 29 Mar 2018 13:31:23 -0700
Subject: [PATCH 0086/1262] Docs: Add Eager Execution guide to Programmer's
 Guide.

PiperOrigin-RevId: 190977505
---
 tensorflow/contrib/eager/README.md            |  20 +-
 .../contrib/eager/python/g3doc/guide.md       | 906 +---------------
 .../docs_src/programmers_guide/eager.md       | 992 ++++++++++++++++++
 .../docs_src/programmers_guide/leftnav_files  |   3 +-
 4 files changed, 1015 insertions(+), 906 deletions(-)
 create mode 100644 tensorflow/docs_src/programmers_guide/eager.md

diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 9d2ca07c3a..9a3b780af8 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,12 +1,8 @@
 # Eager Execution
 
-> *WARNING*: This is a preview/pre-alpha version. The API and performance
-> characteristics are subject to change.
-
-Eager execution is an experimental interface to TensorFlow that provides an
-imperative programming style (à la [NumPy](http://www.numpy.org)). When you
-enable eager execution, TensorFlow operations execute immediately; you do not
-execute a pre-constructed graph with
+Eager execution provides an imperative interface to TensorFlow (similiar to
+[NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
+operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
 
 For example, consider a simple computation in TensorFlow:
@@ -33,7 +29,7 @@ print(m)
 ## Caveats
 
 This feature is in early stages and work remains to be done in terms of smooth
-support for distributed and multi-GPU training and CPU performance.
+support for distributed and multi-GPU training and performance.
 
 - [Known issues](https://github.com/tensorflow/tensorflow/issues?q=is%3Aissue%20is%3Aopen%20label%3Acomp%3Aeager)
 - Feedback is welcome, please consider
@@ -41,21 +37,23 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Eager execution is included in TensorFlow versions 1.5 and above.
+Eager execution is included in TensorFlow versions 1.7 and above.
 Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
 For an introduction to eager execution in TensorFlow, see:
 
-- [User Guide](python/g3doc/guide.md)
+- [User Guide](https://www.tensorflow.org/programmers_guide/eager) ([source](../../docs_src/programmers_guide/eager.md))
 - Notebook: [Basic Usage](python/examples/notebooks/1_basics.ipynb)
 - Notebook: [Gradients](python/examples/notebooks/2_gradients.ipynb)
 - Notebook: [Importing Data](python/examples/notebooks/3_datasets.ipynb)
 
 ## Changelog
 
-- 2017/10/31: Initial preview release.
+- 2017/10/31: Initial preview release (in TensorFlow 1.5)
 - 2017/12/01: Example of dynamic neural network:
   [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
   See [README.md](python/examples/spinn/README.md) for details.
+- 2017/03: Core functionality moved out of the experimental tf.contrib namespace
+  in TensorFlow 1.7.
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index ebb05051f2..2d2aba6908 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -1,900 +1,18 @@
-# TensorFlow Eager Execution
-
-## What is this?
+# Eager execution
 
 Eager execution is a feature that makes TensorFlow execute operations
-immediately: concrete values are returned, instead of a computational graph to
-be executed later.
-
-As a result, enabling eager execution provides:
-
--   A [NumPy](http://www.numpy.org/)-like library for numerical computation with
-    support for GPU acceleration and automatic differentiation.
--   A flexible platform for machine learning research and experimentation.
-
-Eager execution is under active development. This guide walks through an
-alpha/preview release. In particular, not all TensorFlow APIs currently work
-with eager execution enabled, and some models may be slow to execute, compared
-to models defined without using eager execution.
-
-## Installation
-
-Eager execution is included in TensorFlow versions 1.5 and above.
-Installation instructions at https://www.tensorflow.org/install/
-
-The contents of this guide are compatible with TensorFlow 1.5. However, if you
-run into bugs that are fixed in source but not the release, you may want to
-either [build from source](https://www.tensorflow.org/install/install_sources)
-or try a nightly build. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-docker pull tensorflow/tensorflow:nightly-gpu
-docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment.
-
-## Getting Started
-
-With TensorFlow installed, eager execution is enabled via a single call:
-
-```python
-import tensorflow as tf
-
-import tensorflow.contrib.eager as tfe
-
-tfe.enable_eager_execution()
-```
-
-Enabling eager execution changes how TensorFlow functions behave (in particular,
-`Tensor` objects will reference concrete values instead of being symbolic
-handles to nodes in a computational graph). As a result, eager execution should
-be enabled at the beginning of a program and cannot be disabled afterwards in
-the same program.
-
-Code examples in the rest of this guide assume that eager execution has been
-enabled.
-
-## A library for numerical computation
-
-A significant fraction of the [TensorFlow
-API](https://www.tensorflow.org/api_docs/python/) consists of numerical
-operations:
-[arithmetic operations](https://www.tensorflow.org/api_guides/python/math_ops#Arithmetic_Operators),
-[matrix operations](https://www.tensorflow.org/api_guides/python/math_ops#Matrix_Math_Functions),
-[linear algebra operations](https://www.tensorflow.org/versions/master/api_docs/python/tf/linalg),
-etc.
-
-With eager execution enabled, these operations consume and return
-multi-dimensional arrays as `Tensor` objects, similar to NumPy
-[`ndarray`s](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.ndarray.html).
-For example:
-
-```python
-# Multiply two 2x2 matrices
-x = tf.matmul([[1, 2],
-               [3, 4]],
-              [[4, 5],
-               [6, 7]])
-# Add one to each element
-# (tf.add supports broadcasting)
-y = tf.add(x, 1)
-
-# Create a random random 5x3 matrix
-z = tf.random_uniform([5, 3])
-
-print(x)
-print(y)
-print(z)
-```
-
-Output:
-
-```
-tf.Tensor(
-[[16 19]
- [36 43]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[17 20]
- [37 44]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[ 0.25058532  0.0929395   0.54113817]
- [ 0.3108716   0.93350542  0.84909797]
- [ 0.53081679  0.12788558  0.01767385]
- [ 0.29725885  0.33540785  0.83588314]
- [ 0.38877153  0.39720535  0.78914213]], shape=(5, 3), dtype=float32)
-```
-
-For convenience, these operations can also be triggered via operator overloading
-of the `Tensor` object. For example, the `+` operator is equivalent to `tf.add`,
-`-` to `tf.subtract`, `*` to `tf.multiply`, etc.:
-
-```python
-x = (tf.ones([1], dtype=tf.float32) + 1) * 2 - 1
-print(x)
-```
-
-Output:
-
-```
-tf.Tensor([ 3.], shape=(1,), dtype=float32)
-```
-
-### Converting to and from NumPy
-
-The operations above automatically convert Python objects (like lists of
-numbers) and NumPy arrays to `Tensor` objects. `Tensor` objects can also be used
-as NumPy arrays by numpy operations.
-
-```python
-import numpy as np
-
-x = tf.add(1, 1)                     # tf.Tensor with a value of 2
-y = tf.add(np.array(1), np.array(1)) # tf.Tensor with a value of 2
-z = np.multiply(x, y)                # numpy.int64 with a value of 4
-```
-
-Alternatively, they can be explicitly converted using
-[`tf.constant`](https://www.tensorflow.org/api_docs/python/tf/constant), as
-shown in the next example.
-
-Conversely, you can call the `numpy()` method of a `Tensor` object' to obtain
-its NumPy `ndarray` value. For example:
-
-```python
-import numpy as np
-
-np_x = np.array(2., dtype=np.float32)
-x = tf.constant(np_x)
-
-py_y = 3.
-y = tf.constant(py_y)
-
-z = x + y + 1
-
-print(z)
-print(z.numpy())
-```
-
-Output:
-
-```
-tf.Tensor(6.0, shape=(), dtype=float32)
-6.0
-```
-
-### GPU acceleration
-
-Many TensorFlow operations support GPU acceleration. With eager execution
-enabled, [computation is *not* automatically
-offloaded](https://www.tensorflow.org/tutorials/using_gpu) to GPUs. Instead, you
-must explicitly specify when GPUs should be used.
-
-The simplest way to do this is to enclose your computation in a `with
-tf.device('/gpu:0')` block. Also of interest is the `tfe.num_gpus()` function,
-which returns the number of available GPUs.
-
-For example, consider this snippet to measure the time to multiply two 1000x1000
-matrices on CPU:
-
-```python
-import time
-
-def measure(x):
-  # The very first time a GPU is used by TensorFlow, it is initialized.
-  # So exclude the first run from timing.
-  tf.matmul(x, x)
-
-  start = time.time()
-  for i in range(10):
-    tf.matmul(x, x)
-  end = time.time()
-
-  return "Took %s seconds to multiply a %s matrix by itself 10 times" % (end - start, x.shape)
-
-# Run on CPU:
-with tf.device("/cpu:0"):
-  print("CPU: %s" % measure(tf.random_normal([1000, 1000])))
-
-# If a GPU is available, run on GPU:
-if tfe.num_gpus() > 0:
-  with tf.device("/gpu:0"):
-    print("GPU: %s" % measure(tf.random_normal([1000, 1000])))
-```
-
-Output (exact numbers will depend on the characteristics of the hardware):
-
-```python
-CPU: Took 0.145531892776 seconds to multiply a (1000, 1000) matrix by itself 10 times
-GPU: Took 0.000458955764771 seconds to multiply a (1000, 1000) matrix by itself 10 times
-```
-
-Alternatively, methods on the `Tensor` object can be used to explicitly copy the
-`Tensor` to a different device. Operations are typically executed on the device
-on which the inputs are placed. For example:
-
-```python
-x = tf.random_normal([10, 10])
-
-x_gpu0 = x.gpu()
-x_cpu = x.cpu()
-
-_ = tf.matmul(x_cpu, x_cpu)  # Runs on CPU
-_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
-
-if tfe.num_gpus() > 1:
-  x_gpu1 = x.gpu(1)
-  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
-```
-
-### Automatic Differentiation
-
-[Automatic
-differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) is
-very useful when implementing many machine learning algorithms (e.g.,
-[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
-neural networks). For this purpose, TensorFlow eager execution provides an
-[autograd](https://github.com/HIPS/autograd)-style API for automatic
-differentiation. Specifically, the functions:
-
--   `tfe.gradients_function(f)`: Returns a Python function that computes the
-    derivatives of the Python function `f` with respect to its arguments. `f`
-    must return a scalar value. When the returned function is invoked, it
-    returns a list of `Tensor` objects (one element for each argument of `f`).
--   `tfe.value_and_gradients_function(f)`: Similar to `tfe.gradients_function`,
-    except that when the returned function is invoked, it returns the value of
-    `f` in addition to the list of derivatives of `f` with respect to its
-    arguments.
-
-These functions naturally apply to higher order differentiation as well. For
-example:
-
-```python
-def f(x):
-  return tf.multiply(x, x)  # Or x * x
-assert 9 == f(3.).numpy()
-
-df = tfe.gradients_function(f)
-assert 6 == df(3.)[0].numpy()
-
-# Second order deriviative.
-d2f = tfe.gradients_function(lambda x: df(x)[0])
-assert 2 == d2f(3.)[0].numpy()
-
-# Third order derivative.
-d3f = tfe.gradients_function(lambda x : d2f(x)[0])
-assert 0 == d3f(3.)[0].numpy()
-```
-
-These functions can be used to train models. For example, consider the following
-simple linear regression model:
-
-```python
-def prediction(input, weight, bias):
-  return input * weight + bias
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# A loss function: Mean-squared error
-def loss(weight, bias):
-  error = prediction(training_inputs, weight, bias) - training_outputs
-  return tf.reduce_mean(tf.square(error))
-
-# Function that returns the derivative of loss with respect to
-# weight and bias
-grad = tfe.gradients_function(loss)
-
-# Train for 200 steps (starting from some random choice for W and B, on the same
-# batch of data).
-W = 5.
-B = 10.
-learning_rate = 0.01
-print("Initial loss: %f" % loss(W, B).numpy())
-for i in range(200):
-  (dW, dB) = grad(W, B)
-  W -= dW * learning_rate
-  B -= dB * learning_rate
-  if i % 20 == 0:
-    print("Loss at step %d: %f" % (i, loss(W, B).numpy()))
-print("Final loss: %f" % loss(W, B).numpy())
-print("W, B = %f, %f" % (W.numpy(), B.numpy()))
-```
-
-Output: (the exact numbers may vary depending on the randomness in noise)
-
-```
-Initial loss: 66.730003
-Loss at step 0: 64.200096
-Loss at step 20: 29.872814
-Loss at step 40: 14.233772
-Loss at step 60: 7.090570
-Loss at step 80: 3.819887
-Loss at step 100: 2.318821
-Loss at step 120: 1.628385
-Loss at step 140: 1.310142
-Loss at step 160: 1.163167
-Loss at step 180: 1.095162
-Final loss: 1.064711
-W, B = 3.094944, 2.161383
-```
-
-To utilize the GPU, place the code above within a `with tf.device("/gpu:0"):`
-block. (However, this particular model, with only two floating point parameters,
-is unlikely to benefit from GPU acceleration.)
-
-### Customizing gradients
-
-One may want to define custom gradients for an operation, or for a function.
-This may be useful for multiple reasons, including providing a more efficient
-or more [numerically stable](https://en.wikipedia.org/wiki/Numerical_stability)
-gradient for a sequence of operations.
-
-For example, consider the function `log(1 + e^x)`, which commonly occurs in the
-computation of cross entropy and log likelihoods.
-
-```python
-def log1pexp(x):
-  return tf.log(1 + tf.exp(x))
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works fine at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# Returns a `nan` at x = 100 due to numerical instability.
-import math
-assert math.isnan(float(grad_log1pexp(100.)[0]))
-```
-
-We can define a custom gradient for the above function that analytically
-simplifies the gradient expression.
-
-```python
-@tfe.custom_gradient
-def log1pexp(x):
-  e = tf.exp(x)
-  def grad(dy):
-    return dy * (1 - 1 / (1 + e))
-  return tf.log(1 + e), grad
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works as before at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# But now works at x = 100 as well.
-assert 1.0 == float(grad_log1pexp(100.)[0])
-```
-Also notice how the gradient function implementation reuses an expression
-(`tf.exp(x)`) computed during the forward pass, hence making the gradient
-computation more efficient by avoiding redundant computation.
-
-## Building and training models
-
-In practice, your computation may have many parameters to be optimized (by
-computing derivatives). Encapsulating them into re-usable classes/objects
-makes the code easier to follow than writing a single top-level function with
-many arguments.
-
-In fact, eager execution encourages use of the [Keras](https://keras.io)-style
-"Layer" classes in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module.
-
-Furthermore, you may want to apply more sophisticated techniques to compute
-parameter updates, such as those in
-[`tf.train.Optimizer`](https://www.tensorflow.org/api_guides/python/train#Optimizers)
-implementations.
-
-This next section walks through using the same `Optimizer` and `Layer` APIs used
-to build trainable TensorFlow graphs in an environment where eager execution is
-enabled.
-
-### Variables and Optimizers
-
-`tfe.Variable` objects store mutable `Tensor` values that can be accessed during
-training, making automatic differentiation easier. In particular, parameters of
-a model can be encapsulated in Python classes as variables.
-
-`tfe.gradients_function(f)` introduced earlier computes the derivatives of `f`
-with respect to its arguments. However, it requires all parameters of interest
-to be arguments of `f`, which becomes cumbersome when `f` depends on a large
-number of trainable parameters.
-
-`tfe.implicit_gradients` is an alternative function with some useful properties:
-
--   It computes the derivatives of `f` with respect to all the `tfe.Variable`s
-    used by `f`.
--   When the returned function is invoked, it returns a list of
-    (gradient value, Variable object) tuples.
-
-Representing model parameters as `Variable` objects, along with the use of
-`tfe.implicit_gradients`, typically results in better encapsulation. For
-example, the linear regression model described above can be written into a
-class:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.W = tfe.Variable(5., name='weight')
-    self.B = tfe.Variable(10., name='bias')
-
-  def predict(self, inputs):
-    return inputs * self.W + self.B
-
-
-# The loss function to be optimized
-def loss(model, inputs, targets):
-  error = model.predict(inputs) - targets
-  return tf.reduce_mean(tf.square(error))
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# Define:
-# 1. A model
-# 2. Derivatives of a loss function with respect to model parameters
-# 3. A strategy for updating the variables based on the derivatives
-model = Model()
-grad = tfe.implicit_gradients(loss)
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-
-# The training loop
-print("Initial loss: %f" %
-      loss(model, training_inputs, training_outputs).numpy())
-for i in range(201):
-  optimizer.apply_gradients(grad(model, training_inputs, training_outputs))
-  if i % 20 == 0:
-    print("Loss at step %d: %f" %
-          (i, loss(model, training_inputs, training_outputs).numpy()))
-print("Final loss: %f" % loss(model, training_inputs, training_outputs).numpy())
-print("W, B = %s, %s" % (model.W.numpy(), model.B.numpy()))
-```
-
-Output:
-
-```
-Initial loss: 69.693184
-Loss at step 0: 66.987854
-Loss at step 20: 30.553387
-Loss at step 40: 14.250237
-Loss at step 60: 6.955020
-Loss at step 80: 3.690550
-Loss at step 100: 2.229739
-Loss at step 120: 1.576032
-Loss at step 140: 1.283496
-Loss at step 160: 1.152584
-Loss at step 180: 1.093999
-Final loss: 1.067780
-W, B = 3.0114281, 2.0865183
-```
-
-Using `implicit_gradients` avoids the need to provide all the trainable
-parameters of the model as arguments to the `loss` function.
-
-### Using Keras and the Layers API
-
-[Keras](https://keras.io) is a popular API for defining model structures. The
-[`tf.keras.layers`](https://www.tensorflow.org/api_docs/python/tf/keras/layers)
-module provides a set of building blocks for models and is implemented using the
-`tf.layers.Layer` subclasses in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module. We encourage the use of these same building blocks when using
-TensorFlow's eager execution feature. For example, the very same linear
-regression model can be built using `tf.layers.Dense`:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.layer = tf.layers.Dense(1)
-
-  def predict(self, inputs):
-    return self.layer(inputs)
-```
-
-The `tf.layers` API makes it more convenient to define more sophisticated
-models. For example, the following will train an MNIST model:
-
-```python
-class MNISTModel(object):
-  def __init__(self, data_format):
-    # 'channels_first' is typically faster on GPUs
-    # while 'channels_last' is typically faster on CPUs.
-    # See: https://www.tensorflow.org/performance/performance_guide#data_formats
-    if data_format == 'channels_first':
-      self._input_shape = [-1, 1, 28, 28]
-    else:
-      self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = tf.layers.Conv2D(32, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.max_pool2d = tf.layers.MaxPooling2D(
-        (2, 2), (2, 2), padding='same', data_format=data_format)
-    self.conv2 = tf.layers.Conv2D(64, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.dense1 = tf.layers.Dense(1024, activation=tf.nn.relu)
-    self.dropout = tf.layers.Dropout(0.5)
-    self.dense2 = tf.layers.Dense(10)
-
-  def predict(self, inputs):
-    x = tf.reshape(inputs, self._input_shape)
-    x = self.max_pool2d(self.conv1(x))
-    x = self.max_pool2d(self.conv2(x))
-    x = tf.layers.flatten(x)
-    x = self.dropout(self.dense1(x))
-    return self.dense2(x)
-
-def loss(model, inputs, targets):
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(
-          logits=model.predict(inputs), labels=targets))
-
-
-# Load the training and validation data
-from tensorflow.examples.tutorials.mnist import input_data
-data = input_data.read_data_sets("./mnist_data", one_hot=True)
-
-# Train
-device = "gpu:0" if tfe.num_gpus() else "cpu:0"
-model = MNISTModel('channels_first' if tfe.num_gpus() else 'channels_last')
-optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
-grad = tfe.implicit_gradients(loss)
-for i in range(20001):
-  with tf.device(device):
-    (inputs, targets) = data.train.next_batch(50)
-    optimizer.apply_gradients(grad(model, inputs, targets))
-    if i % 100 == 0:
-      print("Step %d: Loss on training set : %f" %
-            (i, loss(model, inputs, targets).numpy()))
-print("Loss on test set: %f" % loss(model, data.test.images, data.test.labels).numpy())
-```
-
-For a more complete example, see [the example in the tensorflow/models
-repository](https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py).
-
-### Checkpointing trained variables
-
-TensorFlow Variables (`tfe.Variable`) provides a way to represent shared,
-persistent state of your model. The `tfe.Saver` class (which is a thin wrapper
-over the
-[`tf.train.Saver`](https://www.tensorflow.org/api_docs/python/tf/train/Saver)
-class) provides a means to save and restore variables to and from _checkpoints_.
-
-For example:
-
-```python
-# Create variables.
-x = tfe.Variable(10., name='x')
-y = tfe.Variable(5., name='y')
-
-# Create a Saver.
-saver = tfe.Saver([x, y])
-
-# Assign new values to the variables and save.
-x.assign(2.)
-saver.save('/tmp/ckpt')
-
-# Change the variable after saving.
-x.assign(11.)
-assert 16. == (x + y).numpy()  # 11 + 5
-
-# Restore the values in the checkpoint.
-saver.restore('/tmp/ckpt')
-
-assert 7. == (x + y).numpy()  # 2 + 5
-```
-
-### `tfe.Network`
-
-You may often want to organize your models using classes, like the `MNISTModel`
-class described above. We recommend inheriting from the `tfe.Network` class as
-it provides conveniences like keeping track of all model variables and methods
-to save and restore from checkpoints.
-
-Sub-classes of `tfe.Network` may register `Layer`s (like classes in
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers),
-or [Keras
-layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers))
-using a call to `self.track_layer()` and define the computation in an
-implementation of `call()`.
-
-Note that `tf.layers.Layer` objects (like `tf.layers.Dense`) create variables
-lazily, when the first input is encountered.
-
-For example, consider the following two-layer neural network:
-
-```python
-class TwoLayerNet(tfe.Network):
-  def __init__(self):
-    super(TwoLayerNet, self).__init__()
-    self.layer1 = self.track_layer(
-      tf.layers.Dense(2, activation=tf.nn.relu, use_bias=False))
-    self.layer2 = self.track_layer(tf.layers.Dense(3, use_bias=False))
-
-  def call(self, x):
-    return self.layer2(self.layer1(x))
-
-net = TwoLayerNet()
-
-# No variables created yet
-assert 0 == len(net.variables)
-
-# They are created on first input:
-inp = tf.constant([[1.]])
-
-# Since input is a 1x1 matrix, net.l1 has 2 units and net.l2 has 3 units,
-# the output is the product of a 1x1 matrix with a 1x2 matrix with a 2x3
-# matrix.
-assert [1, 3] == net(inp).shape.as_list()  # Invoke net; get output shape.
-assert 1 == len(net.layer1.variables)
-assert 1 == len(net.layer2.variables)
-assert 2 == len(net.variables)  # weights for each layer.
-assert [1, 2] == net.variables[0].shape.as_list()  # weights of layer1.
-assert [2, 3] == net.variables[1].shape.as_list()  # weights of layer2.
-```
-
-The `tfe.Network` class is itself a sub-class of `tf.layers.Layer`. This allows
-instances of `tfe.Network` to be embedded in other networks. For example:
-
-```python
-class ThreeLayerNet(tfe.Network):
-  def __init__(self):
-    super(ThreeLayerNet, self).__init__()
-    self.a = self.track_layer(TwoLayerNet())
-    self.b = self.track_layer(tf.layers.Dense(4, use_bias=False))
-
-  def call(self, x):
-    return self.b(self.a(x))
-
-net = ThreeLayerNet()
-
-assert [1, 4] == net(inp).shape.as_list()
-assert 3 == len(net.variables)
-assert [1, 2] == net.variables[0].shape.as_list()
-assert [2, 3] == net.variables[1].shape.as_list()
-assert [3, 4] == net.variables[2].shape.as_list()
-```
-
-See more examples in
-[`tensorflow/contrib/eager/python/examples`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples).
-
-`tfe.Saver` in combination with `tfe.restore_variables_on_create` provides a
-convenient way to save and load checkpoints without changing the program once
-the checkpoint has been created. For example, we can set an objective for the
-output of our network, choose an optimizer, and a location for the checkpoint:
-
-```python
-objective = tf.constant([[2., 3., 4., 5.]])
-optimizer = tf.train.AdamOptimizer(0.01)
-checkpoint_directory = '/tmp/tfe_example'
-checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-net = ThreeLayerNet()
-```
-
-Note that variables have not been created yet. We want them to be restored from
-a checkpoint, if one exists, so we create them inside a
-`tfe.restore_variables_on_create` context manager. Then our training loop is the
-same whether starting training or resuming from a previous checkpoint:
-
-```python
-with tfe.restore_variables_on_create(
-    tf.train.latest_checkpoint(checkpoint_directory)):
-  global_step = tf.train.get_or_create_global_step()
-  for _ in range(100):
-    loss_fn = lambda: tf.norm(net(inp) - objective)
-    optimizer.minimize(loss_fn, global_step=global_step)
-    if tf.equal(global_step % 20, 0):
-      print("Step %d, output %s" % (global_step.numpy(),
-                                    net(inp).numpy()))
-      all_variables = (
-          net.variables
-          + optimizer.variables()
-          + [global_step])
-      # Save the checkpoint.
-      tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
-```
-
-The first time it runs, `Network` variables are initialized randomly. Then the
-output is trained to match the objective we've set:
-
-```
-Step 20, output [[ 0.03575622  0.29863232  0.03474367  0.24735749]]
-Step 40, output [[ 0.40646029  0.9856872   0.46851286  0.95358551]]
-Step 60, output [[ 1.74541104  2.800704    1.79055595  2.74783421]]
-Step 80, output [[ 2.14977384  3.44340849  3.96120024  5.16242075]]
-Step 100, output [[ 1.99943113  3.02364397  3.93500996  4.9610076 ]]
-```
-
-In subsequent iterations, variables are initialized with the values read from
-the latest checkpoint. Running the same code again, we continue from where we
-left off:
-
-```
-Step 120, output [[ 1.99234128  3.0271616   3.98732996  4.96401167]]
-Step 140, output [[ 2.00133467  3.01270437  4.00616646  5.00406504]]
-Step 160, output [[ 1.99647415  2.9956708   3.99064088  4.99632359]]
-Step 180, output [[ 2.00699997  3.00904822  4.00706148  5.01193142]]
-Step 200, output [[ 1.98334622  2.98249531  3.97375059  4.97123432]]
-```
-
-
-### Summaries, metrics and TensorBoard
-
-[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
-is a popular tool for understanding, debugging and optimizing the model training
-process. To benefit from the visualizations offered by TensorBoard, summary
-events need to be written during the course of execution of your program. You
-might find many Tensorflow programs that include the
-[`tf.summary`](https://www.tensorflow.org/api_guides/python/summary) operations
-during graph construction.
-
-`tf.summary` operations are *not* compatible with eager execution, but an
-equivalent alternative exists in
-[`tf.contrib.summary`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/summary)
-that is compatible with both eager execution and graph construction.
-
-During model construction simply insert summary operations like
-`tf.contrib.summary.scalar`. These operations do nothing by default, unless a
-summary writer is currently active and a writing policy is set.
-
-For example, to record summaries once every 100 global steps, use:
-
-```python
-tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_file_writer(logdir)
-
-for _ in range(iterations):
-  with writer.as_default():
-    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
-      # your model code goes here
-      tf.contrib.summary.scalar('loss', loss)
-      # ...
-```
-
-See the full mnist example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
-for a full model using `tf.contrib.summary`.
-
-Similarly to summaries, the metrics in `tf.metrics` are currently not compatible
-with eager execution. We instead provide object-oriented metrics in the
-`tfe.metrics` package, which are compatible with graph construction as well.
-
-Metrics in the `tfe.metrics`, such as `tfe.metrics.Mean` and
-`tfe.Metrics.Accuracy`, all implement an intuitive object-oriented
-interface. Here's an example of how to use the `tfe.metrics.Mean` metric:
-
-```python
-# Metrics are objects, which can be created and destroyed.
-my_mean = tfe.metrics.Mean(name='my_mean')
-# While a metric is active, you can call it as a function to accumulate into its
-# internal state.
-my_mean(0.0)
-my_mean(10.0)
-# Once you've finished updating the metric, you can get its result. In this case
-# a simple average over all the calls to it. If a summary writer is active the
-# metric will write the appropriate summaries using the metric name.
-assert 5.0 == my_mean.result().numpy()
-```
-
-For a full example of a model using metrics for evaluation, see the mnist
-example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist).
-
-### Input Pipelines
-
-The discussion above has been centered around the computation executed by your
-model. The
-[`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data)
-module provides APIs to build complex input pipelines from simple, reusable
-pieces.
-
-If you're familiar with constructing `tf.data.Dataset` objects when building
-TensorFlow graphs, the same API calls are used when eager execution is enabled.
-However, the process of iterating over elements of the dataset differs between
-eager execution and graph construction. When eager execution is enabled, the
-discussion on iterator creation using `make_one_shot_iterator()` and
-`get_next()` in the
-[Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is
-*not* applicable. Instead, a more Pythonic `Iterator` class is available.
-
-For example:
-
-```python
-# Create a source Dataset from in-memory numpy arrays.
-# For reading from files on disk, you may want to use other Dataset classes
-# like the TextLineDataset or the TFRecordDataset.
-dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])
-
-# Apply transformations, shuffling, batching etc.
-dataset = dataset.map(tf.square).shuffle(2).batch(2)
-
-# Use tfe.Iterator to iterate over the dataset.
-for x in tfe.Iterator(dataset):
-  print(x)
-```
-
-Output:
-
-```
-tf.Tensor([4 9], shape=(2,), dtype=int32)
-tf.Tensor([16 25], shape=(2,), dtype=int32)
-tf.Tensor([36  1], shape=(2,), dtype=int32)
-```
-
-## Interoperating with Graphs
-
-Eager execution improves the process of model development in Python; however,
-because it is in its earliest stages, it does not yet support some features
-available to [TensorFlow
-graphs](https://www.tensorflow.org/get_started/get_started#the_computational_graph)
-that are desirable when deploying models in production. In particular, eager
-execution does not yet support distributed training, exporting models (to other
-[programming languages](https://www.tensorflow.org/api_docs/), [TensorFlow
-serving](https://www.tensorflow.org/serving/), and mobile applications), and
-various memory and computation optimizations that are applied to TensorFlow's
-dataflow graphs.
-
-That said, the APIs used to build modes are exactly the same whether executing
-eagerly or constructing graphs. This means that you can iteratively develop your
-model with eager execution enabled and later, if needed, use the same code to
-reap the benefits of representing models as computational graphs.
-
-For example, the same model definition used to construct a graph in
-[mnist.py`](https://github.com/tensorflow/models/tree/master/official/mnist/mnist.py)
-can be trained with eager execution enabled as in [`mnist_eager.py`](https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py).
-
-Other models in the [examples
-directory](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/)
-demonstrate this as well.
-
-Some differences worth noting:
-
--   There is no notion of a `tf.placeholder` or a `tf.Session` when eager
-    execution is enabled.
--   Many properties on the `tf.Tensor` object, like `tf.Tensor.name`,
-    `tf.Tensor.op`, `tf.Tensor.inputs` are not meaningful when eager execution
-    is enabled and their use will raise an `AttributeError`.
--   To use `tfe.implicit_gradients` in graph construction, variables must be
-    created with [`use_resource=True`] provided to
-    [`tf.get_variable()`](https://www.tensorflow.org/api_docs/python/tf/get_variable)
-    or
-    [`tf.variable_scope()`](https://www.tensorflow.org/api_docs/python/tf/variable_scope).
--   Some API calls (such as the functional-style `tf.layers.dense`,
-    `tf.layers.conv2d`) are not compatible with eager execution. Use of such
-    methods should raise an error indicating the alternative (e.g., the
-    `tf.layers.Dense` and `tf.layers.Conv2D` classes).
-
-## What next?
+immediately: concrete values are returned, instead of creating a computational
+graph that is executed later.
 
-Please give eager execution a spin. This feature is in early stages and is
-evolving, so we welcome your feedback via issues on GitHub (see [known
-issues](https://github.com/tensorflow/tensorflow/labels/comp:eager)).
+A user guide is available: https://www.tensorflow.org/programmers_guide/eager
+([source file](../../../../docs_src/programmers_guide/eager.md))
 
-You may want to browse through some sample code, including benchmarks for some:
+We welcome feedback through [GitHub issues](https://github.com/tensorflow/tensorflow/labels/comp:eager).
 
--   [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
--   [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
--   [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
--   [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
--   [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
+Sample code is available, including benchmarks for some:
 
+- [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
+- [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
+- [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
+- [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
+- [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
new file mode 100644
index 0000000000..9ae1e602f4
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -0,0 +1,992 @@
+# Eager Execution
+
+TensorFlow's eager execution is an imperative programming environment that
+evaluates operations immediately, without an extra graph-building step.
+Operations return concrete values instead of constructing a computational graph
+to run later. This makes it easy to get started with TensorFlow, debug models,
+reduce boilerplate code, and is fun! To follow along with this guide, run the
+code samples below in an interactive `python` interpreter.
+
+Eager execution supports most TensorFlow operations and GPU acceleration.
+Automatic differentiation uses a dynamically-constructed tape instead of a static
+graph to compute gradients. Eager execution is a flexible machine learning
+platform for research and experimentation that provides:
+
+* *An intuitive interface* —Structure your code naturally and use Python data
+  structures. Quickly iterate on small models and small data.
+* *Easier debugging* —Call ops directly to inspect running models and test
+  changes. Use standard Python debugging tools for immediate error reporting.
+* *Natural control flow* —Use Python control flow instead of graph control flow,
+  including support for dynamic models.
+
+For a collection of examples running in eager execution, see:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+Note: Some models may experience increased overhead with eager execution enabled.
+Performance improvements are ongoing, but please
+[file a bug](https://github.com/tensorflow/tensorflow/issues) if you find a
+problem and share your benchmarks.
+
+## Setup and basic usage
+
+Install TensorFlow 1.7 to include the updates for eager execution:
+
+```
+$ pip install --pre --upgrade tensorflow
+```
+
+To start eager execution, add `tf.enable_eager_execution()` to the beginning of
+the program or console session. Do not add this operation to other modules that
+the program calls.
+
+```py
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+
+tf.enable_eager_execution()
+```
+
+Now you can run TensorFlow operations and the results will return immediately:
+
+```py
+tf.executing_eagerly()        # => True
+
+x = [[2.]]
+m = tf.matmul(x, x)
+print("hello, {}".format(m))  # => "hello, [[4.]]"
+```
+
+Enabling eager execution changes how TensorFlow operations behave—now they
+immediately evaluate and return their values to Python. `tf.Tensor` objects
+reference concrete values instead of symbolic handles to nodes in a computational
+graph. Since there isn't a computational graph to build and run later in a
+session, it's easy to inspect results using `print()` or a debugger. Evaluating,
+printing, and checking tensor values does not break the flow for computing
+gradients.
+
+Eager execution works nicely with [NumPy](http://www.numpy.org/). NumPy
+operations accept `tf.Tensor` arguments. TensorFlow
+[math operations](https://www.tensorflow.org/api_guides/python/math_ops) convert
+Python objects and NumPy arrays to `tf.Tensor` objects. The
+`tf.Tensor.numpy` method returns the object's value as a NumPy `ndarray`.
+
+```py
+a = tf.constant([[1, 2],
+                 [3, 4]])
+print(a)
+# => tf.Tensor([[1 2]
+#               [3 4]], shape=(2, 2), dtype=int32)
+
+# Broadcasting support
+b = tf.add(a, 1)
+print(b)
+# => tf.Tensor([[2 3]
+#               [4 5]], shape=(2, 2), dtype=int32)
+
+# Operator overloading is supported
+print(a * b)
+# => tf.Tensor([[ 2  6]
+#               [12 20]], shape=(2, 2), dtype=int32)
+
+# Use NumPy values
+import numpy as np
+
+c = np.multiply(a, b)
+print(c)
+# => [[ 2  6]
+#     [12 20]]
+
+# Obtain numpy value from a tensor:
+print(a.numpy())
+# => [[1 2]
+#     [3 4]]
+```
+
+The `tfe` module contains symbols available to both eager and graph execution
+environments and is useful for writing code to [work with graphs](#work_with_graphs):
+
+```py
+import tensorflow.contrib.eager as tfe
+```
+
+## Eager training
+
+### Automatic differentiation
+
+[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
+is useful for implementing machine learning algorithms such as
+[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
+neural networks. During eager execution, use `tfe.GradientTape` to trace
+operations for computing gradients later.
+
+`tfe.GradientTape` is an opt-in feature to provide maximal performance when
+not tracing. Since different operations can occur during each call, all
+forward-pass operations get recorded to a "tape". To compute the gradient, play
+the tape backwards and then discard. A particular `tfe.GradientTape` can only
+be computed once, subsequent calls throw a runtime error.
+
+```py
+w = tfe.Variable([[1.0]])
+with tfe.GradientTape() as tape:
+  loss = w * w
+
+grad = tape.gradient(loss, [w])
+print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
+```
+
+Here's an example of `tfe.GradientTape` that records forward-pass operations
+to train a simple model:
+
+```py
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 1000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+def prediction(input, weight, bias):
+  return input * weight + bias
+
+# A loss function using mean-squared error
+def loss(weights, biases):
+  error = prediction(training_inputs, weights, biases) - training_outputs
+  return tf.reduce_mean(tf.square(error))
+
+# Return the derivative of loss with respect to weight and bias
+def grad(weights, biases):
+  with tfe.GradientTape() as tape:
+    loss_value = loss(weights, biases) 
+  return tape.gradient(loss_value, [weights, biases])
+
+train_steps = 200
+learning_rate = 0.01
+# Start with arbitrary values for W and B on the same batch of data
+W = tfe.Variable(5.)
+B = tfe.Variable(10.)
+
+print("Initial loss: {:.3f}".format(loss(W, B)))
+
+for i in range(train_steps):
+  dW, dB = grad(W, B)
+  W.assign_sub(dW * learning_rate)
+  B.assign_sub(dB * learning_rate)
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B)))
+
+print("Final loss: {:.3f}".format(loss(W, B)))
+print("W = {}, B = {}".format(W.numpy(), B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 71.204
+Loss at step 000: 68.333
+Loss at step 020: 30.222
+Loss at step 040: 13.691
+Loss at step 060: 6.508
+Loss at step 080: 3.382
+Loss at step 100: 2.018
+Loss at step 120: 1.422
+Loss at step 140: 1.161
+Loss at step 160: 1.046
+Loss at step 180: 0.996
+Final loss: 0.974
+W = 3.01582956314, B = 2.1191945076
+```
+
+Replay the `tfe.GradientTape` to compute the gradients and apply them in a
+training loop. This is demonstrated in an excerpt from the
+[mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py)
+example:
+
+```py
+dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
+                                              data.train.labels))
+...
+for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
+  ...
+  with tfe.GradientTape() as tape:
+    logits = model(images, training=True)
+    loss_value = loss(logits, labels)
+  ...
+  grads = tape.gradient(loss_value, model.variables)
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+```
+
+#### Dynamic models
+
+`tfe.GradientTape` can also be used in dynamic models. This example for a
+[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
+algorithm looks like normal NumPy code, except there are gradients and is
+differentiable, despite the complex control flow:
+
+```py
+def line_search_step(fn, init_x, rate=1.0):
+  with tfe.GradientTape() as tape:
+    # Variables are automatically recorded, but manually watch a tensor
+    tape.watch(init_x)
+    value = fn(init_x)
+  grad, = tape.gradient(value, [init_x])
+  grad_norm = tf.reduce_sum(grad * grad)
+  init_value = value
+  while value > init_value - rate * grad_norm:
+    x = init_x - rate * grad
+    value = fn(x)
+    rate /= 2.0
+  return x, value
+```
+
+#### Additional functions to compute gradients
+
+`tfe.GradientTape` is a powerful interface for computing gradients, but there
+is another [Autograd](https://github.com/HIPS/autograd)-style API available for
+automatic differentiation. These functions are useful if writing math code with
+only tensors and gradient functions, and without `tfe.Variables`:
+
+* `tfe.gradients_function` —Returns a function that computes the derivatives
+  of its input function parameter with respect to its arguments. The input
+  function parameter must return a scalar value. When the returned function is
+  invoked, it returns a list of `tf.Tensor` objects: one element for each
+  argument of the input function. Since anything of interest must be passed as a
+  function parameter, this becomes unwieldy if there's a dependency on many
+  trainable parameters.
+* `tfe.value_and_gradients_function` —Similar to
+  `tfe.gradients_function`, but when the returned function is invoked, it
+  returns the value from the input function in addition to the list of
+  derivatives of the input function with respect to its arguments.
+
+In the following example, `tfe.gradients_function` takes the `square`
+function as an argument and returns a function that computes the partial
+derivatives of `square` with respect to its inputs. To calculate the derivative
+of `square` at `3`, `grad(3.0)` returns `6`.
+
+```py
+def square(x):
+  return tf.multiply(x, x)
+
+grad = tfe.gradients_function(square)
+
+square(3.)  # => 9.0
+grad(3.)    # => [6.0]
+
+# The second-order derivative of square:
+gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
+gradgrad(3.)  # => [2.0]
+
+# The third-order derivative is None:
+gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
+gradgradgrad(3.)  # => [None]
+
+
+# With flow control:
+def abs(x):
+  return x if x > 0. else -x
+
+grad = tfe.gradients_function(abs)
+
+grad(3.)   # => [1.0]
+grad(-3.)  # => [-1.0]
+```
+
+### Custom gradients
+
+Custom gradients are an easy way to override gradients in eager and graph
+execution. Within the forward function, define the gradient with respect to the
+inputs, outputs, or intermediate results. For example, here's an easy way to clip
+the norm of the gradients in the backward pass:
+
+```py
+@tf.custom_gradient
+def clip_gradient_by_norm(x, norm):
+  y = tf.identity(x)
+  def grad_fn(dresult):
+    return [tf.clip_by_norm(dresult, norm), None]
+  return y, grad_fn
+```
+
+Custom gradients are commonly used to provide a numerically stable gradient for a
+sequence of operations:
+
+```py
+def log1pexp(x):
+  return tf.log(1 + tf.exp(x))
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# The gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# However, x = 100 fails because of numerical instability.
+grad_log1pexp(100.)  # => [nan]
+```
+
+
+Here, the `log1pexp` function can be analytically simplified with a custom
+gradient. The implementation below reuses the value for `tf.exp(x)` that is
+computed during the forward pass—making it more efficient by eliminating
+redundant calculations:
+
+```py
+@tfe.custom_gradient
+def log1pexp(x):
+  e = tf.exp(x)
+  def grad(dy):
+    return dy * (1 - 1 / (1 + e))
+  return tf.log(1 + e), grad
+
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# As before, the gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# And the gradient computation also works at x = 100.
+grad_log1pexp(100.)  # => [1.0]
+```
+
+
+## Build and train models
+
+There are many parameters to optimize when calculating derivatives. TensorFlow
+code is easier to read when structured into reusable classes and objects instead
+of a single top-level function. Eager execution encourages the use of the
+Keras-style layer classes in the `tf.keras.layers` module. Additionally, the
+`tf.train.Optimizer` classes provide sophisticated techniques to calculate
+parameter updates.
+
+The following example creates a multi-layer model that classifies the standard
+[MNIST handwritten digits](https://www.tensorflow.org/tutorials/layers). It
+demonstrates the optimizer and layer APIs to build trainable graphs in an eager
+execution environment.
+
+### Build a model
+
+The `tf.keras.Sequential` model is a linear stack of layers. It is easy to
+use for basic models:
+
+```py
+model = tf.keras.Sequential([
+  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
+  tf.keras.layers.Dense(10)
+])
+```
+
+Alternatively, organize models in classes by inheriting from `tf.keras.Model`.
+This is a container for layers that is a layer itself, allowing `tf.keras.Model`
+objects to contain other `tf.keras.Model` objects.
+
+```py
+class MNISTModel(tf.keras.Model):
+  def __init__(self):
+    super(MNISTModel, self).__init__()
+    self.dense1 = tf.keras.layers.Dense(units=10)
+    self.dense2 = tf.keras.layers.Dense(units=10)
+
+  def call(self, input):
+    """Run the model."""
+    result = self.dense1(input)
+    result = self.dense2(result)
+    result = self.dense2(result)  # reuse variables from dense2 layer
+    return result
+
+model = MNISTModel()
+```
+
+It's not required to set an input shape for the `tf.keras.Model` class since
+the parameters are set the first time input is passed to the layer.
+
+`tf.keras.layers` classes create and contain their own model variables that
+are tied to the lifetime of their layer objects. To share layer variables, share
+their objects.
+
+### Train a model
+
+Even without training, call the model and inspect the output in eager execution:
+
+```py
+# Create a tensor representing a blank image
+batch = tf.zeros([1, 1, 784])
+print(batch.shape)  # => (1, 1, 784)
+
+result = model(batch)
+# => tf.Tensor([[[ 0.  0., ..., 0.]]], shape=(1, 1, 10), dtype=float32)
+```
+
+This example uses the
+[dataset.py module](https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py)
+from the
+[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist),
+download this file to your local directory. Run the following to download the
+MNIST data files to your working directory and prepare a `tf.data.Dataset`
+for training:
+
+```py
+import dataset  # download dataset.py file
+dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32)
+```
+
+To train a model, define a loss function to optimize and then calculate
+gradients. Use an optimizer to update the variables:
+
+```py
+def loss(model, x, y):
+  prediction = model(x)
+  return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)
+
+def grad(model, inputs, targets):
+  with tfe.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, model.variables)
+
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+
+x, y = tfe.Iterator(dataset_train).next()
+print("Initial loss: {:.3f}".format(loss(model, x, y)))
+
+# Training loop
+for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+  # Calculate derivatives of the input function with respect to its parameters.
+  grads = grad(model, x, y)
+  # Apply the gradient to the model
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 200 == 0:
+    print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y)))
+
+print("Final loss: {:.3f}".format(loss(model, x, y)))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 2.674
+Loss at step 0000: 2.593
+Loss at step 0200: 2.143
+Loss at step 0400: 2.009
+Loss at step 0600: 2.103
+Loss at step 0800: 1.621
+Loss at step 1000: 1.695
+...
+Loss at step 6600: 0.602
+Loss at step 6800: 0.557
+Loss at step 7000: 0.499
+Loss at step 7200: 0.744
+Loss at step 7400: 0.681
+Final loss: 0.670
+```
+
+And for faster training, move the computation to a GPU:
+
+```py
+with tf.device("/gpu:0"):
+  for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+    # minimize() is equivalent to the grad() and apply_gradients() calls.
+    optimizer.minimize(lambda: loss(model, x, y),
+                       global_step=tf.train.get_or_create_global_step())
+```
+
+### Variables and optimizers
+
+`tfe.Variable` objects store mutable `tf.Tensor` values accessed during
+training to make automatic differentiation easier. The parameters of a model can
+be encapsulated in classes as variables.
+
+Better encapsulate model parameters by using `tfe.Variable` with
+`tfe.GradientTape`. For example, the automatic differentiation example above
+can be rewritten:
+
+```py
+class Model(tf.keras.Model):
+  def __init__(self):
+    super(Model, self).__init__()
+    self.W = tfe.Variable(5., name='weight')
+    self.B = tfe.Variable(10., name='bias')
+  def predict(self, inputs):
+    return inputs * self.W + self.B
+
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 2000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+# The loss function to be optimized
+def loss(model, inputs, targets):
+  error = model.predict(inputs) - targets
+  return tf.reduce_mean(tf.square(error))
+
+def grad(model, inputs, targets):
+  with tfe.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, [model.W, model.B])
+
+# Define:
+# 1. A model.
+# 2. Derivatives of a loss function with respect to model parameters.
+# 3. A strategy for updating the variables based on the derivatives.
+model = Model()
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+
+print("Initial loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+
+# Training loop
+for i in range(300):
+  grads = grad(model, training_inputs, training_outputs)
+  optimizer.apply_gradients(zip(grads, [model.W, model.B]),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs)))
+
+print("Final loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 69.066
+Loss at step 000: 66.368
+Loss at step 020: 30.107
+Loss at step 040: 13.959
+Loss at step 060: 6.769
+Loss at step 080: 3.567
+Loss at step 100: 2.141
+Loss at step 120: 1.506
+Loss at step 140: 1.223
+Loss at step 160: 1.097
+Loss at step 180: 1.041
+Loss at step 200: 1.016
+Loss at step 220: 1.005
+Loss at step 240: 1.000
+Loss at step 260: 0.998
+Loss at step 280: 0.997
+Final loss: 0.996
+W = 2.99431324005, B = 2.02129220963
+```
+
+## Use objects for state during eager execution
+
+With graph execution, program state (such as the variables) is stored in global
+collections and their lifetime is managed by the `tf.Session` object. In
+contrast, during eager execution the lifetime of state objects is determined by
+the lifetime of their corresponding Python object.
+
+### Variables are objects
+
+During eager execution, variables persist until the last reference to the object
+is removed, and is then deleted.
+
+```py
+with tf.device("gpu:0"):
+  v = tfe.Variable(tf.random_normal([1000, 1000]))
+  v = None  # v no longer takes up GPU memory
+```
+
+### Object-based saving
+
+`tfe.Checkpoint` can save and restore `tfe.Variable`s to and from
+checkpoints:
+
+```py
+x = tfe.Variable(10.)
+
+checkpoint = tfe.Checkpoint(x=x)  # save as "x"
+
+x.assign(2.)   # Assign a new value to the variables and save.
+save_path = checkpoint.save('./ckpt/')
+
+x.assign(11.)  # Change the variable after saving.
+
+# Restore values from the checkpoint
+checkpoint.restore(save_path)
+
+print(x)  # => 2.0
+```
+
+To save and load models, `tfe.Checkpoint` stores the internal state of objects,
+without requiring hiiden variables. To record the state of a `model`,
+an `optimizer`, and a global step, pass them to a `tfe.Checkpoint`:
+
+```py
+model = MyModel()
+optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+checkpoint_dir = ‘/path/to/model_dir’
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+root = tfe.Checkpoint(optimizer=optimizer,
+                      model=model,
+                      optimizer_step=tf.train.get_or_create_global_step())
+
+root.save(file_prefix=checkpoint_prefix)
+# or
+root.restore(tf.train.latest_checkpoint(checkpoint_dir))
+```
+
+### Object-oriented metrics
+
+`tfe.metrics` are stored as objects. Update a metric by passing the new data to
+the callable, and retrieve the result using the `tfe.metrics.result` method,
+for example:
+
+```py
+m = tfe.metrics.Mean("loss")
+m(0)
+m(5)
+m.result()  # => 2.5
+m([8, 9])
+m.result()  # => 5.5
+```
+
+#### Summaries and TensorBoard
+
+@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+understanding, debugging and optimizing the model training process. It uses
+summary events that are written while executing the program.
+
+`tf.contrib.summary` is compatible with both eager and graph execution
+environments. Summary operations, such as `tf.contrib.summary.scalar`, are
+inserted during model construction. For example, to record summaries once every
+100 global steps:
+
+```py
+tf.train.get_or_create_global_step()  # return global step var
+writer = tf.contrib.summary.create_file_writer(logdir)
+global_step=tf.train.get_or_create_global_step()
+
+writer.set_as_default()
+
+for _ in range(iterations):
+  global_step.assign_add(1)
+  # Must include a record_summaries method
+  with tf.contrib.summary.record_summaries_every_n_global_steps(100):
+    # your model code goes here
+    tf.contrib.summary.scalar('loss', loss)
+     ...
+```
+
+## Performance
+
+Computation is not automatically offloaded to GPUs during eager execution. To
+explicitly direct a computation to a GPU, enclose it in a
+`tf.device('/gpu:0')` block:
+
+```py
+import time
+
+def measure(x, steps):
+  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
+  tf.matmul(x, x)
+  start = time.time()
+  for i in range(steps):
+    x = tf.matmul(x, x)
+    _ = x.numpy()  # Make sure to execute op and not just enqueue it
+  end = time.time()
+  return end - start
+
+shape = (1000, 1000)
+steps = 200
+print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))
+
+# Run on CPU:
+with tf.device("/cpu:0"):
+  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+
+# Run on GPU, if available:
+if tfe.num_gpus() > 0:
+  with tf.device("/gpu:0"):
+    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+else:
+  print("GPU: not found")
+```
+
+Output (exact numbers depend on hardware):
+
+```
+Time to multiply a (1000, 1000) matrix by itself 200 times:
+CPU: 4.614904403686523 secs
+GPU: 0.5581181049346924 secs
+```
+
+A `tf.Tensor` object can be copied to a different device to execute its
+operations:
+
+```py
+x = tf.random_normal([10, 10])
+
+x_gpu0 = x.gpu()
+x_cpu = x.cpu()
+
+_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
+_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
+
+if tfe.num_gpus() > 1:
+  x_gpu1 = x.gpu(1)
+  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
+```
+
+### Benchmarks
+
+For compute-heavy models, such as
+[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50)
+training on a GPU, eager execution performance is comparable to graph execution.
+But this gap grows larger for models with less computation and there is work to
+be done for optimizing hot code paths for models with lots of small operations.
+
+
+## Work with graphs
+
+While eager execution makes development and debugging more interactive,
+TensorFlow graph execution has advantages for distributed training, performance
+optimizations, and production deployment. However, writing graph code can feel
+different than writing regular Python code and more difficult to debug.
+
+For building and training graph-constructed models, the Python program first
+builds a graph representing the computation, then invokes `Session.run` to send
+the graph for execution on the C++-based runtime.  This provides:
+
+* Automatic differentiation using static autodiff.
+* Simple deployment to a platform independent server.
+* Graph-based optimizations (common subexpression elimination, constant-folding, etc.).
+* Compilation and kernel fusion.
+* Automatic distribution and replication (placing nodes on the distributed system).
+
+Deploying code written for eager execution is more difficult: either generate a
+graph from the model, or run the Python runtime and code directly on the server.
+
+### Write compatible code
+
+The same code written for eager execution will also build a graph during graph
+execution. Do this by simply running the same code in a new Python session where
+eager execution is not enabled.
+
+Most TensorFlow operations work during eager execution, but there are some things
+to keep in mind:
+
+* Use `tf.data` for input processing instead of queues. It's faster and easier.
+* Use object-oriented layer APIs—like `tf.keras.layers` and
+  `tf.keras.Model`—since they have explicit storage for variables.
+* Most model code works the same during eager and graph execution, but there are
+  exceptions. (For example, dynamic models using Python control flow to change the
+  computation based on inputs.)
+* Once eager execution is enabled with `tf.enable_eager_execution`, it
+  cannot be turned off. Start a new Python session to return to graph execution.
+
+It's best to write code for both eager execution *and* graph execution. This
+gives you eager's interactive experimentation and debuggability with the
+distributed performance benefits of graph execution.
+
+Write, debug, and iterate in eager execution, then import the model graph for
+production deployment. Use `tfe.Checkpoint` to save and restore model
+variables, this allows movement between eager and graph execution environments.
+See the examples in:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+### Use eager execution in a graph environment
+
+Selectively enable eager execution in a TensorFlow graph environment using
+`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not*
+been called.
+
+```py
+def my_py_func(x):
+  x = tf.matmul(x, x)  # You can use tf ops
+  print(x)  # but it's eager!
+  return x
+
+with tf.Session() as sess:
+  x = tf.placeholder(dtype=tf.float32)
+  # Call eager function in graph!
+  pf = tfe.py_func(my_py_func, [x], tf.float32)
+  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]
+```
+
+
+A `tfe.Checkpoint` stores the complete internal state of the objects passed to it. Nothing else is implicitly included. To record the state of a `model`, an `optimizer`, and a global step pass each one to the checkpoint's constructor:
+
+```py
+model = MyModel()
+optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+checkpoint_dir = ‘/path/to/model_dir’
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+root = tfe.Checkpoint(optimizer=optimizer,
+                      model=model,
+                      optimizer_step=tf.train.get_or_create_global_step())
+
+root.save(file_prefix=checkpoint_prefix)
+# or
+root.restore(tf.train.latest_checkpoint(checkpoint_dir))
+```
+
+### Object-oriented metrics
+
+`tfe.metrics` are stored as objects. Update a metric by passing the new data to
+the callable, and retrieve the result using the `tfe.metrics.result` method,
+for example:
+
+```py
+m = tfe.metrics.Mean("loss")
+m(0)
+m(5)
+m.result()  # => 2.5
+m([8, 9])
+m.result()  # => 5.5
+```
+
+#### Summaries and TensorBoard
+
+@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+understanding, debugging and optimizing the model training process. It uses
+summary events that are written while executing the program.
+
+`tf.contrib.summary` is compatible with both eager and graph execution
+environments. Summary operations, such as `tf.contrib.summary.scalar`, are
+inserted during model construction. For example, to record summaries once every
+100 global steps:
+
+```py
+tf.train.get_or_create_global_step()  # return global step var
+writer = tf.contrib.summary.create_file_writer(logdir)
+
+for _ in range(iterations):
+  with writer.as_default():
+    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
+      # your model code goes here
+      tf.contrib.summary.scalar('loss', loss)
+      ...
+```
+
+## Performance
+
+Computation is not automatically offloaded to GPUs during eager execution. To
+explicitly direct a computation to a GPU, enclose it in a
+`tf.device('/gpu:0')` block:
+
+```py
+import time
+
+def measure(x, steps):
+  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
+  tf.matmul(x, x)
+  start = time.time()
+  for i in range(steps):
+    x = tf.matmul(x, x)
+    _ = x.numpy()  # Make sure to execute op and not just enqueue it
+  end = time.time()
+  return end - start
+
+shape = (1000, 1000)
+steps = 200
+print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))
+
+# Run on CPU:
+with tf.device("/cpu:0"):
+  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+
+# Run on GPU, if available:
+if tfe.num_gpus() > 0:
+  with tf.device("/gpu:0"):
+    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+else:
+  print("GPU: not found")
+```
+
+Output (exact numbers depend on hardware):
+
+```
+Time to multiply a (1000, 1000) matrix by itself 200 times:
+CPU: 4.614904403686523 secs
+GPU: 0.5581181049346924 secs
+```
+
+A `tf.Tensor` object can be copied to a different device to execute its
+operations:
+
+```py
+x = tf.random_normal([10, 10])
+
+x_gpu0 = x.gpu()
+x_cpu = x.cpu()
+
+_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
+_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
+
+if tfe.num_gpus() > 1:
+  x_gpu1 = x.gpu(1)
+  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
+```
+
+### Benchmarks
+
+For compute-heavy models, such as
+[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50)
+training on a GPU, eager execution performance is comparable to graph execution.
+But this gap grows larger for models with less computation and there is work to
+be done for optimizing hot code paths for models with lots of small operations.
+
+
+## Work with graphs
+
+While eager execution makes development and debugging more interactive,
+TensorFlow graph execution has advantages for distributed training, performance
+optimizations, and production deployment. However, writing graph code can feel
+different than writing regular Python code and more difficult to debug.
+
+For building and training graph-constructed models, the Python program first
+builds a graph representing the computation, then invokes `Session.run` to send
+the graph for execution on the C++-based runtime.  This provides:
+
+* Automatic differentiation using static autodiff.
+* Simple deployment to a platform independent server.
+* Graph-based optimizations (common subexpression elimination, constant-folding, etc.).
+* Compilation and kernel fusion.
+* Automatic distribution and replication (placing nodes on the distributed system).
+
+Deploying code written for eager execution is more difficult: either generate a
+graph from the model, or run the Python runtime and code directly on the server.
+
+### Write compatible code
+
+The same code written for eager execution will also build a graph during graph
+execution. Do this by simply running the same code in a new Python session where
+eager execution is not enabled.
+
+Most TensorFlow operations work during eager execution, but there are some things
+to keep in mind:
+
+* Use `tf.data` for input processing instead of queues. It's faster and easier.
+* Use object-oriented layer APIs—like `tf.keras.layers` and
+  `tf.keras.Model`—since they have explicit storage for variables.
+* Most model code works the same during eager and graph execution, but there are
+  exceptions. (For example, dynamic models using Python control flow to change the
+  computation based on inputs.)
+* Once eager execution is enabled with `tf.enable_eager_execution`, it
+  cannot be turned off. Start a new Python session to return to graph execution.
+
+It's best to write code for both eager execution *and* graph execution. This
+gives you eager's interactive experimentation and debuggability with the
+distributed performance benefits of graph execution.
+
+Write, debug, and iterate in eager execution, then import the model graph for
+production deployment. Use `tfe.Checkpoint` to save and restore model
+variables, this allows movement between eager and graph execution environments.
+See the examples in:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+### Use eager execution in a graph environment
+
+Selectively enable eager execution in a TensorFlow graph environment using
+`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not*
+been called.
+
+```py
+def my_py_func(x):
+  x = tf.matmul(x, x)  # You can use tf ops
+  print(x)  # but it's eager!
+  return x
+
+with tf.Session() as sess:
+  x = tf.placeholder(dtype=tf.float32)
+  # Call eager function in graph!
+  pf = tfe.py_func(my_py_func, [x], tf.float32)
+  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]
+```
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 3fe4cb2dda..7ac63bf2e0 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,8 +1,9 @@
 index.md
 
 ### High Level APIs
-estimators.md
+eager.md
 datasets.md
+estimators.md
 
 ### Low Level APIs
 low_level_intro.md
-- 
GitLab


From 02783a09b2e06d03a9d7356167458b3a87a5b64d Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Thu, 29 Mar 2018 16:01:20 -0700
Subject: [PATCH 0087/1262] Updated eager guide to use tensorflow 1.7. Code
 snippets still work.

PiperOrigin-RevId: 191001008
---
 .../docs_src/programmers_guide/eager.md       | 203 +-----------------
 1 file changed, 5 insertions(+), 198 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index 9ae1e602f4..8db65737dc 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -29,10 +29,10 @@ problem and share your benchmarks.
 
 ## Setup and basic usage
 
-Install TensorFlow 1.7 to include the updates for eager execution:
+Upgrade to TensorFlow 1.7 to include updates for eager execution:
 
 ```
-$ pip install --pre --upgrade tensorflow
+$ pip install --upgrade tensorflow
 ```
 
 To start eager execution, add `tf.enable_eager_execution()` to the beginning of
@@ -322,14 +322,13 @@ grad_log1pexp(0.)  # => [0.5]
 grad_log1pexp(100.)  # => [nan]
 ```
 
-
 Here, the `log1pexp` function can be analytically simplified with a custom
 gradient. The implementation below reuses the value for `tf.exp(x)` that is
 computed during the forward pass—making it more efficient by eliminating
 redundant calculations:
 
 ```py
-@tfe.custom_gradient
+@tf.custom_gradient
 def log1pexp(x):
   e = tf.exp(x)
   def grad(dy):
@@ -605,7 +604,7 @@ print(x)  # => 2.0
 ```
 
 To save and load models, `tfe.Checkpoint` stores the internal state of objects,
-without requiring hiiden variables. To record the state of a `model`,
+without requiring hidden variables. To record the state of a `model`,
 an `optimizer`, and a global step, pass them to a `tfe.Checkpoint`:
 
 ```py
@@ -649,9 +648,8 @@ inserted during model construction. For example, to record summaries once every
 100 global steps:
 
 ```py
-tf.train.get_or_create_global_step()  # return global step var
 writer = tf.contrib.summary.create_file_writer(logdir)
-global_step=tf.train.get_or_create_global_step()
+global_step=tf.train.get_or_create_global_step()  # return global step var
 
 writer.set_as_default()
 
@@ -733,197 +731,6 @@ But this gap grows larger for models with less computation and there is work to
 be done for optimizing hot code paths for models with lots of small operations.
 
 
-## Work with graphs
-
-While eager execution makes development and debugging more interactive,
-TensorFlow graph execution has advantages for distributed training, performance
-optimizations, and production deployment. However, writing graph code can feel
-different than writing regular Python code and more difficult to debug.
-
-For building and training graph-constructed models, the Python program first
-builds a graph representing the computation, then invokes `Session.run` to send
-the graph for execution on the C++-based runtime.  This provides:
-
-* Automatic differentiation using static autodiff.
-* Simple deployment to a platform independent server.
-* Graph-based optimizations (common subexpression elimination, constant-folding, etc.).
-* Compilation and kernel fusion.
-* Automatic distribution and replication (placing nodes on the distributed system).
-
-Deploying code written for eager execution is more difficult: either generate a
-graph from the model, or run the Python runtime and code directly on the server.
-
-### Write compatible code
-
-The same code written for eager execution will also build a graph during graph
-execution. Do this by simply running the same code in a new Python session where
-eager execution is not enabled.
-
-Most TensorFlow operations work during eager execution, but there are some things
-to keep in mind:
-
-* Use `tf.data` for input processing instead of queues. It's faster and easier.
-* Use object-oriented layer APIs—like `tf.keras.layers` and
-  `tf.keras.Model`—since they have explicit storage for variables.
-* Most model code works the same during eager and graph execution, but there are
-  exceptions. (For example, dynamic models using Python control flow to change the
-  computation based on inputs.)
-* Once eager execution is enabled with `tf.enable_eager_execution`, it
-  cannot be turned off. Start a new Python session to return to graph execution.
-
-It's best to write code for both eager execution *and* graph execution. This
-gives you eager's interactive experimentation and debuggability with the
-distributed performance benefits of graph execution.
-
-Write, debug, and iterate in eager execution, then import the model graph for
-production deployment. Use `tfe.Checkpoint` to save and restore model
-variables, this allows movement between eager and graph execution environments.
-See the examples in:
-[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
-
-### Use eager execution in a graph environment
-
-Selectively enable eager execution in a TensorFlow graph environment using
-`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not*
-been called.
-
-```py
-def my_py_func(x):
-  x = tf.matmul(x, x)  # You can use tf ops
-  print(x)  # but it's eager!
-  return x
-
-with tf.Session() as sess:
-  x = tf.placeholder(dtype=tf.float32)
-  # Call eager function in graph!
-  pf = tfe.py_func(my_py_func, [x], tf.float32)
-  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]
-```
-
-
-A `tfe.Checkpoint` stores the complete internal state of the objects passed to it. Nothing else is implicitly included. To record the state of a `model`, an `optimizer`, and a global step pass each one to the checkpoint's constructor:
-
-```py
-model = MyModel()
-optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
-checkpoint_dir = ‘/path/to/model_dir’
-checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
-root = tfe.Checkpoint(optimizer=optimizer,
-                      model=model,
-                      optimizer_step=tf.train.get_or_create_global_step())
-
-root.save(file_prefix=checkpoint_prefix)
-# or
-root.restore(tf.train.latest_checkpoint(checkpoint_dir))
-```
-
-### Object-oriented metrics
-
-`tfe.metrics` are stored as objects. Update a metric by passing the new data to
-the callable, and retrieve the result using the `tfe.metrics.result` method,
-for example:
-
-```py
-m = tfe.metrics.Mean("loss")
-m(0)
-m(5)
-m.result()  # => 2.5
-m([8, 9])
-m.result()  # => 5.5
-```
-
-#### Summaries and TensorBoard
-
-@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
-understanding, debugging and optimizing the model training process. It uses
-summary events that are written while executing the program.
-
-`tf.contrib.summary` is compatible with both eager and graph execution
-environments. Summary operations, such as `tf.contrib.summary.scalar`, are
-inserted during model construction. For example, to record summaries once every
-100 global steps:
-
-```py
-tf.train.get_or_create_global_step()  # return global step var
-writer = tf.contrib.summary.create_file_writer(logdir)
-
-for _ in range(iterations):
-  with writer.as_default():
-    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
-      # your model code goes here
-      tf.contrib.summary.scalar('loss', loss)
-      ...
-```
-
-## Performance
-
-Computation is not automatically offloaded to GPUs during eager execution. To
-explicitly direct a computation to a GPU, enclose it in a
-`tf.device('/gpu:0')` block:
-
-```py
-import time
-
-def measure(x, steps):
-  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
-  tf.matmul(x, x)
-  start = time.time()
-  for i in range(steps):
-    x = tf.matmul(x, x)
-    _ = x.numpy()  # Make sure to execute op and not just enqueue it
-  end = time.time()
-  return end - start
-
-shape = (1000, 1000)
-steps = 200
-print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))
-
-# Run on CPU:
-with tf.device("/cpu:0"):
-  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))
-
-# Run on GPU, if available:
-if tfe.num_gpus() > 0:
-  with tf.device("/gpu:0"):
-    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
-else:
-  print("GPU: not found")
-```
-
-Output (exact numbers depend on hardware):
-
-```
-Time to multiply a (1000, 1000) matrix by itself 200 times:
-CPU: 4.614904403686523 secs
-GPU: 0.5581181049346924 secs
-```
-
-A `tf.Tensor` object can be copied to a different device to execute its
-operations:
-
-```py
-x = tf.random_normal([10, 10])
-
-x_gpu0 = x.gpu()
-x_cpu = x.cpu()
-
-_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
-_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
-
-if tfe.num_gpus() > 1:
-  x_gpu1 = x.gpu(1)
-  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
-```
-
-### Benchmarks
-
-For compute-heavy models, such as
-[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50)
-training on a GPU, eager execution performance is comparable to graph execution.
-But this gap grows larger for models with less computation and there is work to
-be done for optimizing hot code paths for models with lots of small operations.
-
-
 ## Work with graphs
 
 While eager execution makes development and debugging more interactive,
-- 
GitLab


From 857dfa321d3689a87af9b6bc64c488a444ae80e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 29 Mar 2018 17:15:33 -0700
Subject: [PATCH 0088/1262] Add details of new mailing lists

PiperOrigin-RevId: 191011187
---
 tensorflow/docs_src/about/uses.md             |  6 ++--
 tensorflow/docs_src/community/contributing.md | 25 +++----------
 tensorflow/docs_src/community/index.md        |  6 +++-
 tensorflow/docs_src/community/leftnav_files   |  1 +
 tensorflow/docs_src/community/lists.md        | 32 ++++++++++++-----
 tensorflow/docs_src/community/swift.md        | 35 +++++++++++++++++++
 6 files changed, 73 insertions(+), 32 deletions(-)
 create mode 100644 tensorflow/docs_src/community/swift.md

diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md
index d646880bd3..d3db98203e 100644
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -18,9 +18,9 @@ This section describes some of the current uses of the TensorFlow system.
 
 > If you are using TensorFlow for research, for education, or for production
 > usage in some product, we would love to add something about your usage here.
-> Please feel free to email us a brief description of how you're using
-> TensorFlow, or even better, send us a pull request to add an entry to this
-> file.
+> Please feel free to [email us](mailto:usecases@tensorflow.org) a brief
+> description of how you're using TensorFlow, or even better, send us a
+> pull request to add an entry to this file.
 
 * **Deep Speech**
 <ul>
diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
index b0960df435..afbb8bbdd0 100644
--- a/tensorflow/docs_src/community/contributing.md
+++ b/tensorflow/docs_src/community/contributing.md
@@ -22,7 +22,7 @@ inside the organization, including:
 guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
 
 * Join the
-[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/developers)
+[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers)
 mailing list, to coordinate and discuss with others contributing to TensorFlow.
 
 * For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
@@ -37,28 +37,13 @@ You may also wish to review our guide to @{$benchmarks$defining and running benc
 To enable focused collaboration on particular areas of TensorFlow, we host
 Special Interest Groups (SIGs). SIGs do their work in public: if you want to
 join and contribute, review the work of the group, and get in touch with the
-relevant SIG leader.
+relevant SIG leader.  Membership policies vary on a per-SIG basis.
 
 * **SIG Build** focuses on issues surrounding building, packaging, and
-  distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/build).
+  distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/build).
 
 * **SIG TensorBoard** furthers the development and direction of TensorBoard and its plugins.
-  [Mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/tensorboard).
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard).
 
 * **SIG Rust** collaborates on the development of TensorFlow's Rust bindings.
-  [Mailing list](https://groups.google.com/a/tensorflow.org/forum/#!forum/rust). 
-
-## Projects developed by the TensorFlow community
-
-The TensorFlow community has created many great projects around TensorFlow, including:
-
-* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
-* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
-* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
-* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
-* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
-* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
-* [Swift language bindings](https://github.com/PerfectlySoft/Perfect-TensorFlow)
-* [Sublime Tensorflow - A plugin for Sublime Text](https://github.com/baptisteArnaud/Sublime-Tensorflow)
-* [GPflow - Gaussian processes in TensorFlow](https://github.com/GPflow/GPflow)
-* [CS 20SI: Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/) - please note, this course was designed with TensorFlow v0.12, so some of the notes may be out of date - but it's still a great resource.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/rust).
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index c08aeb7a97..eec2e51a87 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -47,6 +47,11 @@ The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
 For news and updates from around the universe of TensorFlow projects, follow
 [@tensorflow](https://twitter.com/tensorflow) on Twitter.
 
+### Blog
+
+We post regularly to the [TensorFlow Blog](http://blog.tensorflow.org/),
+with content from the TensorFlow team and the best articles from the community.
+
 ### YouTube
 
 Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learing
@@ -56,7 +61,6 @@ and AI with TensorFlow. On it we have a number of new shows, including:
 - Ask TensorFlow: the team answers the best questions tagged #AskTensorFlow from social media 
 - Coding TensorFlow: short bites with tips for success with TensorFlow
 
-
 ## Community Support
 
 ### Mailing Lists
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index 0bd1f14de9..2bae60d9dd 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -6,3 +6,4 @@ groups.md
 documentation.md
 style_guide.md
 benchmarks.md
+swift.md
diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md
index dc9240030e..7450ab36c4 100644
--- a/tensorflow/docs_src/community/lists.md
+++ b/tensorflow/docs_src/community/lists.md
@@ -9,27 +9,43 @@ see [how to get help](/community/#get_help).
 
 ## General TensorFlow lists
 
-* [announce](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce) - Low-volume announcements of new releases.
-* [discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) - General community discussion around TensorFlow.
-* [developers](https://groups.google.com/a/tensorflow.org/forum/#!forum/developers) - Discussion for developers contributing to TensorFlow.
+* [announce](https://groups.google.com/a/tensorflow.org/d/forum/announce) - Low-volume announcements of new releases.
+* [discuss](https://groups.google.com/a/tensorflow.org/d/forum/discuss) - General community discussion around TensorFlow.
+* [developers](https://groups.google.com/a/tensorflow.org/d/forum/developers) - Discussion for developers contributing to TensorFlow.
 
 ## Project-specific lists
 
 These projects inside the TensorFlow GitHub organization have lists dedicated to their communities:
 
-* [tensor2tensor](https://groups.google.com/forum/#!forum/tensor2tensor) - User
+* [hub](https://groups.google.com/a/tensorflow.org/d/forum/hub) -
+  Discussion and collaboration around [TensorFlow Hub](https://github.com/tensorflow/hub).
+* [magenta-discuss](https://groups.google.com/a/tensorflow.org/d/forum/magenta-discuss) -
+  General discussion about [Magenta](https://magenta.tensorflow.org/)
+  development and directions.
+* [swift](https://groups.google.com/a/tensorflow.org/d/forum/swift) -
+  Community and collaboration around Swift for TensorFlow.
+* [tensor2tensor](https://groups.google.com/d/forum/tensor2tensor) - Discussion
   and peer support for Tensor2Tensor.
+* [tfjs-announce](https://groups.google.com/a/tensorflow.org/d/forum/tfjs-announce) -
+  Announcements of new TensorFlow.js releases.
+* [tfjs](https://groups.google.com/a/tensorflow.org/d/forum/tfjs) - Discussion
+  and peer support for TensorFlow.js.
+* [tflite](https://groups.google.com/a/tensorflow.org/d/forum/tflite) - Discussion and
+  peer support for TensorFlow Lite.
+* [tpu-users](https://groups.google.com/a/tensorflow.org/d/forum/tpu-users) - Community discussion
+  and support for TPU users.
 
 ## Special Interest Groups
 
 TensorFlow's [Special Interest
 Groups](/community/contributing#special_interest_groups) (SIGs) support
 community collaboration on particular project focuses. Members of these groups
-work together to build and support TensorFlow related projects.
+work together to build and support TensorFlow related projects. While their
+archives are public, different SIGs have their own membership policies.
 
-* [build](https://groups.google.com/a/tensorflow.org/forum/#!forum/build) -
+* [build](https://groups.google.com/a/tensorflow.org/d/forum/build) -
   Supporting SIG Build, for build, distribution and packaging of TensorFlow.
-* [tensorboard](https://groups.google.com/a/tensorflow.org/forum/#!forum/tensorboard) -
+* [sig-tensorboard](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard) -
   Supporting SIG TensorBoard, for plugin development and other contribution.
-* [rust](https://groups.google.com/a/tensorflow.org/forum/#!forum/rust) -
+* [rust](https://groups.google.com/a/tensorflow.org/d/forum/rust) -
   Supporting SIG Rust, for the Rust language bindings.
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
new file mode 100644
index 0000000000..54d9960b23
--- /dev/null
+++ b/tensorflow/docs_src/community/swift.md
@@ -0,0 +1,35 @@
+# Swift Community
+
+Welcome to the Swift for TensorFlow development community!
+
+Swift for TensorFlow is a result of first-principles thinking applied to machine
+learning frameworks, and works quite differently than existing TensorFlow
+language bindings.  Whereas prior solutions are designed within the constraints
+of what can be achieved by a (typically Python or Lua) library, Swift for
+TensorFlow is based on the belief that machine learning is important enough to
+deserve first-class language and compiler support.
+
+First-class language and compiler support allows us to innovate in areas that
+have traditionally been out of bounds for machine learning libraries.  Our
+results provide the performance of TensorFlow graphs with the ease of use of
+define-by-run models, and provides a great user experience - for example, by
+catching more mistakes before you run your code.
+
+## Open Source
+
+As announced at the TensorFlow Developer Summit, we are planning to launch our
+open source project on GitHub in April.  In addition to releasing the code, we
+will be using an open design model, where design discussions happen in public.
+
+Between now and then, we are writing some technical white papers that explain in
+detail the design approach (e.g., the core compiler partitioning technique that
+underlies the whole thing, our approach to automatic differentiation, etc.),
+implementation tradeoffs, and the status of this work.  We can’t wait to engage
+with the broader community, but prefer to start the conversation when these
+white papers are ready.
+
+[Sign up here to join the community Google
+group](https://groups.google.com/a/tensorflow.org/d/forum/swift). We will
+initially use it for announcements, and then open it for general discussion when
+we are ready in April.
+
-- 
GitLab


From f77d1b63df9d62ba4bbadeb159b75c52ec9b4e31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 29 Mar 2018 17:30:09 -0700
Subject: [PATCH 0089/1262] * Added a JavaScript tab to DEVELOP. * Added a
 JavaScript file to DEPLOY. * Added a link to JavaScript API Ref.

PiperOrigin-RevId: 191012951
---
 tensorflow/docs_src/deploy/deploy_to_js.md   | 4 ++++
 tensorflow/docs_src/deploy/leftnav_files     | 1 +
 tensorflow/docs_src/javascript/index.md      | 5 +++++
 tensorflow/docs_src/javascript/leftnav_files | 1 +
 4 files changed, 11 insertions(+)
 create mode 100644 tensorflow/docs_src/deploy/deploy_to_js.md
 create mode 100644 tensorflow/docs_src/javascript/index.md
 create mode 100644 tensorflow/docs_src/javascript/leftnav_files

diff --git a/tensorflow/docs_src/deploy/deploy_to_js.md b/tensorflow/docs_src/deploy/deploy_to_js.md
new file mode 100644
index 0000000000..d7ce3ea90b
--- /dev/null
+++ b/tensorflow/docs_src/deploy/deploy_to_js.md
@@ -0,0 +1,4 @@
+# Deploy to JavaScript
+
+You can find details about deploying JavaScript TensorFlow programs
+in the separate [js.tensorflow.org site](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index c682e7add1..93f5bd1ed2 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -2,3 +2,4 @@ index.md
 distributed.md
 hadoop.md
 s3.md
+deploy_to_js.md
diff --git a/tensorflow/docs_src/javascript/index.md b/tensorflow/docs_src/javascript/index.md
new file mode 100644
index 0000000000..ad63eeb255
--- /dev/null
+++ b/tensorflow/docs_src/javascript/index.md
@@ -0,0 +1,5 @@
+# JavaScript 
+
+You may develop TensorFlow programs in JavaScript, training and deploying
+models right in your browser.  For details, see
+[js.tensorflow.org](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/javascript/leftnav_files b/tensorflow/docs_src/javascript/leftnav_files
new file mode 100644
index 0000000000..fc0ab8a543
--- /dev/null
+++ b/tensorflow/docs_src/javascript/leftnav_files
@@ -0,0 +1 @@
+index.md
-- 
GitLab


From 71ed936432ba96a3db1e8b742f58600243f9beb3 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 30 Mar 2018 00:18:18 -0700
Subject: [PATCH 0090/1262] add eager stub, fix titles and leftnav

---
 tensorflow/docs_src/get_started/eager.md                 | 4 ++++
 .../docs_src/get_started/get_started_for_beginners.md    | 2 +-
 tensorflow/docs_src/get_started/index.md                 | 9 +++++++--
 tensorflow/docs_src/get_started/leftnav_files            | 9 +++++----
 tensorflow/docs_src/get_started/premade_estimators.md    | 2 +-
 5 files changed, 18 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/docs_src/get_started/eager.md

diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
new file mode 100644
index 0000000000..3c60a08b40
--- /dev/null
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -0,0 +1,4 @@
+# Get Started
+
+This file is a place-holder for this
+[Colab notebook.](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index b88483be69..1e61c9b05a 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -1,4 +1,4 @@
-# Getting Started for ML Beginners
+# Premade Estimators for ML Beginners
 
 This document explains how to use machine learning to classify (categorize)
 Iris flowers by species.  This document dives deeply into the TensorFlow
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index b7bd1286e3..30a10696bf 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -3,8 +3,13 @@
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-TensorFlow provides many APIs. This section focuses on the high-level APIs.
-If you are new to TensorFlow, begin by reading one of the following documents:
+The easiest way to get started with tensorflow is using Eager Execution.
+
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models.
+To get started with Estimators begin by reading one of the following documents:
 
   * @{$get_started/get_started_for_beginners}, which is aimed at readers
     new to machine learning.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 437791d6a3..c63a9b4b63 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,10 +1,11 @@
 index.md
 
-### Getting Started
-get_started_for_beginners.md
-premade_estimators.md
+eager.md
 
-### Details
+### Estimators
+get_started_for_beginners.md: For Beginners
+premade_estimators.md: Premade Estimators
+>>>
 checkpoints.md
 feature_columns.md
 datasets_quickstart.md
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index 6bffd2e065..601f145df2 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -1,5 +1,5 @@
 
-# Getting Started with TensorFlow
+# Premade Estimators
 
 This document introduces the TensorFlow programming environment and shows you
 how to solve the Iris classification problem in TensorFlow.
-- 
GitLab


From 330c2a831dfff5640ebc2e2811749c6557f6198a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 08:12:16 -0700
Subject: [PATCH 0091/1262] Update test for DebugStripper to construct graph in
 scope.

PiperOrigin-RevId: 191067139
---
 tensorflow/core/grappler/optimizers/BUILD     |  1 +
 .../optimizers/debug_stripper_test.cc         | 57 ++++++++++---------
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 2c365c467c..f865d0c159 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -630,6 +630,7 @@ cc_library(
 
 tf_cuda_cc_test(
     name = "debug_stripper_test",
+    size = "small",
     srcs = ["debug_stripper_test.cc"],
     deps = [
         ":debug_stripper",
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
index aacd55f136..c79c36841d 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -29,14 +29,13 @@ namespace {
 class DebugStripperTest : public GrapplerTest {};
 
 TEST_F(DebugStripperTest, OutputEqualToInput) {
-  constexpr char device[] = "/device:CPU:0";
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output add = ops::Add(s, x, y);
+  Output result = ops::Identity(s, add);
   GrapplerItem item;
-  item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "XTimesTwo", {"x"}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
-      {});
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   DebugStripper optimizer;
   GraphDef output;
@@ -45,19 +44,17 @@ TEST_F(DebugStripperTest, OutputEqualToInput) {
 }
 
 TEST_F(DebugStripperTest, StripAssertFromGraph) {
-  constexpr char device[] = "/device:CPU:0";
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto assert = ops::Assert(s.WithOpName("Assert"), greaterequal, {x, y});
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({assert.operation}), x, y);
   GrapplerItem item;
-  item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("GreaterEqual", "GreaterEqual", {"x", "y"},
-                            {{"T", DT_FLOAT}}, device),
-       test::function::NDef("Assert", "Assert", {"GreaterEqual"},
-                            {{"T", DT_FLOAT}}, device),
-       test::function::NDef("z", "Add", {"x", "y", "^Assert"}, {}, device)},
-      {});
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   DebugStripper optimizer;
   GraphDef output;
@@ -68,31 +65,27 @@ TEST_F(DebugStripperTest, StripAssertFromGraph) {
     if (node.name() == "x") {
       count++;
       EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(device, node.device());
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(device, node.device());
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "GreaterEqual") {
       count++;
       EXPECT_EQ("GreaterEqual", node.op());
-      EXPECT_EQ(device, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("y", node.input(1));
     } else if (node.name() == "Assert") {
       count++;
       EXPECT_EQ("NoOp", node.op());
-      EXPECT_EQ(device, node.device());
-      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("^GreaterEqual", node.input(0));
-      EXPECT_EQ(0, node.attr_size());
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(device, node.device());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("y", node.input(1));
@@ -100,6 +93,16 @@ TEST_F(DebugStripperTest, StripAssertFromGraph) {
     }
   }
   EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
 }  // namespace
-- 
GitLab


From ddbb2c52db5cfab02b80b2ef563d8d6251dcfe77 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 08:23:30 -0700
Subject: [PATCH 0092/1262] Fix a crash in Quantize() when
 tf.contrib.framework.get_name_scope() == None.

PiperOrigin-RevId: 191068059
---
 .../contrib/quantize/python/quantize.py       |  4 +++-
 .../contrib/quantize/python/quantize_test.py  | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 2889016a84..d53d4d7b10 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -416,7 +416,9 @@ def _InsertQuantOp(context,
   # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
   # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which
   # breaks things later.
-  name_prefix = common.DropStringPrefix(name_prefix, ops.get_name_scope() + '/')
+  name_scope = ops.get_name_scope()
+  if name_scope:
+    name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/')
 
   inputs = producer.outputs[0]
   # Prevent ops from being quantized multiple times. Bypass ops can sometimes
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 98f05c8bfc..8d057d3710 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -247,6 +247,27 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       self.assertTrue(not op.name.startswith('name_scope/name_scope/'),
                       'Broken op: %s' % op.name)
 
+  def testWithNullNameScope(self):
+    self._RunTestOverParameters(self._TestWithNullNameScope)
+
+  def _TestWithNullNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope(None):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+        quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+        # Passes if Quantize() does not crash.
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
-- 
GitLab


From ab3b1705bc2c546eb3607876fcdcc45902552346 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sat, 31 Mar 2018 00:36:25 +0900
Subject: [PATCH 0093/1262] cast svd output to float32 and use keepdims in test
 cases

---
 tensorflow/python/kernel_tests/norm_op_test.py | 4 ++--
 tensorflow/python/ops/linalg_ops.py            | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index 0e7d4fd9b9..dde28007d4 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -69,12 +69,12 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
       if use_static_shape_:
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
-            tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_)
+            tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
-            tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_)
+            tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
     self.assertAllClose(np_norm, tf_norm_val)
 
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 86be1e7752..bbc39f58db 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -548,7 +548,8 @@ def norm(tensor,
             ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
         perm_before = array_ops.concat(
-            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis], axis=0)
+            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
+            axis=0)
         perm_after = functional_ops.map_fn(
             lambda i: math_ops.cast(
                 array_ops.squeeze(
@@ -557,7 +558,9 @@ def norm(tensor,
         permed = array_ops.transpose(tensor, perm=perm_before)
         matrix_2_norm = array_ops.expand_dims(
             math_ops.reduce_max(
-                gen_linalg_ops.svd(permed, compute_uv=False)[0],
+                math_ops.cast(
+                    gen_linalg_ops.svd(permed, compute_uv=False)[0],
+                    dtype=dtypes.float32),
                 axis=-1,
                 keepdims=True),
             axis=-1)
-- 
GitLab


From 6b1d9e788305c41cf436a1873c59df8d0df87d44 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sat, 31 Mar 2018 01:27:05 +0900
Subject: [PATCH 0094/1262] use abs instead of cast

---
 tensorflow/python/ops/linalg_ops.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index bbc39f58db..b306042aff 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -558,9 +558,7 @@ def norm(tensor,
         permed = array_ops.transpose(tensor, perm=perm_before)
         matrix_2_norm = array_ops.expand_dims(
             math_ops.reduce_max(
-                math_ops.cast(
-                    gen_linalg_ops.svd(permed, compute_uv=False)[0],
-                    dtype=dtypes.float32),
+                math_ops.abs(gen_linalg_ops.svd(permed, compute_uv=False)[0]),
                 axis=-1,
                 keepdims=True),
             axis=-1)
-- 
GitLab


From e995271f8550bf6b62fb9089ea702f2114783904 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 30 Mar 2018 09:41:29 -0700
Subject: [PATCH 0095/1262] Fix the index to match the left nav.

PiperOrigin-RevId: 191075496
---
 tensorflow/docs_src/programmers_guide/index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index e8c2fa6990..017db0e8cb 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,6 +5,7 @@ works. The units are as follows:
 
 ## High Level APIs
 
+  * @{$programmers_guide/eager}, which is the easiest way to use tensorflow.
   * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/datasets}, which explains how to
-- 
GitLab


From 3d9c27742693f9859e2fb75de57fe108520de712 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 09:56:29 -0700
Subject: [PATCH 0096/1262] Fix several data races by acquiring locks.

(The racy accesses were detected by the thread safety annotations.)

PiperOrigin-RevId: 191077376
---
 tensorflow/core/common_runtime/executor.cc                  | 6 ++++--
 .../core/common_runtime/process_function_library_runtime.h  | 5 ++++-
 tensorflow/core/kernels/queue_op.h                          | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index b06b75d658..195803fd7f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1022,7 +1022,8 @@ class ExecutorState {
     int total_input_tensors = 0;
     std::vector<const Node*>* nodes = nullptr;
 
-    // Lock ordering: ExecutorState.mu_ < mu.
+    // Lock ordering: ExecutorState.mu_ < mu;
+    // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
     void InitializeFrameInfo(const string& enter_name) {
@@ -2333,8 +2334,9 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   FrameState* parent_frame = frame->parent_frame;
   const int64 parent_iter = frame->parent_iter;
   if (parent_frame != nullptr) {
-    mutex_lock paranet_frame_lock(parent_frame->mu);
+    mutex_lock parent_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
+    mutex_lock this_frame_lock(frame->mu);
     for (const Node* node : frame->dead_exits) {
       auto parent_iter_state = parent_frame->GetIteration(parent_iter);
       for (const Edge* e : node->out_edges()) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index d69e8bc2a0..c7b8259f78 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -155,7 +155,10 @@ class ProcessFunctionLibraryRuntime {
 
     string target_device() { return target_device_; }
 
-    FunctionLibraryRuntime::LocalHandle local_handle() { return local_handle_; }
+    FunctionLibraryRuntime::LocalHandle local_handle() {
+      mutex_lock l(mu_);
+      return local_handle_;
+    }
 
     // Initializes the FunctionData object by potentially making an Initialize
     // call to the DistributedFunctionLibraryRuntime.
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index ad606803ee..6c19f9841c 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -43,6 +43,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
 
   void Compute(OpKernelContext* context) override {
     ResourceOpKernel<QueueInterface>::Compute(context);
+    mutex_lock l(mu_);
     if (resource_ && context->track_allocations()) {
       context->record_persistent_memory_allocation(resource_->MemoryUsed());
     }
-- 
GitLab


From 0ecaf4d775f96b6c050de478f17dd647ff7e80e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 10:15:11 -0700
Subject: [PATCH 0097/1262] [XLA] Add IsConstant to the local Python client.

PiperOrigin-RevId: 191080094
---
 .../xla/python/local_computation_builder.cc   |  11 ++
 .../xla/python/local_computation_builder.h    |   7 +
 .../xla/python/local_computation_builder.i    | 164 ++++++++++--------
 tensorflow/compiler/xla/python/xla_client.py  |  14 ++
 .../compiler/xla/python/xla_client_test.py    |  11 ++
 5 files changed, 130 insertions(+), 77 deletions(-)

diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index b21ab3044f..2bacc6a914 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -521,6 +521,17 @@ ComputationDataHandle LocalComputationBuilder::Conditional(
                               false_computation.computation());
 }
 
+StatusOr<bool> LocalComputationBuilder::IsConstant(
+    const ComputationDataHandle& operand, int64 num_parameters) {
+  return builder_.IsConstant(operand, num_parameters);
+}
+
+StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
+    const ComputationDataHandle& operand, const Layout* output_layout,
+    tensorflow::gtl::ArraySlice<Literal> parameters) {
+  return builder_.ComputeConstant(operand, output_layout, parameters);
+}
+
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
   return_sig LocalComputationBuilder::method_name args_sig { \
     return builder_.method_name args;                        \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index a7375c8965..31046e60f1 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -268,6 +268,13 @@ class LocalComputationBuilder {
                                     const ComputationDataHandle& false_operand,
                                     const LocalComputation& false_computation);
 
+  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
+                            int64 num_parameters);
+
+  StatusOr<std::unique_ptr<Literal> > ComputeConstant(
+      const ComputationDataHandle& operand, const Layout* output_layout,
+      tensorflow::gtl::ArraySlice<Literal> parameters);
+
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 8f231d1a12..ac792e8189 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -182,7 +182,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
   const int64 handle = numpy::PyIntOrPyLongToLong($input);
   if (handle == -1 && PyErr_Occurred()) {
-    return NULL;
+    SWIG_fail;
   }
   temp.set_handle(handle);
   $1 = &temp;
@@ -201,7 +201,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -211,7 +211,7 @@ tensorflow::ImportNumpy();
     $result = numpy::PyObjectFromXlaLiteral(*value);
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -224,7 +224,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -233,7 +233,16 @@ tensorflow::ImportNumpy();
     $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
   }
 }
 
@@ -241,7 +250,7 @@ tensorflow::ImportNumpy();
   if (!$1.ok()) {
     PyErr_SetString(
         PyExc_RuntimeError, $1.ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   Py_INCREF(Py_None);
   $result = Py_None;
@@ -253,7 +262,7 @@ tensorflow::ImportNumpy();
     (std::vector<int64> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -265,13 +274,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i] = numpy::PyIntOrPyLongToLong(py_int);
     if (temps[i] == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(py_int);
     Py_DECREF(o);
@@ -285,7 +294,7 @@ tensorflow::ImportNumpy();
     (std::vector<ComputationDataHandle> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -296,13 +305,13 @@ tensorflow::ImportNumpy();
       PyErr_SetString(
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
-      return NULL;
+      SWIG_fail;
     }
     const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
     if (handle == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i].set_handle(handle);
     Py_DECREF(py_int);
@@ -317,7 +326,7 @@ tensorflow::ImportNumpy();
     (std::vector<LocalShapedBuffer*> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
@@ -326,7 +335,7 @@ tensorflow::ImportNumpy();
     LocalShapedBuffer* lsbp;
     if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
                          SWIG_POINTER_EXCEPTION)) == -1) {
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(lsbp);
     Py_DECREF(o);
@@ -340,7 +349,7 @@ tensorflow::ImportNumpy();
   literal_status = numpy::XlaLiteralFromPyObject($input);
   if (!literal_status.ok()) {
     PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $1 = literal_status.ValueOrDie().get();
 }
@@ -352,7 +361,7 @@ tensorflow::ImportNumpy();
 %typemap(out) StatusOr< std::unique_ptr<Literal> > {
   if (!$1.ok()) {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $result = numpy::PyObjectFromXlaLiteral(*$1.ValueOrDie());
 }
@@ -360,7 +369,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -369,7 +378,7 @@ tensorflow::ImportNumpy();
     if (!literal_status.ok()) {
       PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::move(*literal_status.ConsumeValueOrDie()));
     Py_DECREF(o);
@@ -383,7 +392,7 @@ tensorflow::ImportNumpy();
   StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -395,7 +404,7 @@ tensorflow::ImportNumpy();
   StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -410,7 +419,7 @@ tensorflow::ImportNumpy();
     StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temp = std::move(statusor).ValueOrDie();
     $1 = &temp;
@@ -424,7 +433,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -433,7 +442,7 @@ tensorflow::ImportNumpy();
     Py_DECREF(o);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(statusor.ConsumeValueOrDie());
   }
@@ -444,7 +453,7 @@ tensorflow::ImportNumpy();
     std::vector<tensorflow::gtl::optional<Shape> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -456,7 +465,7 @@ tensorflow::ImportNumpy();
       Py_DECREF(o);
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        return NULL;
+        SWIG_fail;
       }
       temps.push_back(statusor.ConsumeValueOrDie());
     }
@@ -470,18 +479,18 @@ tensorflow::ImportNumpy();
   PyObject* py_int = numpy::PyNumberToPyInt($input);
   if (!py_int) {
     PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    return NULL;
+    SWIG_fail;
   }
   const long value = numpy::PyIntOrPyLongToLong(py_int);
   if (value == -1 && PyErr_Occurred()) {
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   if (!PrimitiveType_IsValid(value)) {
     PyErr_SetString(
         PyExc_TypeError, "Argument not valid for PrimitiveType enum");
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   $1 = static_cast<PrimitiveType>(value);
 }
@@ -492,19 +501,19 @@ tensorflow::ImportNumpy();
     (std::vector<std::pair<int64, int64> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
     if (!o) {
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first = PyTuple_GetItem(o, 0);
     if (!first) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first_pyint = numpy::PyNumberToPyInt(first);
     if (!first_pyint) {
@@ -512,13 +521,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "First pair item cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second = PyTuple_GetItem(o, 1);
     if (!second) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second_pyint = numpy::PyNumberToPyInt(second);
     if (!second_pyint) {
@@ -527,21 +536,21 @@ tensorflow::ImportNumpy();
           "Second pair item cannot be converted to int");
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
     if (first_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
     if (second_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::make_pair(first_value, second_value));
     Py_DECREF(o);
@@ -559,26 +568,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "lhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -589,26 +598,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "rhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -619,26 +628,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
       $input, "lhs_batch_dimensions");
   if (!lhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -649,26 +658,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
       $input, "rhs_batch_dimensions");
   if (!rhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -684,20 +693,20 @@ tensorflow::ImportNumpy();
     (PaddingConfig padding_config) {
   PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
   if (!dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   int length = PySequence_Size(dimensions);
   if (length == -1) {
     Py_DECREF(dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(dimensions, i);
     if (!item) {
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     int64 edge_padding_low, edge_padding_high, interior_padding;
     if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
@@ -705,7 +714,7 @@ tensorflow::ImportNumpy();
         || !GetIntAttr(item, "interior_padding", &interior_padding)) {
       Py_DECREF(item);
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(item);
 
@@ -727,32 +736,32 @@ tensorflow::ImportNumpy();
   int64 value;
 
   if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_batch_dimension(value);
 
   if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_feature_dimension(value);
 
   if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_batch_dimension(value);
 
   if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_input_feature_dimension(value);
 
@@ -761,24 +770,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "input_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_input_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -787,24 +796,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_kernel_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -813,24 +822,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "output_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_output_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -865,12 +874,12 @@ tensorflow::ImportNumpy();
 
     PyObject* o = PyObject_GetAttrString($input, "hlo_profile");
     if (o == NULL) {
-      return NULL;
+      SWIG_fail;
     }
     if (o != Py_None) {
       if (!PyBool_Check(o)) {
         PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
-        return NULL;
+        SWIG_fail;
       }
       build_options.set_hlo_profile(o == Py_True);
     }
@@ -885,7 +894,7 @@ tensorflow::ImportNumpy();
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str());
         Py_DECREF(o);
-        return NULL;
+        SWIG_fail;
       }
       build_options.set_result_layout(statusor.ValueOrDie());
     }
@@ -951,6 +960,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::RngBernoulli;
 %unignore xla::swig::LocalComputationBuilder::While;
 %unignore xla::swig::LocalComputationBuilder::Conditional;
+%unignore xla::swig::LocalComputationBuilder::IsConstant;
 %unignore xla::swig::LocalComputationBuilder::Eq;
 %unignore xla::swig::LocalComputationBuilder::Ne;
 %unignore xla::swig::LocalComputationBuilder::Ge;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index e548d420f4..9c81f6439d 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1028,6 +1028,20 @@ class ComputationBuilder(object):
             _unwrap_data_handle(false_operand),
             false_computation.c_local_computation))
 
+  def IsConstant(self, operand, num_parameters=0):
+    """Enqueues an IsConstant operation onto the computation.
+
+    Args:
+      operand: a ComputationDataHandle to test.
+      num_parameters: optional int, number of computation parameters to treat as
+        constant (default 0).
+
+    Returns: bool indicating whether `operand` is a compile-time constant,
+      meaning its value does not depend on parameters with index greater than or
+      equal to `num_parameters`.
+    """
+    return self._client.IsConstant(_unwrap_data_handle(operand), num_parameters)
+
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 4c16c1f8b0..d97264ea64 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -855,6 +855,17 @@ class SingleOpTest(LocalComputationTest):
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
+  def testIsConstant(self):
+    c = self._NewComputation()
+    a = c.ConstantS32Scalar(3)
+    b = c.ConstantS32Scalar(1)
+    x = c.ParameterFromNumpy(NumpyArrayS32(0))
+    const_expr = c.Sub(b, a)
+    non_const_expr = c.Mul(const_expr, x)
+    self.assertTrue(c.IsConstant(const_expr))
+    self.assertFalse(c.IsConstant(non_const_expr))
+    # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
+
 
 class EmbeddedComputationsTest(LocalComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
-- 
GitLab


From a1aa918af9c5c1c3f777abcc7dc9ab2929bb1001 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 10:47:58 -0700
Subject: [PATCH 0098/1262] Check for errors returned by ExecuteOnStream before
 calling BlockHostUntilDone. If ExecuteOnStream fails, BlockHostUntilDone will
 also fail in a way that makes debugging much harder.

RELNOTES: n/a
PiperOrigin-RevId: 191085159
---
 tensorflow/compiler/xla/service/executable.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index be92b1629a..471d2fd6ce 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -80,6 +80,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
 
   StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
+  TF_RETURN_IF_ERROR(return_value.status());
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
-- 
GitLab


From 528c64665f2c3220acb59031926274403b96dddb Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Fri, 30 Mar 2018 10:50:18 -0700
Subject: [PATCH 0099/1262] Create a wrapper for bfloat16 scope so that users
 don't need the custom getter.

PiperOrigin-RevId: 191085552
---
 tensorflow/contrib/tpu/BUILD                  | 12 +++
 tensorflow/contrib/tpu/__init__.py            |  1 +
 tensorflow/contrib/tpu/python/tpu/bfloat16.py | 78 +++++++++++++++++++
 .../contrib/tpu/python/tpu/bfloat16_test.py   | 52 +++++++++++++
 4 files changed, 143 insertions(+)
 create mode 100644 tensorflow/contrib/tpu/python/tpu/bfloat16.py
 create mode 100644 tensorflow/contrib/tpu/python/tpu/bfloat16_test.py

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 3e32a7a85c..b267cceef1 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -159,6 +159,7 @@ py_library(
     name = "tpu_lib",
     srcs = [
         "python/tpu/__init__.py",
+        "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
@@ -240,6 +241,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["python/tpu/bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
 tf_py_test(
     name = "tpu_infeed_test",
     size = "small",
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index ea6e874f2d..bb60f3e2d7 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -53,6 +53,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.bfloat16 import *
 from tensorflow.contrib.tpu.python.tpu.device_assignment import *
 from tensorflow.contrib.tpu.python.tpu.topology import *
 from tensorflow.contrib.tpu.python.tpu.tpu import *
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
new file mode 100644
index 0000000000..929d1824c3
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -0,0 +1,78 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      'bfloat16', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
new file mode 100644
index 0000000000..fda4331e89
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for bfloat16 helper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import bfloat16
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.platform import test
+
+
+class BFloat16ScopeTest(test.TestCase):
+
+  def testScopeName(self):
+    """Test if name for the variable scope is propogated correctly.
+    """
+    with bfloat16.bfloat16_scope() as bf:
+      self.assertEqual(bf.name, "bfloat16")
+
+  def testRequestedDType(self):
+    """Test if requested dtype is honored in the getter.
+    """
+    with bfloat16.bfloat16_scope() as scope:
+      v1 = variable_scope.get_variable("v1", [])
+      self.assertEqual(v1.dtype.base_dtype, dtypes.float32)
+      v2 = variable_scope.get_variable("v2", [], dtype=dtypes.bfloat16)
+      self.assertEqual(v2.dtype.base_dtype, dtypes.bfloat16)
+      self.assertEqual([dtypes.float32, dtypes.float32],
+                       [v.dtype.base_dtype for v in scope.global_variables()])
+
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From c4acdccbb7284c6a63e6824a7ee45ce7a86606b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 11:12:24 -0700
Subject: [PATCH 0100/1262] Rename distributed_apply to _distributed_apply in
 OptimizerV2 to match the Optimizer base class.

PiperOrigin-RevId: 191089407
---
 tensorflow/contrib/optimizer_v2/optimizer_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 471992fdac..25d19578ea 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -866,7 +866,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
     return distribute_lib.get_tower_context().merge_call(
-        self.distributed_apply, filtered, global_step=global_step, name=name)
+        self._distributed_apply, filtered, global_step=global_step, name=name)
 
   def _get_or_create_state(self, var_list=None):
     """Either looks up or creates `_OptimizerV2State`.
@@ -899,7 +899,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       self._per_graph_state[graph_key] = per_graph_state
     return per_graph_state
 
-  def distributed_apply(self, distribution, grads_and_vars, global_step, name):
+  def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
     """`apply_gradients` for use with a `DistributionStrategy`."""
     reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
-- 
GitLab


From ff451cae700d0f0f17ff9d2dde32299344b21fb1 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Fri, 30 Mar 2018 11:21:31 -0700
Subject: [PATCH 0101/1262] Internal change.

PiperOrigin-RevId: 191090993
---
 .../main/java/org/tensorflow/lite/Interpreter.java  | 13 +++++++++++++
 .../tensorflow/lite/NativeInterpreterWrapper.java   | 12 ++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 14f461f5f9..a33959dca4 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -67,6 +67,19 @@ public final class Interpreter implements AutoCloseable {
     wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath());
   }
 
+  /**
+   * Initializes a {@code Interpreter} and specifies the number of threads used for inference.
+   *
+   * @param modelFile: a file of a pre-trained TF Lite model
+   * @param numThreads: number of threads to use for inference
+   */
+  public Interpreter(@NonNull File modelFile, int numThreads) {
+    if (modelFile == null) {
+      return;
+    }
+    wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath(), numThreads);
+  }
+
   /**
    * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
    *
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index dbf8f8f7cc..fc8187acfe 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -32,9 +32,13 @@ import java.util.Map;
 final class NativeInterpreterWrapper implements AutoCloseable {
 
   NativeInterpreterWrapper(String modelPath) {
+    this(modelPath, /* numThreads= */ -1);
+  }
+
+  NativeInterpreterWrapper(String modelPath, int numThreads) {
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModel(modelPath, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle, /* numThreads= */ -1);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
     isMemoryAllocated = true;
   }
 
@@ -44,11 +48,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
    * NativeInterpreterWrapper}.
    */
   NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
-    modelByteBuffer = mappedByteBuffer;
-    errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
-    modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle, /* numThreads= */ -1);
-    isMemoryAllocated = true;
+    this(mappedByteBuffer, /* numThreads= */ -1);
   }
 
   /**
-- 
GitLab


From 5eef3a21a0df996c407a78cdfbdcdd11ce4f6f34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 11:32:58 -0700
Subject: [PATCH 0102/1262] Break FileSystem's dependency on ThreadPool.

PiperOrigin-RevId: 191092932
---
 .../android/asset_manager_filesystem.cc       |   6 +
 .../android/asset_manager_filesystem.h        |   3 +
 .../contrib/makefile/proto_text_cc_files.txt  |   1 +
 tensorflow/core/BUILD                         |   1 +
 tensorflow/core/platform/file_system.cc       |  93 -------------
 tensorflow/core/platform/file_system.h        |   4 +-
 .../core/platform/file_system_helper.cc       | 126 ++++++++++++++++++
 tensorflow/core/platform/file_system_helper.h |  51 +++++++
 .../platform/hadoop/hadoop_file_system.cc     |   6 +
 .../core/platform/hadoop/hadoop_file_system.h |   3 +
 tensorflow/core/platform/null_file_system.h   |   6 +
 .../core/platform/posix/posix_file_system.cc  |   6 +
 .../core/platform/posix/posix_file_system.h   |   3 +
 tensorflow/core/platform/s3/s3_file_system.cc |   6 +
 tensorflow/core/platform/s3/s3_file_system.h  |   3 +
 .../platform/windows/windows_file_system.cc   |   4 +-
 tensorflow/core/util/memmapped_file_system.cc |   6 +
 tensorflow/core/util/memmapped_file_system.h  |   2 +
 18 files changed, 233 insertions(+), 97 deletions(-)
 create mode 100644 tensorflow/core/platform/file_system_helper.cc
 create mode 100644 tensorflow/core/platform/file_system_helper.h

diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index 380a652435..fe2d13e636 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 
 namespace tensorflow {
 namespace {
@@ -243,6 +244,11 @@ bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) {
   return AAssetDir_getNextFileName(dir.get()) != NULL;
 }
 
+Status AssetManagerFileSystem::GetMatchingPaths(const string& pattern,
+                                                std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status AssetManagerFileSystem::NewWritableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.h b/tensorflow/contrib/android/asset_manager_filesystem.h
index 665304b5ee..a87ff42ae2 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.h
+++ b/tensorflow/contrib/android/asset_manager_filesystem.h
@@ -66,6 +66,9 @@ class AssetManagerFileSystem : public FileSystem {
   Status DeleteDir(const string& d) override;
   Status RenameFile(const string& s, const string& t) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
  private:
   string RemoveAssetPrefix(const string& name);
 
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 77c936d8c5..76428bc1d4 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -12,6 +12,7 @@ tensorflow/core/platform/posix/env.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/env_time.cc
 tensorflow/core/platform/file_system.cc
+tensorflow/core/platform/file_system_helper.cc
 tensorflow/core/platform/env.cc
 tensorflow/core/platform/env_time.cc
 tensorflow/core/platform/setround.cc
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 21f7866abd..7d5ae1c5b5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -349,6 +349,7 @@ cc_library(
         "platform/env.h",
         "platform/env_time.h",
         "platform/file_system.h",
+        "platform/file_system_helper.h",
         "platform/fingerprint.h",
         "platform/init_main.h",
         "platform/logging.h",
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index a2f42f44ac..b55e94d552 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -28,28 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
-constexpr int kNumThreads = 8;
-
-// Run a function in parallel using a ThreadPool, but skip the ThreadPool
-// on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, const std::function<void(int)>& f) {
-#if TARGET_OS_IPHONE
-  for (int i = first; i < last; i++) {
-    f(i);
-  }
-#else
-  int num_threads = std::min(kNumThreads, last - first);
-  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
-  for (int i = first; i < last; i++) {
-    threads.Schedule([f, i] { f(i); });
-  }
-#endif
-}
-
-}  // anonymous namespace
-
 FileSystem::~FileSystem() {}
 
 string FileSystem::TranslateName(const string& name) const {
@@ -94,76 +71,6 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-Status FileSystem::GetMatchingPaths(const string& pattern,
-                                    std::vector<string>* results) {
-  results->clear();
-  // Find the fixed prefix by looking for the first wildcard.
-  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
-  string eval_pattern = pattern;
-  std::vector<string> all_files;
-  string dir = io::Dirname(fixed_prefix).ToString();
-  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
-  // include . as the top level directory.
-  if (dir.empty()) {
-    dir = ".";
-    fixed_prefix = io::JoinPath(dir, fixed_prefix);
-    eval_pattern = io::JoinPath(dir, pattern);
-  }
-
-  // Setup a BFS to explore everything under dir.
-  std::deque<string> dir_q;
-  dir_q.push_back(dir);
-  Status ret;  // Status to return.
-  // children_dir_status holds is_dir status for children. It can have three
-  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
-  // if we don't calculate IsDirectory (we might do that because there isn't
-  // any point in exploring that child path).
-  std::vector<Status> children_dir_status;
-  while (!dir_q.empty()) {
-    string current_dir = dir_q.front();
-    dir_q.pop_front();
-    std::vector<string> children;
-    Status s = GetChildren(current_dir, &children);
-    ret.Update(s);
-    if (children.empty()) continue;
-    // This IsDirectory call can be expensive for some FS. Parallelizing it.
-    children_dir_status.resize(children.size());
-    ForEach(0, children.size(),
-            [this, &current_dir, &children, &fixed_prefix,
-             &children_dir_status](int i) {
-              const string child_path = io::JoinPath(current_dir, children[i]);
-              // In case the child_path doesn't start with the fixed_prefix then
-              // we don't need to explore this path.
-              if (!str_util::StartsWith(child_path, fixed_prefix)) {
-                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
-                                                "Operation not needed");
-              } else {
-                children_dir_status[i] = IsDirectory(child_path);
-              }
-            });
-    for (int i = 0; i < children.size(); ++i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // If the IsDirectory call was cancelled we bail.
-      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
-        continue;
-      }
-      // If the child is a directory add it to the queue.
-      if (children_dir_status[i].ok()) {
-        dir_q.push_back(child_path);
-      }
-      all_files.push_back(child_path);
-    }
-  }
-
-  // Match all obtained files to the input pattern.
-  for (const auto& f : all_files) {
-    if (Env::Default()->MatchPath(f, eval_pattern)) {
-      results->push_back(f);
-    }
-  }
-  return ret;
-}
-
 Status FileSystem::DeleteRecursively(const string& dirname,
                                      int64* undeleted_files,
                                      int64* undeleted_dirs) {
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 8f99766e15..077b1d79cf 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -138,10 +138,8 @@ class FileSystem {
   ///  * OK - no errors
   ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
   ///                    implemented
-  /// The default implementation uses a combination of GetChildren, MatchPath
-  /// and IsDirectory.
   virtual Status GetMatchingPaths(const string& pattern,
-                                  std::vector<string>* results);
+                                  std::vector<string>* results) = 0;
 
   /// \brief Obtains statistics for the given path.
   virtual Status Stat(const string& fname, FileStatistics* stat) = 0;
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
new file mode 100644
index 0000000000..22c5057281
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+
+constexpr int kNumThreads = 8;
+
+// Run a function in parallel using a ThreadPool, but skip the ThreadPool
+// on the iOS platform due to its problems with more than a few threads.
+void ForEach(int first, int last, const std::function<void(int)>& f) {
+#if TARGET_OS_IPHONE
+  for (int i = first; i < last; i++) {
+    f(i);
+  }
+#else
+  int num_threads = std::min(kNumThreads, last - first);
+  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
+  for (int i = first; i < last; i++) {
+    threads.Schedule([f, i] { f(i); });
+  }
+#endif
+}
+
+}  // namespace
+
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results) {
+  results->clear();
+  // Find the fixed prefix by looking for the first wildcard.
+  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
+  string eval_pattern = pattern;
+  std::vector<string> all_files;
+  string dir = io::Dirname(fixed_prefix).ToString();
+  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
+  // include . as the top level directory.
+  if (dir.empty()) {
+    dir = ".";
+    fixed_prefix = io::JoinPath(dir, fixed_prefix);
+    eval_pattern = io::JoinPath(dir, pattern);
+  }
+
+  // Setup a BFS to explore everything under dir.
+  std::deque<string> dir_q;
+  dir_q.push_back(dir);
+  Status ret;  // Status to return.
+  // children_dir_status holds is_dir status for children. It can have three
+  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
+  // if we don't calculate IsDirectory (we might do that because there isn't
+  // any point in exploring that child path).
+  std::vector<Status> children_dir_status;
+  while (!dir_q.empty()) {
+    string current_dir = dir_q.front();
+    dir_q.pop_front();
+    std::vector<string> children;
+    Status s = fs->GetChildren(current_dir, &children);
+    ret.Update(s);
+    if (children.empty()) continue;
+    // This IsDirectory call can be expensive for some FS. Parallelizing it.
+    children_dir_status.resize(children.size());
+    ForEach(0, children.size(),
+            [fs, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!str_util::StartsWith(child_path, fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = fs->IsDirectory(child_path);
+              }
+            });
+    for (int i = 0; i < children.size(); ++i) {
+      const string child_path = io::JoinPath(current_dir, children[i]);
+      // If the IsDirectory call was cancelled we bail.
+      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
+        continue;
+      }
+      // If the child is a directory add it to the queue.
+      if (children_dir_status[i].ok()) {
+        dir_q.push_back(child_path);
+      }
+      all_files.push_back(child_path);
+    }
+  }
+
+  // Match all obtained files to the input pattern.
+  for (const auto& f : all_files) {
+    if (env->MatchPath(f, eval_pattern)) {
+      results->push_back(f);
+    }
+  }
+  return ret;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system_helper.h b/tensorflow/core/platform/file_system_helper.h
new file mode 100644
index 0000000000..8d812b0e38
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FileSystem;
+class Env;
+
+namespace internal {
+
+// Given a pattern, stores in 'results' the set of paths (in the given file
+// system) that match that pattern.
+//
+// This helper may be used by implementations of FileSystem::GetMatchingPaths()
+// in order to provide parallel scanning of subdirectories (except on iOS).
+//
+// Arguments:
+//   fs: may not be null and will be used to identify directories and list
+//       their contents.
+//   env: may not be null and will be used to check if a match has been found.
+//   pattern: see FileSystem::GetMatchingPaths() for details.
+//   results: will be cleared and may not be null.
+//
+// Returns an error status if any call to 'fs' failed.
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 74863293a3..9a71fbe2b7 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/posix/error.h"
@@ -396,6 +397,11 @@ Status HadoopFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
+                                          std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status HadoopFileSystem::DeleteFile(const string& fname) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 5f2b222622..6af7a698ff 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -49,6 +49,9 @@ class HadoopFileSystem : public FileSystem {
 
   Status GetChildren(const string& dir, std::vector<string>* result) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/null_file_system.h b/tensorflow/core/platform/null_file_system.h
index 008e6d54d0..420abc1ada 100644
--- a/tensorflow/core/platform/null_file_system.h
+++ b/tensorflow/core/platform/null_file_system.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 
 namespace tensorflow {
 
@@ -65,6 +66,11 @@ class NullFileSystem : public FileSystem {
     return errors::Unimplemented("GetChildren unimplemented");
   }
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override {
+    return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+  }
+
   Status DeleteFile(const string& fname) override {
     return errors::Unimplemented("DeleteFile unimplemented");
   }
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 9a8021565c..47bfa020ce 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/posix/posix_file_system.h"
@@ -225,6 +226,11 @@ Status PosixFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status PosixFileSystem::GetMatchingPaths(const string& pattern,
+                                         std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status PosixFileSystem::DeleteFile(const string& fname) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index 98ffa43b8a..e8898d0a97 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -47,6 +47,9 @@ class PosixFileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stats) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 301fcb9dbf..ee423699b2 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
@@ -497,6 +498,11 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   return Status::OK();
 }
 
+Status S3FileSystem::GetMatchingPaths(const string& pattern,
+                                      std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status S3FileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31264be621..5d0565b378 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -46,6 +46,9 @@ class S3FileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stat) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 682e46e0fc..dc2efbeaf5 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/windows/error.h"
@@ -494,7 +495,8 @@ Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
   // but no code appears to rely on this behavior.
   string converted_pattern(pattern);
   std::replace(converted_pattern.begin(), converted_pattern.end(), '\\', '/');
-  TF_RETURN_IF_ERROR(FileSystem::GetMatchingPaths(converted_pattern, results));
+  TF_RETURN_IF_ERROR(internal::GetMatchingPaths(this, Env::Default(),
+                                                converted_pattern, results));
   for (string& result : *results) {
     std::replace(result.begin(), result.end(), '/', '\\');
   }
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index a0f43d2d4a..ea0a381f4f 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -157,6 +157,12 @@ Status MemmappedFileSystem::GetChildren(const string& filename,
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
+Status MemmappedFileSystem::GetMatchingPaths(const string& pattern,
+                                             std::vector<string>* results) {
+  return errors::Unimplemented(
+      "memmapped format doesn't support GetMatchingPaths");
+}
+
 Status MemmappedFileSystem::DeleteFile(const string& filename) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 541587aeab..76cc4911f5 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -85,6 +85,8 @@ class MemmappedFileSystem : public FileSystem {
   Status NewAppendableFile(const string& fname,
                            std::unique_ptr<WritableFile>* result) override;
   Status GetChildren(const string& dir, std::vector<string>* r) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
   Status DeleteFile(const string& f) override;
   Status CreateDir(const string& d) override;
   Status DeleteDir(const string& d) override;
-- 
GitLab


From 8cd922f95241b2bd6ce96c399c34e3ad137c705e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 11:43:05 -0700
Subject: [PATCH 0103/1262] Enable writing of summaries in
 ExamplesPerSecondHook

PiperOrigin-RevId: 191094585
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index fa56708f44..6834600b79 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2019,7 +2019,8 @@ class TPUEstimator(estimator_lib.Estimator):
                   host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator)),
-              ExamplesPerSecondHook(ctx.global_batch_size),
+              ExamplesPerSecondHook(ctx.global_batch_size,
+                                    output_dir=self.model_dir),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
-- 
GitLab


From 918bd556b880e3d372b9c19d96ac6cfee0e3a852 Mon Sep 17 00:00:00 2001
From: James Keeling <jtkeeling@google.com>
Date: Tue, 27 Mar 2018 09:36:52 -0700
Subject: [PATCH 0104/1262] Prevent warning every time someone imports
 contrib.learn.datasets.base

Everything in contrib/learn/python/learn/datasets/base.py has been deprecated. One of the function in there is a decorator, retry. Because another function in that file is decorated with retry, the function is called upon import, which prints a warning.

I have fixed this by adding a private function, _internal_retry, which is used internally, and redefining retry to simply call this. That way, using retry in user-code will still print the deprecated warning, but it's not printed upon every import.

I also cleaned up the docstrings slightly.

PiperOrigin-RevId: 190626717
---
 .../learn/python/learn/datasets/base.py       | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 3b5c9b97c0..4676eedb20 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -139,15 +139,48 @@ def retry(initial_delay,
 
   Args:
     initial_delay: the initial delay.
+    max_delay: the maximum delay allowed (actual max is
+        max_delay * (1 + jitter).
     factor: each subsequent retry, the delay is multiplied by this value.
         (must be >= 1).
     jitter: to avoid lockstep, the returned delay is multiplied by a random
         number between (1-jitter) and (1+jitter). To add a 20% jitter, set
         jitter = 0.2. Must be < 1.
+    is_retriable: (optional) a function that takes an Exception as an argument
+        and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
+  """
+  return _internal_retry(
+      initial_delay=initial_delay,
+      max_delay=max_delay,
+      factor=factor,
+      jitter=jitter,
+      is_retriable=is_retriable)
+
+
+def _internal_retry(initial_delay,
+                    max_delay,
+                    factor=2.0,
+                    jitter=0.25,
+                    is_retriable=None):
+  """Simple decorator for wrapping retriable functions, for internal use only.
+
+  Args:
+    initial_delay: the initial delay.
     max_delay: the maximum delay allowed (actual max is
         max_delay * (1 + jitter).
+    factor: each subsequent retry, the delay is multiplied by this value.
+        (must be >= 1).
+    jitter: to avoid lockstep, the returned delay is multiplied by a random
+        number between (1-jitter) and (1+jitter). To add a 20% jitter, set
+        jitter = 0.2. Must be < 1.
     is_retriable: (optional) a function that takes an Exception as an argument
         and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
   """
   if factor < 1:
     raise ValueError('factor must be >= 1; was %f' % (factor,))
@@ -195,7 +228,7 @@ def _is_retriable(e):
 
 
 @deprecated(None, 'Please use urllib or similar directly.')
-@retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
+@_internal_retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
 def urlretrieve_with_retry(url, filename=None):
   return urllib.request.urlretrieve(url, filename)
 
-- 
GitLab


From f270e43f886cc39bc4c0fb11e147b9c38853b3a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 12:26:21 -0700
Subject: [PATCH 0105/1262] Disable tsan for
 tensorflow/python/estimator:replicate_model_fn_test

It gets flaky failures.

PiperOrigin-RevId: 191100692
---
 tensorflow/python/estimator/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index f93bc221cc..5d8b19223f 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -966,5 +966,6 @@ cuda_py_test(
     tags = [
         "multi_gpu",
         "noasan",  # flaky time outs
+        "notsan",  # flaky
     ],
 )
-- 
GitLab


From 42b42fcab0031e36105cb1db105b70cbcfa6b125 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 30 Mar 2018 14:00:02 -0700
Subject: [PATCH 0106/1262] Remove unused imports

PiperOrigin-RevId: 191112880
---
 tensorflow/contrib/tpu/python/tpu/bfloat16.py      | 1 -
 tensorflow/contrib/tpu/python/tpu/bfloat16_test.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
index 929d1824c3..5e49af6408 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_contextlib
 
 
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
index fda4331e89..48a01c7308 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
@@ -20,9 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.tpu.python.tpu import bfloat16
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 
 from tensorflow.python.platform import test
-- 
GitLab


From 5bc8b00f3c62549ff6954fa8929afce85c4da46b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 14:17:49 -0700
Subject: [PATCH 0107/1262] tf.maximum: Correctly calculate output shape when
 broadcast is necessary.

PiperOrigin-RevId: 191115726
---
 tensorflow/contrib/lite/kernels/maximum.cc     | 18 +++++++++++++++---
 .../contrib/lite/kernels/maximum_test.cc       | 14 ++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/maximum.cc b/tensorflow/contrib/lite/kernels/maximum.cc
index 9fdf2b47ea..13c40603ce 100644
--- a/tensorflow/contrib/lite/kernels/maximum.cc
+++ b/tensorflow/contrib/lite/kernels/maximum.cc
@@ -52,9 +52,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   MaximumContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
-  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(op_context.input2->dims);
-  op_context.output->type = op_context.input2->type;
-  return context->ResizeTensor(context, op_context.output, output_dims);
+  op_context.output->type = op_context.input1->type;
+
+  bool requires_broadcast =
+      !HaveSameShapes(op_context.input1, op_context.input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(
+        context, CalculateShapeForBroadcast(context, op_context.input1,
+                                            op_context.input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(op_context.input1->dims);
+  }
+
+  return context->ResizeTensor(context, op_context.output, output_size);
 }
 
 template <KernelType kernel_type>
diff --git a/tensorflow/contrib/lite/kernels/maximum_test.cc b/tensorflow/contrib/lite/kernels/maximum_test.cc
index b3fd7d4e6f..df2bf29c20 100644
--- a/tensorflow/contrib/lite/kernels/maximum_test.cc
+++ b/tensorflow/contrib/lite/kernels/maximum_test.cc
@@ -71,6 +71,20 @@ TEST(MaximumOpTest, FloatTest) {
       ElementsAreArray(ArrayFloatNear({1.0, 0.0, 1.0, 12.0, -2.0, -1.43})));
 }
 
+TEST(MaximumOpTest, FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}}, {TensorType_FLOAT32, {2}},
+                   TensorType_FLOAT32);
+  m.SetInput1<float>(data1);
+  m.SetInput2<float>(data2);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
+  EXPECT_THAT(
+      m.GetOutput<float>(),
+      ElementsAreArray(ArrayFloatNear({1.0, 2.0, 0.5, 2.0, 0.5, 11.0})));
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 79d38e6a570c351366481ab6d27a1e310da49046 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 30 Mar 2018 14:34:56 -0700
Subject: [PATCH 0108/1262] Updating release notes: changing minimum Cudnn
 supported version and adding tensorrt note.

---
 RELEASE.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index c63d9f20c9..886541572f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,8 @@
 * Distributed Mutex / CriticalSection added to `tf.contrib.framework.CriticalSection`.
 * Better text processing with `tf.regex_replace`.
 * Easy, efficient sequence input with `tf.contrib.data.bucket_by_sequence_length`
+* Initial support for `tf.contrib.tensorrt` that enables native TensorRT in
+  TensorFlow.
 
 ## Bug Fixes and Other Changes
 * Accelerated Linear Algebra (XLA):
@@ -50,6 +52,12 @@
   * Support `float16` `dtype` in `tf.linalg.*`.
   * Add `tf.estimator.export.TensorServingInputReceiver` that allows `tf.estimator.Estimator.export_savedmodel` to pass raw tensors to model functions.
 
+## Deprecations
+
+* TensorFlow 1.7 may be the last time we support cuDNN versions below 6.0.
+  Starting with TensorFlow 1.8 release, 6.0 will be the minimum supported
+  version.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
-- 
GitLab


From 57de6ef5b168ea4d9ef81eb2d187d74aa7617ff6 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 30 Mar 2018 14:37:34 -0700
Subject: [PATCH 0109/1262] Also adding a note about Cuda supported version

---
 RELEASE.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 886541572f..e845953174 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -54,6 +54,9 @@
 
 ## Deprecations
 
+* TensorFlow 1.7 may be the last time we support Cuda versions below 8.0.
+  Starting with TensorFlow 1.8 release, 8.0 will be the minimum supported
+  version.
 * TensorFlow 1.7 may be the last time we support cuDNN versions below 6.0.
   Starting with TensorFlow 1.8 release, 6.0 will be the minimum supported
   version.
-- 
GitLab


From 15c10899c9c0e1717251b380330cc248b2c76c9c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 14:45:21 -0700
Subject: [PATCH 0110/1262]  show breakdown of total execution time with
 compute and memory time

PiperOrigin-RevId: 191119550
---
 tensorflow/core/grappler/costs/virtual_scheduler.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 3ac3ae0f8f..0e5c654acf 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -44,6 +44,8 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  result.compute_time += right.compute_time;
+  result.memory_time += right.memory_time;
   if (right.inaccurate) {
     result.inaccurate = true;
   }
@@ -841,6 +843,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
 Costs VirtualScheduler::Summary() const {
   // Print out basic execution summary.
   VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected compute time: " << graph_costs_.compute_time.count();
+  VLOG(1) << "Expected memory time: " << graph_costs_.memory_time.count();
   VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
   VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
   VLOG(1) << "Expected max per-op streaming buffers: "
-- 
GitLab


From 97731cb122f53552bd15351e046a256f78cca444 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 30 Mar 2018 14:56:08 -0700
Subject: [PATCH 0111/1262] Raise exception in SWIG on bad TF_Status from C
 API.

This change provides an alternative mechanism to
tf.raise_exception_on_not_ok_status(), which is inefficient and
error-prone (people often use the status multiple times in the with
block, but it's only checked when the context manager exits). Instead,
it uses SWIG to automatically raise an exception when a C API method
fails. Note that this removes the status argument from affected
methods.

For now, I've only applied this typemap to C API methods. It would be
good to expand this to all uses of raise_exception_on_not_ok_status.

PiperOrigin-RevId: 191121016
---
 tensorflow/c/c_api.h                          |  8 +-
 tensorflow/contrib/cmake/tf_python.cmake      |  2 +
 tensorflow/python/BUILD                       | 13 +++
 tensorflow/python/client/session.py           | 91 +++++++++----------
 .../client/session_list_devices_test.py       | 19 +---
 tensorflow/python/client/tf_session.i         | 33 ++++++-
 tensorflow/python/client/tf_session_helper.h  | 12 +--
 tensorflow/python/eager/context.py            |  8 +-
 tensorflow/python/eager/function.py           | 38 +++-----
 tensorflow/python/framework/errors_impl.py    |  3 +
 tensorflow/python/framework/function.py       | 34 +++----
 tensorflow/python/framework/importer.py       |  5 +-
 tensorflow/python/framework/load_library.py   |  7 +-
 tensorflow/python/framework/ops.py            | 87 +++++++-----------
 tensorflow/python/framework/smart_cond.py     |  6 +-
 .../python/lib/core/py_exception_registry.cc  | 50 ++++++++++
 .../python/lib/core/py_exception_registry.h   | 73 +++++++++++++++
 .../python/lib/core/py_exception_registry.i   | 28 ++++++
 tensorflow/python/tensorflow.i                |  2 +-
 19 files changed, 324 insertions(+), 195 deletions(-)
 create mode 100644 tensorflow/python/lib/core/py_exception_registry.cc
 create mode 100644 tensorflow/python/lib/core/py_exception_registry.h
 create mode 100644 tensorflow/python/lib/core/py_exception_registry.i

diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index b32f574628..fe85f8ee0e 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1496,7 +1496,8 @@ TF_CAPI_EXPORT extern int TF_DeviceListCount(const TF_DeviceList* list);
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieves the type of the device at the given index.
 //
@@ -1506,14 +1507,15 @@ TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieve the amount of memory associated with a given device.
 //
 // If index is out of bounds, an error code will be set in the status object,
 // and -1 will be returned.
 TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
-    const TF_DeviceList* list, int index, TF_Status*);
+    const TF_DeviceList* list, int index, TF_Status* status);
 
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index b776307924..fae45ead5c 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -474,6 +474,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor_bridge.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index aa0acd243c..c502a3a42b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -283,6 +283,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_exception_registry",
+    srcs = ["lib/core/py_exception_registry.cc"],
+    hdrs = ["lib/core/py_exception_registry.h"],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "kernel_registry",
     srcs = ["util/kernel_registry.cc"],
@@ -3313,6 +3324,7 @@ tf_py_wrap_cc(
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
         "lib/core/bfloat16.i",
+        "lib/core/py_exception_registry.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -3344,6 +3356,7 @@ tf_py_wrap_cc(
         ":kernel_registry",
         ":numpy_lib",
         ":safe_ptr",
+        ":py_exception_registry",
         ":py_func_lib",
         ":py_record_reader_lib",
         ":py_record_writer_lib",
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 5c9ed9ccaf..4c84d78f2e 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -27,7 +27,6 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -629,14 +628,12 @@ class BaseSession(SessionInterface):
     self._session = None
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          # pylint: disable=protected-access
-          self._session = tf_session.TF_NewSession(self._graph._c_graph, opts,
-                                                   status)
-          # pylint: enable=protected-access
-        else:
-          self._session = tf_session.TF_NewDeprecatedSession(opts, status)
+      if self._created_with_new_api:
+        # pylint: disable=protected-access
+        self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+        # pylint: enable=protected-access
+      else:
+        self._session = tf_session.TF_NewDeprecatedSession(opts)
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
 
@@ -663,22 +660,20 @@ class BaseSession(SessionInterface):
     Returns:
       A list of devices in the session.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        raw_device_list = tf_session.TF_SessionListDevices(
-            self._session, status)
-      else:
-        raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
-            self._session, status)
-      device_list = []
-      size = tf_session.TF_DeviceListCount(raw_device_list)
-      for i in range(size):
-        name = tf_session.TF_DeviceListName(raw_device_list, i, status)
-        device_type = tf_session.TF_DeviceListType(raw_device_list, i, status)
-        memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status)
-        device_list.append(_DeviceAttributes(name, device_type, memory))
-      tf_session.TF_DeleteDeviceList(raw_device_list)
-      return device_list
+    if self._created_with_new_api:
+      raw_device_list = tf_session.TF_SessionListDevices(self._session)
+    else:
+      raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
+          self._session)
+    device_list = []
+    size = tf_session.TF_DeviceListCount(raw_device_list)
+    for i in range(size):
+      name = tf_session.TF_DeviceListName(raw_device_list, i)
+      device_type = tf_session.TF_DeviceListType(raw_device_list, i)
+      memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i)
+      device_list.append(_DeviceAttributes(name, device_type, memory))
+    tf_session.TF_DeleteDeviceList(raw_device_list)
+    return device_list
 
   def close(self):
     """Closes this session.
@@ -692,15 +687,13 @@ class BaseSession(SessionInterface):
     if self._created_with_new_api:
       if self._session and not self._closed:
         self._closed = True
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_CloseSession(self._session, status)
+        tf_session.TF_CloseSession(self._session)
 
     else:
       with self._extend_lock:
         if self._opened and not self._closed:
           self._closed = True
-          with errors.raise_exception_on_not_ok_status() as status:
-            tf_session.TF_CloseDeprecatedSession(self._session, status)
+          tf_session.TF_CloseDeprecatedSession(self._session)
 
   def __del__(self):
     # cleanly ignore all exceptions
@@ -710,11 +703,10 @@ class BaseSession(SessionInterface):
       pass
     if self._session is not None:
       try:
-        status = c_api_util.ScopedTFStatus()
         if self._created_with_new_api:
-          tf_session.TF_DeleteSession(self._session, status)
+          tf_session.TF_DeleteSession(self._session)
         else:
-          tf_session.TF_DeleteDeprecatedSession(self._session, status)
+          tf_session.TF_DeleteDeprecatedSession(self._session)
       except AttributeError:
         # At shutdown, `c_api_util` or `tf_session` may have been garbage
         # collected, causing the above method calls to fail. In this case,
@@ -1031,11 +1023,11 @@ class BaseSession(SessionInterface):
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          return tf_session.TF_SessionPRunSetup_wrapper(
-              session, feed_list, fetch_list, target_list, status)
-        else:
+      if self._created_with_new_api:
+        return tf_session.TF_SessionPRunSetup_wrapper(
+            session, feed_list, fetch_list, target_list)
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
           return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
                                          target_list, status)
 
@@ -1345,8 +1337,7 @@ class BaseSession(SessionInterface):
   def _extend_graph(self):
     if self._created_with_new_api:
       with self._graph._lock:  # pylint: disable=protected-access
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.ExtendSession(self._session, status)
+        tf_session.ExtendSession(self._session)
     else:
       # Ensure any changes to the graph are reflected in the runtime.
       with self._extend_lock:
@@ -1412,22 +1403,22 @@ class BaseSession(SessionInterface):
 
   def _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list,
                           run_metadata):
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        return tf_session.TF_SessionRun_wrapper(
-            self._session, options, feed_dict, fetch_list, target_list,
-            run_metadata, status)
-      else:
+    if self._created_with_new_api:
+      return tf_session.TF_SessionRun_wrapper(
+          self._session, options, feed_dict, fetch_list, target_list,
+          run_metadata)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
         return tf_session.TF_Run(
             self._session, options, feed_dict, fetch_list, target_list,
             status, run_metadata)
 
   def _call_tf_sessionprun(self, handle, feed_dict, fetch_list):
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        return tf_session.TF_SessionPRun_wrapper(
-            self._session, handle, feed_dict, fetch_list, status)
-      else:
+    if self._created_with_new_api:
+      return tf_session.TF_SessionPRun_wrapper(
+          self._session, handle, feed_dict, fetch_list)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
         return tf_session.TF_PRun(
             self._session, handle, feed_dict, fetch_list, status)
 
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 5a7413c12e..38a3acb2dc 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -23,7 +23,6 @@ from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.client import session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -42,21 +41,13 @@ class SessionListDevicesTestMethods(object):
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_session = tf_session.TF_NewSession(
-          ops.get_default_graph()._c_graph, opts, status)
-      raw_device_list = tf_session.TF_SessionListDevices(
-          c_session, status)
+    c_session = tf_session.TF_NewSession(ops.get_default_graph()._c_graph, opts)
+    raw_device_list = tf_session.TF_SessionListDevices(c_session)
     size = tf_session.TF_DeviceListCount(raw_device_list)
-    # Test that invalid device numbers return -1 rather than a Swig-wrapped
-    # pointer.
-    status_no_exception = c_api_util.ScopedTFStatus()
-    memory = tf_session.TF_DeviceListMemoryBytes(
-        raw_device_list, size, status_no_exception)
-    self.assertEqual(memory, -1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      tf_session.TF_DeviceListMemoryBytes(raw_device_list, size)
     tf_session.TF_DeleteDeviceList(raw_device_list)
-    with errors.raise_exception_on_not_ok_status() as status:
-      tf_session.TF_CloseSession(c_session, status)
+    tf_session.TF_CloseSession(c_session)
 
   def testListDevicesGrpcSession(self):
     server = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 77ce9195ee..5dcd0c192e 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -18,11 +18,12 @@ limitations under the License.
 %{
 
 #include "tensorflow/c/python_api.h"
-#include "tensorflow/python/client/tf_session_helper.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/python/client/tf_session_helper.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
 
 // Helper function to convert a Python list of Tensors to a C++ vector of
 // TF_Outputs.
@@ -352,6 +353,27 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
+// Typemaps to automatically raise a Python exception from bad output TF_Status.
+// TODO(b/77295559): expand this to all TF_Status* output params and deprecate
+// raise_exception_on_not_ok_status (currently it only affects the C API).
+%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+  status = TF_NewStatus();
+  $1 = status;
+}
+
+%typemap(argout) TF_Status* status {
+  TF_Code code = TF_GetCode($1);
+  if (code != TF_OK) {
+    PyObject* exc = tensorflow::PyExceptionRegistry::Lookup(code);
+    // Arguments to OpError.
+    PyObject* exc_args = Py_BuildValue("sss", nullptr, nullptr, TF_Message($1));
+    TF_DeleteStatus($1);
+    SWIG_SetErrorObj(exc, exc_args);
+    SWIG_fail;
+  }
+  TF_DeleteStatus($1);
+}
+
 // Converts input Python list of wrapped TF_Outputs into a single array
 %typemap(in) (const TF_Output* inputs, int num_inputs)
     (std::vector<TF_Output> inputs) {
@@ -499,9 +521,8 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
       _TF_SetTarget(opts, target)
     if config is not None:
       from tensorflow.python.framework import errors
-      with errors.raise_exception_on_not_ok_status() as status:
-        config_str = config.SerializeToString()
-        _TF_SetConfig(opts, config_str, status)
+      config_str = config.SerializeToString()
+      _TF_SetConfig(opts, config_str)
     return opts
 %}
 
@@ -758,3 +779,7 @@ def TF_Reset(target, containers=None, config=None):
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
+
+// Clear "TF_Status* status" typemap so it doesn't affect other modules and
+// unexpectedly remove the TF_Status* argument from wrappers.
+%clear TF_Status* status;
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 603d03e315..5416d41376 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -136,8 +136,7 @@ string EqualAttrValueWrapper(const string& actual, const string& expected);
 //
 // If shape is unknown, sets unknown_shape to true.
 tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
-    TF_Graph* graph, TF_Output output, TF_Status* out_status,
-    bool* unknown_shape);
+    TF_Graph* graph, TF_Output output, TF_Status* status, bool* unknown_shape);
 
 // Runs the graph associated with the session starting with the supplied inputs.
 // On success, `py_outputs` is populated with a numpy ndarray for each output
@@ -149,7 +148,7 @@ void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
                            const std::vector<PyObject*>& input_ndarrays,
                            const std::vector<TF_Output>& outputs,
                            const std::vector<TF_Operation*>& targets,
-                           TF_Buffer* run_metadata, TF_Status* out_status,
+                           TF_Buffer* run_metadata, TF_Status* status,
                            std::vector<PyObject*>* py_outputs);
 
 // Set up the graph with the intended feeds (inputs) and fetches (output) for
@@ -165,8 +164,7 @@ void TF_SessionPRunSetup_wrapper(TF_Session* session,
                                  const std::vector<TF_Output>& inputs,
                                  const std::vector<TF_Output>& outputs,
                                  const std::vector<TF_Operation*>& targets,
-                                 const char** out_handle,
-                                 TF_Status* out_status);
+                                 const char** out_handle, TF_Status* status);
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
@@ -182,7 +180,7 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
                             const std::vector<TF_Output>& inputs,
                             const std::vector<PyObject*>& input_ndarrays,
                             const std::vector<TF_Output>& outputs,
-                            TF_Status* out_status,
+                            TF_Status* status,
                             std::vector<PyObject*>* py_outputs);
 
 // Retrieves the inputs of this operation.
@@ -204,7 +202,7 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
-    const char* description, TF_Status* out_status);
+    const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
 //
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 8c1bb06bc3..6ad9e0d88f 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -244,13 +244,9 @@ class Context(object):
       try:
         self._num_gpus = 0
         for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_name = pywrap_tensorflow.TF_DeviceListName(
-                device_list, i, status)
+          dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
           self._context_devices.append(pydev.canonical_name(dev_name))
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_type = pywrap_tensorflow.TF_DeviceListType(
-                device_list, i, status)
+          dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
           if dev_type == "GPU":
             self._num_gpus += 1
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 343012e552..711eddcec1 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -34,7 +34,6 @@ from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -79,14 +78,10 @@ def capture_value(tensor_map, value, dtype, name):
         ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
         shapes = [[d.size for d in s.dim]
                   if not s.unknown_rank else None for s in shapes]
-        with errors.raise_exception_on_not_ok_status() as status:
-          pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-              captured_value._op._graph._c_graph,  # pylint: disable=protected-access
-              captured_value._as_tf_output(),  # pylint: disable=protected-access
-              shapes,
-              ranks,
-              types,
-              status)
+        pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+            captured_value._op._graph._c_graph,  # pylint: disable=protected-access
+            captured_value._as_tf_output(),  # pylint: disable=protected-access
+            shapes, ranks, types)
 
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
@@ -275,23 +270,20 @@ class _EagerDefinedFunction(object):
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
-          graph._c_graph,  # pylint: disable=protected-access
-          compat.as_str(name),
-          False,
-          [o._c_op for o in operations],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
-          [],
-          None,
-          compat.as_str(""),
-          status)
+    fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+        graph._c_graph,  # pylint: disable=protected-access
+        compat.as_str(name),
+        False,
+        [o._c_op for o in operations],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+        [],
+        None,
+        compat.as_str(""))
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+      pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     function_def = function_pb2.FunctionDef()
     function_def.ParseFromString(compat.as_bytes(proto_data))
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 2a40316d51..84106c32c6 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -473,6 +473,8 @@ _CODE_TO_EXCEPTION_CLASS = {
     DATA_LOSS: DataLossError,
 }
 
+c_api.PyExceptionRegistry_Init(_CODE_TO_EXCEPTION_CLASS)
+
 _EXCEPTION_CLASS_TO_CODE = dict((
     (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
 
@@ -499,6 +501,7 @@ def _make_specific_exception(node_def, op, message, error_code):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
+# TODO(b/77295559): expand use of TF_Status* SWIG typemap and deprecate this.
 @tf_export("errors.raise_exception_on_not_ok_status")  # pylint: disable=invalid-name
 class raise_exception_on_not_ok_status(object):
   """Context manager to check for C API status."""
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 82dd2a3356..c5caf9ebc0 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -30,7 +30,6 @@ from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -275,8 +274,7 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     if self._c_func:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_FunctionToFunctionDef(self._c_func, buf, status)
+        c_api.TF_FunctionToFunctionDef(self._c_func, buf)
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
@@ -399,18 +397,16 @@ class _DefinedFunction(object):
                       if self._out_names else [])
       description = self._func.__doc__ or None
       # pylint: disable=protected-access
-      with errors.raise_exception_on_not_ok_status() as status:
-        self._c_func = c_api.TF_GraphToFunction_wrapper(
-            temp_graph._c_graph,
-            base_func_name,
-            self._func_name is None,  # append_hash_to_fn_name
-            None,  # opers
-            [t._as_tf_output() for t in inputs],
-            [t._as_tf_output() for t in outputs],
-            output_names,
-            None,  # opts
-            description,
-            status)
+      self._c_func = c_api.TF_GraphToFunction_wrapper(
+          temp_graph._c_graph,
+          base_func_name,
+          self._func_name is None,  # append_hash_to_fn_name
+          None,  # opers
+          [t._as_tf_output() for t in inputs],
+          [t._as_tf_output() for t in outputs],
+          output_names,
+          None,  # opts
+          description)
       # pylint: enable=protected-access
       self._set_c_attrs(kwargs_attr)
 
@@ -433,9 +429,8 @@ class _DefinedFunction(object):
       serialized = attr_value.SerializeToString()
       # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
       # It might be worth creating a convenient way to re-use the same status.
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
-                                           serialized, status)
+      c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
+                                         serialized)
 
   def _create_hash_str(self, input_arg, output_arg, node_def):
     """Creates an 8-character string unique to this input.
@@ -830,8 +825,7 @@ def _from_definition(fdef, grad_func=None):
   # pylint: disable=protected-access
   if ops._USE_C_API:
     serialized = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      result._c_func = c_api.TF_FunctionImportFunctionDef(serialized, status)
+    result._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
     result._extra_inputs = []
   else:
     result._definition = fdef
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 4ea34d7bb2..23f529b988 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -485,9 +485,8 @@ def import_graph_def(graph_def,
     with graph._lock:  # pylint: disable=protected-access
       with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
         try:
-          with errors.raise_exception_on_not_ok_status() as status:
-            results = c_api.TF_GraphImportGraphDefWithResults(
-                graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+          results = c_api.TF_GraphImportGraphDefWithResults(
+              graph._c_graph, serialized, options)  # pylint: disable=protected-access
         except errors.InvalidArgumentError as e:
           # Convert to ValueError for backwards compatibility.
           raise ValueError(str(e))
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 1f2aa264c1..535c6017f5 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -26,7 +26,6 @@ import threading  # pylint: disable=unused-import
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-import
 from tensorflow.python import pywrap_tensorflow as py_tf
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,8 +53,7 @@ def load_op_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library or get the python wrappers.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  lib_handle = py_tf.TF_LoadLibrary(library_filename)
 
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
@@ -99,5 +97,4 @@ def load_file_system_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  py_tf.TF_LoadLibrary(library_filename)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 6930737a0c..7ca0b836dd 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -373,15 +373,12 @@ class Tensor(_TensorLike):
     """
     graph = self._op._graph._c_graph # pylint: disable=protected-access
     if graph and _USE_C_SHAPES:
-      with errors.raise_exception_on_not_ok_status() as status:
-        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
-                                                  status)
+      num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output())
       if num_dims == -1:
         dim_list = None
       else:
-        with errors.raise_exception_on_not_ok_status() as status:
-          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
-              graph, self._as_tf_output(), num_dims, status)
+        dim_list = c_api.TF_GraphGetTensorShape_wrapper(
+            graph, self._as_tf_output(), num_dims)
         dim_list = [None if i == -1 else i for i in dim_list]
       return tensor_shape.TensorShape(dim_list)
     return self._shape_val
@@ -489,13 +486,11 @@ class Tensor(_TensorLike):
         else:
           dim_list.append(dim.value)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_GraphSetTensorShape_wrapper(
-            self._op._graph._c_graph,  # pylint: disable=protected-access
-            self._as_tf_output(),
-            dim_list,
-            unknown_shape,
-            status)
+      c_api.TF_GraphSetTensorShape_wrapper(
+          self._op._graph._c_graph,  # pylint: disable=protected-access
+          self._as_tf_output(),
+          dim_list,
+          unknown_shape)
     except errors.InvalidArgumentError as e:
       # Convert to ValueError for backwards compatibility.
       raise ValueError(str(e))
@@ -1514,13 +1509,10 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
     serialized = attr_value.SerializeToString()
     # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
     # It might be worth creating a convenient way to re-use the same status.
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_api.TF_SetAttrValueProto(op_desc,
-                                 compat.as_str(name), serialized, status)
+    c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized)
 
   try:
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_op = c_api.TF_FinishOperation(op_desc, status)
+    c_op = c_api.TF_FinishOperation(op_desc)
   except errors.InvalidArgumentError as e:
     # Convert to ValueError for backwards compatibility.
     raise ValueError(str(e))
@@ -1943,12 +1935,10 @@ class Operation(object):
     if self._c_op:
       # Reset cached inputs.
       self._inputs_val = None
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.UpdateEdge(
-            self._graph._c_graph,  # pylint: disable=protected-access
-            tensor._as_tf_output(),  # pylint: disable=protected-access
-            self._tf_input(index),
-            status)
+      c_api.UpdateEdge(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._tf_input(index))
     else:
       self._inputs_val[index].consumers().remove(self)
       self._inputs_val[index] = tensor
@@ -2169,8 +2159,7 @@ class Operation(object):
     # pylint: enable=line-too-long
     if self._c_op:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_OperationToNodeDef(self._c_op, buf, status)
+        c_api.TF_OperationToNodeDef(self._c_op, buf)
         data = c_api.TF_GetBuffer(buf)
       node_def = node_def_pb2.NodeDef()
       node_def.ParseFromString(compat.as_bytes(data))
@@ -2228,11 +2217,9 @@ class Operation(object):
       buf = c_api.TF_NewBufferFromString(
           compat.as_bytes(attr_value.SerializeToString()))
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf,
-                        status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
+        # pylint: enable=protected-access
       finally:
         c_api.TF_DeleteBuffer(buf)
     else:
@@ -2254,8 +2241,7 @@ class Operation(object):
     if self._c_op:
       try:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf, status)
+          c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
           data = c_api.TF_GetBuffer(buf)
       except errors.InvalidArgumentError as e:
         # Convert to ValueError for backwards compatibility.
@@ -2469,11 +2455,10 @@ def _set_shapes_for_outputs_c_api(op):
   # The C API computes the shapes when the TF_Operation is created. Fetch the
   # output shapes from the C object.
   for output in op.outputs:
-    with errors.raise_exception_on_not_ok_status() as status:
-      # pylint: disable=protected-access
-      shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
-          op._graph._c_graph, output._as_tf_output(), status)
-      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+        op._graph._c_graph, output._as_tf_output())
+    # pylint: enable=protected-access
     if unknown_shape:
       output.set_shape(tensor_shape.unknown_shape())
     elif not shape_vector:
@@ -2994,8 +2979,7 @@ class Graph(object):
     # pylint: enable=line-too-long
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_GraphVersions(self._c_graph, buf, status)
+        c_api.TF_GraphVersions(self._c_graph, buf)
         data = c_api.TF_GetBuffer(buf)
       version_def = versions_pb2.VersionDef()
       version_def.ParseFromString(compat.as_bytes(data))
@@ -3098,8 +3082,7 @@ class Graph(object):
     if self._c_graph:
       with self._lock:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_GraphToGraphDef(self._c_graph, buf, status)
+          c_api.TF_GraphToGraphDef(self._c_graph, buf)
           data = c_api.TF_GetBuffer(buf)
         graph = graph_pb2.GraphDef()
         graph.ParseFromString(compat.as_bytes(data))
@@ -3208,14 +3191,10 @@ class Graph(object):
       # remove this when all functions are generated using the C API by default
       # as this will be unnecessary.
       if not function._c_func:
-        with errors.raise_exception_on_not_ok_status() as status:
-          serialized = function.definition.SerializeToString()
-          function._c_func = c_api.TF_FunctionImportFunctionDef(
-              serialized, status)
-      with errors.raise_exception_on_not_ok_status() as status:
-        gradient = function._grad_func._c_func if function._grad_func else None
-        c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
-                                   status)
+        serialized = function.definition.SerializeToString()
+        function._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+      gradient = function._grad_func._c_func if function._grad_func else None
+      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient)
     else:
       # If there is already a function with the same name, raise an error
       # if bodies are different. Else, do nothing. The C API version above
@@ -3732,11 +3711,9 @@ class Graph(object):
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.TF_GraphGetOpDef(self._c_graph,
-                                 compat.as_bytes(type), buf, status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+        # pylint: enable=protected-access
         data = c_api.TF_GetBuffer(buf)
       op_def = op_def_pb2.OpDef()
       op_def.ParseFromString(compat.as_bytes(data))
diff --git a/tensorflow/python/framework/smart_cond.py b/tensorflow/python/framework/smart_cond.py
index c7ff23e4ff..48a834392b 100644
--- a/tensorflow/python/framework/smart_cond.py
+++ b/tensorflow/python/framework/smart_cond.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow as c_api
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -83,9 +82,8 @@ def smart_constant_value(pred):
     # wanted to limit the change hidden behind _USE_C_API).
     # pylint: disable=protected-access
     if pred_value is None and ops._USE_C_API:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pred_value = c_api.TF_TryEvaluateConstant_wrapper(
-            pred.graph._c_graph, pred._as_tf_output(), status)
+      pred_value = c_api.TF_TryEvaluateConstant_wrapper(pred.graph._c_graph,
+                                                        pred._as_tf_output())
     # pylint: enable=protected-access
 
   else:
diff --git a/tensorflow/python/lib/core/py_exception_registry.cc b/tensorflow/python/lib/core/py_exception_registry.cc
new file mode 100644
index 0000000000..6637de632b
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+
+#include <Python.h>
+
+namespace tensorflow {
+
+PyExceptionRegistry* PyExceptionRegistry::singleton_ = nullptr;
+
+void PyExceptionRegistry::Init(PyObject* code_to_exc_type_map) {
+  DCHECK(singleton_ == nullptr) << "PyExceptionRegistry::Init() already called";
+  singleton_ = new PyExceptionRegistry;
+
+  DCHECK(PyDict_Check(code_to_exc_type_map));
+  PyObject* key;
+  PyObject* value;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(code_to_exc_type_map, &pos, &key, &value)) {
+    TF_Code code = static_cast<TF_Code>(PyLong_AsLong(key));
+    singleton_->exc_types_[code] = value;
+    // The exception classes should also have the lifetime of the process, but
+    // incref just in case.
+    Py_INCREF(value);
+  }
+}
+
+PyObject* PyExceptionRegistry::Lookup(TF_Code code) {
+  DCHECK(singleton_ != nullptr) << "Must call PyExceptionRegistry::Init() "
+                                   "before PyExceptionRegistry::Lookup()";
+  DCHECK_NE(code, TF_OK);
+  DCHECK(singleton_->exc_types_.find(code) != singleton_->exc_types_.end())
+      << "Unknown error code passed to PyExceptionRegistry::Lookup: " << code;
+  return singleton_->exc_types_[code];
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_exception_registry.h b/tensorflow/python/lib/core/py_exception_registry.h
new file mode 100644
index 0000000000..2b0f23b548
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+
+#include <map>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/platform/logging.h"
+
+#ifndef PyObject_HEAD
+struct _object;
+typedef _object PyObject;
+#endif
+
+namespace tensorflow {
+
+// Global registry mapping C API error codes to the corresponding custom Python
+// exception type. This is used to expose the exception types to C extension
+// code (i.e. so we can raise custom exceptions via SWIG).
+//
+// Init() must be called exactly once at the beginning of the process before
+// Lookup() can be used.
+//
+// Example usage:
+//   TF_Status* status = TF_NewStatus();
+//   TF_Foo(..., status);
+//
+//   if (TF_GetCode(status) != TF_OK) {
+//     PyObject* exc_type = PyExceptionRegistry::Lookup(TF_GetCode(status));
+//     // Arguments to OpError base class. Set `node_def` and `op` to None.
+//     PyObject* args =
+//       Py_BuildValue("sss", nullptr, nullptr, TF_Message(status));
+//     PyErr_SetObject(exc_type, args);
+//     Py_DECREF(args);
+//     TF_DeleteStatus(status);
+//     return NULL;
+//   }
+class PyExceptionRegistry {
+ public:
+  // Initializes the process-wide registry. Should be called exactly once near
+  // the beginning of the process. The arguments are the various Python
+  // exception types (e.g. `cancelled_exc` corresponds to
+  // errors.CancelledError).
+  static void Init(PyObject* code_to_exc_type_map);
+
+  // Returns the Python exception type corresponding to `code`. Init() must be
+  // called before using this function. `code` should not be TF_OK.
+  static PyObject* Lookup(TF_Code code);
+
+ private:
+  static PyExceptionRegistry* singleton_;
+  PyExceptionRegistry() = default;
+
+  // Maps error codes to the corresponding Python exception type.
+  std::map<TF_Code, PyObject*> exc_types_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
diff --git a/tensorflow/python/lib/core/py_exception_registry.i b/tensorflow/python/lib/core/py_exception_registry.i
new file mode 100644
index 0000000000..e872b74985
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.i
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow::PyExceptionRegistry;
+%unignore tensorflow::PyExceptionRegistry::Init;
+
+%include "tensorflow/python/lib/core/py_exception_registry.h"
+%unignoreall
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 82b908ac0e..26e8acd897 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -25,6 +25,7 @@ limitations under the License.
 %include "tensorflow/python/util/tfprof.i"
 
 %include "tensorflow/python/lib/core/py_func.i"
+%include "tensorflow/python/lib/core/py_exception_registry.i"
 
 %include "tensorflow/python/lib/io/py_record_reader.i"
 %include "tensorflow/python/lib/io/py_record_writer.i"
@@ -54,4 +55,3 @@ limitations under the License.
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"
 %include "tensorflow/python/grappler/model_analyzer.i"
-
-- 
GitLab


From 32133ec21ff01f829bc2f2ac19a5f78632bc07dc Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 30 Mar 2018 15:00:41 -0700
Subject: [PATCH 0112/1262] Make tfe.Iterator work with async mode.

PiperOrigin-RevId: 191121622
---
 tensorflow/contrib/eager/python/datasets.py   | 24 +++++++-----
 .../data/kernel_tests/iterator_ops_test.py    |  9 +++++
 tensorflow/python/data/ops/iterator_ops.py    | 38 ++++++++++---------
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 60453006f4..99b1e098d5 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -107,16 +107,20 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    if self._buffer_resource_handle is not None:
-      with ops.device(self._device):
-        ret = prefetching_ops.function_buffering_resource_get_next(
-            function_buffer_resource=self._buffer_resource_handle,
-            output_types=self._flat_output_types)
-      return sparse.deserialize_sparse_tensors(
-          nest.pack_sequence_as(self._output_types, ret), self._output_types,
-          self._output_shapes, self._output_classes)
-    else:
-      return super(Iterator, self)._next_internal()
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      if self._buffer_resource_handle is not None:
+        with ops.device(self._device):
+          ret = prefetching_ops.function_buffering_resource_get_next(
+              function_buffer_resource=self._buffer_resource_handle,
+              output_types=self._flat_output_types)
+        return sparse.deserialize_sparse_tensors(
+            nest.pack_sequence_as(self._output_types, ret), self._output_types,
+            self._output_shapes, self._output_classes)
+      else:
+        return super(Iterator, self)._next_internal()
 
   # TODO(shivaniagrawal): Expose checkpointable stateful objects from dataset
   # attributes(potential).
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 4a14a915bd..0af282a024 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -717,6 +718,14 @@ class IteratorTest(test.TestCase):
       self.assertTrue(
           iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
 
+  def testEagerIteratorAsync(self):
+    with context.eager_mode(), context.execution_mode(context.ASYNC):
+      val = 0
+      dataset = dataset_ops.Dataset.range(10)
+      for foo in dataset:
+        self.assertEqual(val, foo.numpy())
+        val += 1
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d79b9d6011..0c76afd29d 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -488,23 +488,27 @@ class EagerIterator(object):
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    with ops.device(self._device):
-      # TODO(ashankar): Consider removing this ops.device() contextmanager
-      # and instead mimic ops placement in graphs: Operations on resource
-      # handles execute on the same device as where the resource is placed.
-      # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
-      # because in eager mode this code will run synchronously on the calling
-      # thread. Therefore we do not need to make a defensive context switch
-      # to a background thread, and can achieve a small constant performance
-      # boost by invoking the iterator synchronously.
-      ret = gen_dataset_ops.iterator_get_next_sync(
-          self._resource,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self._output_types, ret), self._output_types,
-        self._output_shapes, self._output_classes)
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        # TODO(ashankar): Consider removing this ops.device() contextmanager
+        # and instead mimic ops placement in graphs: Operations on resource
+        # handles execute on the same device as where the resource is placed.
+        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
+        # because in eager mode this code will run synchronously on the calling
+        # thread. Therefore we do not need to make a defensive context switch
+        # to a background thread, and can achieve a small constant performance
+        # boost by invoking the iterator synchronously.
+        ret = gen_dataset_ops.iterator_get_next_sync(
+            self._resource,
+            output_types=self._flat_output_types,
+            output_shapes=self._flat_output_shapes)
+
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
 
   def next(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
-- 
GitLab


From ef20c62273a5c20d5fce28fd779aa47fb2382ac5 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Fri, 30 Mar 2018 15:15:31 -0700
Subject: [PATCH 0113/1262] Improve the precision of Reduce in the interpreter
 when adding floats, by accumulating in a double.

Also, speed it up by not creating intermediate Literals. The microbenchmark now runs in < 1 ns, as opposed to 30 sec before.

PiperOrigin-RevId: 191123983
---
 .../compiler/xla/service/hlo_evaluator.cc     | 46 +++++++++---
 .../xla/service/hlo_evaluator_test.cc         | 75 +++++++++++++++++++
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 693004d364..9d7251b6ae 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1520,14 +1520,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       arg_dim_counts[dim] = arg_dimensions[dim];
     }
 
-    // Create mapping from result index to arg index.
-    const int64 result_rank = ShapeUtil::Rank(result->shape());
-    int64 result_dim = 0;
-    std::vector<int64> result_to_arg_index(result_rank);
+    // Map each dimension in the result to a dimension in arg that isn't
+    // being reduced.
+    std::vector<int64> result_to_arg_index;
     for (int64 i = 0; i < arg_dimensions.size(); ++i) {
       if (arg_dim_steps[i] == 0) {
-        result_to_arg_index[result_dim] = i;
-        ++result_dim;
+        result_to_arg_index.push_back(i);
       }
     }
 
@@ -1542,6 +1540,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             base[result_to_arg_index[i]] = multi_index[i];
           }
 
+          // When the reduction is addition of floats, accumulate in a double
+          // for better precision. Also, avoid creating Literals for the
+          // intermediate results; it's much faster.
+          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
+              IsScalarAdd(function)) {
+            double computed_result = 0;
+            auto func = [&](ArraySlice<int64> input_index) {
+              computed_result += arg_literal.Get<float>(input_index);
+              return true;
+            };
+            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                    arg_dim_steps, func);
+            return static_cast<ReturnT>(computed_result);
+          }
           auto func = [&](ArraySlice<int64> input_index) {
             auto curr_val = arg_literal.Get<ReturnT>(input_index);
 
@@ -1554,19 +1566,17 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             std::unique_ptr<Literal> computed_result =
                 embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
-            // Clear visit states so that the we can use the evaluate again on
+            // Clear visit states so that we can use the evaluator again on
             // the same computation.
             embedded_evaluator.ResetVisitStates();
-
             // Assign computed result to result_val.
             result_val = computed_result->Get<ReturnT>({});
-
             return true;
           };
-
+          // Computes one element of the result, reducing all dimensions that
+          // contribute to that element.
           ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
                                   arg_dim_steps, func);
-
           return result_val;
         }));
 
@@ -1574,6 +1584,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  bool IsScalarAdd(HloComputation* computation) {
+    HloInstruction* instruction = computation->root_instruction();
+    if (instruction->opcode() == HloOpcode::kAdd &&
+        computation->num_parameters() == 2) {
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      return lhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(lhs->shape()) &&
+             rhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
+    }
+    return false;
+  }
+
   Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
     auto operand = select_and_scatter->operand(0);
     auto source = select_and_scatter->operand(1);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 685cacd7f7..dd14dd3853 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1205,6 +1206,80 @@ TEST_P(HloEvaluatorTest,
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+
+// Tests that Reduce doesn't lose precision when adding many numbers (because
+// it accumulates its result in a double).
+TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  HloComputation::Builder b(TestName());
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  HloInstruction* init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module().AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  std::unique_ptr<Literal> result =
+      hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  LiteralTestUtil::ExpectR0Equal<float>(kNumElements, *result);
+}
+
+// Reducing many numbers should be fast because it doesn't create
+// intermediate Literals; the microbenchmark should finish in < 1 msec.
+void BM_ReducePrecisely(int num_iters) {
+  tensorflow::testing::StopTiming();
+  HloComputation::Builder b("BM_ReducePrecisely");
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  HloModule module("BM_ReducePrecisely", VersionedComputationHandle(), config);
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module.AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module.AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  tensorflow::testing::StartTiming();
+  hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_ReducePrecisely);
+
 TEST_P(HloEvaluatorTest, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
-- 
GitLab


From 954cffeb889671104dcbd53298c555a1bdb3639a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 15:18:18 -0700
Subject: [PATCH 0114/1262] Add attr when rewriter adds an XlaHostCompute Op
 indicating which outside_compilation_subgraph it corresponds to.

PiperOrigin-RevId: 191124345
---
 .../jit/encapsulate_subgraphs_pass.cc         |  1 +
 .../jit/encapsulate_subgraphs_pass_test.cc    | 30 ++++++++++++-------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 53ec6c1e60..b04b333141 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -825,6 +825,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       builder.Attr("key",
                    strings::StrCat("host_compute_channel_", subgraph_name, "_",
                                    oc_subgraph_name));
+      builder.Attr("_outside_compilation_subgraph", oc_subgraph_name);
       Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 56efe98fdb..8599a7038a 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -902,7 +902,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1046,7 +1047,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O2"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O2"}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
@@ -1056,7 +1058,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"i_0_retval", "I:o:0"}});
@@ -1193,7 +1196,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
@@ -1214,7 +1218,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"key", "host_compute_channel_F2_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -1321,7 +1326,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1403,7 +1409,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1482,7 +1489,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"Toutputs", gtl::ArraySlice<DataType>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1561,7 +1569,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"Toutputs", gtl::ArraySlice<DataType>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1725,7 +1734,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
-- 
GitLab


From d0e883aceb21c611cadd6712f139164d18989568 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 15:18:48 -0700
Subject: [PATCH 0115/1262]   Add the waiting time and cross-replica-sum time
 to StepInfoResult.

PiperOrigin-RevId: 191124408
---
 tensorflow/contrib/tpu/profiler/tf_op_stats.proto | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 590db2c376..2a15875627 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -79,6 +79,10 @@ message StepInfoResult {
   optional uint64 infeed_duration_ps = 3;
   // The start time of this step in picoseconds.
   optional uint64 begin_ps = 4;
+  // The waiting time within this step in picoseconds.
+  optional uint64 wait_duration_ps = 5;
+  // The time spent on cross-replica-sum in picoseconds.
+  optional uint64 crs_duration_ps = 6;
 }
 
 // Result proto for a sequence of steps.
-- 
GitLab


From 03b07d7549dc0ee6f90c206c64a3d64906669975 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 15:35:14 -0700
Subject: [PATCH 0116/1262] [XLA] Add a reduce-window test. REL_NOTES:n/a
 PiperOrigin-RevId: 191126542

---
 .../compiler/xla/tests/reduce_window_test.cc  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 9c317fe579..d6f580a6f9 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -252,6 +252,31 @@ TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
                            DefaultErrorSpec());
 }
 
+// Tests the super windowing logic w.r.t handling prime number of windows in a
+// major dimension with reduction.
+TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) {
+  Array4D<float> input_array(15, 15, 4, 128);
+  input_array.FillRandom(2.f, 4.f);
+
+  int win_len = 3;
+  int win_stride = 2;
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  // Reduce only along the x and y dimensions, according to the win_len.
+  ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
+                  {win_stride, win_stride, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {win_len, win_len, 1, 1},
+      {win_stride, win_stride, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
 // Tests a reduction function that is not a simple add/min/max/etc.
 XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
-- 
GitLab


From 9bcecca7304473aeeba191776eeb6c15e96ad335 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 15:41:02 -0700
Subject: [PATCH 0117/1262] Restore definitions of static members in
 MklCpuAllocator.

These were removed in #17396 which made the static member variables of
MklCpuAllocator into inline variables, which are a C++17 feature, and not
properly restored in #18006 which reverted the inline declarations, leading to
an ODR violation that is apparently ignored with some compilers.

PiperOrigin-RevId: 191127281
---
 tensorflow/core/common_runtime/mkl_cpu_allocator.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 829c19204a..43a909466e 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr const char* MklCPUAllocator::kMaxLimitStr;
+constexpr const size_t MklCPUAllocator::kDefaultMaxLimit;
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-- 
GitLab


From a90a47f86082878c911d18529728f81bbd50d33c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 15:51:12 -0700
Subject: [PATCH 0118/1262] Doc string clean-ups for class
 DistributionStrategy.

PiperOrigin-RevId: 191128500
---
 tensorflow/python/training/distribute.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index b11412cac7..c44627eadb 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -376,7 +376,9 @@ class DistributionStrategy(object):
     update. Allreduce is an algorithm for performing a reduction on
     values from multiple devices and making the result available on
     all of those devices.
-  * TODO(josh11b): Future: partitioned variables
+  * In the future we will have support for TensorFlows' partitioned
+    variables, where a single variable is split across multiple
+    devices.
 
   We have then a few approaches we want to support:
   * Code written (as if) with no knowledge of class `DistributionStrategy`.
@@ -390,7 +392,6 @@ class DistributionStrategy(object):
     ```
     with my_distribution.scope():
       iterator = my_distribution.distribute_dataset(dataset)
-      # TODO(josh11b): iterator = dataset.make_one_shot_iterator()
       tower_train_ops = my_distribution.call_for_each_tower(
           tower_fn, iterator.get_next())
       train_op = tf.group(my_distribution.unwrap(tower_train_ops))
@@ -402,6 +403,10 @@ class DistributionStrategy(object):
     using `my_distribution`'s policy, and library functions called by
     `tower_fn` can use the `get_tower_context()` API to get enhanced
     behavior in this case.
+
+    Note that in the future we will add support for initializable
+    Dataset iterators, at which point this example code will change.
+
   * If you want to write a distributed algorithm, you may use any of
     the `DistributionStrategy` APIs inside a
     `with my_distribution.scope():` block of code.
@@ -514,7 +519,7 @@ class DistributionStrategy(object):
 
   Steps 3 and 4 are done automatically by class `Optimizer` if you call
   its `apply_gradients` method in a tower context. Otherwise you can
-  manually call its `distributed_apply` method in a cross-tower context.
+  manually call its `_distributed_apply` method in a cross-tower context.
 
   Another thing you might want to do in the middle of your tower function
   is an all-reduce of some intermediate value, using `d.reduce()` or
-- 
GitLab


From 36fef8baaaa461dc7bcb65d9c7c7f8796fc80c21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 16:16:13 -0700
Subject: [PATCH 0119/1262] Implement strip CheckNumerics in DebugStripper.

PiperOrigin-RevId: 191131935
---
 tensorflow/core/grappler/op_types.cc          |  4 ++
 tensorflow/core/grappler/op_types.h           |  1 +
 .../grappler/optimizers/debug_stripper.cc     |  5 ++
 .../optimizers/debug_stripper_test.cc         | 59 +++++++++++++++++++
 4 files changed, 69 insertions(+)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index c31ac9b59c..e0ee49d157 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -68,6 +68,10 @@ bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
 
 bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
 
+bool IsCheckNumerics(const NodeDef& node) {
+  return node.op() == "CheckNumerics";
+}
+
 bool IsComplex(const NodeDef& node) { return node.op() == "Complex"; }
 
 bool IsComplexAbs(const NodeDef& node) { return node.op() == "ComplexAbs"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 39affcbc24..aa6750d5c3 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -37,6 +37,7 @@ bool IsBiasAdd(const NodeDef& node);
 bool IsBiasAddGrad(const NodeDef& node);
 bool IsBitcast(const NodeDef& node);
 bool IsCast(const NodeDef& node);
+bool IsCheckNumerics(const NodeDef& node);
 bool IsComplex(const NodeDef& node);
 bool IsComplexAbs(const NodeDef& node);
 bool IsConj(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index 0e058e3435..8bd10171f1 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -39,6 +40,10 @@ Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
           inp = AsControlDependency(inp);
         }
       }
+    } else if (IsCheckNumerics(node)) {
+      // Replace with Identity op which will be pruned later.
+      node.set_op("Identity");
+      node.mutable_attr()->erase("message");
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
index c79c36841d..3f11febc64 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -105,6 +105,65 @@ TEST_F(DebugStripperTest, StripAssertFromGraph) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
+TEST_F(DebugStripperTest, StripCheckNumericsFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto check1 = ops::CheckNumerics(s.WithOpName("CheckNumerics1"), x, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("CheckNumerics2"), y, "foo");
+  Output add = ops::Add(s.WithOpName("z"), check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "CheckNumerics1") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "CheckNumerics2") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("CheckNumerics1", node.input(0));
+      EXPECT_EQ("CheckNumerics2", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From 5f6f3198dcba3fcf38aab4a44093fc8c02c49bbc Mon Sep 17 00:00:00 2001
From: Neal Wu <wun@google.com>
Date: Fri, 30 Mar 2018 16:31:31 -0700
Subject: [PATCH 0120/1262] Simple fixes for documentation on
 tf.nn.softmax_cross_entropy_with_logits (and v2).

PiperOrigin-RevId: 191134015
---
 tensorflow/python/ops/nn_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 0c55386241..07ca32953f 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1808,7 +1808,7 @@ def softmax_cross_entropy_with_logits_v2(
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
-  backpropagation into `labels`, pass label tensors through a `stop_gradients`
+  backpropagation into `labels`, pass label tensors through @{tf.stop_gradient}
   before feeding it to this function.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
@@ -1895,7 +1895,7 @@ _XENT_DEPRECATION = """
 Future major versions of TensorFlow will allow gradients to flow
 into the labels input on backprop by default.
 
-See tf.nn.softmax_cross_entropy_with_logits_v2.
+See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
 """
 
 
-- 
GitLab


From e75f554199f13d13775cdde8394fdf8d0683b6af Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 30 Mar 2018 17:22:41 -0700
Subject: [PATCH 0121/1262] Support int64 slice spec for StridedSliceGrad.
 Currently this doesn't work since _StridedSliceGrad always passes a int32
 shape which restricts indices values to be of the same type. Add a test that
 fails before and passes after the change.

PiperOrigin-RevId: 191140718
---
 tensorflow/python/kernel_tests/array_ops_test.py | 8 ++++++++
 tensorflow/python/ops/array_grad.py              | 7 ++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 64c1760d5e..78bdb7eda7 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -780,6 +780,14 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  def testInt64Indices(self):
+    with self.test_session(use_gpu=True) as sess:
+      a = math_ops.range(3)
+      index = constant_op.constant(1, dtype=dtypes.int64)
+      b = 2 * a[index]
+      grad, = gradients_impl.gradients(b, a)
+      self.assertAllEqual(sess.run(grad), [0, 2, 0])
+
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 3c6a5c9e56..57d2657838 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -255,10 +255,15 @@ def _SliceGrad(op, grad):
 @ops.RegisterGradient("StridedSlice")
 def _StridedSliceGrad(op, grad):
   """Gradient for StridedSlice op."""
-  x = array_ops.shape(op.inputs[0])
   begin = op.inputs[1]
   end = op.inputs[2]
   strides = op.inputs[3]
+  # StridedSliceGrad requires `x`, `begin`, `end` and `strides` to be of the
+  # same dtype so we build a shape of the same type as other args.
+  # Note that the choice of `begin` for specifying `out_type` is arbitrary.
+  # We could choose any of {begin|end|strides}.dtype since they are required to
+  # be the same.
+  x = array_ops.shape(op.inputs[0], out_type=begin.dtype)
 
   return array_ops.strided_slice_grad(
       x,
-- 
GitLab


From 7690f8e3a3fdc9dd3da64a41093aac0acbbf1fdc Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Fri, 30 Mar 2018 20:36:03 -0700
Subject: [PATCH 0122/1262] Log large allocations and total memory usage for
 the CPU device.

PiperOrigin-RevId: 191152060
---
 tensorflow/core/framework/allocator.cc     | 38 +++++++++++++++++++++-
 tensorflow/core/grappler/clusters/utils.cc |  2 +-
 tensorflow/core/platform/mem.h             |  2 +-
 tensorflow/core/platform/posix/port.cc     |  2 +-
 tensorflow/core/platform/windows/port.cc   |  2 +-
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index a382b8be95..6182f95f28 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -61,6 +61,26 @@ static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
+// Individual allocations large than this amount will trigger a warning.
+static const double kLargeAllocationWarningThreshold = 0.1;
+
+// If cpu_allocator_collect_stats is true, warn when the total allocated memory
+// exceeds this threshold.
+static const double kTotalAllocationWarningThreshold = 0.5;
+
+// Cache first invocation to port::AvailableRam, as it can be expensive.
+static int64_t LargeAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kLargeAllocationWarningThreshold);
+  return value;
+}
+
+static int64_t TotalAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kTotalAllocationWarningThreshold);
+  return value;
+}
+
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
@@ -70,7 +90,8 @@ void EnableCPUAllocatorFullStats(bool enable) {
 
 class CPUAllocator : public VisitableAllocator {
  public:
-  CPUAllocator() : allocation_begun_(false) {}
+  CPUAllocator()
+      : total_allocation_warning_triggered_(false), allocation_begun_(false) {}
 
   ~CPUAllocator() override {}
 
@@ -81,6 +102,12 @@ class CPUAllocator : public VisitableAllocator {
       allocation_begun_ = true;
     }
 
+    if (num_bytes > LargeAllocationWarningBytes()) {
+      LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
+                   << 100 * kLargeAllocationWarningThreshold
+                   << "% of system memory.";
+    }
+
     void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
@@ -91,6 +118,14 @@ class CPUAllocator : public VisitableAllocator {
           std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
       stats_.max_alloc_size =
           std::max<int64>(stats_.max_alloc_size, alloc_size);
+
+      if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
+          !total_allocation_warning_triggered_) {
+        LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
+                     << "exceeds " << 100 * kTotalAllocationWarningThreshold
+                     << "% of system memory";
+        total_allocation_warning_triggered_ = true;
+      }
     }
 
     // visit each Visitor in alloc_visitors_
@@ -162,6 +197,7 @@ class CPUAllocator : public VisitableAllocator {
  private:
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
+  bool total_allocation_warning_triggered_ GUARDED_BY(mu_);
 
   // visitor_mutex_ protects write access to alloc_visitors_ and free_visitors_.
   // While write access is mutually exclusive, reads may happen concurrently.
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index b54b34959a..50d6e6468f 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -54,7 +54,7 @@ DeviceProperties GetLocalCPUInfo() {
 
   int64 free_mem = port::AvailableRam();
   if (free_mem < INT64_MAX) {
-    device.set_memory_size(free_mem * 1024);
+    device.set_memory_size(free_mem);
   }
 
   (*device.mutable_environment())["cpu_instruction_set"] =
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index 7bb9fc264f..fca3a2332d 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,7 +59,7 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
-// Returns the amount of RAM available in kB, or INT64_MAX if unknown.
+// Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
 int64 AvailableRam();
 
 }  // namespace port
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 494acde803..8e316472fe 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -177,7 +177,7 @@ int64 AvailableRam() {
   struct sysinfo info;
   int err = sysinfo(&info);
   if (err == 0) {
-    return info.freeram / 1024;
+    return info.freeram;
   }
 #endif
   return INT64_MAX;
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index f3b27ea394..174f41a993 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -166,7 +166,7 @@ int64 AvailableRam() {
   MEMORYSTATUSEX statex;
   statex.dwLength = sizeof(statex);
   if (GlobalMemoryStatusEx(&statex)) {
-    return statex.ullAvailPhys / 1024;
+    return statex.ullAvailPhys;
   }
   return INT64_MAX;
 }
-- 
GitLab


From d2a4eee2012b3f5b0a221d00384971dc7b5e425b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 30 Mar 2018 22:55:58 -0700
Subject: [PATCH 0123/1262] Add a unit test to verify that dependency optimizer
 could remove noop and greaterequal node in a simple graph.

PiperOrigin-RevId: 191157450
---
 .../optimizers/dependency_optimizer_test.cc   | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 57b3118245..6a297da52d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -678,6 +678,50 @@ TEST_F(DependencyOptimizerTest, Identity_DeviceCrossing_ConsumerOnSameDevice) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto noop =
+      ops::NoOp(s.WithOpName("NoOp").WithControlDependencies(greaterequal));
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({noop.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  item.fetch.push_back("z");
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+    } else if (node.name() == "NoOp") {
+      count++;
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From d2c9c95566ad1945c7ac24da41953effc6f3cf68 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Fri, 30 Mar 2018 23:37:48 -0700
Subject: [PATCH 0124/1262] Fix typo

PiperOrigin-RevId: 191159820
---
 tensorflow/docs_src/mobile/tflite/devguide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
index 5b521dca7b..96392a3c9b 100644
--- a/tensorflow/docs_src/mobile/tflite/devguide.md
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -88,7 +88,7 @@ Tensorflow Lite format. This process uses several model formats:
   extracted from a `SavedModel`.
 * *TensorFlow Lite model* (.tflite) —A serialized
   [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
-  Lite operators and tensors for the TensorFlow Lite interpreter, similiar to a
+  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
   `FrozenGraphDef`.
 
 ### Freeze Graph
-- 
GitLab


From 3bf08422a2cdd732e9b00debe3d217d04473902d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sun, 1 Apr 2018 09:56:48 +0800
Subject: [PATCH 0125/1262] CLN: remove use_nesterov argument

---
 .../base_api/api_def_ApplyAdaMax.pbtxt        |   6 -
 .../api_def_ResourceApplyAdaMax.pbtxt         |   6 -
 tensorflow/core/kernels/training_ops.cc       | 204 +++++++++++-------
 tensorflow/core/kernels/training_ops.h        |   2 +-
 .../core/kernels/training_ops_gpu.cu.cc       |   2 +-
 tensorflow/core/ops/training_ops.cc           |   2 -
 6 files changed, 133 insertions(+), 89 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
index 106c30ca83..57938b42ae 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -72,12 +72,6 @@ END
 If `True`, updating of the var, m, and v tensors will be protected
 by a lock; otherwise the behavior is undefined, but may exhibit less
 contention.
-END
-  }
-  attr {
-    name: "use_nesterov"
-    description: <<END
-Always `False`, unsupported argument.
 END
   }
   summary: "Update \'*var\' according to the AdaMax algorithm."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
index 5b81e50a07..57fae3cb57 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -66,12 +66,6 @@ END
 If `True`, updating of the var, m, and v tensors will be protected
 by a lock; otherwise the behavior is undefined, but may exhibit less
 contention.
-END
-  }
-  attr {
-    name: "use_nesterov"
-    description: <<END
-Always `False`, unsupported argument.
 END
   }
   summary: "Update \'*var\' according to the AdaMax algorithm."
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 45c600fd40..1a8d08288b 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -338,10 +338,7 @@ struct ApplyAdaMaxNonCuda {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
-    if (use_nesterov) {
-      LOG(WARNING) << "AdaMax doesn't support use_nesterov yet, ignore it.";
-    }
+                  typename TTypes<T>::ConstFlat grad) {
     m.device(d) += (grad - m) * (T(1) - beta1());
     // Here v is u in section 7.1
     v.device(d) = (beta2() * v).cwiseMax(grad.abs());
@@ -350,20 +347,6 @@ struct ApplyAdaMaxNonCuda {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct ApplyAdaMaxSYCL {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
-                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
-                  T beta1_power, T beta2_power, T lr, T beta1, T beta2,
-                  T epsilon, typename TTypes<T>::ConstFlat grad) {
-    m.device(d) += (grad - m) * (T(1) - beta1);
-    v.device(d) = (beta2 * v).cwiseMax(grad.abs());
-    var.device(d) -= lr / (T(1) - beta1_power) * (m / (v + epsilon));
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
-
 template <typename T>
 struct ApplyAdaMax<CPUDevice, T> : ApplyAdaMaxNonCuda<CPUDevice, T> {};
 
@@ -2516,12 +2499,10 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-template <typename Device, typename T,
-          template <typename Device2, typename T2>
-          class Functor>
-class ApplyAdamBaseOp : public OpKernel {
+template <typename Device, typename T>
+class ApplyAdamOp : public OpKernel {
  public:
-  explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
@@ -2594,11 +2575,11 @@ class ApplyAdamBaseOp : public OpKernel {
                                 grad.shape().DebugString()));
 
     const Device& device = ctx->template eigen_device<Device>();
-    auto functor = Functor<Device, T>();
-    functor(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-            beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
-            beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
-            grad.flat<T>(), use_nesterov_);
+    functor::ApplyAdam<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>(), use_nesterov_);
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -2609,11 +2590,10 @@ class ApplyAdamBaseOp : public OpKernel {
 };
 
 #ifdef TENSORFLOW_USE_SYCL
-template <typename T,
-          template <typename T2> class Functor>
-class ApplyAdamBaseOp<SYCLDevice, T, Functor> : public OpKernel {
+template <typename T>
+class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
  public:
-  explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
@@ -2714,10 +2694,9 @@ class ApplyAdamBaseOp<SYCLDevice, T, Functor> : public OpKernel {
                                 var.shape().DebugString(), " ",
                                 grad.shape().DebugString()));
 
-    auto functor = Functor<T>();
-    functor(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-            beta1_power, beta2_power, lr, beta1, beta2,
-            epsilon, grad.flat<T>());
+    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+                                beta1_power, beta2_power, lr, beta1, beta2,
+                                epsilon, grad.flat<T>());
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -2727,28 +2706,28 @@ class ApplyAdamBaseOp<SYCLDevice, T, Functor> : public OpKernel {
 };
 #endif  // TENSORFLOW_USE_SYCL
 
-#define REGISTER_KERNELS(D, T, F)                                  \
+#define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdamBaseOp<D##Device, T, F>);                           \
+      ApplyAdamOp<D##Device, T>);                                  \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdam")                \
                               .HostMemory("var")                   \
                               .HostMemory("m")                     \
                               .HostMemory("v")                     \
                               .Device(DEVICE_##D)                  \
                               .TypeConstraint<T>("T"),             \
-                          ApplyAdamBaseOp<D##Device, T, F>);
-#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdam);
+                          ApplyAdamOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
-#undef REGISTER_CPU_KERNELS
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdamSYCL);
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
+
 TF_CALL_float(REGISTER_SYCL_KERNELS);
 TF_CALL_double(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
 #endif
 
 #if GOOGLE_CUDA
@@ -2773,44 +2752,124 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
-#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdam);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
-#undef REGISTER_GPU_KERNELS
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-#define REGISTER_KERNELS(D, T, F)                                    \
-  REGISTER_KERNEL_BUILDER(                                           \
+template <typename Device, typename T>
+class ApplyAdaMaxOp : public OpKernel {
+ public:
+  explicit ApplyAdaMaxOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& beta2_power = ctx->input(4);
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(9);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdaMax<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyAdaMax").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdamBaseOp<D##Device, T, F>);                             \
+      ApplyAdaMaxOp<D##Device, T>);                                  \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdaMax")                \
-                              .HostMemory("var")                     \
-                              .HostMemory("m")                       \
-                              .HostMemory("v")                       \
-                              .Device(DEVICE_##D)                    \
-                              .TypeConstraint<T>("T"),               \
-                          ApplyAdamBaseOp<D##Device, T, F>);
-#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdaMax);
+                              .HostMemory("var")                   \
+                              .HostMemory("m")                     \
+                              .HostMemory("v")                     \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<T>("T"),             \
+                          ApplyAdaMaxOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
-#undef REGISTER_CPU_KERNELS
-
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdaMaxSYCL);
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                   \
   template <>                                                 \
-  void ApplyAdaMax<GPUDevice, T>::operator()(                 \
+  void ApplyAdaMax<GPUDevice, T>::operator()(                   \
       const GPUDevice& d, typename TTypes<T>::Flat var,       \
       typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
       typename TTypes<T>::ConstScalar beta1_power,            \
@@ -2819,7 +2878,7 @@ namespace functor {
       typename TTypes<T>::ConstScalar beta1,                  \
       typename TTypes<T>::ConstScalar beta2,                  \
       typename TTypes<T>::ConstScalar epsilon,                \
-      typename TTypes<T>::ConstFlat grad, bool use_nesterov); \
+      typename TTypes<T>::ConstFlat grad); \
   extern template struct ApplyAdaMax<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -2827,12 +2886,11 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
-#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdaMax);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
-#undef REGISTER_GPU_KERNELS
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 46a5290210..74acc12d50 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -149,7 +149,7 @@ struct ApplyAdaMax {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad, bool use_nesterov);
+                  typename TTypes<T>::ConstFlat grad);
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 54c06b130c..1a6fc26422 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -152,7 +152,7 @@ struct ApplyAdaMax<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
+                  typename TTypes<T>::ConstFlat grad) {
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 6f107db3ea..99176cec55 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -751,7 +751,6 @@ REGISTER_OP("ApplyAdaMax")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
@@ -769,7 +768,6 @@ REGISTER_OP("ResourceApplyAdaMax")
     .Input("grad: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
-- 
GitLab


From f4850641530017a3b2b294974298ae13028b8583 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sun, 1 Apr 2018 10:21:46 +0800
Subject: [PATCH 0126/1262] CLN: code style

---
 tensorflow/core/kernels/training_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 1a8d08288b..aedca80c31 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -342,7 +342,7 @@ struct ApplyAdaMaxNonCuda {
     m.device(d) += (grad - m) * (T(1) - beta1());
     // Here v is u in section 7.1
     v.device(d) = (beta2() * v).cwiseMax(grad.abs());
-    // var is θ  in section 7.1
+    // var is θ in section 7.1
     var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon()));
   }
 };
-- 
GitLab


From 0d343fbb0e8c66622bc21aab39e225c6d895a78b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sun, 1 Apr 2018 10:42:10 +0800
Subject: [PATCH 0127/1262] CLN: remove unused argument beta2_power

---
 .../contrib/opt/python/training/adamax.py     | 42 ++++++++++++++++---
 .../opt/python/training/adamax_test.py        | 17 +++-----
 .../base_api/api_def_ApplyAdaMax.pbtxt        |  6 ---
 .../api_def_ResourceApplyAdaMax.pbtxt         |  6 ---
 tensorflow/core/kernels/training_ops.cc       | 18 +++-----
 tensorflow/core/kernels/training_ops.h        |  1 -
 .../core/kernels/training_ops_gpu.cu.cc       |  1 -
 tensorflow/core/ops/training_ops.cc           | 24 +++++++++--
 8 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index ea08a0931b..ba9e79be99 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -85,14 +86,35 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
     super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2,
                                           epsilon, use_locking, name)
 
+  def _get_beta_accumulators(self):
+    if context.in_graph_mode():
+      graph = ops.get_default_graph()
+    else:
+      graph = None
+    return self._get_non_slot_variable("beta1_power", graph=graph)
+
+  def _create_slots(self, var_list):
+    # Create the beta1 accumulators on the same device as the first
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
-    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = self._get_beta_accumulators()
     return training_ops.apply_ada_max(
         var, m, v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
-        math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
@@ -102,11 +124,10 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
-    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = self._get_beta_accumulators()
     return training_ops.resource_apply_ada_max(
         var.handle, m.handle, v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
-        math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
@@ -115,9 +136,8 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
 
   def _apply_sparse_shared(self, grad, var, indices,
                            scatter_add, scatter_update):
-    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = self._get_beta_accumulators()
     beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
@@ -159,3 +179,13 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
     return self._apply_sparse_shared(
         grad, var, indices,
         self._resource_scatter_add, self._resource_scatter_update)
+
+  def _finish(self, update_ops, name_scope):
+    # Update the power accumulators.
+    with ops.control_dependencies(update_ops):
+      beta1_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+          beta1_power * self._beta1_t, use_locking=self._use_locking)
+    return control_flow_ops.group(*update_ops + [update_beta1],
+                                  name=name_scope)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index e91e5cb96a..ccd08c0934 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -105,12 +105,11 @@ class AdaMaxOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
         self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
 
-        beta1_power, beta2_power = opt._get_beta_accumulators()
+        beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
           self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           update.run()
 
           var0_np, m0, v0 = adamax_sparse_update_numpy(
@@ -195,11 +194,9 @@ class AdaMaxOptimizerTest(test.TestCase):
         opt = adamax.AdaMaxOptimizer()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         opt_variables = opt.variables()
-        beta1_power, beta2_power = opt._get_beta_accumulators()
+        beta1_power = opt._get_beta_accumulators()
         self.assertTrue(beta1_power is not None)
-        self.assertTrue(beta2_power is not None)
         self.assertIn(beta1_power, opt_variables)
-        self.assertIn(beta2_power, opt_variables)
 
         with ops.Graph().as_default():
           # Shouldn't return non-slot variables from other graphs.
@@ -211,7 +208,7 @@ class AdaMaxOptimizerTest(test.TestCase):
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
           self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        beta1_power, beta2_power = opt._get_beta_accumulators()
+        beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
@@ -222,8 +219,6 @@ class AdaMaxOptimizerTest(test.TestCase):
 
           self.assertAllCloseAccordingToType(0.9**(t + 1),
                                              self.evaluate(beta1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta2_power))
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
@@ -265,12 +260,11 @@ class AdaMaxOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], var0.eval())
         self.assertAllClose([3.0, 4.0], var1.eval())
 
-        beta1_power, beta2_power = opt._get_beta_accumulators()
+        beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
           self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           update.run()
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -299,7 +293,7 @@ class AdaMaxOptimizerTest(test.TestCase):
         update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        beta1_power, beta2_power = opt._get_beta_accumulators()
+        beta1_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], var0.eval())
@@ -308,7 +302,6 @@ class AdaMaxOptimizerTest(test.TestCase):
         # Run 3 steps of intertwined AdaMax1 and AdaMax2.
         for t in range(1, 4):
           self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           if t % 2 == 0:
             update1.run()
           else:
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
index 57938b42ae..5e705c009c 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -22,12 +22,6 @@ END
     name: "beta1_power"
     description: <<END
 Must be a scalar.
-END
-  }
-  in_arg {
-    name: "beta2_power"
-    description: <<END
-Must be a scalar.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
index 57fae3cb57..ad99b78af1 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -22,12 +22,6 @@ END
     name: "beta1_power"
     description: <<END
 Must be a scalar.
-END
-  }
-  in_arg {
-    name: "beta2_power"
-    description: <<END
-Must be a scalar.
 END
   }
   in_arg {
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index aedca80c31..2e193b0c0e 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -333,7 +333,6 @@ struct ApplyAdaMaxNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
                   typename TTypes<T>::ConstScalar beta1_power,
-                  typename TTypes<T>::ConstScalar beta2_power,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
@@ -2793,18 +2792,14 @@ class ApplyAdaMaxOp : public OpKernel {
             "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& beta1_power = ctx->input(3);
-    const Tensor& beta2_power = ctx->input(4);
-    const Tensor& lr = ctx->input(5);
-    const Tensor& beta1 = ctx->input(6);
-    const Tensor& beta2 = ctx->input(7);
-    const Tensor& epsilon = ctx->input(8);
+    const Tensor& lr = ctx->input(4);
+    const Tensor& beta1 = ctx->input(5);
+    const Tensor& beta2 = ctx->input(6);
+    const Tensor& epsilon = ctx->input(7);
 
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
                 errors::InvalidArgument("beta1_power is not a scalar: ",
                                         beta1_power.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
-                errors::InvalidArgument("beta2_power is not a scalar: ",
-                                        beta2_power.shape().DebugString()));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
                 errors::InvalidArgument("lr is not a scalar : ",
                                         lr.shape().DebugString()));
@@ -2818,7 +2813,7 @@ class ApplyAdaMaxOp : public OpKernel {
                 errors::InvalidArgument("epsilon is not a scalar: ",
                                         epsilon.shape().DebugString()));
 
-    const Tensor& grad = ctx->input(9);
+    const Tensor& grad = ctx->input(8);
     OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
                 errors::InvalidArgument("var and m do not have the same shape",
                                         var.shape().DebugString(), " ",
@@ -2836,7 +2831,7 @@ class ApplyAdaMaxOp : public OpKernel {
     const Device& device = ctx->template eigen_device<Device>();
     functor::ApplyAdaMax<Device, T>()(
         device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1_power.scalar<T>(), lr.scalar<T>(),
         beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
         grad.flat<T>());
 
@@ -2873,7 +2868,6 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T>::Flat var,       \
       typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
       typename TTypes<T>::ConstScalar beta1_power,            \
-      typename TTypes<T>::ConstScalar beta2_power,            \
       typename TTypes<T>::ConstScalar lr,                     \
       typename TTypes<T>::ConstScalar beta1,                  \
       typename TTypes<T>::ConstScalar beta2,                  \
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 74acc12d50..f536a61eb0 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -144,7 +144,6 @@ struct ApplyAdaMax {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
                   typename TTypes<T>::ConstScalar beta1_power,
-                  typename TTypes<T>::ConstScalar beta2_power,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 1a6fc26422..2aa17f2a0f 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -147,7 +147,6 @@ struct ApplyAdaMax<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
                   typename TTypes<T>::ConstScalar beta1_power,
-                  typename TTypes<T>::ConstScalar beta2_power,
                   typename TTypes<T>::ConstScalar lr,
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 99176cec55..dc7b588898 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -737,12 +737,29 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
 REGISTER_OP("ApplyAdaMax")
     .Input("var: Ref(T)")
     .Input("m: Ref(T)")
     .Input("v: Ref(T)")
     .Input("beta1_power: T")
-    .Input("beta2_power: T")
     .Input("lr: T")
     .Input("beta1: T")
     .Input("beta2: T")
@@ -752,7 +769,7 @@ REGISTER_OP("ApplyAdaMax")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdamShapeFn(c, false /* sparse */);
+      return ApplyAdaMaxShapeFn(c, false /* sparse */);
     });
 
 REGISTER_OP("ResourceApplyAdaMax")
@@ -760,7 +777,6 @@ REGISTER_OP("ResourceApplyAdaMax")
     .Input("m: resource")
     .Input("v: resource")
     .Input("beta1_power: T")
-    .Input("beta2_power: T")
     .Input("lr: T")
     .Input("beta1: T")
     .Input("beta2: T")
@@ -769,7 +785,7 @@ REGISTER_OP("ResourceApplyAdaMax")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdamShapeFn(c, false /* sparse */);
+      return ApplyAdaMaxShapeFn(c, false /* sparse */);
     });
 
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
-- 
GitLab


From aae991b259f1acd91dc6fa21ea74bba4c4710530 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 1 Apr 2018 01:15:35 -0700
Subject: [PATCH 0128/1262] [XLA] Add a ReduceWindow test to reduce along the
 lane dimension.

PiperOrigin-RevId: 191214828
---
 .../compiler/xla/tests/reduce_window_test.cc    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index d6f580a6f9..8dd24f1237 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -277,6 +277,23 @@ TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) {
                            DefaultErrorSpec());
 }
 
+TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) {
+  Array4D<float> input_array(19, 17, 8, 256);
+  input_array.FillWithMinorDimNum();
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input_data_handle, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
 // Tests a reduction function that is not a simple add/min/max/etc.
 XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
-- 
GitLab


From 926bd44844d36bbefcbd620eb65ba0019e0a6dde Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 1 Apr 2018 16:01:20 -0700
Subject: [PATCH 0129/1262] [XLA] Redesign: implement and test concat.

PiperOrigin-RevId: 191244047
---
 .../xla/client/xla_client/xla_builder.cc      |  38 +++--
 tensorflow/compiler/xla/tests/BUILD           |   4 +-
 tensorflow/compiler/xla/tests/concat_test.cc  | 145 +++++++++---------
 3 files changed, 103 insertions(+), 84 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index e51a8b14c0..7a9ff0c441 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -52,6 +52,16 @@ bool CanBeRoot(HloOpcode opcode) {
   }
 }
 
+StatusOr<std::vector<Shape>> GetOperandShapes(
+    tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  std::vector<Shape> operand_shapes;
+  for (const XlaOp& operand : operands) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
+    operand_shapes.push_back(shape);
+  }
+  return operand_shapes;
+}
+
 }  // namespace
 
 StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
@@ -362,11 +372,7 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
-    std::vector<Shape> operand_shapes;
-    for (const auto& operand : operands) {
-      TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
-      operand_shapes.push_back(shape);
-    }
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                 [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
@@ -457,7 +463,21 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
 
 XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
                               int64 dimension) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+
+    instr.add_dimensions(dimension);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
+  }());
 }
 
 XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
@@ -508,11 +528,7 @@ XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
-    std::vector<Shape> operand_shapes;
-    for (const XlaOp& e : elements) {
-      TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(e));
-      operand_shapes.push_back(shape);
-    }
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                 [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index e337669aeb..283efbb707 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1444,9 +1444,9 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index fb0e9c724a..a4c8a83eb1 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -38,9 +38,9 @@ using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
 XLA_TEST_F(ConcatTest, Concat_Nothing) {
-  ComputationBuilder builder(client_, TestName());
-  auto concatenated = builder.ConcatInDim({}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  XlaBuilder builder(TestName());
+  builder.ConcatInDim({}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("Concatenate expects at least one argument"));
@@ -48,18 +48,18 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) {
 
 // Concatenate with one argument works.
 XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -68,51 +68,51 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<float>(42.0);
   auto b = builder.ConstantR0<float>(64.0);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({a, b}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("out of bounds: 0"));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -129,20 +129,20 @@ XLA_TEST_F(ConcatTest, Concat_R1_L253_With_R1_L7) {
     expected[253 + i] = rhs[i] = 253 + i + 1;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>(lhs);
   auto b = builder.ConstantR1<float>(rhs);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
   for (int dim : {0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto a = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
     auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
-    auto concatenated = builder.ConcatInDim({a, b}, dim);
+    builder.ConcatInDim({a, b}, dim);
 
     ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {},
                                ErrorSpec(0.0001));
@@ -150,26 +150,27 @@ XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
-      {0}, {64},
+      {0},
+      {64},
   });
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 64},
@@ -178,22 +179,22 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
 }
 
 XLA_TEST_F(ConcatTest, Concat2x0With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(Array2D<float>(2, 0));
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   ComputeAndCompareR2<float>(&builder, *b_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(2, 3);
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 1, 2, 64, 65, 66, 67, 68},
@@ -203,22 +204,22 @@ XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 2));
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR2<float>(&builder, *a_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto b_array = CreatePatternedMatrix(5, 2, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
       {0, 1},
@@ -234,16 +235,16 @@ XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x0x2_3x0x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 2));
   auto b = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 1));
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
   ComputeAndCompareR3<float>(&builder, Array3D<float>(3, 0, 3), {},
                              ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -258,27 +259,29 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
   });
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 6}}, {{2, 3, 7}}, {{4, 5, 8}},
+      {{0, 1, 6}},
+      {{2, 3, 7}},
+      {{4, 5, 8}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_1x1_1x1_1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b, c}, 0);
+  builder.ConcatInDim({a, b, c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -300,35 +303,35 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
   auto c = builder.ConstantR3FromArray3D(c_array);
-  auto concatenated = builder.ConcatInDim({a, b, c}, 2);
+  builder.ConcatInDim({a, b, c}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 2, 3}}, {{4, 5, 6, 7}}, {{8, 9, 10, 11}},
+      {{0, 1, 2, 3}},
+      {{4, 5, 6, 7}},
+      {{8, 9, 10, 11}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatLeftAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = (a concat b) concat c
-  auto concatenated =
-      builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
+  builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatRightAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = a concat (b concat c)
-  auto concatenated =
-      builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
+  builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -342,7 +345,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim0) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 0);
@@ -363,7 +366,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim1) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -388,7 +391,7 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -404,13 +407,13 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
 
 // Show that we can't concatenate with an opaques.
 XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto opaque_shape = ShapeUtil::MakeOpaqueShape();
   auto r1f32 = xla::ShapeUtil::MakeShape(xla::F32, {1});
   auto x = builder.Parameter(0, r1f32, "x");
   auto y = builder.Parameter(1, opaque_shape, "y");
-  auto concatenated = builder.ConcatInDim({x, y}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({x, y}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -418,23 +421,23 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto p0 = builder.ConstantR1<bool>({true});
   auto p1 = builder.ConstantR1<bool>({false});
   auto p2 = builder.ConstantR1<bool>({true});
-  auto concatenated = builder.ConcatInDim({p0, p1, p2}, 0);
+  builder.ConcatInDim({p0, p1, p2}, 0);
 
   bool expected[] = {true, false, true};
   ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR1<int32>({1});
   auto a1 = builder.ConstantR1<int32>({2, 3});
   auto a2 = builder.ConstantR1<int32>({4, 5, 6});
   auto a3 = builder.ConstantR1<int32>({7, 8, 9, 10});
-  auto concatenated = builder.ConcatInDim({a0, a1, a2, a3}, 0);
+  builder.ConcatInDim({a0, a1, a2, a3}, 0);
 
   std::vector<int32> expected(10);
   std::iota(expected.begin(), expected.end(), 1);
@@ -442,7 +445,7 @@ XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array3D<float> arr0(9, 17, 1);
   arr0.Fill(1);
@@ -462,14 +465,14 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
     }
   }
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto p0 = CreateR3Parameter<float>(arr0, /*parameter_number=*/0, "p0",
                                      &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto p1 = CreateR3Parameter<float>(arr1, /*parameter_number=*/1, "p1",
                                      &builder, &h1);
 
-  auto concatenated = builder.ConcatInDim({h0, h1}, 2);
+  builder.ConcatInDim({h0, h1}, 2);
 
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
@@ -495,7 +498,7 @@ TEST_P(ConcatR2BinaryTest, DoIt) {
   Array2D<int32> rhs(spec.rhs_dim0, spec.rhs_dim1);
   rhs.FillUnique(1000);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR2FromArray2D<int32>(lhs);
   auto a1 = builder.ConstantR2FromArray2D<int32>(rhs);
   builder.ConcatInDim({a0, a1}, spec.concat_dimension);
@@ -521,7 +524,7 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, f32_scalar, "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto mul = builder.Mul(x, y);
@@ -545,7 +548,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "z");
@@ -573,7 +576,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "y");
-- 
GitLab


From 5ca9fedc6b3f9619a3bcf7a5a4a523668055f57d Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 2 Apr 2018 13:02:01 +0800
Subject: [PATCH 0130/1262] Fix adam optimizer related math equation rendering
 format

---
 .../opt/python/training/lazy_adam_optimizer.py   |  6 +++---
 tensorflow/contrib/optimizer_v2/adam.py          | 16 ++++++++--------
 .../api_def/base_api/api_def_ApplyAdam.pbtxt     |  8 ++++----
 .../base_api/api_def_ResourceApplyAdam.pbtxt     |  8 ++++----
 tensorflow/python/training/adam.py               | 16 ++++++++--------
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
index aeca900bc8..72117c1e81 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
     epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
     lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
 
-    # m := beta1 * m + (1 - beta1) * g_t
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
     m = self.get_slot(var, "m")
     m_t = state_ops.scatter_update(m, grad.indices,
                                    beta1_t * array_ops.gather(m, grad.indices) +
                                    (1 - beta1_t) * grad.values,
                                    use_locking=self._use_locking)
 
-    # v := beta2 * v + (1 - beta2) * (g_t * g_t)
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
     v = self.get_slot(var, "v")
     v_t = state_ops.scatter_update(v, grad.indices,
                                    beta2_t * array_ops.gather(v, grad.indices) +
                                    (1 - beta2_t) * math_ops.square(grad.values),
                                    use_locking=self._use_locking)
 
-    # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
     m_t_slice = array_ops.gather(m_t, grad.indices)
     v_t_slice = array_ops.gather(v_t, grad.indices)
     denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 42b7f92a76..e863ca1244 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -41,21 +41,21 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
     Initialization:
 
     ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize initial 2nd moment vector)
-    t <- 0 (Initialize timestep)
+    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
+    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
+    \\(t <- 0\\) (Initialize timestep)
     ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
     ```
-    t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    $$t <- t + 1$$
+    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
 
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
+    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index c2858a1bfb..9bffaa79f5 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
+$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
+$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index bea1fd6762..109b68e472 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,9 +76,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
+$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
+$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
 END
 }
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 006e360389..178eddc664 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -44,21 +44,21 @@ class AdamOptimizer(optimizer.Optimizer):
     Initialization:
 
     ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize initial 2nd moment vector)
-    t <- 0 (Initialize timestep)
+    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
+    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
+    \\(t <- 0\\) (Initialize timestep)
     ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
     ```
-    t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    $$t <- t + 1$$
+    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
 
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
+    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
-- 
GitLab


From 85763f5192bc772daf672b183ec63edef4e0047c Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 2 Apr 2018 13:11:26 +0800
Subject: [PATCH 0131/1262] Fix minor typo

---
 tensorflow/contrib/optimizer_v2/adam.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index e863ca1244..9bc160c0b9 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -51,11 +51,11 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     ```
     $$t <- t + 1$$
-    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
-    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
+    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
-- 
GitLab


From 41074cd435a5d8b3831db8333b3669877b15a2c9 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 2 Apr 2018 13:14:48 +0800
Subject: [PATCH 0132/1262] Fix minor typo

---
 tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++----
 tensorflow/python/training/adam.py                       | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index 9bffaa79f5..fc2cb09471 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
-$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
-$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
-$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
+$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 178eddc664..1f2c40f18e 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -54,11 +54,11 @@ class AdamOptimizer(optimizer.Optimizer):
 
     ```
     $$t <- t + 1$$
-    $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
+    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$
-    $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
+    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
     ```
 
     The default value of 1e-8 for epsilon might not be a good default in
-- 
GitLab


From 1fbad5034a8ea531e496b0ecbf9e2c3839b62311 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Sun, 1 Apr 2018 22:46:11 -0700
Subject: [PATCH 0133/1262] Make batch_sequences_with_states_test.py work with
 the C API enabled.

The C API improves static shape inference, making more errors caught
at graph construction time instead of runtime.

PiperOrigin-RevId: 191260634
---
 .../training/batch_sequences_with_states_test.py    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index dbdbb08a82..16c260edb0 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -526,10 +526,15 @@ class PaddingTest(test.TestCase):
           "key_2": constant_op.constant([1.5, 2.5])  # length 2
       }
 
-      _, padded_seq = sqss._padding(sequences, 2)
-      with self.assertRaisesOpError(
-          ".*All sequence lengths must match, but received lengths.*"):
-        padded_seq["key_1"].eval()
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            ValueError, "Fill dimensions must be >= 0"):
+          _, padded_seq = sqss._padding(sequences, 2)
+      else:
+        _, padded_seq = sqss._padding(sequences, 2)
+        with self.assertRaisesOpError(
+            ".*All sequence lengths must match, but received lengths.*"):
+          padded_seq["key_1"].eval()
 
   def testPadding(self):
     with ops.Graph().as_default() as g, self.test_session(graph=g):
-- 
GitLab


From 3be80b5d31ecadd4f4a883b1ebe159d005408c1d Mon Sep 17 00:00:00 2001
From: Chikanaga Tomoyuki <nagachika@ruby-lang.org>
Date: Mon, 2 Apr 2018 17:59:05 +0900
Subject: [PATCH 0134/1262] Remove warnings for copy_op_to_graph.

---
 tensorflow/contrib/copy_graph/python/util/copy_elements.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index b806799202..102bc460fd 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
     #stores String-based info such as name, device and type of the op.
     #Unique to every Operation instance.
-    new_node_def = deepcopy(op._node_def)
+    new_node_def = deepcopy(op.node_def)
     #Change the name
     new_node_def.name = new_name
 
@@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
 
     #Make a copy of the op_def too.
     #Its unique to every _type_ of Operation.
-    op_def = deepcopy(op._op_def)
+    op_def = deepcopy(op.op_def)
 
     #Initialize a new Operation instance
     new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
-- 
GitLab


From be571938196fb191f260a2c45176d406e6c19a13 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 08:00:03 -0700
Subject: [PATCH 0135/1262] Adding support for RandomUniform. Basic support for
 op import/export of RandomUniform, and constant resolution with static seeds.

PiperOrigin-RevId: 191293897
---
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 .../contrib/lite/toco/export_tensorflow.cc    |  21 +++
 .../graph_transformations.h                   |   1 +
 .../propagate_array_data_types.cc             | 164 +++++++++++-------
 .../propagate_fixed_sizes.cc                  |  10 +-
 .../resolve_constant_random_uniform.cc        | 116 +++++++++++++
 .../contrib/lite/toco/import_tensorflow.cc    |  23 ++-
 tensorflow/contrib/lite/toco/model.h          |   8 +
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   1 +
 tensorflow/contrib/lite/toco/tooling_util.cc  |   1 +
 10 files changed, 274 insertions(+), 72 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index d552de313c..2dd689ad4c 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -259,6 +259,7 @@ cc_library(
         "graph_transformations/resolve_constant_fake_quant.cc",
         "graph_transformations/resolve_constant_fill.cc",
         "graph_transformations/resolve_constant_gather.cc",
+        "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 22a23357b3..e88357f7dd 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1711,6 +1711,23 @@ void ConvertTopKV2Operator(const Model& model, const TopKV2Operator& src_op,
   (*topk_op->mutable_attr())["sorted"].set_b(true);
 }
 
+void ConvertRandomUniformOperator(const Model& model,
+                                  const RandomUniformOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  CHECK(tensorflow_graph != nullptr);
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("RandomUniform");
+  CHECK_EQ(src_op.inputs.size(), 1);
+  new_op->set_name(src_op.outputs[0]);
+  *new_op->add_input() = src_op.inputs[0];
+  const auto shape_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(shape_type);
+  (*new_op->mutable_attr())["dtype"].set_type(
+      GetTensorFlowDataType(src_op.dtype));
+  (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
+  (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -1897,6 +1914,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertTransposeConvOperator(
         model, static_cast<const TransposeConvOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRandomUniform) {
+    ConvertRandomUniformOperator(
+        model, static_cast<const RandomUniformOperator&>(src_op),
+        tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 640afc7c74..76ec02aa07 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -173,6 +173,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 778da39bf1..89ad58f887 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -50,78 +50,108 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
   // Do the actual output data types propagation.
-  if (op->type == OperatorType::kDequantize ||
-      op->type == OperatorType::kResizeBilinear) {
-    // These operators unconditionally produce float outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
-  } else if (op->type == OperatorType::kTensorFlowLess ||
-             op->type == OperatorType::kTensorFlowLessEqual ||
-             op->type == OperatorType::kTensorFlowGreater ||
-             op->type == OperatorType::kTensorFlowGreaterEqual) {
-    // These operators unconditionally produce bool outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
-  } else if (op->type == OperatorType::kRank ||
-             op->type == OperatorType::kTensorFlowShape) {
-    // These operators only produce int32 outputs.
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
-  } else if (op->type == OperatorType::kTensorFlowSplit ||
-             op->type == OperatorType::kTensorFlowConcat ||
-             op->type == OperatorType::kFill) {
-    // These operators produce an output with the same type as their 2nd input
-    CHECK_GE(op->inputs.size(), 2);
-    const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTransposeConv) {
-    // These operators produce an output with the same type as their 3rd input
-    CHECK_GE(op->inputs.size(), 3);
-    const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kCast) {
-    // Data type of the Cast op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* cast_op = static_cast<CastOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
-  } else if (op->type == OperatorType::kArgMax) {
-    // Data type of the ArgMax op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* argmax_op = static_cast<ArgMaxOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
-  } else if (op->type == OperatorType::kRange) {
-    auto* range_op = static_cast<RangeOperator*>(op);
-    // Output type of the Range op can be set via an attribute
-    ArrayDataType data_type;
-    if (range_op->dtype != ArrayDataType::kNone) {
-      // Use the type if specified
-      data_type = range_op->dtype;
-    } else {
-      // Otherwise use the first input
-      CHECK_GE(op->inputs.size(), 1);
-      data_type = model->GetArray(op->inputs[0]).data_type;
+  switch (op->type) {
+    case OperatorType::kDequantize:
+    case OperatorType::kResizeBilinear:
+      // These operators unconditionally produce float outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
+      break;
+    case OperatorType::kTensorFlowLess:
+    case OperatorType::kTensorFlowLessEqual:
+    case OperatorType::kTensorFlowGreater:
+    case OperatorType::kTensorFlowGreaterEqual:
+      // These operators unconditionally produce bool outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
+      break;
+    case OperatorType::kRank:
+    case OperatorType::kTensorFlowShape:
+      // These operators only produce int32 outputs.
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
+      break;
+    case OperatorType::kTensorFlowSplit:
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kFill: {
+      // These operators produce an output with the same type as their 2nd input
+      CHECK_GE(op->inputs.size(), 2);
+      const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-    CHECK_EQ(op->outputs.size(), 1);
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTensorFlowUnsupported) {
-    auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
-    // Some output tensors from the op could be eliminated by optimization.
-    // This can make unsupported_op->output_data_types have more elements than
-    // op->outputs.
-    if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+    case OperatorType::kTransposeConv: {
+      // These operators produce an output with the same type as their 3rd input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kCast: {
+      // Data type of the Cast op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* cast_op = static_cast<CastOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
+      break;
+    }
+    case OperatorType::kArgMax: {
+      // Data type of the ArgMax op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* argmax_op = static_cast<ArgMaxOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
+      break;
+    }
+    case OperatorType::kRange: {
+      auto* range_op = static_cast<RangeOperator*>(op);
+      // Output type of the Range op can be set via an attribute
+      ArrayDataType data_type;
+      if (range_op->dtype != ArrayDataType::kNone) {
+        // Use the type if specified
+        data_type = range_op->dtype;
+      } else {
+        // Otherwise use the first input
+        CHECK_GE(op->inputs.size(), 1);
+        data_type = model->GetArray(op->inputs[0]).data_type;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kRandomUniform: {
+      auto* rand_op = static_cast<RandomUniformOperator*>(op);
+      // The output type of RandomUniform is specified with an attribute
+      if (rand_op->dtype == ArrayDataType::kNone) {
+        return false;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, rand_op->dtype);
+      break;
+    }
+    case OperatorType::kTensorFlowUnsupported: {
+      auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
+      // Some output tensors from the op could be eliminated by optimization.
+      // This can make unsupported_op->output_data_types have more elements than
+      // op->outputs.
+      if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+        return false;
+      }
+      for (int i = 0; i < op->outputs.size(); ++i) {
+        auto output = op->outputs[i];
+        auto data_type = unsupported_op->output_data_types[i];
+        model->GetArray(output).data_type = data_type;
+      }
+      break;
+    }
+    case OperatorType::kExpandDims: {
+      // Yield on ExpandDim until it is converted to Reshape
       return false;
     }
-    for (int i = 0; i < op->outputs.size(); ++i) {
-      auto output = op->outputs[i];
-      auto data_type = unsupported_op->output_data_types[i];
-      model->GetArray(output).data_type = data_type;
+    default: {
+      // These operators produce outputs with the same type as their 1st input
+      CHECK_GT(op->inputs.size(), 0);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-  } else if (op->type == OperatorType::kExpandDims) {
-    // Yield on ExpandDim until it is converted to Reshape
-    return false;
-  } else {
-    // These operators produce outputs with the same type as their 1st input
-    CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
   }
+
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
     if (old_output_data_types[output] != model->GetArray(output).data_type) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 676736cfc5..b96d698675 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -392,8 +392,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
                          depth * block_size * block_size}));
 }
 
-void ProcessFillOperator(Model* model, FillOperator* op) {
-  CHECK_EQ(op->inputs.size(), 2);
+void ProcessOpWithShapeInput(Model* model, Operator* op) {
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
@@ -1529,7 +1528,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
                                   static_cast<SpaceToDepthOperator*>(op));
       break;
     case OperatorType::kFill:
-      ProcessFillOperator(model, static_cast<FillOperator*>(op));
+      CHECK_EQ(op->inputs.size(), 2);
+      ProcessOpWithShapeInput(model, op);
       break;
     case OperatorType::kFullyConnected:
       ProcessFullyConnectedOperator(model,
@@ -1659,6 +1659,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       // transforms that remove them, so we avoid propagating shapes through
       // them and let things settle once they've been removed.
       break;
+    case OperatorType::kRandomUniform:
+      CHECK_EQ(op->inputs.size(), 1);
+      ProcessOpWithShapeInput(model, op);
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
new file mode 100644
index 0000000000..88d06d7dc7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace toco {
+
+template <ArrayDataType Type>
+bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
+  typedef tensorflow::random::UniformDistribution<
+      tensorflow::random::PhiloxRandom, DataType<Type>>
+      Distribution;
+
+  // Allocate output
+  auto& output_array = model->GetArray(op->outputs[0]);
+  CHECK(output_array.data_type == Type);
+  std::vector<DataType<Type>>& data =
+      output_array.GetMutableBuffer<Type>().data;
+  data.resize(RequiredBufferSizeForShape(output_array.shape()));
+
+  // We use the same random number generator and distribution as TensorFlow to
+  // produce the exact same values given the same seeds. See
+  // tensorflow::functor::FillPhiloxRandomTask<Distribution, false> in
+  // //third_party/tensorflow/core/kernels/random_op.cc for the implementation.
+  tensorflow::random::PhiloxRandom generator(op->seed, op->seed2);
+  Distribution dist;
+
+  // The generator creates Distribution::kResultElementCount samples at a time.
+  size_t offset = 0;
+  size_t num_samples = Distribution::kResultElementCount;
+  while (offset < data.size()) {
+    const typename Distribution::ResultType samples = dist(&generator);
+    std::copy(&samples[0],
+              &samples[0] + std::min(num_samples, data.size() - offset),
+              &data[0] + offset);
+    offset += num_samples;
+  }
+
+  return true;
+}
+
+bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* base_op = it->get();
+  if (base_op->type != OperatorType::kRandomUniform) {
+    return false;
+  }
+  auto* op = static_cast<RandomUniformOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  if ((op->seed == 0) && (op->seed2 == 0)) {
+    LOG(WARNING) << "RandomUniform op outputting \"" << op->outputs[0]
+                 << "\" is truly random (using /dev/random system entropy). "
+                    "Therefore, cannot resolve as constant. Set \"seed\" or "
+                    "\"seed2\" attr non-zero to fix this";
+    return false;
+  }
+
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!ComputeRandomUniformArray<ArrayDataType::kFloat>(model, op)) {
+        return false;
+      }
+      break;
+    // For future support of double or half.
+    // case ArrayDataType::kDouble...
+    default:
+      LOG(FATAL)
+          << "Unsupported data type given to RandomUniform op with output \""
+          << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used
+  toco::DeleteArrayIfUsedOnce(op->inputs[0], model);
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index c26e4bddff..876479079b 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -74,7 +74,7 @@ const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
   return attr.s();
 }
 
-int GetIntAttr(const NodeDef& node, const string& attr_name) {
+int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
                                   << node.DebugString();
   const auto& attr = node.attr().at(attr_name);
@@ -569,6 +569,23 @@ void ConvertBiasAddOperator(const NodeDef& node,
   model->operators.emplace_back(biasadd);
 }
 
+void ConvertRandomUniform(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "RandomUniform");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_INT32);
+  auto op = absl::make_unique<RandomUniformOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  op->dtype = ConvertDataType(GetDataTypeAttr(node, "dtype"));
+  op->seed = GetIntAttr(node, "seed");
+  op->seed2 = GetIntAttr(node, "seed2");
+  CHECK(model != nullptr);
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertReluOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
@@ -1931,7 +1948,7 @@ void ConvertTopKV2Operator(const NodeDef& node,
   // K can be encoded as attr (TopK) convert it to a const.
   if (HasAttr(node, "k")) {
     string k_array = CreateConstArray<ArrayDataType::kInt32>(
-        model, node.name() + "k", {GetIntAttr(node, "k")});
+        model, node.name() + "k", {static_cast<int32>(GetIntAttr(node, "k"))});
     op->inputs.push_back(k_array);
   } else {
     CheckInputsCount(node, tf_import_flags, 2);
@@ -2168,6 +2185,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
     } else if (node.op() == "DynamicStitch" ||
                node.op() == "ParallelDynamicStitch") {
       ConvertDynamicStitchOperator(node, tf_import_flags, model);
+    } else if (node.op() == "RandomUniform") {
+      ConvertRandomUniform(node, tf_import_flags, model);
     } else {
       ConvertUnsupportedOperator(node, tf_import_flags, model);
     }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 5199e292e1..64269d369d 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -60,6 +60,7 @@ enum class OperatorType {
   kMaxPool,
   kFakeQuant,
   kMul,
+  kRandomUniform,
   kRange,
   kRank,
   kRelu,
@@ -946,6 +947,13 @@ struct FloorModOperator : Operator {
   FloorModOperator() : Operator(OperatorType::kFloorMod) {}
 };
 
+struct RandomUniformOperator : Operator {
+  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+  int64 seed;
+  int64 seed2;
+};
+
 // Creates a sequence of numbers that begins at start and extends by increments
 // of delta up to but not including limit.
 //
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 30dd6fab9e..0c52f50e39 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -79,6 +79,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
   transformations->Add(new ResolveConstantGather);
+  transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index f3f50487ff..060c52e9e3 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -297,6 +297,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(L2Pool)
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
+    HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
-- 
GitLab


From f4672ca59b259436dd1cb60b9e12ba9c523e17f6 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 2 Apr 2018 08:29:42 -0700
Subject: [PATCH 0136/1262] Update tests to enable flipping on bfloat16 types.

Skip individual tests that are currently failing with bfloat16.

PiperOrigin-RevId: 191296618
---
 tensorflow/compiler/tests/binary_ops_test.py  | 43 +++++++++++--------
 .../compiler/tests/spacetobatch_op_test.py    |  7 +++
 .../compiler/tests/variable_ops_test.py       | 12 ++++--
 3 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index ba7b9bacd2..d1d7379c0a 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -190,19 +190,24 @@ class BinaryOpsTest(XLATestCase):
           ],
           equality_test=self.ListsAreClose)
 
-      self._testBinary(
-          gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
-          np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
-                    [0.9, 1.0, 1.1, 1.2]], dtype=dtype),
-          np.array([2, 1, 7], dtype=np.int32),
-          expected=[
-              np.array([1.342536, 1.442536, np.nan], dtype=dtype),
-              np.array([[0.213838, 0.236328, -0.738817, 0.288651],
-                        [0.213838, -0.763672, 0.261183, 0.288651],
-                        [np.nan, np.nan, np.nan, np.nan]],
-                       dtype=dtype),
-          ],
-          equality_test=self.ListsAreClose)
+      # TODO(b/68813416): Fails with bfloat16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
+            np.array(
+                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
+                 [0.9, 1.0, 1.1, 1.2]],
+                dtype=dtype),
+            np.array([2, 1, 7], dtype=np.int32),
+            expected=[
+                np.array([1.342536, 1.442536, np.nan], dtype=dtype),
+                np.array(
+                    [[0.213838, 0.236328, -0.738817, 0.288651], [
+                        0.213838, -0.763672, 0.261183, 0.288651
+                    ], [np.nan, np.nan, np.nan, np.nan]],
+                    dtype=dtype),
+            ],
+            equality_test=self.ListsAreClose)
 
   def testIntOps(self):
     for dtype in self.int_types:
@@ -260,12 +265,6 @@ class BinaryOpsTest(XLATestCase):
           np.array([[1], [2]], dtype=dtype),
           dtype(7),
           expected=np.array([[8], [9]], dtype=dtype))
-      self._testBinary(
-          math_ops.add,
-          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-          expected=np.array(
-              [1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
 
       self._testBinary(
           math_ops.subtract,
@@ -361,6 +360,12 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
+    self._testBinary(
+        math_ops.add,
+        np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+        np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+        expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
+
   def testComplexOps(self):
     for dtype in self.complex_types:
       ctypes = {np.complex64: np.float32}
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index 92518aadc4..6083981493 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
@@ -156,6 +157,12 @@ class SpaceToBatchNDTest(XLATestCase):
     paddings = np.array(paddings).reshape((len(block_shape), 2))
     with self.test_session() as sess, self.test_scope():
       for dtype in self.float_types:
+        # TODO(b/68813416): Skip bfloat16's as the input type for direct is
+        # float32 and results in a mismatch, while making testDirect provide the
+        # correctly typed input results in 'no fill-function for data-type'
+        # error.
+        if dtype == dtypes.bfloat16.as_numpy_dtype:
+          continue
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index b08d6ab21e..8ecad00f6e 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -230,7 +230,10 @@ class SliceAssignTest(XLATestCase):
       # shrink shape changes
       checker[1:2, 1] = [66]
       checker[1, 1:2] = [66]
-      checker[1, 1] = 66
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker[1, 1] = 66
       # newaxis shape changes
       checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
       # shrink and newaxis
@@ -243,8 +246,11 @@ class SliceAssignTest(XLATestCase):
 
       # Assign vector to scalar (rank-0) using newaxis
       checker2 = StridedSliceAssignChecker(self, 222, dtype=dtype)
-      checker2[()] = 6  # no indices
-      checker2[...] = 6  # ellipsis
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker2[()] = 6  # no indices
+        checker2[...] = 6  # ellipsis
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
-- 
GitLab


From aa7bb027a7cac837a3b774e9f443139b85c82aa8 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 3 Apr 2018 00:18:32 +0800
Subject: [PATCH 0137/1262] Fix minor typo

---
 .../api_def/base_api/api_def_ResourceApplyAdam.pbtxt     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 109b68e472..5c60fa3aa1 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,9 +76,8 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$
-$$m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t$$
-$$v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t$$
-$$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$
-END
+$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 }
-- 
GitLab


From 43c6dd98f1a69c0515f0769b997cfac576a195e5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 09:25:33 -0700
Subject: [PATCH 0138/1262] Add CycleGAN specific summary.

PiperOrigin-RevId: 191302480
---
 .../gan/python/eval/python/summaries_impl.py  | 64 ++++++++++++++-----
 .../gan/python/eval/python/summaries_test.py  | 20 +++---
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 0d1afad72d..508f487722 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -31,6 +31,7 @@ __all__ = [
     'add_image_comparison_summaries',
     'add_gan_model_summaries',
     'add_regularization_loss_summaries',
+    'add_cyclegan_image_summaries',
 ]
 
 
@@ -51,14 +52,9 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     ValueError: If real and generated data aren't images.
   """
   if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_y2x, **saved_params)
-    return
-
+    raise ValueError(
+        '`add_gan_model_image_summaries` does not take CycleGANModels. Please '
+        'use `add_cyclegan_image_summaries` instead.')
   _assert_is_image(gan_model.real_data)
   _assert_is_image(gan_model.generated_data)
 
@@ -89,6 +85,49 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     add_gan_model_summaries(gan_model)
 
 
+def add_cyclegan_image_summaries(cyclegan_model):
+  """Adds image summaries for CycleGAN.
+
+  There are two summaries, one for each generator. The first image is the
+  generator input, the second is the generator output, and the third is G(F(x)).
+
+  Args:
+    cyclegan_model: A CycleGANModel tuple.
+
+  Raises:
+    ValueError: If `cyclegan_model` isn't a CycleGANModel.
+    ValueError: If generated data, generator inputs, and reconstructions aren't
+      images.
+    ValueError: If the generator input, generated data, and reconstructions
+      aren't all the same size.
+  """
+  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
+    raise ValueError('`cyclegan_model` was not a CycleGANModel. Instead, was '
+                     '%s' % type(cyclegan_model))
+
+  _assert_is_image(cyclegan_model.model_x2y.generator_inputs)
+  _assert_is_image(cyclegan_model.model_x2y.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_x)
+  _assert_is_image(cyclegan_model.model_y2x.generator_inputs)
+  _assert_is_image(cyclegan_model.model_y2x.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_y)
+
+  def _add_comparison_summary(gan_model, reconstructions):
+    image_list = (array_ops.unstack(gan_model.generator_inputs[:1]) +
+                  array_ops.unstack(gan_model.generated_data[:1]) +
+                  array_ops.unstack(reconstructions[:1]))
+    summary.image(
+        'image_comparison', eval_utils.image_reshaper(
+            image_list, num_cols=len(image_list)), max_outputs=1)
+
+  with ops.name_scope('x2y_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_x2y, cyclegan_model.reconstructed_x)
+  with ops.name_scope('y2x_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_y2x, cyclegan_model.reconstructed_y)
+
+
 def add_image_comparison_summaries(gan_model, num_comparisons=2,
                                    display_diffs=False):
   """Adds image summaries to compare triplets of images.
@@ -109,15 +148,6 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2,
     ValueError: If the generator input, real, and generated data aren't all the
       same size.
   """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_y2x, **saved_params)
-    return
-
   _assert_is_image(gan_model.generator_inputs)
   _assert_is_image(gan_model.generated_data)
   _assert_is_image(gan_model.real_data)
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 45eb108586..33d51bfc21 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -65,15 +65,14 @@ def get_cyclegan_model():
   return namedtuples.CycleGANModel(
       model_x2y=model_x2y,
       model_y2x=model_y2x,
-      reconstructed_x=array_ops.zeros([3, 30, 35, 6]),
-      reconstructed_y=array_ops.zeros([3, 30, 35, 6]))
+      reconstructed_x=array_ops.zeros([4, 32, 32, 3]),
+      reconstructed_y=array_ops.zeros([4, 32, 32, 3]))
 
 
 class SummariesTest(test.TestCase):
 
-  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
-                                               expected_num_summary_ops,
-                                               model_summaries):
+  def _test_add_gan_model_image_summaries_impl(
+      self, get_model_fn, expected_num_summary_ops, model_summaries):
     summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
                                             model_summaries=model_summaries)
 
@@ -89,8 +88,9 @@ class SummariesTest(test.TestCase):
   def test_add_gan_model_image_summaries_no_model(self):
     self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
 
-  def test_add_gan_model_image_summaries_for_cyclegan(self):
-    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10, True)
+  def test_cyclegan_image_summaries_dont_work(self):
+    with self.assertRaises(ValueError):
+      summaries.add_gan_model_image_summaries(get_cyclegan_model())
 
   def _test_add_gan_model_summaries_impl(self, get_model_fn,
                                          expected_num_summary_ops):
@@ -137,7 +137,11 @@ class SummariesTest(test.TestCase):
     self._test_add_image_comparison_summaries_impl(get_gan_model, 1)
 
   def test_add_image_comparison_summaries_for_cyclegan(self):
-    self._test_add_image_comparison_summaries_impl(get_cyclegan_model, 2)
+    summaries.add_cyclegan_image_summaries(get_cyclegan_model())
+
+    self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+    with self.test_session(use_gpu=True):
+      summary.merge_all().eval()
 
 
 if __name__ == '__main__':
-- 
GitLab


From 7ca4101eac5cb580400d48ef683d812b44bc45bd Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 2 Apr 2018 09:29:33 -0700
Subject: [PATCH 0139/1262] Only test types supported and change log_eps for
 bfloat16.

PiperOrigin-RevId: 191302894
---
 tensorflow/compiler/tests/cholesky_op_test.py             | 7 +++++++
 .../compiler/tests/matrix_triangular_solve_op_test.py     | 8 ++++++++
 tensorflow/compiler/tests/unary_ops_test.py               | 5 ++++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index 5010fe5e21..1a8989d7c2 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -34,6 +34,13 @@ from tensorflow.python.platform import test
 
 class CholeskyOpTest(XLATestCase):
 
+  # Cholesky defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/cholesky)
+  @property
+  def float_types(self):
+    return set(super(CholeskyOpTest, self).float_types).intersection(
+        (np.float64, np.float32, np.complex64, np.complex128))
+
   def _verifyCholeskyBase(self, sess, placeholder, x, chol, verification, atol):
     chol_np, verification_np = sess.run([chol, verification], {placeholder: x})
     self.assertAllClose(x, verification_np, atol=atol)
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index cccb7f5789..5819b2bf2b 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -37,6 +37,14 @@ def MakePlaceholder(x):
 
 class MatrixTriangularSolveOpTest(XLATestCase):
 
+  #  MatrixTriangularSolve defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/matrix_triangular_solve)
+  @property
+  def float_types(self):
+    return set(super(MatrixTriangularSolveOpTest,
+                     self).float_types).intersection(
+                         (np.float64, np.float32, np.complex64, np.complex128))
+
   def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
                                  placeholder_b, a, clean_a, b, verification,
                                  atol):
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index a8ab235378..17149aa1c8 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -793,7 +793,10 @@ class UnaryOpsTest(XLATestCase):
       self._assertSoftplusMatchesExpected([[-2, 0, 8]], dtype)
       self._assertSoftplusMatchesExpected(
           [[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]], dtype)
-      log_eps = np.log(np.finfo(dtype).eps)
+      if dtype == dtypes.bfloat16.as_numpy_dtype:
+        log_eps = np.log(np.finfo(np.float32).eps)
+      else:
+        log_eps = np.log(np.finfo(dtype).eps)
       one = dtype(1)
       ten = dtype(10)
       self._assertSoftplusMatchesExpected([
-- 
GitLab


From 861f7a3ecc3c5ecbc22d2fa731956f4ff0469c25 Mon Sep 17 00:00:00 2001
From: Sergio Guadarrama <sguada@google.com>
Date: Mon, 2 Apr 2018 09:49:35 -0700
Subject: [PATCH 0140/1262] Automated g4 rollback of changelist 191127281

PiperOrigin-RevId: 191305220
---
 tensorflow/core/common_runtime/mkl_cpu_allocator.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 43a909466e..829c19204a 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -19,9 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-constexpr const char* MklCPUAllocator::kMaxLimitStr;
-constexpr const size_t MklCPUAllocator::kDefaultMaxLimit;
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-- 
GitLab


From 136b6095c42f6591c70a9f640598ff8398348b40 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 10:14:35 -0700
Subject: [PATCH 0141/1262] Additional arg scope test that demonstrate how
 nested arg_scope objects behave.

PiperOrigin-RevId: 191308666
---
 .../framework/python/ops/arg_scope_test.py    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
index 7ba9d4ffa9..4c3879d4fc 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
@@ -170,6 +170,30 @@ class ArgScopeTest(test.TestCase):
         self.assertTupleEqual(args, func1_args)
         self.assertDictEqual(kwargs, func1_kwargs)
 
+  def testNestedArgScopeObjectCreatedOutsideScopeOverridesArgScope(self):
+
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    scope_object = get_scope_object()
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(scope_object):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1]})
+
+  def testArgScopeObjectCreatedWithinScopeInheritsArgScope(self):
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(get_scope_object()):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1], 'd': 10})
+
   def testSharedArgScope(self):
     func1_args = (0,)
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
-- 
GitLab


From 3edab0abb1213f88507692042a320abc695ff674 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 10:27:12 -0700
Subject: [PATCH 0142/1262] Remove reshape of sparse tensor indices in for
 maybe_batch.

PiperOrigin-RevId: 191310753
---
 tensorflow/python/training/input.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 44f00a96de..caa26581e8 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -515,8 +515,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
     def _sparse_values_to_keep(t, keep_input):
       """Convert a per-row `keep_input` vector to a per-value one."""
       # Get the rows of every value in the sparse Tensor.
-      row_values = array_ops.reshape(
-          t.indices, [array_ops.shape(t.indices)[0], -1])[:, 0]
+      row_values = t.indices[:, 0]
       # The value should be kept iff the row should be kept.
       return array_ops.gather(keep_input, row_values)
     if keep_input.shape.ndims == 1:
-- 
GitLab


From 82bb63b958f5e39fb57ad45147b7d2c662c9b06c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 10:50:09 -0700
Subject: [PATCH 0143/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191314576
---
 tensorflow/core/framework/attr_value_util.cc  |  26 +--
 .../core/framework/common_shape_fns_test.cc   |  18 +-
 tensorflow/core/framework/dataset.h           |   2 +-
 tensorflow/core/framework/function.cc         |   5 +-
 tensorflow/core/framework/function_test.cc    |   2 +-
 tensorflow/core/framework/graph_def_util.cc   |   3 +-
 .../core/framework/node_def_builder_test.cc   |   5 +-
 tensorflow/core/framework/node_def_util.cc    |   7 +-
 .../core/framework/node_def_util_test.cc      |   5 +-
 tensorflow/core/framework/op.cc               |   3 +-
 .../core/framework/op_compatibility_test.cc   |   7 +-
 tensorflow/core/framework/op_def_builder.cc   |  49 ++--
 tensorflow/core/framework/op_def_util.cc      |  12 +-
 tensorflow/core/framework/op_def_util_test.cc |   2 +-
 tensorflow/core/framework/op_gen_lib.cc       |  21 +-
 tensorflow/core/framework/op_kernel.cc        |   2 +-
 tensorflow/core/framework/op_kernel_test.cc   |  19 +-
 .../core/framework/resource_mgr_test.cc       |   3 +-
 .../core/framework/shape_inference_test.cc    | 209 ++++++++----------
 .../framework/shape_inference_testutil.cc     |   8 +-
 .../core/framework/shape_inference_testutil.h |  23 +-
 .../shape_inference_testutil_test.cc          |  14 +-
 tensorflow/core/framework/types.cc            |   2 +-
 tensorflow/core/framework/types_test.cc       |   6 +-
 .../core/framework/variant_op_copy_test.cc    |  10 +-
 .../framework/variant_op_registry_test.cc     |  11 +-
 26 files changed, 242 insertions(+), 232 deletions(-)

diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index ebb56d525e..87c1ddd15d 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -186,7 +186,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   // check if has_list is false and some other field in attr_value is
   // set to flag the error.  This test can be made more strict once
   // support for GraphDef versions <= 4 is dropped.
-  if (StringPiece(type).starts_with("list(") && !attr_value.has_list()) {
+  if (str_util::StartsWith(type, "list(") && !attr_value.has_list()) {
     if (num_set) {
       return errors::InvalidArgument(
           "AttrValue missing value with expected type '", type, "'");
@@ -197,7 +197,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   }
 
   // Okay to have an empty list, but not to be missing a non-list value.
-  if (num_set == 0 && !StringPiece(type).starts_with("list(")) {
+  if (num_set == 0 && !str_util::StartsWith(type, "list(")) {
     return errors::InvalidArgument(
         "AttrValue missing value with expected type '", type, "'");
   }
@@ -241,29 +241,29 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
 bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
   // Parse type.
   string field_name;
-  bool is_list = type.Consume("list(");
-  if (type.Consume("string")) {
+  bool is_list = str_util::ConsumePrefix(&type, "list(");
+  if (str_util::ConsumePrefix(&type, "string")) {
     field_name = "s";
-  } else if (type.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&type, "int")) {
     field_name = "i";
-  } else if (type.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&type, "float")) {
     field_name = "f";
-  } else if (type.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&type, "bool")) {
     field_name = "b";
-  } else if (type.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&type, "type")) {
     field_name = "type";
-  } else if (type.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&type, "shape")) {
     field_name = "shape";
-  } else if (type.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&type, "tensor")) {
     field_name = "tensor";
-  } else if (type.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&type, "func")) {
     field_name = "func";
-  } else if (type.Consume("placeholder")) {
+  } else if (str_util::ConsumePrefix(&type, "placeholder")) {
     field_name = "placeholder";
   } else {
     return false;
   }
-  if (is_list && !type.Consume(")")) {
+  if (is_list && !str_util::ConsumePrefix(&type, ")")) {
     return false;
   }
 
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 5f3e5ad457..13d429b895 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -140,9 +141,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 1"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 1"));
   }
 
   {
@@ -161,10 +161,9 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains(
-                "Invalid argument: Dimensions must be equal, but are 5 and 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Invalid argument: Dimensions must be equal, but are 5 and 3"));
   }
 
   {
@@ -173,9 +172,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 3"));
   }
 
   {
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index beaf0adbc5..fb1fe9c51f 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -201,7 +201,7 @@ class GraphDefBuilderWrapper {
   // Also looks up the `op_def->name` in the global
   // `WhitelistedStatefulOpRegistry`.
   bool IsOpWhitelisted(const OpDef* op_def) const {
-    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+    return (str_util::EndsWith(op_def->name(), "Dataset") &&
             op_def->output_arg_size() == 1 &&
             op_def->output_arg(0).type() == DT_VARIANT) ||
            dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 3e7b89d4eb..bdc1af9fda 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -278,7 +279,7 @@ class FunctionInstantiationHelper {
       auto it = index_.lower_bound(node_name);
       while (it != index_.end() && it->first <= node_colon_bound) {
         if (it->first == node_name ||
-            tensorflow::StringPiece(it->first).starts_with(node_colon)) {
+            tensorflow::str_util::StartsWith(it->first, node_colon)) {
           nid = it->second.nid;
           break;
         }
@@ -502,7 +503,7 @@ string Print(const NodeDef& n) {
   std::vector<StringPiece> dat;
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
-    if (s.Consume("^")) {
+    if (str_util::ConsumePrefix(&s, "^")) {
       dep.push_back(s.ToString());
     } else {
       dat.push_back(s);
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 23685e9c53..44e1383719 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -496,7 +496,7 @@ MySelect(x:float) -> (z:float) {
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 896cb3cd7f..f7539d37be 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb_text.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -94,7 +95,7 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
   std::vector<string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
-    if (!StringPiece(attr.first).starts_with("_") &&
+    if (!str_util::StartsWith(attr.first, "_") &&
         FindAttr(attr.first, *consumer_op_def) == nullptr) {
       const OpDef::AttrDef* producer_attr_def =
           FindAttr(attr.first, *producer_op_def);
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index e836873f66..cc583df348 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,7 +83,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
     for (const string& message : messages) {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
           << status << ", " << message;
     }
   }
@@ -103,7 +104,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     }
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
         << "Actual error: " << status.error_message()
         << "\nDoes not contain: " << message;
   }
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 95fb386314..bad92ca9b3 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -131,7 +132,7 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
+  if (!str_util::StartsWith(attr_name, "_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
@@ -399,7 +400,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
   for (const string& input : node_def.input()) {
-    if (StringPiece(input).starts_with("^")) {
+    if (str_util::StartsWith(input, "^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
         return errors::InvalidArgument(
@@ -425,7 +426,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   }
   for (const auto& attr : node_def.attr()) {
     // Allow internal optional attributes with names starting with "_".
-    if (StringPiece(attr.first).starts_with("_")) {
+    if (str_util::StartsWith(attr.first, "_")) {
       continue;
     }
     auto iter = op_attrs.find(attr.first);
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index ae3a93eafe..2a49425dba 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -65,7 +66,7 @@ void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
       << "; OpDef: " << SummarizeOpDef(op_def);
 
   LOG(INFO) << "Message: " << status.error_message();
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
       << "NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def) << "\nActual error: " << status
       << "\nDoes not contain: " << message;
@@ -265,7 +266,7 @@ void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
   EXPECT_TRUE(errors::IsInvalidArgument(status))
       << status << "; NodeDef: " << SummarizeNodeDef(bad);
 
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.ToString()), message))
       << "NodeDef: " << SummarizeNodeDef(bad) << ", " << status << ", "
       << message;
 }
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index fc5467b3c8..5f68c59fe9 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -142,7 +143,7 @@ void OpRegistry::Export(bool include_internal, OpList* ops) const {
   out->Reserve(sorted.size());
 
   for (const auto& item : sorted) {
-    if (include_internal || !StringPiece(item.first).starts_with("_")) {
+    if (include_internal || !str_util::StartsWith(item.first, "_")) {
       *out->Add() = item.second->op_def;
     }
   }
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index b57bdcb841..c782480f1f 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -96,7 +97,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(new_op_def);
     } else {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(error))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
           << status << " does not contain " << error;
     }
   }
@@ -118,7 +119,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeNodeDef(*node_def());
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(validation_error))
+          str_util::StrContains(status.error_message(), validation_error))
           << status << " does not contain " << validation_error;
     }
 
@@ -179,7 +180,7 @@ class OpCompatibilityTest : public OpsTestBase {
                     << SummarizeOpDef(*new_op_def);
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(compatibility_error))
+          str_util::StrContains(status.error_message(), compatibility_error))
           << status << " does not contain " << compatibility_error;
     }
   }
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 962bc11ccb..403bd0b5e2 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -112,9 +112,11 @@ bool ConsumeAttrNumber(StringPiece* sp, int64* out) {
 
 bool ConsumeCompoundAttrType(StringPiece* sp, StringPiece* out) {
   auto capture_begin = sp->begin();
-  if (sp->Consume("numbertype") || sp->Consume("numerictype") ||
-      sp->Consume("quantizedtype") || sp->Consume("realnumbertype") ||
-      sp->Consume("realnumberictype")) {
+  if (str_util::ConsumePrefix(sp, "numbertype") ||
+      str_util::ConsumePrefix(sp, "numerictype") ||
+      str_util::ConsumePrefix(sp, "quantizedtype") ||
+      str_util::ConsumePrefix(sp, "realnumbertype") ||
+      str_util::ConsumePrefix(sp, "realnumberictype")) {
     *out = StringPiece(capture_begin, sp->begin() - capture_begin);
     return true;
   }
@@ -155,32 +157,32 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   bool is_list = ConsumeListPrefix(&spec);
   string type;
   StringPiece type_string;  // Used if type == "type"
-  if (spec.Consume("string")) {
+  if (str_util::ConsumePrefix(&spec, "string")) {
     type = "string";
-  } else if (spec.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&spec, "int")) {
     type = "int";
-  } else if (spec.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&spec, "float")) {
     type = "float";
-  } else if (spec.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&spec, "bool")) {
     type = "bool";
-  } else if (spec.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&spec, "type")) {
     type = "type";
-  } else if (spec.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&spec, "shape")) {
     type = "shape";
-  } else if (spec.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&spec, "tensor")) {
     type = "tensor";
-  } else if (spec.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&spec, "func")) {
     type = "func";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
     VERIFY(ProcessCompoundType(type_string, allowed),
            "Expected to see a compound type, saw: ", type_string);
-  } else if (spec.Consume("{")) {
+  } else if (str_util::ConsumePrefix(&spec, "{")) {
     // e.g. "{ int32, float, bool }" or "{ \"foo\", \"bar\" }"
     AttrValue* allowed = attr->mutable_allowed_values();
     str_util::RemoveLeadingWhitespace(&spec);
-    if (spec.starts_with("\"") || spec.starts_with("'")) {
+    if (str_util::StartsWith(spec, "\"") || str_util::StartsWith(spec, "'")) {
       type = "string";  // "{ \"foo\", \"bar\" }" or "{ 'foo', 'bar' }"
       while (true) {
         StringPiece escaped_string;
@@ -193,11 +195,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
         allowed->mutable_list()->add_s(unescaped);
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after strings in list, not: '", spec, "'");
           break;
         }
@@ -215,11 +218,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                  "Unrecognized type string '", type_string, "'");
           allowed->mutable_list()->add_type(dt);
         }
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after types in list, not: '", spec, "'");
           break;
         }
@@ -232,7 +236,8 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
 
   // Write the type into *attr.
   if (is_list) {
-    VERIFY(spec.Consume(")"), "Expected ) to close 'list(', not: '", spec, "'");
+    VERIFY(str_util::ConsumePrefix(&spec, ")"),
+           "Expected ) to close 'list(', not: '", spec, "'");
     str_util::RemoveLeadingWhitespace(&spec);
     attr->set_type(strings::StrCat("list(", type, ")"));
   } else {
@@ -240,7 +245,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Read optional minimum constraint at the end.
-  if ((is_list || type == "int") && spec.Consume(">=")) {
+  if ((is_list || type == "int") && str_util::ConsumePrefix(&spec, ">=")) {
     int64 min_limit = -999;
     VERIFY(ConsumeAttrNumber(&spec, &min_limit),
            "Could not parse integer lower limit after '>=', found '", spec,
@@ -250,7 +255,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Parse default value, if present.
-  if (spec.Consume("=")) {
+  if (str_util::ConsumePrefix(&spec, "=")) {
     str_util::RemoveLeadingWhitespace(&spec);
     VERIFY(ParseAttrValue(attr->type(), spec, attr->mutable_default_value()),
            "Could not parse default value '", spec, "'");
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index c80802aad3..9be0dc69d2 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -239,7 +240,7 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
 Status ValidateOpDef(const OpDef& op_def) {
   using ::tensorflow::strings::Scanner;
 
-  if (!StringPiece(op_def.name()).starts_with("_")) {
+  if (!str_util::StartsWith(op_def.name(), "_")) {
     VALIDATE(Scanner(op_def.name())
                  .One(Scanner::UPPERLETTER)
                  .Any(Scanner::LETTER_DIGIT)
@@ -259,11 +260,11 @@ Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate type
     StringPiece type(attr.type());
-    bool is_list = type.Consume("list(");
+    bool is_list = str_util::ConsumePrefix(&type, "list(");
     bool found = false;
     for (StringPiece valid : {"string", "int", "float", "bool", "type", "shape",
                               "tensor", "func"}) {
-      if (type.Consume(valid)) {
+      if (str_util::ConsumePrefix(&type, valid)) {
         found = true;
         break;
       }
@@ -271,8 +272,9 @@ Status ValidateOpDef(const OpDef& op_def) {
     VALIDATE(found, "Unrecognized type '", type, "' in attr '", attr.name(),
              "'");
     if (is_list) {
-      VALIDATE(type.Consume(")"), "'list(' is missing ')' in attr ",
-               attr.name(), "'s type ", attr.type());
+      VALIDATE(str_util::ConsumePrefix(&type, ")"),
+               "'list(' is missing ')' in attr ", attr.name(), "'s type ",
+               attr.type());
     }
     VALIDATE(type.empty(), "Extra '", type, "' at the end of attr ",
              attr.name(), "'s type ", attr.type());
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 2b9812d4fc..4514d92e38 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -57,7 +57,7 @@ class ValidateOpDefTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
     if (!status.ok()) {
       LOG(INFO) << "message: " << status;
-      EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
           << "Actual: " << status << "\nExpected to contain: " << message;
     }
   }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 5f2eb9d99a..7f23272871 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -50,10 +50,10 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
     StringPiece to_append = str.substr(0, space);
     str.remove_prefix(space + 1);
     // Remove spaces at break.
-    while (to_append.ends_with(" ")) {
+    while (str_util::EndsWith(to_append, " ")) {
       to_append.remove_suffix(1);
     }
-    while (str.Consume(" ")) {
+    while (str_util::ConsumePrefix(&str, " ")) {
     }
 
     // Go on to the next line.
@@ -65,8 +65,9 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
 }
 
 bool ConsumeEquals(StringPiece* description) {
-  if (description->Consume("=")) {
-    while (description->Consume(" ")) {  // Also remove spaces after "=".
+  if (str_util::ConsumePrefix(description, "=")) {
+    while (str_util::ConsumePrefix(description,
+                                   " ")) {  // Also remove spaces after "=".
     }
     return true;
   }
@@ -98,7 +99,7 @@ static bool StartsWithFieldName(StringPiece line,
                                 const std::vector<string>& multi_line_fields) {
   StringPiece up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
-  while (up_to_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&up_to_colon, " "))
     ;  // Remove leading spaces.
   for (const auto& field : multi_line_fields) {
     if (up_to_colon == field) {
@@ -119,9 +120,9 @@ static bool ConvertLine(StringPiece line,
   StringPiece up_to_colon;
   StringPiece after_colon = line;
   SplitAt(':', &after_colon, &up_to_colon);
-  while (after_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&after_colon, " "))
     ;  // Remove leading spaces.
-  if (!after_colon.Consume("\"")) {
+  if (!str_util::ConsumePrefix(&after_colon, "\"")) {
     // We only convert string fields, so don't convert this line.
     return false;
   }
@@ -181,9 +182,9 @@ string PBTxtToMultiline(StringPiece pbtxt,
 static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   if (colon == StringPiece::npos) return false;
   line.remove_prefix(colon + 1);
-  while (line.Consume(" ")) {
+  while (str_util::ConsumePrefix(&line, " ")) {
   }
-  if (line.Consume("<<")) {
+  if (str_util::ConsumePrefix(&line, "<<")) {
     *end = line.ToString();
     return true;
   }
@@ -228,7 +229,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     string suffix;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
-      if (line.Consume(end)) break;
+      if (str_util::ConsumePrefix(&line, end)) break;
       if (first) {
         first = false;
       } else {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 9ec1c213c3..6ba196cc34 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -96,7 +96,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(StringPiece(type_string()).starts_with("_")),
+      is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index b53b877f28..bcd409e5c5 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -546,9 +546,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
                                             {"T|list(type)|[DT_FLOAT]"}));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      StringPiece(GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}))
-          .contains("Invalid argument: "));
+  EXPECT_TRUE(str_util::StrContains(
+      GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}),
+      "Invalid argument: "));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {"T|int|7"},
                 error::INVALID_ARGUMENT);
@@ -565,8 +565,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernel", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
 }
@@ -585,8 +585,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernelForT", DEVICE_CPU, {"T|type|DT_FLOAT"},
                 error::INVALID_ARGUMENT);
@@ -606,8 +606,9 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("OpKernel 'BadConstraint' has constraint on attr "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "OpKernel 'BadConstraint' has constraint on attr "
                             "'T' not in NodeDef"));
 
   ExpectFailure("BadConstraint", DEVICE_CPU, {"dtype|type|DT_FLOAT"},
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 07272e2374..798220d4c3 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -71,7 +72,7 @@ string LookupOrCreate(ResourceMgr* rm, const string& container,
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index f48a7b9c47..da103bfec9 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -152,10 +153,9 @@ TEST_F(ShapeInferenceTest, Run) {
     };
     Status s = c.Run(fn);
     // Extra error message is attached when Run fails.
-    EXPECT_TRUE(StringPiece(s.ToString())
-                    .contains("Shape must be at most rank 0 but "
-                              "is rank 1 for 'foo' (op: "
-                              "'foo_op')"))
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Shape must be at most rank 0 but is rank 1 for 'foo' (op: 'foo_op')"))
         << s;
   }
 }
@@ -367,10 +367,9 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtMost(in1, 2, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at most rank 2 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtMost(in1, 2, &s1).ToString(),
+      "Invalid argument: Shape must be at most rank 2 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtMost(in1, 3, &s1).ok());
@@ -406,10 +405,9 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtLeast(in1, 4, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 4 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtLeast(in1, 4, &s1).ToString(),
+      "Invalid argument: Shape must be at least rank 4 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtLeast(in1, 3, &s1).ok());
@@ -449,12 +447,14 @@ TEST_F(ShapeInferenceTest, WithValue) {
   // WithValue on dimension with known size.
   out1 = d0;
 
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 0, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 0 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 0, &out1).ToString(),
+                            "Invalid argument: Dimension must be 0 but is 1"));
   EXPECT_FALSE(IsSet(out1));
   out1 = d0;
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 2, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 2 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 2, &out1).ToString(),
+                            "Invalid argument: Dimension must be 2 but is 1"));
 
   EXPECT_FALSE(IsSet(out1));
   EXPECT_TRUE(c.WithValue(d0, 1, &out1).ok());
@@ -513,16 +513,14 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_EQ(3, merged_dims.size());
 
   // Merging unequal values is an error.
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d2, d1, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 2 and 1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d2, d1, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 2 and 1"));
 
   EXPECT_FALSE(IsSet(out));
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d1, d2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d1, d2, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -729,26 +727,23 @@ TEST_F(ShapeInferenceTest, MergeShape) {
 
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_u_2, s_1_3, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 2 and 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_u_2, s_1_3, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 2 and 3"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1_3, s_u_2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 3 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1_3, s_u_2, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 3 and 2"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1, s_1_2, &out).ToString())
-          .contains(
-              "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1, s_1_2, &out).ToString(),
+      "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -795,22 +790,18 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   // Incompatible merges give errors and set outs to nullptr.
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 3 but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Shape must be at least rank 3 but is rank 2"));
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 }
@@ -868,24 +859,21 @@ TEST_F(ShapeInferenceTest, Subshape) {
 
   // Errors.
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 6, -3, &out).ToString())
-                  .contains("Invalid argument: Subshape must have computed "
-                            "start <= end, but is 5 "
-                            "and 2 (computed from start 6 and end -3 over "
-                            "shape with rank 5)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subshape(in0, 6, -3, &out).ToString(),
+      "Invalid argument: Subshape must have computed start <= end, but is 5 "
+      "and 2 (computed from start 6 and end -3 over shape with rank 5)"));
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, -50, 100, &out).ToString())
-                  .contains("Invalid argument: Subshape start out of "
-                            "bounds: -50, for shape with "
-                            "rank 5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, -50, 100, &out).ToString(),
+                                    "Invalid argument: Subshape start out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 0, -50, &out).ToString())
-                  .contains("Invalid argument: Subshape end out of bounds: "
-                            "-50, for shape with rank "
-                            "5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, 0, -50, &out).ToString(),
+                                    "Invalid argument: Subshape end out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1094,27 +1082,26 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   EXPECT_EQ("[]", create(&t));
 
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
-  EXPECT_TRUE(
-      StringPiece(create(&t))
-          .contains("Input tensor must be int32 or int64, but was float"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be rank 1, but was rank 0"));
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be rank 1, but was rank 2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test when the input shape is wrong.
   {
@@ -1172,9 +1159,9 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   EXPECT_TRUE(c.MakeShapeFromShapeProto(proto, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
   proto.add_dim()->set_size(0);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("An unknown shape must not have any dimensions set."));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "An unknown shape must not have any dimensions set."));
   EXPECT_FALSE(IsSet(out));
 
   // With known rank.
@@ -1188,10 +1175,10 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 
   // With invalid dimension value.
   proto.add_dim()->set_size(-2);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("Shape [0,?,1000,-2] has dimensions with values below -1 "
-                    "(where -1 means unknown)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "Shape [0,?,1000,-2] has dimensions with values below -1 "
+      "(where -1 means unknown)"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1257,9 +1244,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 
   // Same tests, with int64 values.
   t1 = tensorflow::test::AsScalar<int64>(20);
@@ -1267,9 +1255,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, GetAttr) {
@@ -1322,33 +1311,33 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(c.Divide(d_6, d_2, evenly_divisible, &out).ok());
   EXPECT_EQ("3", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 5, evenly_divisible, &out).error_message())
-          .contains("Dimension size must be evenly divisible by 5 but is 6"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 5, evenly_divisible, &out).error_message(),
+      "Dimension size must be evenly divisible by 5 but is 6"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, d_0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, d_0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 
   // Repeat error cases above with evenly_divisible=false.
   evenly_divisible = false;
   EXPECT_TRUE(c.Divide(d_6, 5, evenly_divisible, &out).ok());
   EXPECT_EQ("1", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, Add) {
@@ -1396,11 +1385,9 @@ TEST_F(ShapeInferenceTest, Add) {
   EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out)
-                      .error_message())
-          .contains(
-              "Dimension size overflow from adding 6 and 9223372036854775802"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message(),
+      "Dimension size overflow from adding 6 and 9223372036854775802"));
 }
 
 TEST_F(ShapeInferenceTest, Subtract) {
@@ -1448,9 +1435,9 @@ TEST_F(ShapeInferenceTest, Subtract) {
   EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Subtract(d_5, d_6, &out).error_message())
-          .contains("Negative dimension size caused by subtracting 6 from 5"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subtract(d_5, d_6, &out).error_message(),
+      "Negative dimension size caused by subtracting 6 from 5"));
 }
 
 TEST_F(ShapeInferenceTest, Multiply) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index b4765ab0b2..b54dd220ab 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -100,7 +100,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
       }
     }
 
-    if (expected.starts_with("in")) {
+    if (str_util::StartsWith(expected, "in")) {
       if (in_index == -1) {
         return Unknown(err_prefix,
                        " should have matched an input shape by "
@@ -135,7 +135,9 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     }
 
     // Verify the dimensions.
-    CHECK(expected.starts_with("[") && expected.ends_with("]")) << expected;
+    CHECK(str_util::StartsWith(expected, "[") &&
+          str_util::EndsWith(expected, "]"))
+        << expected;
     expected.remove_prefix(1);
     expected.remove_suffix(1);
 
@@ -176,7 +178,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
           return Unknown(err_prefix, " expected to be unknown but was ",
                          c.Value(out_dim), err_suffix);
         }
-      } else if (expected_dim.starts_with("d")) {
+      } else if (str_util::StartsWith(expected_dim, "d")) {
         // Compare the dimension values.
         auto v = str_util::Split(expected_dim, '|');
         if (in_dim_idx.first == -1) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 7977841482..2a99af7659 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
@@ -83,17 +84,17 @@ class ShapeInferenceTestutil {
       "", ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
               op, i, o)                                                       \
               .error_message())
-#define INFER_ERROR(error_substring, op, i)                                 \
-  {                                                                         \
-    string error_message =                                                  \
-        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
-            op, i, "e")                                                     \
-            .error_message();                                               \
-    const string& substring = error_substring;                              \
-    EXPECT_NE("", error_message);                                           \
-    EXPECT_TRUE(StringPiece(error_message).contains(substring))             \
-        << "Expected to see '" << substring << "' in '" << error_message    \
-        << "'";                                                             \
+#define INFER_ERROR(error_substring, op, i)                                    \
+  {                                                                            \
+    string error_message =                                                     \
+        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes(    \
+            op, i, "e")                                                        \
+            .error_message();                                                  \
+    const string& substring = error_substring;                                 \
+    EXPECT_NE("", error_message);                                              \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(error_message, substring)) \
+        << "Expected to see '" << substring << "' in '" << error_message       \
+        << "'";                                                                \
   }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 20a6807064..a4405b502c 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -25,10 +26,11 @@ namespace shape_inference {
 
 namespace {
 
-#define EXPECT_CONTAINS(str, substr)                                 \
-  do {                                                               \
-    string s = (str);                                                \
-    EXPECT_TRUE(StringPiece(s).contains(substr)) << "String: " << s; \
+#define EXPECT_CONTAINS(str, substr)                            \
+  do {                                                          \
+    string s = (str);                                           \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(s, substr)) \
+        << "String: " << s;                                     \
   } while (false)
 
 static OpShapeInferenceFn* global_fn_ptr = nullptr;
@@ -97,8 +99,8 @@ TEST(ShapeInferenceTestutilTest, Failures) {
   auto error_message = ShapeInferenceTestutil::InferShapes(
                            ShapeInferenceTestOp("NoSuchOp"), "", "")
                            .error_message();
-  EXPECT_TRUE(StringPiece(error_message)
-                  .starts_with("Op type not registered 'NoSuchOp'"));
+  EXPECT_TRUE(
+      str_util::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index adf4e1bae3..2280114de5 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -114,7 +114,7 @@ string DataTypeString(DataType dtype) {
 }
 
 bool DataTypeFromString(StringPiece sp, DataType* dt) {
-  if (sp.ends_with("_ref")) {
+  if (str_util::EndsWith(sp, "_ref")) {
     sp.remove_suffix(4);
     DataType non_ref;
     if (DataTypeFromString(sp, &non_ref) && !IsRefType(non_ref)) {
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 60f2b4135a..16b069c70a 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -140,9 +141,8 @@ TEST(TypesTest, ComplexTypes) {
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
-    const StringPiece n = name;
-    EXPECT_EQ(DataTypeIsInteger(dt),
-              n.starts_with("int") || n.starts_with("uint"))
+    EXPECT_EQ(DataTypeIsInteger(dt), str_util::StartsWith(name, "int") ||
+                                         str_util::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
   }
 }
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 85e014f804..60fa7bd559 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/port.h"
 
@@ -259,8 +260,8 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   ClientSession session(root);
   std::vector<Tensor> outputs;
   Status s = session.Run({create_const}, &outputs);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("GPU copy from non-DMA string tensor"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "GPU copy from non-DMA string tensor"))
       << s.ToString();
 }
 
@@ -365,8 +366,9 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   std::vector<Tensor> outputs;
   Status err = session.Run({create_op, identity}, &outputs);
   EXPECT_EQ(err.code(), errors::Code::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(err.error_message())
-                  .contains("During Variant Host->Device Copy: non-DMA-copy "
+  EXPECT_TRUE(
+      str_util::StrContains(err.error_message(),
+                            "During Variant Host->Device Copy: non-DMA-copy "
                             "attempted of tensor type: string"))
       << err.error_message();
 }
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 06ca211c76..7055e62c0e 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include "tensorflow/core/lib/strings/str_util.h"
 
 #define EIGEN_USE_THREADS
 
@@ -130,7 +131,7 @@ TEST(VariantOpShapeRegistryTest, TestBasic) {
   Variant v = vv_early_exit;
   Status s0 = (*shape_fn)(v, &shape);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit!"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
 
   VariantValue vv_ok{false /* early_exit */};
   v = vv_ok;
@@ -229,7 +230,7 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -254,7 +255,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -299,7 +300,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   Status s0 = BinaryOpVariants<CPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
@@ -325,7 +326,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
-- 
GitLab


From 5617c9b4be759b62f3cbd29b0a58c41a43ac47a0 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 2 Apr 2018 10:51:13 -0700
Subject: [PATCH 0144/1262] Improves the documentation of control_dependencies.

PiperOrigin-RevId: 191314766
---
 tensorflow/python/framework/ops.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 7ca0b836dd..22b621e4cb 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4489,6 +4489,22 @@ class Graph(object):
         return tf.matmul(tensor, tensor)
     ```
 
+    Also note that though execution of ops created under this scope will trigger
+    execution of the dependencies, the ops created under this scope might still
+    be pruned from a normal tensorflow graph. For example, in the following
+    snippet of code the dependencies are never executed:
+
+    ```python
+      loss = model.loss()
+      with tf.control_dependencies(dependencies):
+        loss = loss + tf.constant(1)  # note: dependencies ignored in the
+                                      # backward pass
+      return tf.gradients(loss, model.variables)
+    ```
+
+    This is because evaluating the gradient graph does not require evaluating
+    the constant(1) op created in the forward pass.
+
     Args:
       control_inputs: A list of `Operation` or `Tensor` objects which
         must be executed or computed before running the operations
-- 
GitLab


From 4118b7c8432ca770191ca22fc01fd435e4e4571e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 11:03:18 -0700
Subject: [PATCH 0145/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191316903
---
 .../common_runtime/direct_session_test.cc     |  25 ++--
 .../core/common_runtime/function_test.cc      |   9 +-
 .../function_threadpool_test.cc               |   3 +-
 tensorflow/core/common_runtime/placer.cc      |   4 +-
 tensorflow/core/common_runtime/placer_test.cc | 110 +++++++++---------
 .../process_function_library_runtime_test.cc  |   3 +-
 .../core/common_runtime/session_test.cc       |  16 +--
 .../core/common_runtime/shape_refiner_test.cc |   9 +-
 tensorflow/core/graph/graph_constructor.cc    |  15 +--
 .../core/graph/graph_constructor_test.cc      |   4 +-
 tensorflow/core/graph/graph_partition.cc      |   3 +-
 tensorflow/core/graph/graph_partition_test.cc |   5 +-
 tensorflow/core/graph/graph_test.cc           |   3 +-
 .../core/graph/quantize_training_test.cc      |   5 +-
 tensorflow/core/graph/subgraph_test.cc        |   4 +-
 tensorflow/core/graph/tensor_id.cc            |   3 +-
 tensorflow/core/graph/validate_test.cc        |   7 +-
 17 files changed, 122 insertions(+), 106 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index ee38960618..f95cecfc66 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -155,22 +156,22 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
 
     Status s = session->RunCallable(handle, {}, nullptr, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("`fetch_tensors` must be provided"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                      "`fetch_tensors` must be provided"));
 
     TF_ASSERT_OK(session->ReleaseCallable(handle));
 
     std::vector<Tensor> outputs;
     s = session->RunCallable(handle, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(
-        StringPiece(s.error_message())
-            .contains("Attempted to run callable after handle was released"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Attempted to run callable after handle was released"));
 
     s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
     EXPECT_TRUE(
-        StringPiece(s.error_message()).contains("No such callable handle"));
+        str_util::StrContains(s.error_message(), "No such callable handle"));
   }
 }
 
@@ -567,7 +568,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, MultipleFeedTest_Callable) {
@@ -650,7 +651,7 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
           {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
       &handle);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, FetchMultipleTimes) {
@@ -845,8 +846,8 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {third_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 }
 
 TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
@@ -875,8 +876,8 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   // Fetch fourth_identity without feeds.
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 
   // Feed switch_node:1 and fetch fourth_identity.
   s = session->PRun(handle, {{switch_node->name() + ":1", bool_value}},
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index d17ef4d459..61b2f0e60f 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -53,8 +54,8 @@ Status GetOpSig(const string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
-void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+void HasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
@@ -240,7 +241,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
@@ -310,7 +311,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 6223a4e648..2d09e83d01 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -153,7 +154,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index e128b9257f..86851c2c07 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -151,7 +152,8 @@ class ColocationGraph {
       if (attr_value != nullptr && attr_value->has_list()) {
         for (const string& class_spec : attr_value->list().s()) {
           StringPiece spec(class_spec);
-          if (spec.Consume(kColocationGroupPrefixStringPiece)) {
+          if (str_util::ConsumePrefix(&spec,
+                                      kColocationGroupPrefixStringPiece)) {
             found_spec = true;
             TF_RETURN_IF_ERROR(
                 ColocateNodeToGroup(&colocation_group_root, node, spec));
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 098024d219..5ad251c892 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -262,9 +263,9 @@ class PlacerTest : public ::testing::Test {
                 ->attributes()                                          \
                 .device_type())
 
-#define EXPECT_DEVICE_CONTAINS(g, name, device_substr)                        \
-  EXPECT_TRUE(StringPiece(GetNodeByName((g), (name))->assigned_device_name()) \
-                  .contains(device_substr))
+#define EXPECT_DEVICE_CONTAINS(g, name, device_substr) \
+  EXPECT_TRUE(::tensorflow::str_util::StrContains(     \
+      GetNodeByName((g), (name))->assigned_device_name(), device_substr))
 
 // Test that a graph with no constraints will successfully assign nodes to the
 // "best available" device (i.e. prefer GPU over CPU).
@@ -488,11 +489,10 @@ TEST_F(PlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
-              "does not have registered OpKernel support for TestInput"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
+      "does not have registered OpKernel support for TestInput"));
 }
 
 // Test that graphs with reference connections are correctly placed.
@@ -541,15 +541,15 @@ TEST_F(PlacerTest, TestReferenceConnection) {
   {
     Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", "FakeGPU"));
   {
     Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", "FakeGPU"));
 }
@@ -760,8 +760,9 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'foo' and 'in' because no "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "Cannot colocate nodes 'foo' and 'in' because no "
                             "device type supports both of those nodes and the "
                             "other nodes colocated with them"));
 }
@@ -824,11 +825,11 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Cannot colocate nodes 'var3' and 'assign3' because no "
-                    "device type supports both of those nodes and the other "
-                    "nodes colocated with them."));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Cannot colocate nodes 'var3' and 'assign3' because no "
+      "device type supports both of those nodes and the other "
+      "nodes colocated with them."));
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -888,7 +889,7 @@ TEST_F(PlacerTest, TestEmptyDeviceSet) {
 
   Status s = Place(&g, &empty);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("No devices are registered"));
+      str_util::StrContains(s.error_message(), "No devices are registered"));
 }
 
 // Test that placement fails when the requested device forces an
@@ -913,16 +914,17 @@ TEST_F(PlacerTest, TestHeterogeneousDeviceSetFailure) {
   heterogeneous.AddDevice(cpu.get());
   Status s = Place(&g, &heterogeneous);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("colocated with a group of nodes that required "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "colocated with a group of nodes that required "
                             "incompatible device"));
 
   // The error message should contain information that indicates which
   // op types have which registered device types.
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: FakeGPU"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
       << s;
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("TestAssign: FakeGPU FakeCPU"))
+      str_util::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
       << s;
 }
 
@@ -937,7 +939,7 @@ TEST_F(PlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -952,7 +954,7 @@ TEST_F(PlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -969,9 +971,9 @@ TEST_F(PlacerTest, TestUnknownAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:foo' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:foo' does not match any device"));
 }
 
 // Test that placement fails when an op with no registered kernels is
@@ -986,12 +988,11 @@ TEST_F(PlacerTest, TestNoKernelsRegistered) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableNoKernels'"));
   EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No OpKernel was registered to support Op 'VariableNoKernels'"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("<no registered kernels>"));
+      str_util::StrContains(s.error_message(), "<no registered kernels>"));
 }
 
 // Test that placement fails when a kernel is registered but no known
@@ -1011,10 +1012,10 @@ TEST_F(PlacerTest, TestNoDevicesRegistered) {
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("No OpKernel was registered to support "
-                            "Op 'VariableGPU'"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("device='FakeGPU'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableGPU'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "device='FakeGPU'"));
 }
 
 // Test that placement fails when a requested device is malformed.
@@ -1028,8 +1029,8 @@ TEST_F(PlacerTest, TestMalformedDeviceSpecification) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed device specification '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Malformed device specification '/foo:bar'"));
 }
 
 // Test that placement fails when a previously-assigned device is malformed.
@@ -1045,8 +1046,8 @@ TEST_F(PlacerTest, TestMalformedAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed assigned device '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "Malformed assigned device '/foo:bar'"));
 }
 
 // Test that placement fails when a device was previously assigned to
@@ -1063,9 +1064,8 @@ TEST_F(PlacerTest, TestNonUniqueAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:a' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Assigned device '/job:a' does not match any device"));
 }
 
 // Test that ops request to be placed on non-existent devices will be relocated
@@ -1099,7 +1099,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1116,10 +1116,10 @@ TEST_F(PlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("no supported kernel for fakecpu devices is available"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakecpu:0"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "no supported kernel for fakecpu devices is available"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1137,9 +1137,9 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("was explicitly assigned to /job:foo/replica:17 "
-                            "but available devices"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "was explicitly assigned to /job:foo/replica:17 but available devices"));
 }
 
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
@@ -1205,8 +1205,8 @@ TEST_F(PlacerTest, TestUnsatisfiableConstraintWithReferenceConnections) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'var' and 'assign'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Cannot colocate nodes 'var' and 'assign'"));
 }
 
 // Test that a generator node follows its consumers (where there are several
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 2da67b084a..4fbf2abc67 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -132,7 +133,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                    });
     done2.WaitForNotification();
     EXPECT_TRUE(errors::IsNotFound(status));
-    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index a074154450..feaf29c7bb 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 
 #include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -31,10 +32,9 @@ TEST(SessionTest, InvalidTargetReturnsNull) {
   Session* session;
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No session factory registered for the given session options"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No session factory registered for the given session options"));
 }
 
 // Register a fake session factory to test error handling paths in
@@ -44,7 +44,7 @@ class FakeSessionFactory : public SessionFactory {
   FakeSessionFactory() {}
 
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with("fake");
+    return str_util::StartsWith(options.target, "fake");
   }
 
   Session* NewSession(const SessionOptions& options) override {
@@ -68,9 +68,9 @@ TEST(SessionTest, MultipleFactoriesForTarget) {
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::INTERNAL);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("Multiple session factories"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_1"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_2"));
+      str_util::StrContains(s.error_message(), "Multiple session factories"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_1"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index adf5a9afff..f48638afc0 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -143,8 +144,8 @@ TEST_F(ShapeRefinerTest, BadShapes) {
   // an error.
   Status s = m.AddNode(mm.node());
   ASSERT_FALSE(s.ok());
-  ASSERT_TRUE(StringPiece(s.error_message())
-                  .contains("Dimensions must be equal, but are 1 and 2"));
+  ASSERT_TRUE(str_util::StrContains(
+      s.error_message(), "Dimensions must be equal, but are 1 and 2"));
 }
 
 TEST_F(ShapeRefinerTest, SetShape) {
@@ -1032,8 +1033,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
-  EXPECT_TRUE(
-      StringPiece(m.AddNode(result).error_message()).contains("but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(m.AddNode(result).error_message(),
+                                    "but is rank 2"));
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 76ee88e684..f15e2ce9fa 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
 
@@ -73,7 +74,7 @@ class GraphConstructor {
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
-          prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
+          prefix(in.prefix.empty() || str_util::EndsWith(in.prefix, "/")
                      ? in.prefix
                      : in.prefix + "/"),
           uniquify_names(in.uniquify_names),
@@ -436,7 +437,7 @@ Status GraphConstructor::BuildNodeIndex() {
     bool in_control_dependence = false;
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
-      if (!input_name.empty() && input_name.starts_with("^")) {
+      if (!input_name.empty() && str_util::StartsWith(input_name, "^")) {
         in_control_dependence = true;
       } else if (in_control_dependence) {
         return errors::InvalidArgument(
@@ -484,7 +485,7 @@ Status GraphConstructor::InitFromEdges() {
       bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
         StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(input_name, "^")) {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
@@ -534,7 +535,7 @@ Status GraphConstructor::ValidateColocationConstraints(
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
     StringPiece s(c);
-    if (s.Consume(kColocationGroupPrefix) &&
+    if (str_util::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
       return errors::InvalidArgument(
           "Node '", node_def.name(),
@@ -764,7 +765,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     // Skip remapped inputs (which already exist in g_ and are not being
     // imported).
     if (input_already_exists[i]) continue;
-    if (input.Consume("^")) {
+    if (str_util::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
       node_def->set_input(i, strings::StrCat(prefix_, input));
@@ -776,7 +777,7 @@ void GraphConstructor::AddPrefixToNodeDef(
         node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
-      if (v.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&v, kColocationGroupPrefix)) {
         list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
@@ -819,7 +820,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     bool updated = false;
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
-      if (val.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
         const auto& name_pair = uniquified_names_.find(val.ToString());
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 963c1dc024..c18ccf6ce4 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -156,7 +156,9 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     StringPiece loc(value[0]);
-    return loc.Consume(kColocationGroupPrefix) ? loc.ToString() : "";
+    return str_util::ConsumePrefix(&loc, kColocationGroupPrefix)
+               ? loc.ToString()
+               : "";
   }
 
   string GraphDebugString() const {
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 17a174101b..877e4f1b44 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -372,7 +373,7 @@ string ControlLoopName(const string& name) {
 
 bool IsControlLoop(const Node* node) {
   const string& name = node->name();
-  return StringPiece(name).starts_with("_cloop");
+  return str_util::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 6841f29149..83b24cafe2 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -120,7 +121,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       if (ndef.op() == "_Recv") {
         bool has_control = false;
         for (const string& input_name : ndef.input()) {
-          if (StringPiece(input_name).starts_with("^")) {
+          if (str_util::StartsWith(input_name, "^")) {
             has_control = true;
             break;
           }
@@ -128,7 +129,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
         EXPECT_TRUE(has_control);
       }
       // Must have a control loop
-      if (StringPiece(ndef.name()).starts_with("_cloop")) {
+      if (str_util::StartsWith(ndef.name(), "_cloop")) {
         if (ndef.op() == "Enter") {
           has_control_enter = true;
         }
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e2ce0ba046..c8c2b225fe 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -408,7 +409,7 @@ TEST_F(GraphTest, NewName) {
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
-  EXPECT_TRUE(StringPiece(a1).starts_with("A")) << a1;
+  EXPECT_TRUE(str_util::StartsWith(a1, "A")) << a1;
 }
 
 TEST_F(GraphTest, IsValidNode) {
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index 2ad69dbd0c..e46f92bc24 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -215,7 +216,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
@@ -269,7 +270,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 7219d9812f..6c014a8d44 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -312,8 +312,8 @@ TEST_F(SubgraphTest, ChainOfFools) {
   EXPECT_TRUE(HasEdge("e", 0, "_send_e_0", 0));
 }
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 089ea5e527..8af1936d64 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -45,7 +46,7 @@ TensorId ParseTensorName(StringPiece name) {
   if (p > base && *p == ':' && mul > 1) {
     id.first = StringPiece(base, p - base);
     id.second = index;
-  } else if (name.starts_with("^")) {
+  } else if (str_util::StartsWith(name, "^")) {
     // Control edge
     id.first = StringPiece(base + 1);
     id.second = Graph::kControlSlot;
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index cb6d107cad..d58cdc3c5b 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -60,7 +61,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -83,7 +84,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -91,7 +92,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // Validation should still fail.
   s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 }
 
 TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
-- 
GitLab


From 6fa7b699dc5d09d38c2706ef794a3874bc76e979 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 11:17:56 -0700
Subject: [PATCH 0146/1262] Switched to using TensorFlow's true dilated
 convolution support now that it is implemented, and removed emulated dilated
 convolution support.

PiperOrigin-RevId: 191319505
---
 .../contrib/lite/toco/export_tensorflow.cc    | 95 ++-----------------
 .../convert_pure_conv_to_depthwise.cc         |  5 +
 2 files changed, 15 insertions(+), 85 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index e88357f7dd..5d51431005 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -357,6 +357,14 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
+  if ((src_op.dilation_width_factor != 1) ||
+      (src_op.dilation_height_factor != 1)) {
+    auto& dilations = (*conv2d_op->mutable_attr())["dilations"];
+    dilations.mutable_list()->add_i(1);
+    dilations.mutable_list()->add_i(src_op.dilation_height_factor);
+    dilations.mutable_list()->add_i(src_op.dilation_width_factor);
+    dilations.mutable_list()->add_i(1);
+  }
   string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
@@ -391,84 +399,6 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   }
 }
 
-void ConvertDilatedConvOperator(const Model& model, const ConvOperator& src_op,
-                                GraphDef* tensorflow_graph) {
-  CHECK((src_op.dilation_width_factor > 1) ||
-        (src_op.dilation_height_factor > 1))
-      << "Conv operator must have height or width dilation factor > 1. "
-         "Otherwise, use regular conv op.";
-  CHECK_EQ(src_op.stride_width, 1)
-      << "Dilated AND strided convolution is unsupported";
-  CHECK_EQ(src_op.stride_height, 1)
-      << "Dilated AND strided convolution is unsupported";
-
-  // Emulate dilated convolution with a chain of SpaceToBatchND -> Conv ->
-  // BatchToSpaceND ops.
-
-  // Compute padding
-  const auto& input_array = model.GetArray(src_op.inputs[0]);
-  const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
-  int height_mod_dilation = input_shape.dims(1) % src_op.dilation_height_factor;
-  int pad_height;
-  if (height_mod_dilation) {
-    pad_height = src_op.dilation_height_factor - height_mod_dilation;
-  } else {
-    pad_height = 0;
-  }
-  int pad_width;
-  int width_mod_dilation = input_shape.dims(2) % src_op.dilation_width_factor;
-  if (width_mod_dilation) {
-    pad_width = src_op.dilation_width_factor - width_mod_dilation;
-  } else {
-    pad_width = 0;
-  }
-
-  // SpaceToBatchND op "collapses" the spatially separated elements together
-  string stb_output = src_op.outputs[0] + "/dilated_conv_SpaceToBatch";
-  auto* stb_op = tensorflow_graph->add_node();
-  stb_op->set_op("SpaceToBatchND");
-  stb_op->set_name(stb_output);
-  *stb_op->add_input() = src_op.inputs[0];
-  (*stb_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  string block_shape = src_op.outputs[0] + "/dilated_conv_block_shape";
-  CreateIntTensorConst(
-      block_shape,
-      {src_op.dilation_height_factor, src_op.dilation_width_factor}, {2},
-      tensorflow_graph);
-  *stb_op->add_input() = block_shape;
-  (*stb_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
-  string stb_paddings = src_op.outputs[0] + "/dilated_conv_paddings";
-  CreateIntTensorConst(stb_paddings, {0, pad_height, pad_width, 0}, {2, 2},
-                       tensorflow_graph);
-  *stb_op->add_input() = stb_paddings;
-  (*stb_op->mutable_attr())["Tpaddings"].set_type(DT_INT32);
-
-  // Perform a regular conv on the "collapsed" elements
-  ConvOperator conv_op;
-  string conv_output = src_op.outputs[0] + "/dilated_conv_Conv2D";
-  conv_op.inputs = src_op.inputs;
-  conv_op.inputs[0] = stb_output;
-  conv_op.outputs = {conv_output};
-  conv_op.padding.type = src_op.padding.type;
-  conv_op.stride_width = src_op.stride_width;
-  conv_op.stride_height = src_op.stride_height;
-  conv_op.dilation_width_factor = 1;
-  conv_op.dilation_height_factor = 1;
-  ConvertConvOperator(model, conv_op, tensorflow_graph);
-
-  // BatchToSpaceND op restores elements to their original layout
-  auto* bts_op = tensorflow_graph->add_node();
-  bts_op->set_op("BatchToSpaceND");
-  bts_op->set_name(src_op.outputs[0]);
-  *bts_op->add_input() = conv_output;
-  (*bts_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  *bts_op->add_input() = block_shape;
-  (*bts_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
-  *bts_op->add_input() = stb_paddings;
-  (*bts_op->mutable_attr())["Tcrops"].set_type(DT_INT32);
-}
-
 void ConvertDepthwiseConvOperator(const Model& model,
                                   const DepthwiseConvOperator& src_op,
                                   GraphDef* tensorflow_graph) {
@@ -1736,13 +1666,8 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   }
 
   if (src_op.type == OperatorType::kConv) {
-    const ConvOperator& conv_op = static_cast<const ConvOperator&>(src_op);
-    if ((conv_op.dilation_width_factor != 1) ||
-        (conv_op.dilation_height_factor != 1)) {
-      return ConvertDilatedConvOperator(model, conv_op, tensorflow_graph);
-    } else {
-      ConvertConvOperator(model, conv_op, tensorflow_graph);
-    }
+    ConvertConvOperator(model, static_cast<const ConvOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kDepthwiseConv) {
     ConvertDepthwiseConvOperator(
         model, static_cast<const DepthwiseConvOperator&>(src_op),
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index d38db85280..0fffab574d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -33,6 +33,11 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   if (conv_op->stride_width != conv_op->stride_height) {
     return false;
   }
+  if ((conv_op->dilation_width_factor != 1) ||
+      (conv_op->dilation_height_factor != 1)) {
+    // Depthwise conv does not support dilation
+    return false;
+  }
   auto& weights_array = model->GetArray(conv_op->inputs[1]);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
-- 
GitLab


From 5d81b72b9c1a7edd1a84c13b1dc753b310545e56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 11:35:00 -0700
Subject: [PATCH 0147/1262] [XLA] Redesign: improve error handling: - For every
 op creation method, check whether there's any existing error, if so, don't do
 anything and returns an empty op. To do this efficiently, make the
 NoteErrorOrReturn method accept a lambda, and check first_error_ before
 evaluating the lambda. - Return error instead of TF_CHECK_RET, because the
 second seems to always print ERROR logs.

PiperOrigin-RevId: 191322082
---
 .../xla/client/xla_client/xla_builder.cc      | 76 ++++++++++++++-----
 .../xla/client/xla_client/xla_builder.h       |  8 +-
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 7a9ff0c441..04091ecb11 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 
+#include <functional>
 #include <numeric>
 #include <string>
 #include <utility>
@@ -65,12 +66,17 @@ StatusOr<std::vector<Shape>> GetOperandShapes(
 }  // namespace
 
 StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
   return instr->shape();
 }
 
 StatusOr<Shape> XlaOp::GetShape() const {
-  TF_RET_CHECK(builder_ != nullptr);
+  if (builder_ == nullptr) {
+    return InvalidArgument(
+        "cannot GetShape for an invalid XlaOp with handle %lld", handle());
+  }
   return builder_->GetShape(*this);
 }
 
@@ -91,7 +97,22 @@ void XlaBuilder::NoteError(const Status& error) {
   }
 }
 
+XlaOp XlaBuilder::NoteErrorOrReturn(
+    const std::function<StatusOr<XlaOp>()>& op_creator) {
+  if (!first_error_.ok()) {
+    return {};
+  }
+  auto op = op_creator();
+  if (!op.ok()) {
+    NoteError(op.status());
+    return {};
+  }
+  return op.ConsumeValueOrDie();
+}
+
 StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_RET_CHECK(root_id != nullptr);
   ProgramShape program_shape;
 
@@ -197,6 +218,8 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
 StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, const XlaOp& operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   HloInstructionProto instr;
   *instr.mutable_shape() = shape;
   for (int64 dim : broadcast_dimensions) {
@@ -207,6 +230,8 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
 
 StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
                                                  const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
 
   CHECK(ShapeUtil::IsScalar(operand_shape) ||
@@ -250,7 +275,7 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
     return AddInstruction(std::move(instr), unop, {operand});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::BinaryOp(
@@ -307,7 +332,7 @@ XlaOp XlaBuilder::BinaryOp(
     }
 
     return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
@@ -345,7 +370,7 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     }
     return AddInstruction(std::move(instr), triop,
                           {updated_lhs, updated_rhs, updated_ehs});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
@@ -364,7 +389,7 @@ XlaOp XlaBuilder::ConstantLiteral(const Literal& literal) {
     *instr.mutable_shape() = literal.shape();
     *instr.mutable_literal() = literal.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConstant);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Call(const XlaComputation& computation,
@@ -390,7 +415,7 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
     }
 
     return AddInstruction(std::move(instr), HloOpcode::kCall, operands);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
@@ -406,7 +431,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
     instr.set_name(name);
     *instr.mutable_shape() = shape;
     return AddInstruction(std::move(instr), HloOpcode::kParameter);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Broadcast(
@@ -430,10 +455,12 @@ XlaOp XlaBuilder::Broadcast(
       dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank;
     }
     return InDimBroadcast(shape, operand, dimensions);
-  }());
+  });
 }
 
 StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   HloInstructionProto instr;
   *instr.mutable_shape() = shape;
   return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
@@ -477,7 +504,7 @@ XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
     instr.add_dimensions(dimension);
 
     return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
@@ -497,7 +524,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                            ? operand
                            : Transpose(operand, dimensions);
     return Reshape(shape, transposed);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
@@ -507,7 +534,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Collapse(const XlaOp& operand,
@@ -535,7 +562,7 @@ XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements) {
                         ShapeInference::InferVariadicOpShape(
                             HloOpcode::kTuple, operand_shape_ptrs));
     return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
@@ -554,7 +581,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
 
     return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement,
                           {tuple_data});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
@@ -804,7 +831,7 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
       instr.add_dimensions(dim);
     }
     return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Rev(const XlaOp& operand,
@@ -1071,6 +1098,8 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
 StatusOr<XlaOp> XlaBuilder::AddInstruction(
     HloInstructionProto&& instr, HloOpcode opcode,
     tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   const int64 handle = instructions_.size();
   instr.set_id(handle);
   instr.set_opcode(HloOpcodeString(opcode));
@@ -1081,10 +1110,15 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
     instr.set_name(StrCat(instr.name(), ".", handle));
   }
   for (const auto& operand : operands) {
-    TF_RET_CHECK(operand.builder_ != nullptr);
-    TF_RET_CHECK(operand.builder_ == this)
-        << "Do not add XlaOp from builder " << operand.builder_->name()
-        << " to builder " << this->name();
+    if (operand.builder_ == nullptr) {
+      return InvalidArgument("invalid XlaOp with handle %lld",
+                             operand.handle());
+    }
+    if (operand.builder_ != this) {
+      return InvalidArgument("Do not add XlaOp from builder %s to builder %s",
+                             operand.builder_->name().c_str(),
+                             this->name().c_str());
+    }
     instr.add_operand_ids(operand.handle());
   }
 
@@ -1101,6 +1135,12 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
 
 StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  if (op.builder_ != this) {
+    return InvalidArgument("invalid XlaOp with handle %lld", op.handle());
+  }
+
   TF_RET_CHECK(op.builder_ == this);
   if (op.handle() >= instructions_.size() || op.handle() < 0) {
     return InvalidArgument("no XlaOp value %lld", op.handle());
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index f66feb93ce..f43101db34 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -809,13 +809,7 @@ class XlaBuilder {
   // * dying if die_immediately_on_error_ is true
   void NoteError(const Status& error);
 
-  XlaOp NoteErrorOrReturn(StatusOr<XlaOp>&& op) {
-    if (!op.ok()) {
-      NoteError(op.status());
-      return XlaOp();
-    }
-    return op.ConsumeValueOrDie();
-  }
+  XlaOp NoteErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
 
   // Helper method that creates an empty op and notes error.
   XlaOp UnimplementedOp();
-- 
GitLab


From 6c4095c7353666c4b75ce189e68860be1159b40a Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 2 Apr 2018 11:53:37 -0700
Subject: [PATCH 0148/1262] TFTS: Clean up the cold start SignatureDef.

Removes state where it wasn't used.

PiperOrigin-RevId: 191324834
---
 .../timeseries/python/timeseries/estimators_test.py  |  7 +++++++
 .../contrib/timeseries/python/timeseries/head.py     | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index 51d0c0ca3f..9f161c1695 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import tempfile
 
 import numpy
+import six
 
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import estimators
@@ -127,6 +128,12 @@ class TimeSeriesRegressorTest(test.TestCase):
             session=sess)
 
         # Test cold starting
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES],
+            signatures.signature_def[
+                feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys())
         batch_numpy_times = numpy.tile(
             numpy.arange(30, dtype=numpy.int64)[None, :], (10, 1))
         batch_numpy_values = numpy.ones([10, 30, 1])
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 4cf6bbcfd4..71085f9de8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -58,6 +58,16 @@ def time_series_regression_head(model,
                                    input_statistics_generator)
 
 
+class _NoStatePredictOutput(export_lib.PredictOutput):
+
+  def as_signature_def(self, receiver_tensors):
+    no_state_receiver_tensors = {
+        key: value for key, value in receiver_tensors.items()
+        if not key.startswith(feature_keys.State.STATE_PREFIX)}
+    return super(_NoStatePredictOutput, self).as_signature_def(
+        receiver_tensors=no_state_receiver_tensors)
+
+
 class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
   """See `time_series_regression_head`."""
 
@@ -167,7 +177,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                 export_lib.PredictOutput(
                     state_to_dictionary(filtering_outputs.end_state)),
             feature_keys.SavedModelLabels.COLD_START_FILTER:
-                export_lib.PredictOutput(
+                _NoStatePredictOutput(
                     state_to_dictionary(cold_filtering_outputs.end_state))
         },
         # Likely unused, but it is necessary to return `predictions` to satisfy
-- 
GitLab


From 46df4a1afd50f69966e63245e7758cc0d5656c4e Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 2 Apr 2018 12:01:32 -0700
Subject: [PATCH 0149/1262] Fix #18180

tf.size() was not respecting the `out_type` argument when eager execution was
enabled.

PiperOrigin-RevId: 191326039
---
 .../python/kernel_tests/array_ops_test.py     | 48 +++++++++++--------
 tensorflow/python/ops/array_ops.py            |  5 +-
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 78bdb7eda7..5a20eebbc5 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1007,30 +1007,38 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testDenseShape(self):
-    with self.test_session():
-      t_value = [[0, 42], [24, 0]]
-      self.assertAllEqual((2, 2), array_ops.shape(t_value).eval())
-      self.assertEqual(4, array_ops.size(t_value).eval())
-      self.assertEqual(2, array_ops.rank(t_value).eval())
+    t_value = [[0, 42], [24, 0]]
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t_value)))
 
-      t = constant_op.constant(t_value)
-      self.assertAllEqual((2, 2), array_ops.shape(t).eval())
-      self.assertEqual(4, array_ops.size(t).eval())
-      self.assertEqual(2, array_ops.rank(t).eval())
+    t = constant_op.constant(t_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t)))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSparseShape(self):
-    with self.test_session():
-      sp_value = sparse_tensor.SparseTensorValue(
-          indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
-      self.assertAllEqual((2, 2), array_ops.shape(sp_value).eval())
-      self.assertEqual(4, array_ops.size(sp_value).eval())
-      self.assertEqual(2, array_ops.rank(sp_value).eval())
-
-      sp = sparse_tensor.SparseTensor.from_value(sp_value)
-      self.assertAllEqual((2, 2), array_ops.shape(sp).eval())
-      self.assertEqual(4, array_ops.size(sp).eval())
-      self.assertEqual(2, array_ops.rank(sp).eval())
+    sp_value = sparse_tensor.SparseTensorValue(
+        indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp_value)))
+
+    sp = sparse_tensor.SparseTensor.from_value(sp_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSizeDtype(self):
+    tensor = [1]
+    self.assertEqual(dtypes.int32, self.evaluate(array_ops.size(tensor)).dtype)
+    self.assertEqual(
+        dtypes.int64,
+        self.evaluate(array_ops.size(tensor, out_type=dtypes.int64)).dtype)
 
 
 @test_util.with_c_api
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 207866610b..68d446602e 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -387,7 +387,10 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
   """
   if context.executing_eagerly() and not isinstance(
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-    return np.prod(ops.convert_to_tensor(input)._shape_tuple())  # pylint: disable=protected-access
+    input = ops.convert_to_tensor(input)
+    np_out_type = out_type.as_numpy_dtype
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
                           sparse_tensor.SparseTensorValue)):
-- 
GitLab


From 43f5b27f6064b64b7dbcfcae865829e3617a7112 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 2 Apr 2018 12:07:16 -0700
Subject: [PATCH 0150/1262] Switch to the TensorFlow API generation based on
 ApiDef's and tf_export decorators.

PiperOrigin-RevId: 191326767
---
 tensorflow/BUILD                              |  8 --
 tensorflow/__init__.py                        |  7 +-
 tensorflow/contrib/cmake/python_modules.txt   |  2 +
 tensorflow/contrib/cmake/tf_python.cmake      | 91 +++++++++++++++----
 tensorflow/experimental_api.py                | 38 --------
 tensorflow/python/framework/dtypes.py         |  2 +-
 tensorflow/tools/api/generator/BUILD          |  2 +
 .../tools/api/generator/create_python_api.py  | 21 ++++-
 tensorflow/tools/api/tests/BUILD              |  1 -
 .../tools/api/tests/api_compatibility_test.py | 48 +---------
 .../ci_build/windows/cpu/cmake/run_py.bat     |  6 +-
 11 files changed, 105 insertions(+), 121 deletions(-)
 delete mode 100644 tensorflow/experimental_api.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 3d5737a9d7..cfafffdd13 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -540,14 +540,6 @@ py_library(
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
-
-py_library(
-    name = "experimental_tensorflow_py",
-    srcs = ["experimental_api.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow/tools/api/tests:__subpackages__"],
     deps = [
         "//tensorflow/python",
         "//tensorflow/tools/api/generator:python_api",
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 78ad6aec19..c8683e3976 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -20,14 +20,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # pylint: disable=wildcard-import
-from tensorflow.python import *  # pylint: disable=redefined-builtin
+from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 02c456c199..d5cf42b641 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -104,6 +104,8 @@ tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
 tensorflow/tools
+tensorflow/tools/api
+tensorflow/tools/api/generator
 tensorflow/tools/graph_transforms
 tensorflow/contrib
 tensorflow/contrib/all_reduce
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index fae45ead5c..20eeded8d3 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -686,6 +686,77 @@ AddUserOps(TARGET _beam_search_ops
     DEPENDS pywrap_tensorflow_internal tf_python_ops
     DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
+else()
+  add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
+endif()
+
+
+########################################################
+# Generate API __init__.py files.
+########################################################
+
+# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
+
+set(api_init_files "")
+foreach(api_init_file ${api_init_files_list})
+    string(STRIP "${api_init_file}" api_init_file)
+    if(api_init_file)
+        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+    endif()
+endforeach(api_init_file)
+set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
+file(WRITE "${api_init_list_file}" "${api_init_files}")
+
+# Run create_python_api.py to generate __init__.py files.
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+      # this step is running since the files aren't there yet.
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
+
+      # Re-add tensorflow/__init__.py back.
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+)
+
+add_custom_target(tf_python_api SOURCES ${api_init_files})
+add_dependencies(tf_python_api tf_python_ops)
+
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -695,6 +766,7 @@ add_dependencies(tf_python_build_pip_package
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
+    tf_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
@@ -707,25 +779,6 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  else()
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  endif()
-else()
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
-endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/experimental_api.py b/tensorflow/experimental_api.py
deleted file mode 100644
index 63a8aa9cb1..0000000000
--- a/tensorflow/experimental_api.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Bring in all of the public TensorFlow interface into this
-# module.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.lazy_loader import LazyLoader
-contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
-del LazyLoader
-
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 0edae92fd4..a31c424263 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -345,7 +345,7 @@ tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 tf_export("uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
-tf_export("uint64").export_constant(__name__, "uint32")
+tf_export("uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
 tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 6722536358..f8063ae0fb 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -32,6 +32,7 @@ genrule(
     # api/module1/module2/__init__.py and api/module3/__init__.py.
     # keep sorted
     outs = [
+        # BEGIN GENERATED FILES
         "api/__init__.py",
         "api/app/__init__.py",
         "api/bitwise/__init__.py",
@@ -116,6 +117,7 @@ genrule(
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
         "api/user_ops/__init__.py",
+        # END GENERATED FILES
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 183c4731b8..1505dc69b9 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -195,16 +195,19 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
+    # Convert path separators to '/' for easier parsing below.
+    normalized_output_file = output_file.replace(os.sep, '/')
     if _API_DIR not in output_file:
       raise ValueError(
           'Output files must be in api/ directory, found %s.' % output_file)
     # Get the module name that corresponds to output_file.
     # First get module directory under _API_DIR.
     module_dir = os.path.dirname(
-        output_file[output_file.rfind(_API_DIR)+len(_API_DIR):])
+        normalized_output_file[
+            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
     # Convert / to .
     module_name = module_dir.replace('/', '.').strip('.')
-    module_name_to_file_path[module_name] = output_file
+    module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
   for module, file_path in module_name_to_file_path.items():
@@ -241,6 +244,16 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
-      help='Python files that we expect this script to output.')
+      help='If a single file is passed in, then we we assume it contains a '
+      'semicolon-separated list of Python files that we expect this script to '
+      'output. If multiple files are passed in, then we assume output files '
+      'are listed directly as arguments.')
   args = parser.parse_args()
-  main(args.outputs)
+  if len(args.outputs) == 1:
+    # If we only get a single argument, then it must be a file containing
+    # list of outputs.
+    with open(args.outputs[0]) as output_list_file:
+      outputs = [line.strip() for line in output_list_file.read().split(';')]
+  else:
+    outputs = args.outputs
+  main(outputs)
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 0dc154b6d2..724b12cd47 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -23,7 +23,6 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:experimental_tensorflow_py",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 603b2a4327..26d5bca637 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,7 +34,6 @@ import sys
 import unittest
 
 import tensorflow as tf
-from tensorflow import experimental_api as api
 
 from google.protobuf import text_format
 
@@ -47,8 +46,6 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
-if hasattr(tf, 'experimental_api'):
-  del tf.experimental_api
 
 # FLAGS defined at the bottom:
 FLAGS = None
@@ -205,51 +202,12 @@ class ApiCompatibilityTest(test.TestCase):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    traverse.traverse(tf, public_api_visitor)
-
-    proto_dict = visitor.GetProtos()
-
-    # Read all golden files.
-    expression = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*'))
-    golden_file_list = file_io.get_matching_files(expression)
-
-    def _ReadFileToProto(filename):
-      """Read a filename, create a protobuf from its contents."""
-      ret_val = api_objects_pb2.TFAPIObject()
-      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
-      return ret_val
-
-    golden_proto_dict = {
-        _FileNameToKey(filename): _ReadFileToProto(filename)
-        for filename in golden_file_list
-    }
-
-    # Diff them. Do not fail if called with update.
-    # If the test is run to update goldens, only report diffs but do not fail.
-    self._AssertProtoDictEquals(
-        golden_proto_dict,
-        proto_dict,
-        verbose=FLAGS.verbose_diffs,
-        update_goldens=FLAGS.update_goldens)
-
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
-  def testNewAPIBackwardsCompatibility(self):
-    # Extract all API stuff.
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     # TODO(annarev): Make slide_dataset available in API.
     public_api_visitor.private_map['tf'] = ['slide_dataset']
-    traverse.traverse(api, public_api_visitor)
+    traverse.traverse(tf, public_api_visitor)
 
     proto_dict = visitor.GetProtos()
 
@@ -276,9 +234,7 @@ class ApiCompatibilityTest(test.TestCase):
         golden_proto_dict,
         proto_dict,
         verbose=FLAGS.verbose_diffs,
-        update_goldens=False,
-        additional_missing_object_message=
-        'Check if tf_export decorator/call is missing for this symbol.')
+        update_goldens=FLAGS.update_goldens)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 3c3b223a00..30554a084c 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -37,9 +40,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
-- 
GitLab


From fc34c057d9d1118477b3e02870b97305c2d1af86 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 12:36:08 -0700
Subject: [PATCH 0151/1262] Fix a bug in AvgPoolGrad op cost in extracting
 input x's shape. AvgPoolGrad takes a shape tensor; hence, a value should be
 parsed from inputs(0) to extract correct shape of x.

PiperOrigin-RevId: 191330762
---
 .../grappler/costs/op_level_cost_estimator.cc | 29 ++++++-
 .../costs/op_level_cost_estimator_test.cc     | 85 ++++++++++---------
 2 files changed, 70 insertions(+), 44 deletions(-)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 0f6307cfdf..75258d0547 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -817,6 +817,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    input_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       input_shape.add_dim()->set_size(1);
     }
@@ -859,6 +860,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    filter_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       filter_shape.add_dim()->set_size(1);
     }
@@ -1242,10 +1244,31 @@ Costs OpLevelCostEstimator::PredictAvgPoolGrad(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
-  // x: op_info.inputs(0)
+  // x's shape: op_info.inputs(0)
   // y_grad: op_info.inputs(1)
-  ConvolutionDimensions dims = OpDimensionsFromInputs(
-      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  // Extract x_shape from op_info.inputs(0).value() or op_info.outputs(0).
+  bool shape_found = false;
+  TensorShapeProto x_shape;
+  if (op_info.inputs_size() >= 1 && op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
+    shape_found = GetTensorShapeProtoFromTensorProto(value, &x_shape);
+  }
+  if (!shape_found && op_info.outputs_size() > 0) {
+    x_shape = op_info.outputs(0).shape();
+    shape_found = true;
+  }
+  if (!shape_found) {
+    // Set the minimum shape that's feasible.
+    x_shape.Clear();
+    for (int i = 0; i < 4; ++i) {
+      x_shape.add_dim()->set_size(1);
+    }
+    found_unknown_shapes = true;
+  }
+
+  ConvolutionDimensions dims =
+      OpDimensionsFromInputs(x_shape, op_info, &found_unknown_shapes);
 
   int64 ops = 0;
   if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 56915ed821..4758bbfee7 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -217,6 +217,39 @@ std::vector<int> GetPoolingOutputSize(const std::vector<int>& input,
   return output;
 }
 
+// Helper functions for testing GetTensorShapeProtoFromTensorProto().
+void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
+                    const std::vector<int64> values, const bool tensor_content,
+                    TensorProto* tensor_proto) {
+  tensor_proto->Clear();
+  TensorProto temp_tensor_proto;
+  temp_tensor_proto.set_dtype(dtype);
+  for (const auto& x : shape) {
+    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
+  }
+  for (const auto& x : values) {
+    if (dtype == DT_INT64) {
+      temp_tensor_proto.add_int64_val(x);
+    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
+               dtype == DT_UINT8) {
+      temp_tensor_proto.add_int_val(x);
+    } else if (dtype == DT_UINT32) {
+      temp_tensor_proto.add_uint32_val(x);
+    } else if (dtype == DT_UINT64) {
+      temp_tensor_proto.add_uint64_val(x);
+    } else {
+      CHECK(false) << "Unsupported dtype: " << dtype;
+    }
+  }
+  Tensor tensor(dtype);
+  CHECK(tensor.FromProto(temp_tensor_proto));
+  if (tensor_content) {
+    tensor.AsProtoTensorContent(tensor_proto);
+  } else {
+    tensor.AsProtoField(tensor_proto);
+  }
+}
+
 OpContext DescribePoolingOp(const string& op_name, const std::vector<int>& x,
                             const std::vector<int>& ksize,
                             const std::vector<int>& strides,
@@ -233,8 +266,11 @@ OpContext DescribePoolingOp(const string& op_name, const std::vector<int>& x,
     DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
     DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_outputs());
   } else if (op_name == "AvgPoolGrad") {
-    // input: x, y_grad, output: x_grad.
-    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    // input: x's shape, y_grad, output: x_grad.
+    DescribeArbitraryRankInput({4}, DT_INT32, &op_info);
+    auto* tensor_proto = op_info.mutable_inputs(0)->mutable_value();
+    GetTensorProto(DT_INT32, {4}, {x[0], x[1], x[2], x[3]},
+                   /*tensor_content=*/false, tensor_proto);
     DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
     DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
   } else if (op_name == "MaxPoolGrad") {
@@ -510,39 +546,6 @@ TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
   EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
 }
 
-// Helper functions for testing GetTensorShapeProtoFromTensorProto().
-void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
-                    const std::vector<int64> values, const bool tensor_content,
-                    TensorProto* tensor_proto) {
-  tensor_proto->Clear();
-  TensorProto temp_tensor_proto;
-  temp_tensor_proto.set_dtype(dtype);
-  for (const auto& x : shape) {
-    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
-  }
-  for (const auto& x : values) {
-    if (dtype == DT_INT64) {
-      temp_tensor_proto.add_int64_val(x);
-    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
-               dtype == DT_UINT8) {
-      temp_tensor_proto.add_int_val(x);
-    } else if (dtype == DT_UINT32) {
-      temp_tensor_proto.add_uint32_val(x);
-    } else if (dtype == DT_UINT64) {
-      temp_tensor_proto.add_uint64_val(x);
-    } else {
-      CHECK(false) << "Unsupported dtype: " << dtype;
-    }
-  }
-  Tensor tensor(dtype);
-  CHECK(tensor.FromProto(temp_tensor_proto));
-  if (tensor_content) {
-    tensor.AsProtoTensorContent(tensor_proto);
-  } else {
-    tensor.AsProtoField(tensor_proto);
-  }
-}
-
 void ExpectTensorShape(const std::vector<int64>& expected,
                        const TensorShapeProto& tensor_shape_proto) {
   TensorShape tensor_shape_expected(expected);
@@ -746,25 +749,25 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPoolGrad) {
   {
     // Typical 3xz3 window with 2x2 stride.
     auto costs = predict_avg_pool_grad(10, 20, 384, 3, 2, "SAME");
-    EXPECT_EQ(Costs::Duration(1920000), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
     EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
-    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
     EXPECT_FALSE(costs.inaccurate);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
     auto costs = predict_avg_pool_grad(10, 20, 384, 1, 2, "SAME");
-    EXPECT_EQ(Costs::Duration(1574400), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(960002), costs.execution_time);
     EXPECT_EQ(Costs::Duration(192000), costs.compute_time);
-    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
     EXPECT_FALSE(costs.inaccurate);
   }
   {
     // 2x2 window with 3x3 stride.
     auto costs = predict_avg_pool_grad(10, 20, 384, 2, 3, "VALID");
-    EXPECT_EQ(Costs::Duration(1476480), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(862082), costs.execution_time);
     EXPECT_EQ(Costs::Duration(172416), costs.compute_time);
-    EXPECT_EQ(Costs::Duration(1304064), costs.memory_time);
+    EXPECT_EQ(Costs::Duration(689666), costs.memory_time);
     EXPECT_FALSE(costs.inaccurate);
   }
 }
-- 
GitLab


From c8064f1ac3c42951aa1593260346b75d306ffe95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 12:41:59 -0700
Subject: [PATCH 0152/1262] Rewrite Add/AddN subgraph, minimizing number of
 required broadcasts.

1) Collect to AddOpsGroup inputs of symbolically defined
   shapes, that can be broadcasted to the root shape
2) Rewrite equal shapes with AddN(s)
3) Build Add tree from aggegations of different shapes,
   minimizing the cost of broadcast

PiperOrigin-RevId: 191331566
---
 tensorflow/core/grappler/optimizers/BUILD     |   2 +
 .../optimizers/arithmetic_optimizer.cc        | 297 +++++++-----
 .../optimizers/arithmetic_optimizer.h         |   8 +-
 .../optimizers/arithmetic_optimizer_test.cc   | 441 ++++++++++++------
 .../grappler/optimizers/symbolic_shapes.h     |   4 +-
 5 files changed, 500 insertions(+), 252 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index f865d0c159..4ce3e73911 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -251,6 +251,7 @@ cc_library(
         ":constant_folding",
         ":graph_optimizer",
         ":graph_optimizer_stage",
+        ":symbolic_shapes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -260,6 +261,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index d155e0b289..882e4d9a40 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
@@ -31,8 +32,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -197,39 +200,6 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
 
 bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 
-// Shape is symbolically defined if it has a known rank, and each dimension is
-// defined, or is an unknown symbol (dim.size <= -2).
-bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape) {
-  return !shape.unknown_rank() &&
-         std::all_of(
-             shape.dim().begin(), shape.dim().end(),
-             [](const TensorShapeProto::Dim& dim) { return dim.size() != -1; });
-}
-
-bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties) {
-  return ShapeIsSymbolicallyDefined(properties.shape());
-}
-
-bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
-                             const TensorShapeProto& right) {
-  if (left.unknown_rank() || right.unknown_rank() ||
-      left.dim_size() != right.dim_size()) {
-    return false;
-  }
-  for (int i = 0; i < left.dim_size(); ++i) {
-    if (left.dim(i).size() == -1 || right.dim(i).size() == -1 ||
-        left.dim(i).size() != right.dim(i).size()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
-                             const OpInfo::TensorProperties& right) {
-  return ShapesSymbolicallyEqual(left.shape(), right.shape());
-}
-
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
@@ -348,17 +318,30 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 // Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
 // original inputs of absorbed nodes.
 //
-// All nodes in a Add/AddN subgraph must have symbolically equal shape. All
-// nodes must have the same device placement.
+// 1) All nodes must have the same device placement.
+//
+// 2) If All nodes in a Add/AddN subgraph have symbolically equal shape, tree is
+//    optimized to a single AddN node.
 //
-// Example:
 //                AddN_1
 //             /    |    \
-//          Add_1   z   Add_2       -> AddN(z, y, z, w, q, e)
+//          Add_1   z   Add_2       -> AddN(x, y, z, w, q, e)
 //          /  \        /  \
 //         x    y      w    Add_3
 //                          / \
 //                         q   e
+//
+// 3) If some nodes have different shape (it needs to be broadcastable to the
+//    shape of a "root), tree is optimized to AddNs for symbolically equal
+//    shapes, and a tree of Add ops, that minimize broadcasts.
+//
+//                AddN_1                                 Add
+//             /    |    \                              /  \
+//          Add_1   z   Add_2       ->               Add    w
+//          /  \        /  \                        /   \
+//         x    y      w    Add_3      AddN(x, y, q, e)  z
+//                          / \
+//                         q   e
 class AddOpsRewriteStage : public ArithmeticOptimizerStage {
  public:
   explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
@@ -379,7 +362,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     OpInfo::TensorProperties properties;
     Status has_properties = GetTensorProperties(node->name(), &properties);
     return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
-           HasAllInputsOfSymbolicallyEqualShape(*node, properties);
+           HasAllInputsOfBroadcastableShape(*node, properties);
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -387,7 +370,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     AddOpsGroup group;
     TF_RETURN_IF_ERROR(CreateAddOpsGroup(node, &group));
 
-    if (!group.absorbed_nodes.empty() && !IsRewritten(group)) {
+    if (!group.absorbed_nodes.empty()) {
       *simplified_node_name = RewriteAddOpsGroup(group);
     }
 
@@ -395,6 +378,14 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
   }
 
  private:
+  // Input name with a statically inferred shape from GraphProperties
+  struct InputAndShape {
+    InputAndShape(const string& input, const TensorShapeProto& shape)
+        : input(input), shape(shape) {}
+    string input;
+    TensorShapeProto shape;
+  };
+
   // Holds together an add ops subgraph that we want to rewrite together.
   //
   // For the graph above the AddOpsGroup will be:
@@ -406,12 +397,12 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     TensorShapeProto root_shape;
     // Add/AddN operations below the root level that were absorbed by this group
     std::vector<NodeDef*> absorbed_nodes;
-    // Inputs of absorbed nodes that will be forwarded to rewritten AddN node
-    std::vector<string> inputs;
+    // Inputs of absorbed nodes that will be forwarded to optimized AddN ops
+    std::vector<InputAndShape> inputs;
   };
 
-  // Check if all inputs have symbolically equal shapes
-  bool HasAllInputsOfSymbolicallyEqualShape(
+  // Check if all inputs can be broadcasted to the same shape
+  bool HasAllInputsOfBroadcastableShape(
       const NodeDef& node, const OpInfo::TensorProperties& properties) const {
     const AddOpsRewriteStage* self = this;
     return std::all_of(
@@ -421,7 +412,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
           Status has_input_properties =
               self->GetTensorProperties(input, &input_properties);
           return has_input_properties.ok() &&
-                 ShapesSymbolicallyEqual(properties, input_properties);
+                 ShapesBroadcastable(properties, input_properties);
         });
   }
 
@@ -467,11 +458,11 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (node->device() != group.root_node->device()) {
       return false;
     }
-    // All input shapes must be symbolically defined and equal to the node shape
+    // All input shapes must be broadcastable to the node shape
     OpInfo::TensorProperties properties;
     Status has_properties = GetTensorProperties(name, &properties);
     return has_properties.ok() &&
-           HasAllInputsOfSymbolicallyEqualShape(*node, properties);
+           HasAllInputsOfBroadcastableShape(*node, properties);
   }
 
   // Node requirements both for a root node and an absorbed node
@@ -490,18 +481,16 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (rewritten_nodes_.find(node->name()) != rewritten_nodes_.end()) {
       return false;
     }
+    // it must not be created by this stage at any of previous optimization runs
+    if (StringPiece(node->name()).contains(stage_name_)) {
+      return false;
+    }
     // should not drive or be driven by control dependency
     // TODO(ezhulenev): relax this condition for root node
     return !(IsDrivenByControlDependency(*node) ||
              DrivesControlDependency(*node));
   }
 
-  // Check that optimized group node name doesn't exists. It might happen if
-  // graph optimized multiple times without pruning between invocations.
-  bool IsRewritten(const AddOpsGroup& group) const {
-    return ctx_.node_map->NodeExists(AddOpsGroupName(group));
-  }
-
   // Create an AddOpsGroup with a root in a given node
   Status CreateAddOpsGroup(const NodeDef* root_node, AddOpsGroup* group) {
     OpInfo::TensorProperties root_node_output_properties;
@@ -513,7 +502,10 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
 
     group->absorbed_nodes.reserve(root_node->input_size());
     for (int i = 0; i < root_node->input_size(); ++i) {
-      TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(root_node->input(i), group));
+      const string& input_i = root_node->input(i);
+      if (!IsControlInput(input_i)) {
+        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
+      }
     }
 
     return Status::OK();
@@ -526,71 +518,159 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (IsAbsorbableByAddOpsGroup(input, *group)) {
       group->absorbed_nodes.push_back(node);
       for (int i = 0; i < node->input_size(); ++i) {
-        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(node->input(i), group));
+        const string& input_i = node->input(i);
+        if (!IsControlInput(input)) {
+          TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
+        }
       }
     } else {
       // If node can't be absorbed, add it to AddOpsGroup input
-      group->inputs.push_back(input);
+      OpInfo::TensorProperties properties;
+      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
+      group->inputs.emplace_back(input, properties.shape());
     }
     return Status::OK();
   }
 
-  // New node for AddOpsGroup is added to the same scope as a root_node. All
-  // absorbed nodes are stripped of their scope, and only names are used in a
-  // new node name.
-  //
-  // Example: AddOpsGroup(root="a/b/c/Add_2", absorbed=["d/Add_1", "e/Add"])
-  //          node_name="a/b/c/AddOpsGroup_Add_2_Add_1_Add
-  string AddOpsGroupName(const AddOpsGroup& group) const {
-    CHECK_NOTNULL(group.root_node);
-
-    auto root = ParseNodeScopeAndName(group.root_node->name());
+  // Rewrite an add ops group into a single AddN if all input shapes are
+  // symbolically equal. If not, create AddN for equal shapes first, and then
+  // build an Add tree, minimizing the cost of broadcasts.
+  string RewriteAddOpsGroup(const AddOpsGroup& group) {
+    // all new nodes will be placed under the scope of a root node
+    auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
+
+    auto shape_sig = [](const TensorShapeProto& shape) {
+      string name = strings::StrCat("r:", shape.dim_size(), ":d");
+      for (int i = 0; i < shape.dim_size(); ++i)
+        strings::StrAppend(&name, ":", shape.dim(i).size());
+      return name;
+    };
+
+    // Find what shapes are present in the inputs of absorbed nodes
+    std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
+    for (const auto& input : group.inputs) {
+      shape_sig_to_inputs[shape_sig(input.shape)].push_back(input);
+    }
 
-    std::vector<string> absorbed_node_names(group.absorbed_nodes.size());
-    std::transform(group.absorbed_nodes.begin(), group.absorbed_nodes.end(),
-                   absorbed_node_names.begin(),
-                   [](const NodeDef* node) { return node->name(); });
+    // Collect all the shapes from representative elements
+    std::vector<TensorShapeProto> shapes;
+    shapes.reserve(shape_sig_to_inputs.size());
+    for (const auto& el : shape_sig_to_inputs)
+      shapes.push_back(el.second[0].shape);
+
+    // If all inputs have the same shape, rewrite whole group with a single AddN
+    if (shapes.size() == 1) {
+      string node_name = OptimizedNodeName(root_scope_and_name);
+      AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
+                                        group.inputs);
+      // keep track of nodes that were created or absorbed as a part of rewrite
+      rewritten_nodes_.insert(node_name);
+      return node_name;
+    }
 
-    return OptimizedNodeName(root, absorbed_node_names);
-  }
+    // For inputs of different shapes:
+    // 1. Rewrite inputs of the same shape using AddN (leaf nodes)
+    // 2. Build a tree of Add nodes, minimizing cost of broadcast
+    std::sort(shapes.begin(), shapes.end(),
+              [](const TensorShapeProto& left, const TensorShapeProto& right) {
+                return CompareSymbolicallyShapedTensorSizes(left, right);
+              });
+
+    // optimized name for leaf AddN nodes
+    auto leaf_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Leaf_", i));
+    };
+    // optimized name for internal nodes of a tree built up from AddN leaves
+    auto internal_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Internal_", i));
+    };
+
+    // Add/AddN nodes that must be added to the tree
+    std::deque<InputAndShape> add_ops;
+
+    // Prepare leaf AddN nodes for inputs of equal shape
+    for (int i = 0; i < shapes.size(); ++i) {
+      const auto node_name = leaf_node_name(i);
+      const auto& inputs = shape_sig_to_inputs[shape_sig(shapes[i])];
+      add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
+                                                          node_name, inputs));
+    }
 
-  // Create a new node for a AddOpsGroup and return it's name.
-  string RewriteAddOpsGroup(const AddOpsGroup& group) {
-    CHECK_GT(group.absorbed_nodes.size(), 0)
-        << "AddOpsGroup must have non empty absorbed nodes";
+    // Build up a tree of Add ops
+    int internal_nodes = 0;
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+      string name = add_ops.empty() ? OptimizedNodeName(root_scope_and_name)
+                                    : internal_node_name(internal_nodes++);
+      InputAndShape add = AddAggregatedInputs(*group.root_node, name, lhs, rhs);
+      add_ops.push_front(add);
+    } while (add_ops.size() > 1);
+
+    InputAndShape optimized_root_node = add_ops.front();
+    return optimized_root_node.input;
+  }
+
+  // Add 'AddN' node to aggregate inputs of symbolically equal shape
+  InputAndShape AddInputsOfSymbolicallyEqualShape(
+      const NodeDef& root_node, const string& node_name,
+      const std::vector<InputAndShape>& inputs) {
+    CHECK(!inputs.empty()) << "Inputs must be non-empty";
+
+    // Do not create redundant AddN nodes
+    if (inputs.size() == 1) {
+      return inputs[0];
+    }
 
-    // name for a new node constructed from AddOpsGroup
-    string node_name = AddOpsGroupName(group);
+    // get shape from representative element
+    auto shape = inputs[0].shape;
 
     // copy attributes from a root node
-    DataType dtype = group.root_node->attr().at("T").type();
+    DataType dtype = root_node.attr().at("T").type();
 
     // add new AddN node
-    NodeDef* added_node = AddEmptyNode(node_name);
-    added_node->set_op("AddN");
-    added_node->set_device(group.root_node->device());
-    (*added_node->mutable_attr())["T"].set_type(dtype);
-    (*added_node->mutable_attr())["N"].set_i(group.inputs.size());
-
-    // all inputs of absorbed nodes are added to the new node
-    for (const string& input : group.inputs) {
-      ctx_.node_map->AddOutput(input, node_name);
-      added_node->add_input(input);
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("AddN");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+    (*node->mutable_attr())["N"].set_i(inputs.size());
+
+    for (const auto& inputAndShape : inputs) {
+      ctx_.node_map->AddOutput(inputAndShape.input, node_name);
+      node->add_input(inputAndShape.input);
     }
 
-    // Add frame dependencies that the original node might have had.
-    AddFrameControlDeps(group.root_node, {added_node}, "", {});
+    rewritten_nodes_.insert(node_name);
+    return InputAndShape(node_name, shape);
+  }
 
-    VLOG(1) << "Absorbed " << group.absorbed_nodes.size()
-            << " Add/AddN nodes from the graph";
+  // Add a single 'Add' node to sum two inputs
+  InputAndShape AddAggregatedInputs(const NodeDef& root_node,
+                                    const string& node_name,
+                                    const InputAndShape& left,
+                                    const InputAndShape& right) {
+    // copy attributes from a root node
+    DataType dtype = root_node.attr().at("T").type();
 
-    // keep track of nodes that were created or absorbed as a part of rewrite
-    rewritten_nodes_.insert(node_name);
-    for (const NodeDef* absorbed : group.absorbed_nodes) {
-      rewritten_nodes_.insert(absorbed->name());
-    }
+    // add new Add node
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("Add");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+
+    ctx_.node_map->AddOutput(left.input, node_name);
+    ctx_.node_map->AddOutput(right.input, node_name);
 
-    return node_name;
+    node->add_input(left.input);
+    node->add_input(right.input);
+
+    rewritten_nodes_.insert(node_name);
+    return InputAndShape(
+        node_name, TensorShapeProto());  // shape is not important at this point
   }
 
   // keep nodes that were added or absorbed as a part of AddOpsGroup rewrite
@@ -1686,24 +1766,33 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  optimized_graph_ = optimized_graph;
-  *optimized_graph_ = item.graph;
+  GrapplerItem optimized_item(item);
+  optimized_graph_ = &optimized_item.graph;
 
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
   node_map_.reset(new NodeMap(optimized_graph_));
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map_, &num_frames));
+
+  DedupComputations();
+
+  // Perform topological sort on the graph in order to help AddOpsRewrite to
+  // optimize larger subgraphs starting from the roots with more inputs.
+  TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
+
   // Shapes are only needed in aggressive mode.
   graph_properties_.reset(new GraphProperties(item));
   TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
 
+  // Identify loop frames
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map_, &num_frames));
+
   // Perform the optimizations.
-  DedupComputations();
   TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
 
+  optimized_graph->Swap(optimized_graph_);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 965f0e9ea2..63a7b55893 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -69,7 +69,13 @@ class ArithmeticOptimizer : public GraphOptimizer {
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
-      return ArithmeticOptimizerOptions();
+      ArithmeticOptimizerOptions options;
+      // TODO(ezhulenev): enable combine_add_to_addn by default after 1.8
+      // release cut
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.combine_add_to_addn = true;
+      }
+      return options;
     }
   };
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index ad3edc144a..ef3ed35fa6 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -156,25 +156,23 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
 
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
   EXPECT_EQ(1, tensors_expected.size());
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
   EXPECT_EQ(2, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_div = output.node(1);
-  EXPECT_EQ("div", new_div.name());
-  EXPECT_EQ(2, new_div.input_size());
-  EXPECT_EQ("c1", new_div.input(0));
-  EXPECT_EQ("c1", new_div.input(1));
+
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(2, new_div->input_size());
+  EXPECT_EQ("c1", new_div->input(0));
+  EXPECT_EQ("c1", new_div->input(1));
 
   auto tensors = EvaluateNodes(output, item.fetch, {});
   EXPECT_EQ(1, tensors.size());
@@ -198,20 +196,18 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_div = output.node(3);
-  EXPECT_EQ(4, new_div.input_size());
-  EXPECT_EQ("check1", new_div.input(0));
-  EXPECT_EQ("check1", new_div.input(1));
-  EXPECT_EQ("^assert1", new_div.input(2));
-  EXPECT_EQ("^assert1", new_div.input(3));
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(4, new_div->input_size());
+  EXPECT_EQ("check1", new_div->input(0));
+  EXPECT_EQ("check1", new_div->input(1));
+  EXPECT_EQ("^assert1", new_div->input(2));
+  EXPECT_EQ("^assert1", new_div->input(3));
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
@@ -227,28 +223,24 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(4, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_c2 = output.node(1);
-  EXPECT_EQ("c2", new_c2.name());
-  const NodeDef& new_mul1 = output.node(2);
-  EXPECT_EQ("mul1", new_mul1.name());
-  EXPECT_EQ(2, new_mul1.input_size());
-  EXPECT_EQ("c1", new_mul1.input(0));
-  EXPECT_EQ("c2", new_mul1.input(1));
-  const NodeDef& new_div1 = output.node(3);
-  EXPECT_EQ("div1", new_div1.name());
-  EXPECT_EQ(2, new_div1.input_size());
-  EXPECT_EQ("mul1", new_div1.input(0));
-  EXPECT_EQ("mul1", new_div1.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+  const NodeDef* new_c2 = node_map.GetNode("c2");
+  ASSERT_NE(new_c2, nullptr);
+  const NodeDef* new_mul1 = node_map.GetNode("mul1");
+  ASSERT_NE(new_mul1, nullptr);
+  EXPECT_EQ(2, new_mul1->input_size());
+  EXPECT_EQ("c1", new_mul1->input(0));
+  EXPECT_EQ("c2", new_mul1->input(1));
+  const NodeDef* new_div1 = node_map.GetNode("div1");
+  ASSERT_NE(new_div1, nullptr);
+  EXPECT_EQ(2, new_div1->input_size());
+  EXPECT_EQ("mul1", new_div1->input(0));
+  EXPECT_EQ("mul1", new_div1->input(1));
 }
 
 TEST_F(ArithmeticOptimizerTest, MulToSquare) {
@@ -364,26 +356,25 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_const = output.node(3);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(4);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  const NodeDef& new_id = output.node(2);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
@@ -398,27 +389,26 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(6, output.node_size());
-  const NodeDef& new_const = output.node(4);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(5);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  EXPECT_EQ("^y", new_mul.input(2));
-  const NodeDef& new_id = output.node(3);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+  EXPECT_EQ("^y", new_mul->input(2));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
@@ -458,25 +448,25 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ(17, output.node_size());
 
   const NodeDef* id_node = node_map.GetNode("id");
-  ASSERT_TRUE(id_node != nullptr);
+  ASSERT_NE(id_node, nullptr);
   EXPECT_EQ(1, id_node->input_size());
   EXPECT_EQ(HoistMulName("Add_6"), id_node->input(0));
 
   const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
-  ASSERT_TRUE(mul_node != nullptr);
+  ASSERT_NE(mul_node, nullptr);
   EXPECT_EQ(2, mul_node->input_size());
   EXPECT_EQ("Placeholder", mul_node->input(0));
   EXPECT_EQ(HoistAddName("Add_6"), mul_node->input(1));
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
-  ASSERT_TRUE(add_6_node != nullptr);
+  ASSERT_NE(add_6_node, nullptr);
   EXPECT_EQ(3, add_6_node->input_size());
   EXPECT_EQ(HoistAddName("Add_4"), add_6_node->input(0));
   EXPECT_EQ(HoistAddName("Add_5"), add_6_node->input(1));
   EXPECT_EQ("^Placeholder", add_6_node->input(2));
 
   const NodeDef* add_4_node = node_map.GetNode(HoistAddName("Add_4"));
-  ASSERT_TRUE(add_4_node != nullptr);
+  ASSERT_NE(add_4_node, nullptr);
   EXPECT_EQ("Add", add_4_node->op());
   EXPECT_EQ(3, add_4_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
@@ -484,7 +474,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ("^Placeholder", add_4_node->input(2));
 
   const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
-  ASSERT_TRUE(add_5_node != nullptr);
+  ASSERT_NE(add_5_node, nullptr);
   EXPECT_EQ("Add", add_5_node->op());
   EXPECT_EQ(3, add_5_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
@@ -492,14 +482,14 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ("^Placeholder", add_5_node->input(2));
 
   const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
-  ASSERT_TRUE(add_const_node != nullptr);
+  ASSERT_NE(add_const_node, nullptr);
   EXPECT_EQ("Const", add_const_node->op());
   EXPECT_EQ(1, add_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_const_node->input(0));
 
   const NodeDef* add_1_const_node =
       node_map.GetNode(OptimizedName("Add_1_const"));
-  ASSERT_TRUE(add_1_const_node != nullptr);
+  ASSERT_NE(add_1_const_node, nullptr);
   EXPECT_EQ("Const", add_1_const_node->op());
   EXPECT_EQ(1, add_1_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_1_const_node->input(0));
@@ -550,17 +540,17 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
         EXPECT_EQ(9, output.node_size());
 
         const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
-        ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+        ASSERT_NE(new_add_node, nullptr) << "Hoisted Add node not found";
         EXPECT_EQ("y1", new_add_node->input(0));
         EXPECT_EQ("y2", new_add_node->input(1));
 
         const NodeDef* new_mul_node = node_map.GetNode(HoistMulName("add"));
-        ASSERT_TRUE(new_mul_node != nullptr) << "Hoisted Mul node not found";
+        ASSERT_NE(new_mul_node, nullptr) << "Hoisted Mul node not found";
         EXPECT_EQ("x", new_mul_node->input(0));
         EXPECT_EQ(new_add_node->name(), new_mul_node->input(1));
 
         const NodeDef* id_node = node_map.GetNode("id");
-        ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+        ASSERT_NE(id_node, nullptr) << "Id node not found";
         EXPECT_EQ("id", id_node->name());
         EXPECT_EQ(HoistMulName("add"), id_node->input(0));
       }
@@ -581,18 +571,17 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("trans_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* trans_fused_node =
+      node_map.GetNode(OptimizedName("trans_fused"));
+  ASSERT_NE(trans_fused_node, nullptr);
+  EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
+  EXPECT_EQ("z", trans_fused_node->input(0));
+  EXPECT_EQ("perm", trans_fused_node->input(1));
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
@@ -609,14 +598,16 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conjugate_trans_fused"), output.node(6).name());
-  EXPECT_EQ("Transpose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conjugate_trans_fused_node =
+      node_map.GetNode(OptimizedName("conjugate_trans_fused"));
+  EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
+  EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
+  EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
@@ -632,18 +623,16 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conj_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conj_fused_node =
+      node_map.GetNode(OptimizedName("conj_fused"));
+  EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
+  EXPECT_EQ("z", conj_fused_node->input(0));
+  EXPECT_EQ("perm", conj_fused_node->input(1));
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
@@ -668,23 +657,22 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
 
     ArithmeticOptimizer optimizer;
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
-    // Run the optimizer twice to make sure the rewrite is idempotent.
-    item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
+    OptimizeTwice(&optimizer, &item, &output);
+    NodeMap node_map(&output);
 
     EXPECT_EQ(7, output.node_size());
-    EXPECT_EQ(OptimizedName("matmul_fused"), output.node(6).name());
-    EXPECT_EQ("a", output.node(6).input(0));
-    EXPECT_EQ("b", output.node(6).input(1));
+
+    const NodeDef* matmul_fused_node =
+        node_map.GetNode(OptimizedName("matmul_fused"));
+    ASSERT_NE(matmul_fused_node, nullptr);
+    EXPECT_EQ("a", matmul_fused_node->input(0));
+    EXPECT_EQ("b", matmul_fused_node->input(1));
     if (matmul_type == "BatchMatMul") {
-      EXPECT_TRUE(output.node(6).attr().at("adj_x").b());
-      EXPECT_TRUE(output.node(6).attr().at("adj_y").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_x").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_y").b());
     } else {
-      EXPECT_TRUE(output.node(6).attr().at("transpose_a").b());
-      EXPECT_TRUE(output.node(6).attr().at("transpose_b").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
   }
 }
@@ -1322,8 +1310,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
 
   // check add tree was replaced with AddN
   const NodeDef* collapsed_add =
-      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_add != nullptr);
+      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
 
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(3, collapsed_add->input_size());
@@ -1333,7 +1321,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
 
   // check output was re-wired to new node
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
-  ASSERT_TRUE(updated_outputs != nullptr);
+  ASSERT_NE(updated_outputs, nullptr);
 
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
 }
@@ -1381,8 +1369,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
 
   // check left Add subtree replaced with AddN
   const NodeDef* collapsed_left =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_left != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_left, nullptr);
 
   EXPECT_EQ("AddN", collapsed_left->op());
   EXPECT_EQ(3, collapsed_left->input_size());
@@ -1392,8 +1380,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
 
   // check right Add subtree replaced with AddN
   const NodeDef* collapsed_right =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz_Add_xy");
-  ASSERT_TRUE(collapsed_right != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz");
+  ASSERT_NE(collapsed_right, nullptr);
 
   EXPECT_EQ("AddN", collapsed_right->op());
   EXPECT_EQ(3, collapsed_right->input_size());
@@ -1403,7 +1391,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
 
   // check that Mul inputs re-wired to new Nodes
   const NodeDef* updated_mul = node_map.GetNode("Mul");
-  ASSERT_TRUE(updated_mul != nullptr);
+  ASSERT_NE(updated_mul, nullptr);
 
   EXPECT_EQ("Mul", updated_mul->op());
   EXPECT_EQ(2, updated_mul->input_size());
@@ -1444,9 +1432,9 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   NodeMap node_map(&output);
 
   // check Add tree replaced with AddN
-  const NodeDef* collapsed_add = node_map.GetNode(
-      "ArithmeticOptimizer/AddOpsRewrite_Add_all_Add_ab_Add_bc");
-  ASSERT_TRUE(collapsed_add != nullptr);
+  const NodeDef* collapsed_add =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_all");
+  ASSERT_NE(collapsed_add, nullptr);
 
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(4, collapsed_add->input_size());
@@ -1496,8 +1484,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
 
   // check add tree was replaced with AddN
   const NodeDef* collapsed_add =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_add != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(3, collapsed_add->input_size());
   EXPECT_EQ("a", collapsed_add->input(0));
@@ -1506,10 +1494,173 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
 
   // check output was re-wired to new node
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
-  ASSERT_TRUE(updated_outputs != nullptr);
+  ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
 }
 
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32, 32, 32}, DT_FLOAT);
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto x = ops::Variable(s.WithOpName("x"), {32}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {32, 32}, DT_FLOAT);
+  auto z = ops::Variable(s.WithOpName("z"), {32, 32, 32}, DT_FLOAT);
+  auto add_xy = ops::Add(s.WithOpName("Add_xy"), x, y);
+  auto add_xyz = ops::Add(s.WithOpName("Add_xyz"), add_xy, z);
+
+  auto add_all = ops::Add(s.WithOpName("AddAll"), add_abc, add_xyz);
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_all);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //  1) [a, x], [b, y], [c, z] - aggregate same shapes first
+  //  2) Build an aggregation tree minimizing cost of broadcast
+  //
+  //         +                              +
+  //      /     \                       /       \
+  //     +       +                     +       AddN(c, z)
+  //    / \     / \                 /     \
+  //   +   c   x   + -->    AddN(a, x)  AddN(b, y)
+  //  / \         / \
+  // a   b       y   z
+  EXPECT_EQ(12, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_AddAll";
+  string outer_0_add_name =
+      "ArithmeticOptimizer/AddOpsRewrite_Internal_0_AddAll";
+  string inner_0_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_AddAll";
+  string inner_1_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_1_AddAll";
+  string inner_2_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_2_AddAll";
+
+  // Add [a, x] first
+  const NodeDef* add_ax_node = node_map.GetNode(inner_0_add_name);
+  ASSERT_NE(add_ax_node, nullptr);
+  EXPECT_EQ("AddN", add_ax_node->op());
+  EXPECT_EQ(2, add_ax_node->input_size());
+  EXPECT_EQ("a", add_ax_node->input(0));
+  EXPECT_EQ("x", add_ax_node->input(1));
+
+  // Then add [b, y]
+  const NodeDef* add_by_node = node_map.GetNode(inner_1_add_name);
+  ASSERT_NE(add_by_node, nullptr);
+  EXPECT_EQ("AddN", add_by_node->op());
+  EXPECT_EQ(2, add_by_node->input_size());
+  EXPECT_EQ("b", add_by_node->input(0));
+  EXPECT_EQ("y", add_by_node->input(1));
+
+  // Then add [c, z]
+  const NodeDef* add_cz_node = node_map.GetNode(inner_2_add_name);
+  ASSERT_NE(add_cz_node, nullptr);
+  EXPECT_EQ("AddN", add_cz_node->op());
+  EXPECT_EQ(2, add_cz_node->input_size());
+  EXPECT_EQ("c", add_cz_node->input(0));
+  EXPECT_EQ("z", add_cz_node->input(1));
+
+  // Then add results together starting from smaller shapes [a, x] + [b, y]
+  const NodeDef* outer_0_node = node_map.GetNode(outer_0_add_name);
+  ASSERT_NE(outer_0_node, nullptr);
+  EXPECT_EQ("Add", outer_0_node->op());
+  EXPECT_EQ(2, outer_0_node->input_size());
+  EXPECT_EQ(inner_0_add_name, outer_0_node->input(0));
+  EXPECT_EQ(inner_1_add_name, outer_0_node->input(1));
+
+  // And finally top level Add node
+  const NodeDef* outer_node = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_node, nullptr);
+  EXPECT_EQ("Add", outer_node->op());
+  EXPECT_EQ(2, outer_node->input_size());
+  EXPECT_EQ(outer_0_add_name, outer_node->input(0));
+  EXPECT_EQ(inner_2_add_name, outer_node->input(1));
+
+  // And outputs reading new top level Add node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // We have a small input with one unknown dimension
+  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_FLOAT);
+
+  // And second input which is larger, but has the same unknown dimension
+  // device spec prevents this node from rewriting
+  auto d = "/job:do_not_rewrite_me";
+  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_FLOAT);
+  auto large = ops::Add(s.WithOpName("large").WithDevice(d), small, v);
+
+  // [a, c] have {?, 1, 1} shape, [b] has {?, 32, 32}
+  auto a = ops::Sqrt(s.WithOpName("a"), small);
+  auto b = ops::Square(s.WithOpName("b"), large);
+  auto c = ops::Round(s.WithOpName("c"), small);
+
+  // [add_ab, add_abc] shape must be inferred from inputs
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: it's much cheaper to add small
+  // tensors, and do the broadcast just once
+  //
+  //     +                  +
+  //    / \                / \
+  //   +   c      -->     +   b
+  //  / \                / \
+  // a   b              a   c
+  EXPECT_EQ(9, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_Add_abc";
+  string inner_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_Add_abc";
+
+  // outer Add node
+  const NodeDef* outer_add = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_add, nullptr);
+  EXPECT_EQ("Add", outer_add->op());
+  EXPECT_EQ(inner_add_name, outer_add->input(0));
+  EXPECT_EQ("b", outer_add->input(1));
+
+  // inner AddN node
+  const NodeDef* inner_add = node_map.GetNode(inner_add_name);
+  ASSERT_NE(inner_add, nullptr);
+  EXPECT_EQ(2, inner_add->input_size());
+  EXPECT_EQ("a", inner_add->input(0));
+  EXPECT_EQ("c", inner_add->input(1));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
 TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto x = ops::Variable(s.WithOpName("x"), {2, 2}, DT_FLOAT);
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.h b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
index a9dcf44e23..eb79bab314 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes.h
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
@@ -31,8 +31,8 @@ bool IsUnknown(const TensorShapeProto::Dim& dim);
 bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
 bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
 
-// Shapes are symbolically equal, if they have the same rank, they are
-// they are known or symbolically defined, and have matching dimensions.
+// Shapes are symbolically equal, if they have the same rank, they are known or
+// symbolically defined, and have matching dimensions.
 bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
                              const TensorShapeProto& right);
 bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
-- 
GitLab


From 5531482360f803030564d768752aa38800b39636 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 2 Apr 2018 12:58:52 -0700
Subject: [PATCH 0153/1262] eager: Tweak error message.

Motivated by
https://stackoverflow.com/questions/49616532/a-tensorflow-eager-gpu-error/49617069

PiperOrigin-RevId: 191334050
---
 tensorflow/c/eager/c_api.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index bb1492fca2..c96a38dec3 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -496,9 +496,11 @@ tensorflow::Status ValidateInputTypeAndPlacement(
               expected_device->name(), " but is actually on ",
               actual_device->name(), " (operation running on ",
               op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu(),"
-              " or transparently copied by using tfe.enable_eager_execution("
-              "tfe.DEVICE_PLACEMENT_SILENT). Copying tensors between devices"
+              " Tensors can be copied explicitly using .gpu() or .cpu() "
+              "methods,"
+              " or transparently copied by using tf.enable_eager_execution("
+              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+              "between devices"
               " may slow down your model");
         case tensorflow::DEVICE_PLACEMENT_WARN:
           LOG(WARNING) << "before computing " << op->name << " input #" << i
-- 
GitLab


From dbb11a8593570ad4d7af96da9a7e314f0209844c Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 2 Apr 2018 13:17:14 -0700
Subject: [PATCH 0154/1262] Reduce overhead for eager ops

- Call _context_handle in the fastpath. Fall back to slow path if it is not
  initialized.
  A better fix would be to not initialize handle and devices lazily (and not
  have to pay that function call in the slow path either), but that
  seems to break all GPU/TPU tests. I'm not as yet really familiar with how
  devices are recognized, but I'd be happy to hear any ideas you may have to
  fix this.
- context.context() is monkey patched to remove the "is None" check once we
  know the context is correctly initialized. Ideally we would be able to remove
  this function call as well.
- Maintain is_eager instead of doing the comparison every time. Also, in the
  fastpath, inline the check directly instead of paying the function call cost.
- Inline _eager_context.device_name instead of get the device_name property to
  not pay the function call cost

gen_array_ops.identity Old: 216706.923837 examples/sec (4.61452722549)
gen_array_ops.identity New: 290819.129714 examples/sec (3.43856334686)

PiperOrigin-RevId: 191336857
---
 tensorflow/python/eager/context.py            | 25 +++++++++++--------
 .../python/eager/python_eager_op_gen.cc       |  8 +++---
 tensorflow/python/eager/pywrap_tfe_src.cc     |  9 +++++++
 tensorflow/python/framework/ops.py            |  4 +++
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 6ad9e0d88f..99ec895b54 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -86,6 +86,7 @@ class _EagerContext(threading.local):
     self.device_spec = pydev.DeviceSpec.from_string("")
     self.device_name = self.device_spec.to_string()
     self.mode = _default_mode
+    self.is_eager = _default_mode == EAGER_MODE
     self.scope_name = ""
     self.recording_summaries = False
     self.summary_writer_resource = None
@@ -283,9 +284,12 @@ class Context(object):
 
   @tf_contextlib.contextmanager
   def _mode(self, mode):
+    """A context manager to allow setting the mode to EAGER/GRAPH."""
     ctx = self._eager_context
     old_mode = ctx.mode
+    old_is_eager = ctx.is_eager
     ctx.mode = mode
+    ctx.is_eager = mode == EAGER_MODE
     if mode == EAGER_MODE:
       # Entering graph mode does not provide us with sufficient information to
       # record a context switch; graph-based context switches are only logged
@@ -294,13 +298,14 @@ class Context(object):
     try:
       yield
     finally:
+      ctx.is_eager = old_is_eager
       ctx.mode = old_mode
       if mode == EAGER_MODE:
         self.context_switches.pop()
 
   def executing_eagerly(self):
     """Returns True if current thread has eager executing enabled."""
-    return self._eager_context.mode == EAGER_MODE
+    return self._eager_context.is_eager
 
   def scalar_cache(self):
     """Per-device cache for scalars."""
@@ -508,23 +513,19 @@ class Context(object):
     To retrieve the accumulated metadata call context.export_run_metadata()
     and to stop tracing call context.disable_run_metadata().
     """
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
 
   @tf_contextlib.contextmanager
   def device_policy(self, policy):
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-        self._context_handle)
+    handle = self._handle
+    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(handle)
     pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-        self._handle, policy)
+        handle, policy)
     try:
       yield
     finally:
       pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-          self._handle, old)
+          handle, old)
 
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
@@ -575,6 +576,10 @@ def context():
   return _context
 
 
+def context_safe():
+  return _context
+
+
 # TODO(agarwal): remove this.
 def get_default_context():
   """Same as context."""
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index c2ce8efd7f..06185904e7 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -367,7 +367,7 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   // Handle graph-mode case
   strings::StrAppend(&result_,
                      "  _ctx = _context.context()\n"
-                     "  if not _ctx.executing_eagerly():\n",
+                     "  if not _ctx._eager_context.is_eager:\n",
                      function_setup,
                      "    _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
@@ -712,9 +712,9 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 }
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
-  string fastpath_execute_params =
-      strings::StrCat("_ctx._handle, _ctx.device_name, \"", op_def_.name(),
-                      "\", ", "name, _ctx._post_execution_callbacks");
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 8a398f6447..d99bd0b0ff 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1844,6 +1844,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+
+  if (op_exec_info.ctx == nullptr) {
+    // The context hasn't been initialized. It will be in the slow path.
+    RaiseFallbackException(
+        "This function does not handle the case of the path where "
+        "all inputs are not already EagerTensors.");
+    return nullptr;
+  }
+
   op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
   op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
   op_exec_info.op_def = GetOpDef(op_exec_info.op_name);
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 22b621e4cb..c0baeb98ac 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5343,6 +5343,10 @@ def enable_eager_execution(config=None, device_policy=None,
     raise ValueError(
         "tf.enable_eager_execution must be called at program startup.")
 
+  # Monkey patch to get rid of an unnecessary conditional since the context is
+  # now initialized.
+  context.context = context.context_safe
+
 
 def eager_run(main=None, argv=None):
   """Runs the program with an optional main function and argv list.
-- 
GitLab


From 2f64599f850f1560e4c6ef2c5869156f39c2ad44 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 13:41:26 -0700
Subject: [PATCH 0155/1262] Verify that a %send and a %recv on the same
 channel, don't end up landing on the same device. Also verify that
 send/send-done and recv/recv-done are on the same device/module.

PiperOrigin-RevId: 191340724
---
 .../xla/service/hlo_module_group_metadata.cc  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index fa5dcb0b36..54c34ce116 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -313,6 +313,27 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
     if (!ShapeUtil::Compatible(send_shape, recv_shape)) {
       return FailedPrecondition("send/recv shapes do not match");
     }
+    const HloModule* send_module = channel.send->parent()->parent();
+    const HloModule* send_done_module = channel.send_done->parent()->parent();
+    if (send_module != send_done_module) {
+      return FailedPrecondition(
+          "send and send-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(send_module), GetModuleId(send_done_module));
+    }
+    const HloModule* recv_module = channel.recv->parent()->parent();
+    const HloModule* recv_done_module = channel.recv_done->parent()->parent();
+    if (recv_module != recv_done_module) {
+      return FailedPrecondition(
+          "recv and recv-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(recv_module), GetModuleId(recv_done_module));
+    }
+    if (send_module == recv_module) {
+      return FailedPrecondition(
+          "send and recv (channel=%lld) must be on different devices: %lld",
+          channel.id, GetModuleId(send_module));
+    }
   }
 
   // Check if channel instructions are used only in allowed computations.
-- 
GitLab


From cf7d4f0ecfbdd7002d3c89a7da75d534b1c7d806 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 2 Apr 2018 13:53:52 -0700
Subject: [PATCH 0156/1262] Don't bypass reshape nodes that anchor control
 dependencies

PiperOrigin-RevId: 191342646
---
 .../grappler/optimizers/arithmetic_optimizer.cc     | 13 ++++++-------
 tensorflow/core/grappler/utils.cc                   |  8 ++++++++
 tensorflow/core/grappler/utils.h                    |  3 +++
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 882e4d9a40..6e27259998 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1344,19 +1344,18 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     int output_pos = 0;
     string input_node_name = ParseNodeName(node->input(0), &output_pos);
     const NodeDef* input = node_map_->GetNode(input_node_name);
-    if (input->op() == "Reshape") {
+    if (input->op() == "Reshape" && !HasControlInputs(*input)) {
       reshape->set_input(0, input->input(0));
       node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
       nodes_to_simplify->PushBack(reshape);
       return reshape->name();
     }
 
-    // If the reshape is a no-op, forward its input to its consumers. This is
-    // considered aggressive, because users may state that the placeholder
-    // outputs tensors of shape [M, N] while feeding it with tensors of shape
-    // [M*N] (or worse). The reshape nodes are then necessary to update the
-    // tensor metadata to the required shape.
-    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_)) {
+    // If the reshape is a no-op, forward its input to its consumers, unless it
+    // anchors a control dependency since we want to make sure that control
+    // dependency is triggered.
+    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_) &&
+        !HasControlInputs(*reshape)) {
       return reshape->input(0);
     }
   }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 86a6d5000d..5893f286ed 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -255,6 +255,14 @@ int NumOutputs(const NodeDef& node, GraphDef* graph) {
   return num_outputs;
 }
 
+bool HasControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  if (num_inputs > 0 && IsControlInput(node.input(num_inputs - 1))) {
+    return true;
+  }
+  return false;
+}
+
 int NumNonControlInputs(const NodeDef& node) {
   int num_inputs = node.input_size();
   for (const string& input : node.input()) {
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 7aa31939f5..11555d712a 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -138,6 +138,9 @@ string AsControlDependency(const string& node);
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
 
+// Returns true iff the node has at least one control input.
+bool HasControlInputs(const NodeDef& node);
+
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
 
-- 
GitLab


From e4c5a755967a6b5442d164e43f03f0672610c998 Mon Sep 17 00:00:00 2001
From: Patrick Nguyen <drpng@google.com>
Date: Mon, 2 Apr 2018 14:03:53 -0700
Subject: [PATCH 0157/1262] Export the rest of If, While, and For.

We keep _If and _While. This moves the tests and python generators.
The operators are not part of the public tensorflow API.

PiperOrigin-RevId: 191344237
---
 .../core/api_def/base_api/api_def_For.pbtxt   |  29 ++
 .../core/api_def/base_api/api_def_If.pbtxt    |  40 +++
 .../core/api_def/base_api/api_def_While.pbtxt |  33 +++
 .../core/api_def/python_api/api_def_For.pbtxt |   1 +
 .../core/api_def/python_api/api_def_If.pbtxt  |   1 +
 .../api_def/python_api/api_def_While.pbtxt    |   1 +
 tensorflow/core/kernels/functional_ops.cc     | 189 ++++++++++--
 tensorflow/core/ops/functional_ops.cc         |  41 ++-
 .../kernel_tests/functional_ops_test.py       | 272 ++++++++++++++++++
 tensorflow/python/ops/functional_ops.py       | 260 ++++++++++++++++-
 10 files changed, 838 insertions(+), 29 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_For.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_If.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_While.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_For.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_If.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_While.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_For.pbtxt b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
new file mode 100644
index 0000000000..a7cd8e1a26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "For"
+  in_arg { name: "start" description: "The lower bound. An int32" }
+  in_arg { name: "limit" description: "The upper bound. An int32" }
+  in_arg { name: "delta" description: "The increment. An int32" }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "A list of dtypes." }
+  attr {
+    name: "body"
+    description: <<END
+    A function that takes a list of tensors (int32, T) and returns another
+    list of tensors (T).
+END
+  }
+  summary: <<END
+  ```python
+   output = input;
+   for i in range(start, limit, delta)
+     output = body(i, output);
+  ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_If.pbtxt b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
new file mode 100644
index 0000000000..7ba5a3f37e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "If"
+  in_arg { name: "cond"  description: "The predicate." }
+  in_arg {
+    name: "cond"
+    description: <<END
+      A Tensor. If the tensor is a scalar of non-boolean type, the
+      scalar is converted to a boolean according to the
+      following rule: if the scalar is a numerical value, non-zero means
+      `True` and zero means False; if the scalar is a string, non-empty
+      means `True` and empty means `False`. If the tensor is not a scalar,
+      being empty means False and being non-empty means True.
+END
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "then_branch"
+    description: <<END
+      A function that takes 'inputs' and returns a list of tensors, whose
+      types are the same as what else_branch returns.
+END
+  }
+  attr {
+    name: "else_branch"
+    description: <<END
+    A function that takes 'inputs' and returns a list of tensors, whose
+    types are the same as what then_branch returns.
+END
+  }
+  summary: "output = cond ? then_branch(input) : else_branch(input)"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_While.pbtxt b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
new file mode 100644
index 0000000000..95a19c6dff
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "While"
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "dtype in use." }
+  attr {
+    name: "cond"
+    description: <<END
+      A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+END
+  }
+  attr {
+    name: "body"
+    description: <<END
+      A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+END
+  }
+  summary: "output = input; While (Cond(output)) { output = Body(output) }"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_For.pbtxt b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
new file mode 100644
index 0000000000..a58ddf56fe
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "For" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_If.pbtxt b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
new file mode 100644
index 0000000000..a44db5da08
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "If" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_While.pbtxt b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
new file mode 100644
index 0000000000..f47a9b0fce
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "While" visibility: HIDDEN }
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index b687088db1..911aa3a78f 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -21,10 +20,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/mutex.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
 
+namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef FunctionLibraryRuntime::Handle FHandle;
@@ -106,11 +107,9 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
   opts->runner = ctx->runner();
 }
 
-}  // end namespace
-
-class FunctionalIf : public AsyncOpKernel {
+class IfOp : public AsyncOpKernel {
  public:
-  explicit FunctionalIf(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit IfOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     auto lib = ctx->function_library();
     OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
     const NameAttrList* func;
@@ -120,7 +119,7 @@ class FunctionalIf : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &else_handle_));
   }
 
-  ~FunctionalIf() override {}
+  ~IfOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     bool cond;
@@ -134,8 +133,7 @@ class FunctionalIf : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalIf* kernel, OpKernelContext* ctx, bool cond,
-          DoneCallback done)
+    State(IfOp* kernel, OpKernelContext* ctx, bool cond, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
           cond_(cond),
@@ -168,7 +166,7 @@ class FunctionalIf : public AsyncOpKernel {
     }
 
    private:
-    FunctionalIf* const kernel_;
+    IfOp* const kernel_;
     OpKernelContext* const ctx_;
     const bool cond_;
     const DoneCallback done_;
@@ -179,18 +177,22 @@ class FunctionalIf : public AsyncOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), FunctionalIf);
+// TODO(drpng): remove this.
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
-                        FunctionalIf);
+                        IfOp);
+
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
-class FunctionalWhile : public AsyncOpKernel {
+class WhileOp : public AsyncOpKernel {
  public:
-  explicit FunctionalWhile(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit WhileOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &cond_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &body_func_));
   }
 
-  ~FunctionalWhile() override {}
+  ~WhileOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     auto lib = ctx->function_library();
@@ -234,7 +236,7 @@ class FunctionalWhile : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalWhile* kernel, OpKernelContext* ctx, FHandle cond_handle,
+    State(WhileOp* kernel, OpKernelContext* ctx, FHandle cond_handle,
           FHandle body_handle, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
@@ -253,7 +255,7 @@ class FunctionalWhile : public AsyncOpKernel {
     void Start() { EvalCond(); }
 
    private:
-    FunctionalWhile* const kernel_;
+    WhileOp* const kernel_;
     OpKernelContext* const ctx_;
     const FHandle cond_handle_;
     const FHandle body_handle_;
@@ -316,7 +318,152 @@ class FunctionalWhile : public AsyncOpKernel {
     }
   };
 };
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), FunctionalWhile);
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), FunctionalWhile);
+// TODO(drpng): remove these.
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), WhileOp);
+
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_GPU), WhileOp);
+
+Status GetScalar(OpKernelContext* ctx, int index, int32* value,
+                 const char* label) {
+  Tensor t = ctx->input(index);
+  if (!TensorShapeUtils::IsScalar(t.shape())) {
+    return errors::InvalidArgument(label, " must be a scalar, but ",
+                                   t.shape().DebugString());
+  }
+  *value = t.scalar<int32>()();
+  return Status::OK();
+}
+
+class ForOp : public AsyncOpKernel {
+ public:
+  explicit ForOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &body_handle_));
+  }
+
+  ~ForOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    (new State(this, ctx, done))->Start();
+  }
+
+ private:
+  FHandle body_handle_;
+
+  class State {
+   public:
+    State(ForOp* kernel, OpKernelContext* ctx, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          args_(1 + ctx_->num_inputs() - 3) {
+      args_[0] = Tensor(DT_INT32, {});
+      iter_ = &args_[0].scalar<int32>()();
+
+      const int32 num_loop_inputs = ctx_->num_inputs() - 3;
+      rets_.reserve(num_loop_inputs);
+      for (int i = 0; i < num_loop_inputs; ++i) {
+        rets_.push_back(ctx_->input(3 + i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      Status s = StartLoop();
+      if (!s.ok()) Finish(s);
+    }
+
+   private:
+    ForOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+
+    int32* iter_;  // points to args_[0].
+    int32 limit_;
+    int32 delta_;
+
+    // If an error e is returned, caller must call Finish(e).
+    // If OK is returned, the async loop execution has been started.
+    Status StartLoop() {
+      SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
+
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 0, iter_, "start"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 1, &limit_, "limit"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 2, &delta_, "delta"));
+
+      if ((delta_ > 0 && *iter_ <= limit_) ||
+          (delta_ < 0 && *iter_ >= limit_) ||
+          (delta_ == 0 && *iter_ == limit_)) {
+        RunNext();
+        return Status::OK();
+      } else {
+        return errors::InvalidArgument("Invalid start/limit/delta: ", *iter_,
+                                       " ", limit_, " ", delta_);
+      }
+    }
+
+    void RunNext() {
+      bool done_loop;
+      if (delta_ > 0) {
+        done_loop = *iter_ >= limit_;
+      } else {
+        done_loop = *iter_ <= limit_;
+      }
+      if (done_loop) {
+        Finish(Status::OK());
+        return;
+      }
+
+      if (rets_.size() >= args_.size()) {
+        Finish(errors::InvalidArgument(
+            "For loop body returned ", rets_.size(),
+            " arguments. Expected: ", args_.size() - 1));
+        return;
+      }
+      for (int i = 0; i < rets_.size(); ++i) {
+        args_[1 + i] = std::move(rets_[i]);
+      }
+      rets_.clear();
+      lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
+                [this](const Status& s) {
+                  if (s.ok()) {
+                    *iter_ += delta_;
+                    RunNext();
+                  } else {
+                    Finish(s);
+                  }
+                });
+    }
+
+    void Finish(Status s) {
+      if (s.ok()) {
+        s = SetOutputs(kernel_, ctx_, rets_);
+      }
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    }
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("For").Device(DEVICE_CPU), ForOp);
+REGISTER_KERNEL_BUILDER(Name("For")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("start")
+                            .HostMemory("limit")
+                            .HostMemory("delta"),
+                        ForOp);
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 4b21fac80a..792686cae1 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -50,6 +50,7 @@ REGISTER_OP("RemoteCall")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
+// TODO(drpng): remove this.
 REGISTER_OP("_If")
     .Input("cond: Tcond")
     .Input("input: Tin")
@@ -76,8 +77,18 @@ else_branch: A function that takes 'inputs' and returns a list of
     tensors.  whose types are the same as what then_branch returns.
 )doc");
 
-// TODO(b/37549631) setting the While Op to always be stateful is too
-// conservative.
+REGISTER_OP("If")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+// TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
     .Output("output: T")
@@ -108,4 +119,30 @@ body: A function that takes a list of tensors and returns another
       by T.
 )doc");
 
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("While")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("For")
+    .Input("start: int32")
+    .Input("limit: int32")
+    .Input("delta: int32")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("body: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 1301ef9d19..10aea89173 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -39,6 +40,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+# pylint: disable=invalid-name
 def simple_scoped_fn(a, x):
   """Simple function: (a, x) -> 2(x+a), but with "2" as a variable in scope."""
   with variable_scope.variable_scope("body"):
@@ -607,6 +609,276 @@ class FunctionalOpsTest(test.TestCase):
       mul = sess.run(remote_op)
       self.assertEqual(mul, 9)
 
+  def testIf(self):
+
+    @function.Defun(dtypes.float32)
+    def Twice(x):
+      return x * 2
+
+    @function.Defun(dtypes.float32)
+    def Thrice(x):
+      return x * 3 + 1
+
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(x):
+        return sess.run(
+            functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice))[0]
+
+      self.assertAllEqual(Run(9.), 18.)
+      self.assertAllEqual(Run(-8.), -23.)
+      self.assertAllEqual(Run(0.), 1.)
+
+  def testWhile(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(n):
+        return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
+
+      self.assertAllEqual(Run(20.), 210.)
+      self.assertAllEqual(Run(100.), 5050.)
+
+  def testWhileError(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def CondReturnsTooManyArgs(n, x):
+      return n > 0, x
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def BodyReturnsTooManyArgs(n, x):
+      return n - 1, x + n, x
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Expected a single scalar.*got 2 tensors."):
+        functional_ops.While([5., 0.], CondReturnsTooManyArgs, Body)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "While loop body returned 3 arguments. Expected: 2"):
+        functional_ops.While([5., 0.], Cond, BodyReturnsTooManyArgs)[0].eval()
+
+  def testWhileInMultipleSubgraphs(self):
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Cond(n, x):  # pylint: disable=unused-argument
+      return n > 0
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+      n = array_ops.placeholder(dtypes.float32)
+      _, result = functional_ops.While([n, 0.], Cond, Body)
+      c = constant_op.constant(37.)
+
+      self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
+      self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
+      # Test that the result is the same when we run a different subgraph.
+      self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0])
+
+  def _tfSum(self, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu) as sess:
+
+      @function.Defun(dtypes.int32, dtypes.float32)
+      def Body(n, x):
+        return x + math_ops.to_float(n)
+
+      xs = [
+          # 1 + 2  + ... + 20
+          functional_ops.For(
+              1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+          # 100 + 99 + ... + 1
+          functional_ops.For(
+              100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+      ]
+      xvals = sess.run(xs)
+    self.assertAllEqual(210, xvals[0])
+    self.assertAllEqual(5050, xvals[1])
+
+  def testFor(self):
+    self._tfSum(False)
+
+  def testForWithWhile(self):
+    self._tfSum(True)
+
+  def testForWithWhileNaming(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="TestBody")
+      def TestBody(n, x):
+        return x + math_ops.to_float(n)
+
+      _ = functional_ops.For(
+          1, 21, 1, [0.], TestBody, rewrite_with_while=True)[0]
+
+    names = []
+    for func in g.as_graph_def().library.function:
+      names.append(func.signature.name)
+    self.assertTrue("TestBody" in names)
+    self.assertTrue("TestBody_Cond" in names)
+    self.assertTrue("TestBody_Body" in names)
+
+  def testForCapturedInputs(self):
+    v = variables.Variable(1.0)
+
+    @function.Defun(dtypes.int32)
+    def TestNullary(n):
+      v + math_ops.to_float(n)  # pylint: disable=expression-not-assigned
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def TestUnary(n, x):
+      return x + math_ops.to_float(n) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32, dtypes.float32)
+    def TestBinary(n, x, x2):
+      return x + math_ops.to_float(n) + v, x2 + v
+
+    for rewrite_with_while in (True, False):
+      # TODO(b/65752372): Set `use_gpu=False` because
+      # `functional_ops.While()` does not reliably work on GPU (apparently
+      # because the result of evaluating the condition may be in device
+      # memory, but it is read on the host).
+      use_gpu = not rewrite_with_while
+      with self.test_session(use_gpu=use_gpu) as sess:
+        result_nullary = functional_ops.For(
+            1, 10, 1, [], TestNullary,
+            rewrite_with_while=rewrite_with_while)
+        result_unary = functional_ops.For(
+            1, 10, 1, [0.], TestUnary,
+            rewrite_with_while=rewrite_with_while)
+        result_binary = functional_ops.For(
+            1, 10, 1, [0., 0.], TestBinary,
+            rewrite_with_while=rewrite_with_while)
+        sess.run(variables.global_variables_initializer())
+        assert not result_nullary
+        # The nullary variant doesn't return anything so we can't easily run it.
+        # As a total hack, fetch the operation by name and run it.
+        sess.run(ops.get_default_graph().get_operation_by_name(
+            "While" if rewrite_with_while else "For"))
+        assert len(result_unary) == 1
+        self.assertEqual([54.0], sess.run(result_unary))
+        assert len(result_binary) == 2
+        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+
+  def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu):
+
+      @function.Defun(dtypes.int32, *[dtypes.float64] * 3)
+      def MLP(i, a, ws, bs):
+        a = math_ops.tanh(math_ops.matmul(a, ws[i, :]) + bs[i, :])
+        return a, ws, bs
+
+      ret = functional_ops.For(
+          0,
+          wsval.shape[0],
+          1, [xval, wsval, bsval],
+          MLP,
+          rewrite_with_while=rewrite_with_while)[0]
+
+      return ret.eval()
+
+  def _npMLP(self, xval, wsval, bsval):
+    for i in range(wsval.shape[0]):
+      xval = np.tanh(np.dot(xval, wsval[i, :]) + bsval[i, :])
+    return xval
+
+  def _testForMLP(self, rewrite_with_while):
+    # We construct a 5-layer Multi-Layer Perceptron network here.
+    # Each layer have the same number of hidden unites (3), and the
+    # activation function is tanh().  We feed the input (xval) with
+    # batch size 2.
+    xval = np.random.normal(size=(2, 3))
+    wsval = np.random.normal(size=(5, 3, 3))
+    bsval = np.random.normal(size=(5, 3))
+    np_ans = self._npMLP(xval, wsval, bsval)
+    tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
+    self.assertAllClose(np_ans, tf_for_ans)
+
+  def testForMLP(self):
+    self._testForMLP(False)
+
+  def testForMLPWhile(self):
+    self._testForMLP(True)
+
+  def testForError(self):
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def Foo(i, v):
+      return math_ops.to_float(i) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def ReturnsTooManyArgs(unused_i, v):
+      return v, v
+
+    with self.test_session(use_gpu=True):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a scalar"):
+        functional_ops.For([0], 10, 1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Invalid start/limit/delta"):
+        functional_ops.For(0, 10, -1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "For loop body returned 2 arguments. Expected: 1"):
+        functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
+
+  def testGradient(self):
+
+    @function.Defun(dtypes.float32)
+    def Poly(x):
+      # y = 2x^3+3x^2+4x+8
+      return 2 * x * x * x + 3 * x * x + 4 * x + 8
+
+    @function.Defun(dtypes.float32)
+    def Grad(x):
+      # dy/dx = dy/dy * dy/dx = 1.0 * (6x^2+6x+4)
+      return functional_ops.Gradient([x, 1.0], Poly)[0]
+
+    with self.test_session(use_gpu=False) as sess:
+      a = constant_op.constant(0.)
+      avals = [Poly(a), Grad(a)]
+      b = constant_op.constant(1.)
+      bvals = [Poly(b), Grad(b)]
+      self.assertAllEqual(sess.run(avals), [8., 4.])
+      self.assertAllEqual(sess.run(bvals), [17., 16.])
+
 
 if __name__ == "__main__":
   test.main()
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index a840b1eddf..4d95ca262c 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,22 +27,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_functional_ops import *
-# pylint: enable=wildcard-import
 # pylint: disable=unused-import
-from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
+from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
+from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -634,3 +636,249 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       varscope.set_caching_device(None)
 
     return output_pack(results_flat)
+
+
+# pylint: disable=invalid-name
+def If(cond, inputs, then_branch, else_branch, name=None):
+  r"""output = Cond(inputs) ? then_branch(inputs) : else_branch(inputs).
+
+  Args:
+    cond: A `Tensor`. A scalar. If the scalar is not a boolean, the scalar is
+      converted to a boolean according to the following rule: if the
+      scalar is a numerical value, non-zero means True and zero means
+      False; if the scalar is a string, non-empty means True and empty
+      means False.
+    inputs: A list of input tensors.
+    then_branch: A function takes 'inputs' and returns a list of tensors,
+        whose types are the same as what else_branch returns.
+    else_branch: A function takes 'inputs' and returns a list of tensors.
+        whose types are the same as what then_branch returns.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors returned by either then_branch(inputs)
+    or else_branch(inputs).
+  """
+  # pylint: disable=protected-access
+  return gen_functional_ops._if(
+      cond,
+      inputs, [_.type for _ in then_branch.definition.signature.output_arg],
+      then_branch,
+      else_branch,
+      name=name)
+
+
+def Gradient(inputs, f, name=None):
+  r"""Computes the gradient function for function f via backpropagation.
+
+  Args:
+    inputs: A list of tensors of size N + M.
+    f: The function we want to compute the gradient for.
+
+      The function 'f' must be a numerical function which takes N inputs and
+      produces M outputs. Its gradient function 'g', which is  a function
+      taking N + M inputs and produces N outputs.
+
+      I.e. if we have
+         (y1, y2, ..., yM) = f(x1, x2, ..., xN),
+      then, g is
+         (dL/dx1, dL/dx2, ..., dL/dxN) = g(x1, x2, ..., xN,
+                                           dL/dy1, dL/dy2, ..., dL/dyM),
+
+      where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+      loss function). dL/dxi is the partial derivative of L with respect
+      to xi.
+
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors of size N.
+  """
+  # TODO(zhifengc): Pretty-print the above spec in latex.
+  # TODO(zhfiengc): Needs some math expert to say the comment above better.
+  tlist = [_.type for _ in f.definition.signature.input_arg]
+  return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
+
+
+# pylint: disable=invalid-name,protected-access
+def While(input_, cond, body, name=None, hostmem=None):
+  r"""output = input; While (Cond(output)) { output = Body(output) }.
+
+  Args:
+    input_: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    cond: . A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+    body: . A funcion takes a list of tensors and returns another
+      list tensors. Both lists have the same types as specified
+      by T.
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, input[i] is a
+      host memory tensor.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if hostmem:
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# b/36459430
+#
+# Ideally, we do not need this rewrite For loop into a While loop.
+# However, today, if a While runs on GPU and the condition returns a
+# boolean, the While kernel crashes. Even if we fix the crash, the
+# bool needs to be copied between GPU and CPU. So, a for loop is much
+# preferred when running on GPU.
+#
+# On the other hand, For op has no directly XLA kernel. So, when we run
+# a for loop, we need to rewrite it using a While op.
+#
+# It should be possible and probably better to write a XLA C++ kernel
+# implementing the logic in _ForUsingWhile.
+def _ForUsingWhile(start,
+                   limit,
+                   delta,
+                   inputs,
+                   forbody,
+                   name=None,
+                   hostmem=None):
+  """Helper to implement a For loop using a While."""
+  # To support negative delta (e.g., range(100, 0, -3)), we iterate
+  # over the range(n) and use iter * delta + start as the real
+  # iteration index. (e.g., for i in range(34): iter = i * (-3) +
+  # 100).
+  d = math_ops.abs(delta)
+  # XLA on TPUs doesn't support integer division
+  n = math_ops.cast(
+      math_ops.cast((math_ops.abs(limit - start) + d - 1), dtypes.float32) /
+      math_ops.cast(d, dtypes.float32), dtypes.int32)
+
+  # Carried loop variables ("extra_args") are implicitly added to the input list
+  # of the WhileBody function. WhileCond does not call forbody, and so does not
+  # depend on any of forbody's extra_args. Since WhileCond and WhileBody
+  # must have identical inputs, we have to augment the cond signature to take
+  # the same types as the carried loop variables.
+  body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
+  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
+
+  cond_name = "%s_Cond" % forbody.name
+
+  @function.Defun(*cond_sig, func_name=cond_name)
+  def WhileCond(i, n, *args):
+    del args
+    return i < n
+
+  body_name = "%s_Body" % forbody.name
+
+  @function.Defun(*body_sig, func_name=body_name)
+  def WhileBody(i, n, start, delta, *args):
+    """A While wrapper for forbody that handles loop-carried captured inputs."""
+    for_result = forbody(start + i * delta, *args)
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(for_result, ops.Operation):
+      for_result = ()
+    # Unary functions return a single Tensor value.
+    elif isinstance(for_result, ops.Tensor):
+      for_result = (for_result,)
+    extra_args = tuple(function.get_extra_args())
+    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+
+  if hostmem is not None:
+    hostmem = [(4 + _) for _ in hostmem]
+
+  results = While(
+      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      cond=WhileCond,
+      body=WhileBody,
+      name=name,
+      hostmem=hostmem)
+  # Slice off the loop-carried captured inputs.
+  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+
+
+def For(start,
+        limit,
+        delta,
+        inputs,
+        body,
+        name=None,
+        hostmem=None,
+        rewrite_with_while=None):
+  r"""out = input; for i in range(start, limit, delta) out = body(i, out).
+
+  Args:
+    start: A `Tensor` of type `int32`.
+    limit: A `Tensor` of type `int32`.
+    delta: A `Tensor` of type `int32`.
+    inputs: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    body: A function takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as (int32, T...).
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, inputs[i] is a
+      host memory tensor. In other words, (i+1)-th argument of the body
+      function is expecting a host memory.
+    rewrite_with_while: If True, using While op to implement the For.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  if rewrite_with_while:
+    return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
+  if body.captured_inputs:
+    wrapper_name = "%s_BodyWrapper" % body.name
+
+    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
+    def BodyWrapper(*args):
+      """A wrapper for body that handles loop-carried captured inputs."""
+      body_result = body(*args)
+      extra_args = tuple(function.get_extra_args())
+      # Nullary functions return an Operation. Normal functions can't do this
+      # because their return values are converted to Tensors.
+      if isinstance(body_result, ops.Operation):
+        return extra_args
+      # Unary functions return a single Tensor value.
+      elif not isinstance(body_result, tuple):
+        return (body_result,) + extra_args
+      # N-ary functions return a tuple of Tensors.
+      else:
+        return body_result + extra_args
+
+    inputs += BodyWrapper.captured_inputs
+    ret = gen_functional_ops._for(
+        start, limit, delta, inputs, BodyWrapper, name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(BodyWrapper.captured_inputs)]
+  else:
+    ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
+  if hostmem:
+    num_for_params = 3  # start/limit/delta
+
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend([num_for_params + i for i in hostmem])
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# pylint: enable=invalid-name,protected-access
-- 
GitLab


From e0488c7f4ee1d3d996c6e87076f4683a41883fd3 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 2 Apr 2018 14:19:42 -0700
Subject: [PATCH 0158/1262] Export "VERSION" as "__version__" as well

PiperOrigin-RevId: 191346647
---
 tensorflow/python/framework/versions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 06955b8858..d08b4bf48a 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -29,7 +29,7 @@ __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
 __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
-tf_export("VERSION").export_constant(__name__, "VERSION")
+tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
 tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
-- 
GitLab


From 817882c28fd6f0dbbbf35b6ac0764ccbd38430d0 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 2 Apr 2018 14:40:37 -0700
Subject: [PATCH 0159/1262] ResourceHandleShapeAndType returns bytes, not
 unicode.

This could cause failures when enabling the C API with python3.

PiperOrigin-RevId: 191350031
---
 tensorflow/python/client/tf_session.i | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 5dcd0c192e..ee76e29c05 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -460,6 +460,11 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
   $1 = PyLong_AsLongLong($input);
 }
 
+// Override default py3 behavior of attempting to encode into Unicode.
+%typemap(out) std::string tensorflow::ResourceHandleShapeAndType {
+  $result = PyBytes_FromStringAndSize($1.data(), $1.size());
+}
+
 // TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
 // skip for now
 %ignore TF_WhileParams;
-- 
GitLab


From 8f543ed7e3e2775aedb5c953f7f5cbff2139663a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 14:46:13 -0700
Subject: [PATCH 0160/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191350894
---
 tensorflow/c/c_api_test.cc                        |  2 +-
 tensorflow/cc/saved_model/loader_test.cc          | 15 ++++++++-------
 tensorflow/cc/tutorials/example_trainer.cc        |  6 ++++--
 .../compiler/xla/service/llvm_ir/llvm_util.cc     |  3 ++-
 .../compiler/xla/tools/parser/hlo_parser_test.cc  |  3 ++-
 .../contrib/android/asset_manager_filesystem.cc   |  5 ++---
 tensorflow/contrib/nccl/kernels/nccl_rewrite.cc   |  3 ++-
 .../tensorboard/db/summary_file_writer_test.cc    |  3 ++-
 .../distributed_runtime/base_rendezvous_mgr.cc    |  4 ++--
 .../core/distributed_runtime/rpc/grpc_session.cc  |  3 ++-
 tensorflow/core/grappler/op_types.cc              |  3 ++-
 tensorflow/core/lib/wav/wav_io_test.cc            |  3 ++-
 tensorflow/core/ops/math_grad_test.cc             |  3 ++-
 tensorflow/core/ops/math_ops_test.cc              | 14 ++++++++------
 .../platform/hadoop/hadoop_file_system_test.cc    |  3 ++-
 .../internal/advisor/tfprof_advisor_test.cc       | 14 ++++++++------
 .../core/util/tensor_bundle/tensor_bundle_test.cc | 13 +++++++------
 tensorflow/examples/label_image/main.cc           |  7 ++++---
 tensorflow/examples/multibox_detector/main.cc     |  7 ++++---
 tensorflow/python/eager/python_eager_op_gen.cc    |  4 ++--
 tensorflow/python/lib/core/py_seq_tensor.cc       |  7 ++++---
 21 files changed, 72 insertions(+), 53 deletions(-)

diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 028f146be3..ca80db23ed 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -53,7 +53,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 4c64d2cfe3..72b8bc1871 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -133,9 +134,9 @@ TEST_F(LoaderTest, NoTagMatch) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message())
-                  .contains("Could not find meta graph def matching supplied "
-                            "tags: { missing-tag }"))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: { missing-tag }"))
       << st.error_message();
 }
 
@@ -149,9 +150,9 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(
-      StringPiece(st.error_message())
-          .contains("Could not find meta graph def matching supplied tags: "))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: "))
       << st.error_message();
 }
 
@@ -169,7 +170,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+  EXPECT_TRUE(str_util::StrContains(st.error_message(), kInvalidTarget))
       << st.error_message();
 }
 
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 3675d72ee3..5dbc4f5f6a 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -166,7 +167,8 @@ namespace {
 
 bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     int32* dst) {
-  if (arg.Consume(flag) && arg.Consume("=")) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag) &&
+      tensorflow::str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     return (sscanf(arg.data(), "%d%c", dst, &extra) == 1);
   }
@@ -176,7 +178,7 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    bool* dst) {
-  if (arg.Consume(flag)) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *dst = true;
       return true;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 2a282f3be7..ec04239b4f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -762,7 +763,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
     fake_argv_storage.push_back("");
     for (const auto& it : options) {
       // Skip options the XLA backend itself consumes.
-      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
+      if (!tensorflow::str_util::StartsWith(it.first, "xla_")) {
         if (it.second.empty()) {
           fake_argv_storage.push_back(it.first);
         } else {
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 863081d654..adc8b1d620 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -894,7 +895,7 @@ class HloParserTest : public ::testing::Test,
                       public ::testing::WithParamInterface<TestData> {
  protected:
   static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-    EXPECT_TRUE(StringPiece(s).contains(expected))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index fe2d13e636..513d519eab 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -229,9 +229,8 @@ string AssetManagerFileSystem::NormalizeDirectoryPath(const string& fname) {
 }
 
 string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) {
-  string output(name);
-  StringPiece piece(output);
-  piece.Consume(prefix_);
+  StringPiece piece(name);
+  str_util::ConsumePrefix(&piece, prefix_);
   return piece.ToString();
 }
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
index a4de46a93f..4676e937e5 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #include <forward_list>
@@ -254,7 +255,7 @@ class NcclReplacePass : public GraphOptimizationPass {
     // Find reduction and broadcast ops and replace them with Send/Recv ops.
     for (Node* node : graph->op_nodes()) {
       StringPiece type = node->type_string();
-      if (!type.starts_with("Nccl")) {
+      if (!str_util::StartsWith(type, "Nccl")) {
         continue;
       }
       if (type == "NcclReduce") {
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index c61b465596..cd3f712256 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -58,7 +59,7 @@ class SummaryFileWriterTest : public ::testing::Test {
     TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
     bool found = false;
     for (const string& f : files) {
-      if (StringPiece(f).contains(test_name)) {
+      if (str_util::StrContains(f, test_name)) {
         if (found) {
           return errors::Unknown("Found more than one file for ", test_name);
         }
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 049eec347c..bafd9bfc68 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -144,9 +144,9 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 // Returns true if "device_name" is a valid full name of local device
 // of the "worker".  This helper is purely based on the worker name
 // and device name and does no lookups in the worker->device_mgr.
-static bool IsLocalDevice(const string& worker_name,
+static bool IsLocalDevice(const StringPiece worker_name,
                           const StringPiece device_name) {
-  return device_name.starts_with(worker_name);
+  return str_util::StartsWith(device_name, worker_name);
 }
 
 Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 120a33f17b..3e79a40683 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -402,7 +403,7 @@ Status GrpcSession::Reset(const SessionOptions& options,
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with(kSchemePrefix);
+    return str_util::StartsWith(options.target, kSchemePrefix);
   }
 
   Session* NewSession(const SessionOptions& options) override {
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index e0ee49d157..e12e432a33 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -409,7 +410,7 @@ bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
   string op_name = node.op();
   std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
-  if (StringPiece(op_name).contains("inplace")) {
+  if (str_util::StrContains(op_name, "inplace")) {
     return true;
   }
   return GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace");
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index d8a83fc464..9e41da6a20 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -203,7 +204,7 @@ TEST(WavIO, ChunkSizeOverflow) {
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
   EXPECT_FALSE(decode_status.ok());
-  EXPECT_TRUE(StringPiece(decode_status.error_message()).contains("too large"))
+  EXPECT_TRUE(str_util::StrContains(decode_status.error_message(), "too large"))
       << decode_status.error_message();
 }
 
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 8dcd3e815f..da38a6bc24 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -362,7 +363,7 @@ class MathGradTest : public ::testing::Test {
 };
 
 void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index ca3772e6f8..8f974d5367 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -239,20 +240,21 @@ TEST(MathOpsTest, Select_ShapeFn) {
 
   // Expect an error when the shapes can't be merged.
   handle_data[2]->at(0).first = shape_proto({2, 2});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "must be equal, but are 1 and 2"));
   handle_data[2]->at(0).first = i1;  // restore to valid
 
   // Expect an error when the types can't be merged.
   handle_data[2]->at(1).second = DT_INT64;
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different dtypes"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "pointing to different dtypes"));
   handle_data[2]->at(1).second = DT_INT32;  // restore to valid
 
   // Expect an error when different numbers of tensors are merged.
   handle_data[2]->push_back({i1, DT_FLOAT});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different numbers of tensors"));
+  EXPECT_TRUE(
+      str_util::StrContains(run_inference_for_handles().error_message(),
+                            "pointing to different numbers of tensors"));
   handle_data[2]->pop_back();  // restore to valid.
 }
 
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index 6ba2f04d0f..b207d34749 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -197,7 +198,7 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
   // implementation makes no guarantees that writable files are readable while
   // being written.
-  if (!StringPiece(fname).starts_with("hdfs://")) {
+  if (!str_util::StartsWith(fname, "hdfs://")) {
     return;
   }
 
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index e968b9c97e..96b6cc30bd 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
 
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,8 +83,8 @@ TEST_F(TFProfAdvisorTest, OperationChecker) {
   (*options.mutable_checkers())[kCheckers[1]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[1]).reports(0))
-                  .contains("NCHW"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[1]).reports(0), "NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
@@ -91,16 +92,17 @@ TEST_F(TFProfAdvisorTest, UtilizationChecker) {
   (*options.mutable_checkers())[kCheckers[0]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[0]).reports(0))
-                  .contains("low utilization"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[0]).reports(0), "low utilization"));
 }
 
 TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
   AdvisorOptionsProto options;
   (*options.mutable_checkers())[kCheckers[2]];
   AdviceProto advice = advisor_->Advise(options);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[2]).reports(0))
-                  .contains("top 1 operation type: Conv2D"));
+  EXPECT_TRUE(
+      str_util::StrContains(advice.checkers().at(kCheckers[2]).reports(0),
+                            "top 1 operation type: Conv2D"));
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 08f1aa7125..7f166f0ec0 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -293,7 +294,7 @@ void VersionTest(const VersionDef& version, StringPiece expected_error) {
   BundleReader reader(Env::Default(), path);
   EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
   EXPECT_TRUE(
-      StringPiece(reader.status().error_message()).starts_with(expected_error));
+      str_util::StartsWith(reader.status().error_message(), expected_error));
 }
 
 }  // namespace
@@ -588,7 +589,7 @@ TEST(TensorBundleTest, Error) {
     TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
     EXPECT_FALSE(writer.Add("foo", Constant_2x3(2.f)).ok());
     EXPECT_TRUE(
-        StringPiece(writer.status().ToString()).contains("duplicate key"));
+        str_util::StrContains(writer.status().ToString(), "duplicate key"));
     EXPECT_FALSE(writer.Finish().ok());
   }
   {  // Double finish
@@ -598,7 +599,7 @@ TEST(TensorBundleTest, Error) {
   }
   {  // Not found.
     BundleReader reader(Env::Default(), Prefix("nonexist"));
-    EXPECT_TRUE(StringPiece(reader.status().ToString()).contains("Not found"));
+    EXPECT_TRUE(str_util::StrContains(reader.status().ToString(), "Not found"));
   }
 }
 
@@ -629,7 +630,7 @@ TEST(TensorBundleTest, Checksum) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(errors::IsDataLoss(status));
-    EXPECT_TRUE(StringPiece(status.ToString()).contains(expected_msg));
+    EXPECT_TRUE(str_util::StrContains(status.ToString(), expected_msg));
   };
 
   // Corrupts a float tensor.
@@ -680,8 +681,8 @@ TEST(TensorBundleTest, Endianness) {
 
   BundleReader reader(Env::Default(), Prefix("end"));
   EXPECT_TRUE(errors::IsUnimplemented(reader.status()));
-  EXPECT_TRUE(StringPiece(reader.status().ToString())
-                  .contains("different endianness from the reader"));
+  EXPECT_TRUE(str_util::StrContains(reader.status().ToString(),
+                                    "different endianness from the reader"));
 }
 
 TEST(TensorBundleTest, TruncatedTensorContents) {
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 63bc39de6c..baa65d3243 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -137,15 +138,15 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     // gif decoder returns 4-D tensor, remove the first dim
     image_reader =
         Squeeze(root.WithOpName("squeeze_first_dim"),
                 DecodeGif(root.WithOpName("gif_reader"), file_reader));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".bmp")) {
     image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index e38704fd98..96ea525a4e 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,10 +85,10 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
@@ -131,7 +132,7 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
 
 Status SaveImage(const Tensor& tensor, const string& file_path) {
   LOG(INFO) << "Saving image to " << file_path;
-  CHECK(tensorflow::StringPiece(file_path).ends_with(".png"))
+  CHECK(tensorflow::str_util::EndsWith(file_path, ".png"))
       << "Only saving of png files is supported.";
 
   auto root = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 06185904e7..15d20bdd1a 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -117,7 +117,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                    const string& function_name)
       : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
     op_name_ = function_name_;
-    op_name_.Consume("_");
+    str_util::ConsumePrefix(&op_name_, "_");
   }
   ~GenEagerPythonOp() override {}
 
@@ -492,7 +492,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
                          " = ", default_value, "\n");
     }
-    if (attr_type.starts_with("list(")) {
+    if (str_util::StartsWith(attr_type, "list(")) {
       ExpectListArg(indentation, attr_api_name, function_setup);
     }
 
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 8247d354db..32ea737a99 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/py_util.h"
@@ -77,9 +78,9 @@ string PyRepr(PyObject* obj) {
 bool IsPyDimension(PyObject* obj) {
   const char* tp_name = obj->ob_type->tp_name;
   if (strcmp(tp_name, "Dimension") != 0) return false;
-  bool ret =
-      StringPiece(PyRepr(PyType(obj)))
-          .ends_with("tensorflow.python.framework.tensor_shape.Dimension'>");
+  bool ret = str_util::EndsWith(
+      PyRepr(PyType(obj)),
+      "tensorflow.python.framework.tensor_shape.Dimension'>");
   return ret;
 }
 
-- 
GitLab


From 1ec02d2a1cf74ab7c0889b6fc6c678c31274d659 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Mon, 2 Apr 2018 14:48:21 -0700
Subject: [PATCH 0161/1262] [TF] Copy-on-write for Resource Variant assign op.

PiperOrigin-RevId: 191351293
---
 .../core/kernels/resource_variable_ops.cc     | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index d1675f27dd..082b57b8e2 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -252,6 +252,7 @@ class AssignVariableOp : public OpKernel {
     // tensor, we can just adopt the input tensor's buffer instead.
     std::unique_ptr<Tensor> input_alias =
         context->forward_input(1, dtype_, value.shape(), DEVICE_MEMORY, attr);
+
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     if (input_alias) {
@@ -363,9 +364,33 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(DT_VARIANT)));
 
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+
+    // Copying is unnecessary if we are the last user of the value
+    // tensor, we can just adopt the input tensor's buffer instead.
+    // Note that Variant objects themselves always reside on host.
+    std::unique_ptr<Tensor> input_alias =
+        context->forward_input(1, DT_VARIANT, value.shape(), HOST_MEMORY, attr);
+
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
-    *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+    if (input_alias) {
+      *variable->tensor() = *input_alias;
+      return;
+    }
+
+    // Need to copy, but maybe we can re-use variable's buffer?
+    if (!variable->tensor()->RefCountIsOne() ||
+        !variable->tensor()->shape().IsSameSize(value.shape())) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      OP_REQUIRES_OK(context,
+                     context->allocate_persistent(DT_VARIANT, value.shape(),
+                                                  &unused, &tmp, attr));
+      *variable->tensor() = *tmp;
+    }
+
     const auto elements_in = value.flat<Variant>();
     auto elements_out = variable->tensor()->flat<Variant>();
     auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
-- 
GitLab


From 2ded65fbbb5724b6ecfca05fb6727f27bb29a14a Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 2 Apr 2018 15:00:40 -0700
Subject: [PATCH 0162/1262] Fix assertion error in
 Graph._create_op_from_tf_operation.

_create_op_from_tf_operation would raise an assertion error if a node
was imported with the same name as an unused name_scope. This isn't
really an error, so this change removes the assert.

Note that with this change, the affected node will have a different
name with the C API enabled, but this seems ok especially considering
this is an edge case.

PiperOrigin-RevId: 191353116
---
 tensorflow/python/framework/importer_test.py | 17 +++++++++++++++++
 tensorflow/python/framework/ops.py           |  8 ++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 369669c2e6..2c913d1e02 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -219,6 +219,23 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(outer_inner.name, "outer/inner_1")
       self.assertEqual(outer_inner_c.name, "outer/inner/c_1")
 
+  def testEmptyNameScope(self):
+    with ops.Graph().as_default():
+      # Create name scope but don't create any ops with it
+      with ops.name_scope("foo"):
+        pass
+
+      # Import graph def that uses name scope name
+      op, = importer.import_graph_def(
+          self._MakeGraphDef("node { name: 'foo' op: 'IntOutput' }"),
+          return_elements=["foo"],
+          name="")
+
+      if ops._USE_C_API:
+        self.assertEqual(op.name, "foo")
+      else:
+        self.assertEqual(op.name, "foo_1")
+
   def testInputMap(self):
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index c0baeb98ac..be0fe5ee44 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3344,8 +3344,12 @@ class Graph(object):
     """
     self._check_not_finalized()
     ret = Operation(c_op, self)
-    assert ret.name not in self._names_in_use
-    self._names_in_use[ret.name] = 1
+    # If a name_scope was created with ret.name but no nodes were created in it,
+    # the name will still appear in _names_in_use even though the name hasn't
+    # been used. This is ok, just leave _names_in_use as-is in this case.
+    # TODO(skyewm): make the C API guarantee no name conflicts.
+    if ret.name not in self._names_in_use:
+      self._names_in_use[ret.name] = 1
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-- 
GitLab


From 5bb819f64deaa9a641abd95b17c00a843dcb3ce8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 15:17:59 -0700
Subject: [PATCH 0163/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 191356007
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 96 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 96 +++++++++++++++++++
 2 files changed, 192 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 7cdf36f423..10b24c2d34 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -20671,6 +20671,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -22754,6 +22786,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
@@ -68075,6 +68146,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 42a68cb712..5764976aee 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -9779,6 +9779,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -11183,6 +11215,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
@@ -32936,6 +33007,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
-- 
GitLab


From 11c0faed23ec32c3f1532f5154dd3c7bb38847d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 15:27:24 -0700
Subject: [PATCH 0164/1262] [XLA] Set trace for the operand of a trace
 instruction when creating the instruction directly or creating from proto.
 Also implement XlaBuidler::Trace.

PiperOrigin-RevId: 191357376
---
 tensorflow/compiler/xla/client/xla_client/xla_builder.cc | 7 ++++++-
 tensorflow/compiler/xla/service/hlo_instruction.cc       | 8 ++++++++
 tensorflow/compiler/xla/service/user_computation.cc      | 1 -
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 04091ecb11..ec2362179e 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -543,7 +543,12 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
 }
 
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_literal() = Literal::CreateR1U8(tag)->ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a2a2c1e615..fcf9ebf5f7 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -98,6 +98,13 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     }
   }
 
+  if (instruction->opcode() == HloOpcode::kTrace) {
+    TF_RET_CHECK(instruction->operands().size() == 1)
+        << "Trace instruction should have 1 operand but sees "
+        << instruction->operands().size();
+    instruction->mutable_operand(0)->set_tracing(instruction.get());
+  }
+
   TF_RET_CHECK(!proto.name().empty());
   instruction->name_ = proto.name();
 
@@ -170,6 +177,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
   instruction->operands_.push_back(operand);
   instruction->literal_ = Literal::CreateR1U8(tag);
+  operand->set_tracing(instruction.get());
   return instruction;
 }
 
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index fcdb2e01fb..532f7fd5bf 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -3491,7 +3491,6 @@ void ComputationLowerer::Visit(
       HloInstruction* operand = lookup_instruction(trace_request.operand());
       hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
-      operand->set_tracing(hlo_instruction);
       break;
     }
 
-- 
GitLab


From 84fc3becd9c18b3f93ec60f35a3b474a8f192bd9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 15:39:35 -0700
Subject: [PATCH 0165/1262] Add tf.math.polyval that evaluates an element-wise
 polynomial using Horner's method. This is equivalent to numpy.polyval.

PiperOrigin-RevId: 191359241
---
 .../python/kernel_tests/cwise_ops_test.py     | 42 ++++++++++
 tensorflow/python/ops/math_ops.py             | 83 ++++++++++++++-----
 tensorflow/tools/api/generator/BUILD          |  1 +
 .../tools/api/golden/tensorflow.math.pbtxt    |  7 ++
 tensorflow/tools/api/golden/tensorflow.pbtxt  |  4 +
 5 files changed, 115 insertions(+), 22 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/tensorflow.math.pbtxt

diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 8db0bb6f0d..34e7751243 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -2165,5 +2165,47 @@ class AccumulateTest(test.TestCase):
         math_ops.accumulate_n([a], tensor_dtype=np.int32)
 
 
+class PolyvalTest(test.TestCase):
+
+  def _runtest(self, dtype, degree):
+    x = np.random.rand(2, 2).astype(dtype)
+    coeffs = [np.random.rand(2, 2).astype(dtype) for _ in range(degree + 1)]
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+  def testSimple(self):
+    for dtype in [
+        np.int32, np.float32, np.float64, np.complex64, np.complex128
+    ]:
+      for degree in range(5):
+        self._runtest(dtype, degree)
+
+  def testBroadcast(self):
+    dtype = np.float32
+    degree = 3
+    shapes = [(1,), (2, 1), (1, 2), (2, 2)]
+    for x_shape in shapes:
+      for coeff_shape in shapes:
+        x = np.random.rand(*x_shape).astype(dtype)
+        coeffs = [
+            np.random.rand(*coeff_shape).astype(dtype)
+            for _ in range(degree + 1)
+        ]
+        np_val = np.polyval(coeffs, x)
+        with self.test_session():
+          tf_val = math_ops.polyval(coeffs, x)
+          self.assertAllClose(np_val, tf_val.eval())
+
+  def testEmpty(self):
+    x = np.random.rand(2, 2).astype(np.float32)
+    coeffs = []
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 276897ab99..1c20d004cb 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -71,6 +71,7 @@ See the @{$python/math_ops} guide.
 @@igammac
 @@zeta
 @@polygamma
+@@polyval
 @@betainc
 @@rint
 @@diag
@@ -174,6 +175,7 @@ from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -184,7 +186,6 @@ arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylin
 tf_export("arg_max")(arg_max)
 tf_export("arg_min")(arg_min)
 
-
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
 _resource_variable_type = None
@@ -1343,8 +1344,7 @@ def _ReductionDims(x, axis, reduction_indices):
   else:
     # Fast path: avoid creating Rank and Range ops if ndims is known.
     if isinstance(x, ops.Tensor) and x._rank() is not None:  # pylint: disable=protected-access
-      return constant_op.constant(
-          np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
+      return constant_op.constant(np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
     if (isinstance(x, sparse_tensor.SparseTensor) and
         x.dense_shape.get_shape().is_fully_defined()):
       rank = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
@@ -2273,10 +2273,11 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
     ValueError: If `inputs` don't all have same shape and dtype or the shape
     cannot be inferred.
   """
+
   def _input_error():
-    return ValueError(
-        "inputs must be a list of at least one Tensor with the "
-        "same dtype and shape")
+    return ValueError("inputs must be a list of at least one Tensor with the "
+                      "same dtype and shape")
+
   if not inputs or not isinstance(inputs, (list, tuple)):
     raise _input_error()
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
@@ -2294,8 +2295,8 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
 
   # tensor_dtype is for safety only; operator's output type computed in C++
   if tensor_dtype is not None and tensor_dtype != inputs[0].dtype:
-    raise TypeError("tensor_dtype is {}, but input is of type {}"
-                    .format(tensor_dtype, inputs[0].dtype))
+    raise TypeError("tensor_dtype is {}, but input is of type {}".format(
+        tensor_dtype, inputs[0].dtype))
 
   if len(inputs) == 1 and name is None:
     return inputs[0]
@@ -2761,14 +2762,14 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sum(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_mean")
-def sparse_segment_mean(data, indices, segment_ids, name=None,
+def sparse_segment_mean(data,
+                        indices,
+                        segment_ids,
+                        name=None,
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
@@ -2805,14 +2806,14 @@ def sparse_segment_mean(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_mean(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_sqrt_n")
-def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+def sparse_segment_sqrt_n(data,
+                          indices,
+                          segment_ids,
+                          name=None,
                           num_segments=None):
   r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
 
@@ -2842,10 +2843,7 @@ def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sqrt_n(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("tensordot", "linalg.tensordot")
@@ -3016,6 +3014,47 @@ def tensordot(a, b, axes, name=None):
       return product
 
 
+@tf_export("math.polyval")
+def polyval(coeffs, x, name=None):
+  r"""Computes the elementwise value of a polynomial.
+
+  If `x` is a tensor and `coeffs` is a list n + 1 tensors, this function returns
+  the value of the n-th order polynomial
+
+     p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)
+
+  evaluated using Horner's method, i.e.
+
+     p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
+            x * coeffs[0]))
+
+  Args:
+    coeffs: A list of `Tensor` representing the coefficients of the polynomial.
+    x: A `Tensor` representing the variable of the polynomial.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as the expression p(x) with usual broadcasting rules
+    for element-wise addition and multiplication applied.
+
+  @compatibility(numpy)
+  Equivalent to numpy.polyval.
+  @end_compatibility
+  """
+
+  with ops.name_scope(name, "polyval", nest.flatten(coeffs) + [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    if len(coeffs) < 1:
+      return array_ops.zeros_like(x, name=name)
+    coeffs = [
+        ops.convert_to_tensor(coeff, name=("coeff_%d" % index))
+        for index, coeff in enumerate(coeffs)
+    ]
+    p = coeffs[0]
+    for c in coeffs[1:]:
+      p = c + p * x
+    return p
+
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
 # 1.0 API so we leave these here for backwards compatibility.
 fft = gen_spectral_ops.fft
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index f8063ae0fb..a1c569951e 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -94,6 +94,7 @@ genrule(
         "api/logging/__init__.py",
         "api/losses/__init__.py",
         "api/manip/__init__.py",
+        "api/math/__init__.py",
         "api/metrics/__init__.py",
         "api/nn/__init__.py",
         "api/nn/rnn_cell/__init__.py",
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
new file mode 100644
index 0000000000..897718c05e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.math"
+tf_module {
+  member_method {
+    name: "polyval"
+    argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 937044aece..afa3b78eb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -404,6 +404,10 @@ tf_module {
     name: "manip"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "math"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
-- 
GitLab


From bf813f8ecf03b0a550593a530e2fec05b491cf9b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 15:46:34 -0700
Subject: [PATCH 0166/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 191360220

---
 tensorflow/go/op/wrappers.go | 250 +++++++++++++++++------------------
 1 file changed, 125 insertions(+), 125 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a33703ad6f..0fd2177df7 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1720,6 +1720,131 @@ func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output
 	return op.Output(0)
 }
 
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the complex conjugate of a complex number.
 //
 // Given a tensor `input` of complex numbers, this operation returns a tensor of
@@ -5128,102 +5253,6 @@ func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -5808,35 +5837,6 @@ func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rank",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
-- 
GitLab


From da92e74fcb28f31c2a4163c58e6e585f561b1c33 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 2 Apr 2018 15:48:04 -0700
Subject: [PATCH 0167/1262] Add support for resource variables

PiperOrigin-RevId: 191360477
---
 tensorflow/core/grappler/op_types.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index e12e432a33..584008b0c1 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -409,6 +409,16 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
 bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
   string op_name = node.op();
+
+  // Ops that modify resource variables effectively modify one of their inputs.
+  if (op_name == "AssignVariableOp" || op_name == "AssignAddVariableOp" ||
+      op_name == "AssignSubVariableOp" || op_name == "ResourceScatterUpdate" ||
+      op_name == "ResourceScatterAdd" || op_name == "ResourceScatterSub" ||
+      op_name == "ResourceScatterMul" || op_name == "ResourceScatterDiv" ||
+      op_name == "ResourceScatterMin" || op_name == "ResourceScatterMax") {
+    return false;
+  }
+
   std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
   if (str_util::StrContains(op_name, "inplace")) {
     return true;
-- 
GitLab


From 0042e54608eedf1f10d761bc9ca68f030feeed04 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 2 Apr 2018 15:51:21 -0700
Subject: [PATCH 0168/1262] Automated g4 rollback of changelist 191326767

PiperOrigin-RevId: 191360905
---
 tensorflow/BUILD                              |  8 ++
 tensorflow/__init__.py                        |  7 +-
 tensorflow/contrib/cmake/python_modules.txt   |  2 -
 tensorflow/contrib/cmake/tf_python.cmake      | 91 ++++---------------
 tensorflow/experimental_api.py                | 38 ++++++++
 tensorflow/python/framework/dtypes.py         |  2 +-
 tensorflow/tools/api/generator/BUILD          |  2 -
 .../tools/api/generator/create_python_api.py  | 21 +----
 tensorflow/tools/api/tests/BUILD              |  1 +
 .../tools/api/tests/api_compatibility_test.py | 48 +++++++++-
 .../ci_build/windows/cpu/cmake/run_py.bat     |  6 +-
 11 files changed, 121 insertions(+), 105 deletions(-)
 create mode 100644 tensorflow/experimental_api.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index cfafffdd13..3d5737a9d7 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -540,6 +540,14 @@ py_library(
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = ["//tensorflow/python"],
+)
+
+py_library(
+    name = "experimental_tensorflow_py",
+    srcs = ["experimental_api.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow/tools/api/tests:__subpackages__"],
     deps = [
         "//tensorflow/python",
         "//tensorflow/tools/api/generator:python_api",
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index c8683e3976..78ad6aec19 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -20,19 +20,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
+from tensorflow.python import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
-
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index d5cf42b641..02c456c199 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -104,8 +104,6 @@ tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
 tensorflow/tools
-tensorflow/tools/api
-tensorflow/tools/api/generator
 tensorflow/tools/graph_transforms
 tensorflow/contrib
 tensorflow/contrib/all_reduce
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 20eeded8d3..fae45ead5c 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -686,77 +686,6 @@ AddUserOps(TARGET _beam_search_ops
     DEPENDS pywrap_tensorflow_internal tf_python_ops
     DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  else()
-    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  endif()
-else()
-  add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
-endif()
-
-
-########################################################
-# Generate API __init__.py files.
-########################################################
-
-# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
-FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
-STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
-string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
-string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
-string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
-
-set(api_init_files "")
-foreach(api_init_file ${api_init_files_list})
-    string(STRIP "${api_init_file}" api_init_file)
-    if(api_init_file)
-        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
-        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
-    endif()
-endforeach(api_init_file)
-set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
-file(WRITE "${api_init_list_file}" "${api_init_files}")
-
-# Run create_python_api.py to generate __init__.py files.
-add_custom_command(
-      OUTPUT ${api_init_files}
-      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
-
-      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
-      # this step is running since the files aren't there yet.
-      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
-      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-
-      # Run create_python_api.py to generate API init files.
-      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
-              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
-
-      # Re-add tensorflow/__init__.py back.
-      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
-                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-
-      COMMENT "Generating __init__.py files for Python API."
-      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
-)
-
-add_custom_target(tf_python_api SOURCES ${api_init_files})
-add_dependencies(tf_python_api tf_python_ops)
-
-
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -766,7 +695,6 @@ add_dependencies(tf_python_build_pip_package
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
-    tf_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
@@ -779,6 +707,25 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
+else()
+  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
+endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/experimental_api.py b/tensorflow/experimental_api.py
new file mode 100644
index 0000000000..63a8aa9cb1
--- /dev/null
+++ b/tensorflow/experimental_api.py
@@ -0,0 +1,38 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Bring in all of the public TensorFlow interface into this
+# module.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+# pylint: disable=wildcard-import
+from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.lazy_loader import LazyLoader
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+del LazyLoader
+
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index a31c424263..0edae92fd4 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -345,7 +345,7 @@ tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 tf_export("uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
-tf_export("uint64").export_constant(__name__, "uint64")
+tf_export("uint64").export_constant(__name__, "uint32")
 int16 = DType(types_pb2.DT_INT16)
 tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index a1c569951e..9f1bdd8aae 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -32,7 +32,6 @@ genrule(
     # api/module1/module2/__init__.py and api/module3/__init__.py.
     # keep sorted
     outs = [
-        # BEGIN GENERATED FILES
         "api/__init__.py",
         "api/app/__init__.py",
         "api/bitwise/__init__.py",
@@ -118,7 +117,6 @@ genrule(
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
         "api/user_ops/__init__.py",
-        # END GENERATED FILES
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 1505dc69b9..183c4731b8 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -195,19 +195,16 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
-    # Convert path separators to '/' for easier parsing below.
-    normalized_output_file = output_file.replace(os.sep, '/')
     if _API_DIR not in output_file:
       raise ValueError(
           'Output files must be in api/ directory, found %s.' % output_file)
     # Get the module name that corresponds to output_file.
     # First get module directory under _API_DIR.
     module_dir = os.path.dirname(
-        normalized_output_file[
-            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
+        output_file[output_file.rfind(_API_DIR)+len(_API_DIR):])
     # Convert / to .
     module_name = module_dir.replace('/', '.').strip('.')
-    module_name_to_file_path[module_name] = os.path.normpath(output_file)
+    module_name_to_file_path[module_name] = output_file
 
   # Create file for each expected output in genrule.
   for module, file_path in module_name_to_file_path.items():
@@ -244,16 +241,6 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
-      help='If a single file is passed in, then we we assume it contains a '
-      'semicolon-separated list of Python files that we expect this script to '
-      'output. If multiple files are passed in, then we assume output files '
-      'are listed directly as arguments.')
+      help='Python files that we expect this script to output.')
   args = parser.parse_args()
-  if len(args.outputs) == 1:
-    # If we only get a single argument, then it must be a file containing
-    # list of outputs.
-    with open(args.outputs[0]) as output_list_file:
-      outputs = [line.strip() for line in output_list_file.read().split(';')]
-  else:
-    outputs = args.outputs
-  main(outputs)
+  main(args.outputs)
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 724b12cd47..0dc154b6d2 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -23,6 +23,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow:experimental_tensorflow_py",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 26d5bca637..603b2a4327 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,6 +34,7 @@ import sys
 import unittest
 
 import tensorflow as tf
+from tensorflow import experimental_api as api
 
 from google.protobuf import text_format
 
@@ -46,6 +47,8 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
+if hasattr(tf, 'experimental_api'):
+  del tf.experimental_api
 
 # FLAGS defined at the bottom:
 FLAGS = None
@@ -205,8 +208,6 @@ class ApiCompatibilityTest(test.TestCase):
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    # TODO(annarev): Make slide_dataset available in API.
-    public_api_visitor.private_map['tf'] = ['slide_dataset']
     traverse.traverse(tf, public_api_visitor)
 
     proto_dict = visitor.GetProtos()
@@ -236,6 +237,49 @@ class ApiCompatibilityTest(test.TestCase):
         verbose=FLAGS.verbose_diffs,
         update_goldens=FLAGS.update_goldens)
 
+  @unittest.skipUnless(
+      sys.version_info.major == 2,
+      'API compabitility test goldens are generated using python2.')
+  def testNewAPIBackwardsCompatibility(self):
+    # Extract all API stuff.
+    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
+
+    public_api_visitor = public_api.PublicAPIVisitor(visitor)
+    public_api_visitor.do_not_descend_map['tf'].append('contrib')
+    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
+    # TODO(annarev): Make slide_dataset available in API.
+    public_api_visitor.private_map['tf'] = ['slide_dataset']
+    traverse.traverse(api, public_api_visitor)
+
+    proto_dict = visitor.GetProtos()
+
+    # Read all golden files.
+    expression = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _KeyToFilePath('*'))
+    golden_file_list = file_io.get_matching_files(expression)
+
+    def _ReadFileToProto(filename):
+      """Read a filename, create a protobuf from its contents."""
+      ret_val = api_objects_pb2.TFAPIObject()
+      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
+      return ret_val
+
+    golden_proto_dict = {
+        _FileNameToKey(filename): _ReadFileToProto(filename)
+        for filename in golden_file_list
+    }
+
+    # Diff them. Do not fail if called with update.
+    # If the test is run to update goldens, only report diffs but do not fail.
+    self._AssertProtoDictEquals(
+        golden_proto_dict,
+        proto_dict,
+        verbose=FLAGS.verbose_diffs,
+        update_goldens=False,
+        additional_missing_object_message=
+        'Check if tf_export decorator/call is missing for this symbol.')
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 30554a084c..3c3b223a00 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -28,9 +28,6 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -40,6 +37,9 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
-- 
GitLab


From b5b87738dc03015939add7820da3d5dd2a744c84 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 2 Apr 2018 15:55:01 -0700
Subject: [PATCH 0169/1262] Java: Update to 1.7.0

PiperOrigin-RevId: 191361364
---
 tensorflow/java/maven/libtensorflow/pom.xml         | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml     | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +-
 tensorflow/java/maven/pom.xml                       | 2 +-
 tensorflow/java/maven/proto/pom.xml                 | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 0b69a8cbe5..c99d04869a 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 541876f7f5..4561c2c8ad 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index d8933e5238..82a2b8e769 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 6286fd73df..4c1ec0cc80 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 4e881f5a63..fcd8236bad 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index d512a7eda9..241581713a 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From 817eddd18b7c1c569bb6d284f13bd9d496b415a9 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Mon, 2 Apr 2018 16:21:37 -0700
Subject: [PATCH 0170/1262] Clarify ResourceVariable specification

PiperOrigin-RevId: 191365224
---
 tensorflow/python/ops/resource_variable_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 2f39ea2e7d..07e25e540c 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -171,7 +171,9 @@ class ResourceVariable(variables.Variable):
   to see all modifications to the value of the variable which happen in any
   operation on which the read_value depends on (either directly, indirectly, or
   via a control dependency) and guaranteed to not see any modification to the
-  value of the variable on which the read_value operation does not depend on.
+  value of the variable from operations that depend on the read_value operation.
+  Updates from operations that have no dependency relationship to the read_value
+  operation might or might not be visible to read_value.
 
   For example, if there is more than one assignment to a ResourceVariable in
   a single session.run call there is a well-defined value for each operation
-- 
GitLab


From 19f7990d06b672e9a8f5085b42bb6822e4877a8b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 16:45:16 -0700
Subject: [PATCH 0171/1262] Add a config option to run Grappler optimizers more
 than once. Don't crash in layout optimizer if no cluster is given. Clean up
 Cluster::DisableOptimizer() so it actually turns all current optimizers off.

PiperOrigin-RevId: 191368433
---
 tensorflow/core/grappler/clusters/cluster.cc  |  4 +
 .../grappler/optimizers/layout_optimizer.cc   |  4 +
 .../grappler/optimizers/meta_optimizer.cc     | 82 +++++++++++--------
 .../optimizers/meta_optimizer_test.cc         | 14 ++++
 .../core/protobuf/rewriter_config.proto       | 12 +++
 5 files changed, 80 insertions(+), 36 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 39bfca244e..8d8c6084ec 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -62,6 +62,10 @@ void Cluster::DisableOptimizer(bool disable) {
         options_.config.mutable_graph_options()->mutable_rewrite_options();
     rewriter_config->set_layout_optimizer(RewriterConfig::OFF);
     rewriter_config->set_disable_model_pruning(true);
+    rewriter_config->set_function_optimization(RewriterConfig::OFF);
+    rewriter_config->set_arithmetic_optimization(RewriterConfig::OFF);
+    rewriter_config->set_loop_optimization(RewriterConfig::OFF);
+    rewriter_config->set_dependency_optimization(RewriterConfig::OFF);
     rewriter_config->set_constant_folding(RewriterConfig::OFF);
     rewriter_config->set_memory_optimization(RewriterConfig::NO_MEM_OPT);
     rewriter_config->mutable_auto_parallel()->set_enable(false);
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 254c1edf7b..308eecd420 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2119,6 +2119,10 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
 
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
+  if (cluster == nullptr) {
+    return errors::InvalidArgument("cluster == nullptr");
+  }
+
   if (GetNumGPUs(*cluster) < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     *output = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index ad655db727..ce27d3d95c 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -44,16 +44,15 @@ int64 NumEdges(const GraphDef& graph) {
 }
 
 string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
-  return strings::StrCat("Graph size before: ", before.node_size(), " nodes, ",
-                         NumEdges(before),
-                         " edges. Graph size after: ", after.node_size(),
-                         " nodes, ", NumEdges(after), " edges.");
+  return strings::StrCat("Graph size after: ", after.node_size(), " nodes (",
+                         after.node_size() - before.node_size(), "), ",
+                         NumEdges(after), " edges (",
+                         NumEdges(after) - NumEdges(before), ")");
 }
 }  // namespace
 
 std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     const string& optimizer) {
-  VLOG(1) << "Adding graph optimization pass: " << optimizer;
   std::unique_ptr<GraphOptimizer> graph_optimizer;
   if (optimizer == "pruning") {
     graph_optimizer.reset(new ModelPruner());
@@ -171,41 +170,52 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     return Status::OK();
   }
 
+  // Some optimizers should be run only once.
+  const std::set<string> run_once_optimizers = {"layout"};
   bool already_optimized = false;
-  for (const auto& optimizer : optimizers) {
-    if (!already_optimized) {
-      Status status = optimizer->Optimize(cluster, item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        result = status.ToString();
-      } else {
-        already_optimized = true;
-        result = strings::StrCat(
-            "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
+  const int num_iterations =
+      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+          ? 1
+          : cfg_.meta_optimizer_iterations();
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    VLOG(1) << "Starting optimization iteration " << iteration + 1;
+    for (const auto& optimizer : optimizers) {
+      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
+        continue;
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
-    } else {
-      GrapplerItem optimized_item(item, std::move(*optimized_graph));
-      Status status =
-          optimizer->Optimize(cluster, optimized_item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        optimized_graph->Swap(&optimized_item.graph);
-        result = status.ToString();
+      if (!already_optimized) {
+        Status status = optimizer->Optimize(cluster, item, optimized_graph);
+        string result;
+        if (!status.ok()) {
+          VLOG(1) << "Not able to apply optimizer " << optimizer->name()
+                  << ". Return status: " << status.ToString();
+          result = status.ToString();
+        } else {
+          already_optimized = true;
+          result = strings::StrCat(
+              "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
+        }
+        result_.push_back(std::make_pair(optimizer->name(), result));
+        VLOG(1) << "Optimizer " << optimizer->name()
+                << " return status: " << result;
       } else {
-        result = strings::StrCat(
-            "OK. ",
-            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+        GrapplerItem optimized_item(item, std::move(*optimized_graph));
+        Status status =
+            optimizer->Optimize(cluster, optimized_item, optimized_graph);
+        string result;
+        if (!status.ok()) {
+          VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
+                  << status.ToString();
+          optimized_graph->Swap(&optimized_item.graph);
+          result = status.ToString();
+        } else {
+          result = strings::StrCat(
+              optimizer->name(), ": ",
+              PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+        }
+        result_.push_back(std::make_pair(optimizer->name(), result));
+        VLOG(1) << result;
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 536347d834..d9a386b9be 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -72,6 +72,20 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
+TEST(MetaOptimizerTest, RunOptimizersTwice) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index bb772460b0..9b6202e7b4 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -29,6 +29,14 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
+  // Enum controling the number of times to run optimizers. The default is to
+  // run them once.
+  enum NumIterationsType {
+    DEFAULT_NUM_ITERS = 0;
+    ONE = 1;
+    TWO = 2;
+  }
+
   // Optimize tensor layouts (default is ON)
   // e.g. This will try to use NCHW layout on GPU which is faster.
   Toggle layout_optimizer = 1;
@@ -51,6 +59,10 @@ message RewriterConfig {
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
+  // Controls how many times we run the optimizers in meta optimizer (default
+  // is once).
+  NumIterationsType meta_optimizer_iterations = 12;
+
   enum MemOptType {
     // The default setting (SCHEDULING and SWAPPING HEURISTICS only)
     DEFAULT_MEM_OPT = 0;
-- 
GitLab


From 7e9113ab912caff9ad15195b15771ff20bde6080 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 17:08:27 -0700
Subject: [PATCH 0172/1262] [XLA] Redesign: implement ExecuteGraphParallel.

PiperOrigin-RevId: 191371793
---
 tensorflow/compiler/xla/client/client.cc     |  34 ++-
 tensorflow/compiler/xla/service/service.cc   | 229 ++++++++++++++++---
 tensorflow/compiler/xla/service/service.h    |  22 ++
 tensorflow/compiler/xla/tests/BUILD          |   2 +
 tensorflow/compiler/xla/tests/client_test.cc |  10 +-
 5 files changed, 259 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index c4c8894374..3f45167fcb 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -324,8 +324,38 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
     tensorflow::gtl::ArraySlice<XlaComputationInstance> computations) {
-  return Unimplemented(
-      "ExecuteParallel is not yet implemented for XlaComputation.");
+  ExecuteGraphParallelRequest request;
+
+  for (const XlaComputationInstance& computation : computations) {
+    ExecuteGraphRequest single_request;
+    *single_request.mutable_computation() = computation.computation.proto();
+    for (GlobalData* argument : computation.arguments) {
+      *single_request.add_arguments() = argument->handle();
+    }
+    *single_request.mutable_execution_options() = computation.execution_options;
+    *request.add_requests() = single_request;
+  }
+
+  ExecuteParallelResponse response;
+  VLOG(1) << "making execute-graph-parallel request: "
+          << request.ShortDebugString();
+  tensorflow::Status s = stub_->ExecuteGraphParallel(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::unique_ptr<GlobalData>> outputs;
+  for (size_t i = 0; i < computations.size(); ++i) {
+    outputs.push_back(
+        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
+    if (computations[i].execution_profile != nullptr) {
+      *computations[i].execution_profile = response.responses(i).profile();
+    }
+  }
+
+  return std::move(outputs);
 }
 
 StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ca8071b7bb..ec883a6cf3 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -409,6 +409,37 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   return std::move(executables);
 }
 
+StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
+    const std::vector<const HloModuleProto*>& module_protos,
+    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+    Backend* backend,
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
+  VLOG(1) << Printf("BuildExecutable on service %p", this);
+
+  VLOG(1) << "Computations:";
+  for (const HloModuleProto* proto : module_protos) {
+    VLOG(1) << proto->name();
+  }
+
+  CHECK_EQ(module_protos.size(), module_configs.size());
+  std::vector<std::unique_ptr<HloModule>> modules;
+  for (int64 i = 0; i < module_protos.size(); ++i) {
+    const HloModuleProto* proto = module_protos[i];
+    const HloModuleConfig& config = *module_configs[i];
+    TF_ASSIGN_OR_RETURN(auto module,
+                        HloModule::CreateFromProto(*proto, config));
+    modules.push_back(std::move(module));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<Executable>> executables,
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
+
+  return std::move(executables);
+}
+
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -703,6 +734,47 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
   return computation->SetReturnValue(arg->operand());
 }
 
+StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
+Service::GetExecutors(const ExecutionOptions& execution_options,
+                      int64 requests_size, int64 request_index) const {
+  if (execution_options.device_handles().empty()) {
+    return FailedPrecondition(
+        "device handles must be given to execute parallel computations");
+  }
+  if (requests_size > 1 && execution_options.device_handles_size() > 1) {
+    return InvalidArgument(
+        "Parallel requests with multiple device handles is not supported. "
+        "Found %lld parallel requests, with request %lld containing %d device "
+        "handles.",
+        requests_size, request_index, execution_options.device_handles_size());
+  }
+  std::vector<perftools::gputools::StreamExecutor*> executors;
+  for (const auto& device_handle : execution_options.device_handles()) {
+    TF_ASSIGN_OR_RETURN(auto replicas,
+                        Replicas(*execute_backend_, device_handle));
+    se::StreamExecutor* executor = replicas[0];
+    CHECK(executor != nullptr);
+    executors.push_back(executor);
+  }
+  return executors;
+}
+
+StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
+    const ExecutionOptions& execution_options,
+    tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments) {
+  // Resolve the allocations for the arguments of the computation, and create
+  // a vector of device memory offsets for the arguments from the allocations.
+  // In the case of partitioned computations, assume all arguments go on the
+  // zeroth core.
+  TF_ASSIGN_OR_RETURN(
+      auto replicas,
+      Replicas(*execute_backend_, execution_options.device_handles(0)));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arguments, replicas));
+  return replicated_arguments;
+}
+
 tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
                                             ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
@@ -731,26 +803,10 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // is one of the executors to run the replicated computation.
     const ExecutionOptions& execution_options =
         arg->requests(i).execution_options();
-    if (execution_options.device_handles().empty()) {
-      return FailedPrecondition(
-          "device handles must be given to execute parallel computations");
-    }
-    if (arg->requests_size() > 1 &&
-        execution_options.device_handles_size() > 1) {
-      return InvalidArgument(
-          "Parallel requests with multiple device handles is not supported. "
-          "Found %d parallel requests, with request %lld containing %d device "
-          "handles.",
-          arg->requests_size(), i, execution_options.device_handles_size());
-    }
-    std::vector<perftools::gputools::StreamExecutor*> executors;
-    for (const auto& device_handle : execution_options.device_handles()) {
-      TF_ASSIGN_OR_RETURN(auto replicas,
-                          Replicas(*execute_backend_, device_handle));
-      se::StreamExecutor* executor = replicas[0];
-      CHECK(executor != nullptr);
-      executors.push_back(executor);
-    }
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
 
     // Resolve the UserComputation object associated with the requested
     // computation and compute the program shape.
@@ -767,16 +823,9 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
         std::shared_ptr<const ProgramShape> program_shape,
         user_computation->ComputeProgramShape(versioned_handle.version));
 
-    // Resolve the allocations for the arguments of the computation, and create
-    // a vector of device memory offsets for the arguments from the allocations.
-    // In the case of partitioned computations, assume all arguments go on the
-    // zeroth core.
-    TF_ASSIGN_OR_RETURN(
-        auto replicas,
-        Replicas(*execute_backend_, execution_options.device_handles(0)));
-    TF_ASSIGN_OR_RETURN(
-        std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-        ResolveAndValidateArguments(request.arguments(), replicas));
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations. Here, we care only about the
@@ -839,7 +888,103 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
 tensorflow::Status Service::ExecuteGraphParallel(
     const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
-  return Unimplemented("execute-graph-parallel is not yet implemented");
+  VLOG(1) << "running execute-graph-parallel request";
+
+  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
+  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<const HloModuleProto*> module_protos;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  std::vector<string> computation_names;
+  std::vector<DeviceHandle> device_handles;
+
+  int num_requested_devices =
+      std::accumulate(arg->requests().begin(), arg->requests().end(), 0,
+                      [](int a, const ExecuteGraphRequest& r) -> int {
+                        return a + r.execution_options().device_handles_size();
+                      });
+  if (num_requested_devices * options_.number_of_replicas() >
+      execute_backend_->device_count()) {
+    return FailedPrecondition(
+        "there are not enough stream executors to execute %d computations",
+        num_requested_devices);
+  }
+
+  for (int64 i = 0; i < arg->requests_size(); ++i) {
+    // Get the stream executor for the i'th computation. This stream executor
+    // is one of the executors to run the replicated computation.
+    const ExecutionOptions& execution_options =
+        arg->requests(i).execution_options();
+    const ExecuteGraphRequest& request = arg->requests(i);
+    TF_RET_CHECK(request.has_computation()) << "computations may not be empty";
+    TF_RET_CHECK(request.computation().has_program_shape())
+        << "programe shape may not be empty";
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
+
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
+
+    // Create an HloModuleConfig object for the computation, given the shape of
+    // the program and the argument allocations. Here, we care only about the
+    // shapes of the arguments, so, it is sufficient to use the arguments of
+    // replica 0.
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(request.computation().program_shape(),
+                           replicated_arguments.front(),
+                           request.execution_options(),
+                           /*user_computation=*/nullptr));
+    VLOG(3)
+        << "ExecuteGraphParallel created HloModuleConfig computation layout: "
+        << module_config->entry_computation_layout().ToString();
+
+    // Adds to the vectors to build and execute the computations after the loop.
+    all_arguments.push_back(replicated_arguments);
+    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
+    module_protos.push_back(&request.computation());
+    module_configs.push_back(std::move(module_config));
+    computation_names.insert(computation_names.end(), executors.size(),
+                             request.computation().name());
+    all_executors.push_back(executors);
+    device_handles.insert(device_handles.end(),
+                          execution_options.device_handles().begin(),
+                          execution_options.device_handles().end());
+  }
+
+  // Build the HloModules and compile to generate the executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteGraphParallel, so we have to pass a null device_allocator below.
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
+                      BuildExecutables(module_protos, std::move(module_configs),
+                                       execute_backend_.get(), all_executors,
+                                       /*device_allocator=*/nullptr));
+  std::vector<Executable*> executable_ptrs;
+  executable_ptrs.reserve(executables.size());
+  for (const auto& executable : executables) {
+    executable_ptrs.push_back(executable.get());
+  }
+
+  // Execute the generated executables in parallel and return the device
+  // handles for each computation's output.
+  ExecutionProfile profile;
+  TF_ASSIGN_OR_RETURN(
+      std::vector<GlobalDataHandle> outputs,
+      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
+                                       execute_backend_.get(), device_handles,
+                                       computation_names, &profile));
+  for (const GlobalDataHandle& output : outputs) {
+    ExecuteResponse response;
+    *response.mutable_output() = output;
+    *response.mutable_profile() = profile;
+    *result->add_responses() = response;
+  }
+
+  VLOG(1) << "successfully completed 'execute-graph-parallel' request";
+  return tensorflow::Status::OK();
 }
 
 tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
@@ -872,6 +1017,20 @@ tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
   *parallel_arg.add_requests() = *arg;
   ExecuteParallelResponse parallel_result;
   TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                          ExecuteResponse* result) {
+  ExecuteGraphParallelRequest parallel_arg;
+  *parallel_arg.add_requests() = *arg;
+  ExecuteParallelResponse parallel_result;
+  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::PickParallelResponse(
+    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
   // The "result device" selection is a bit hacky, but better than assuming it
   // is device 0. We have b/76035356 for restructuring the client API to clean
   // up the current asymmetries and support more functionalities.
@@ -999,8 +1158,14 @@ tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("programe shape may not be empty");
+  }
 
-  // TODO(b/74197823): Handle partitioning.
+  // If we received multiple device handles, we must partition the module.
+  if (arg->execution_options().device_handles_size() > 1) {
+    return ExecuteOneToN(arg, result);
+  }
 
   TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
                                               SingleComputationDeviceHandle()));
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index ebe4a2e043..e09d58bbe7 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -278,6 +278,20 @@ class Service : public ServiceInterface {
       const ExecutionOptions& execution_options,
       const UserComputation* user_computation = nullptr);
 
+  // Picks a parallel response and fills the result.
+  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
+                              ExecuteResponse* result);
+
+  // Prepare the executors for executing parallel.
+  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
+      const ExecutionOptions& execution_options, int64 requests_size,
+      int64 request_index) const;
+
+  // Prepare the arguments for executing parallel.
+  StatusOr<std::vector<std::vector<const ShapedBuffer*>>> GetArguments(
+      const ExecutionOptions& execution_options,
+      tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments);
+
  protected:
   friend class LocalExecutable;
 
@@ -334,6 +348,12 @@ class Service : public ServiceInterface {
       Backend* backend,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
+  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
+      const std::vector<const HloModuleProto*>& module_protos,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+      Backend* backend,
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
@@ -378,6 +398,8 @@ class Service : public ServiceInterface {
   // will be the result of this computation.
   tensorflow::Status ExecuteOneToN(const ExecuteRequest* arg,
                                    ExecuteResponse* result);
+  tensorflow::Status ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                   ExecuteResponse* result);
 
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 283efbb707..9cead12eba 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1566,6 +1566,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 045148cdd1..32e2f2c084 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -109,14 +111,14 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
 
 XLA_TEST_F(ClientTest,
         DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
-  Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
+  XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GlobalData> const_arg,
       client_->TransferToServer(*Literal::CreateR2<int32>({{5, 6}, {7, 8}})));
 
-  ComputationBuilder b(client_, TestName() + ".add");
+  XlaBuilder b(TestName() + ".add");
   b.Add(b.Parameter(0, shape, "param_0"),
         b.ConstantR2<int32>({{1, 2}, {3, 4}}));
   TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build());
@@ -124,14 +126,14 @@ XLA_TEST_F(ClientTest,
   // We can't really test parallel execution on CPU since all of the cores in a
   // CPU are presented as a single device.  So for now we test "parallel"
   // execution on a single device.
-  std::vector<Client::ComputationInstance> computation_instances;
+  std::vector<Client::XlaComputationInstance> computation_instances;
   TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
                           client_->GetDeviceHandles(1));
   ASSERT_EQ(devices.size(), 1);
 
   ExecutionOptions options = execution_options_;
   *options.add_device_handles() = devices[0];
-  computation_instances.push_back(Client::ComputationInstance(
+  computation_instances.push_back(Client::XlaComputationInstance(
       add_with_one_arg, {const_arg.get()}, options, nullptr));
 
   TF_ASSERT_OK_AND_ASSIGN(auto results,
-- 
GitLab


From 03613455c1c1c3957aedc4edcedd96a21bf9a514 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 2 Apr 2018 17:21:41 -0700
Subject: [PATCH 0173/1262] TFTS: Add a OneShotPredictionHead with no model
 state in its serving signature.

PiperOrigin-RevId: 191373516
---
 .../timeseries/python/timeseries/BUILD        | 15 +++-
 .../python/timeseries/estimators.py           |  8 +-
 .../timeseries/python/timeseries/head.py      | 84 +++++++++++++-----
 .../timeseries/python/timeseries/head_test.py | 86 ++++++++++++++++++-
 4 files changed, 166 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 55a25e39fe..86022f46ce 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -88,10 +88,14 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
+        "//tensorflow/python/feature_column",
     ],
 )
 
@@ -132,7 +136,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -141,6 +144,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
@@ -160,19 +164,28 @@ py_test(
         "no_pip_gpu",  # b/63391119
     ],
     deps = [
+        ":estimators",
         ":feature_keys",
         ":head",
+        ":input_pipeline",
         ":model",
         ":state_management",
+        "//tensorflow/contrib/timeseries/examples:lstm",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 469cea4fd2..886e1846e2 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -44,7 +44,7 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
   """An Estimator to fit and evaluate a time series model."""
 
   def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
-               config=None):
+               config=None, head_type=ts_head_lib.TimeSeriesRegressionHead):
     """Initialize the Estimator.
 
     Args:
@@ -55,6 +55,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
           from tf.train.Optimizer. Defaults to Adam with step size 0.02.
       model_dir: See `Estimator`.
       config: See `Estimator`.
+      head_type: The kind of head to use for the model (inheriting from
+          `TimeSeriesRegressionHead`).
     """
     input_statistics_generator = math_utils.InputStatisticsFromMiniBatch(
         dtype=model.dtype, num_features=model.num_features)
@@ -63,8 +65,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     if optimizer is None:
       optimizer = train.AdamOptimizer(0.02)
     self._model = model
-    ts_regression_head = ts_head_lib.time_series_regression_head(
-        model, state_manager, optimizer,
+    ts_regression_head = head_type(
+        model=model, state_manager=state_manager, optimizer=optimizer,
         input_statistics_generator=input_statistics_generator)
     model_fn = ts_regression_head.create_estimator_spec
     super(TimeSeriesRegressor, self).__init__(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 71085f9de8..a28a5872b8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -39,25 +39,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.summary import summary
 
 
-def time_series_regression_head(model,
-                                state_manager,
-                                optimizer,
-                                input_statistics_generator=None):
-  """Creates a `_Head` for time series regression.
-
-  Args:
-    model: A model for time series regression.
-    state_manager: A state manager.
-    optimizer: An optimizer.
-    input_statistics_generator: A input statistics generator.
-
-  Returns:
-    An instance of `_Head` for time series regression.
-  """
-  return _TimeSeriesRegressionHead(model, state_manager, optimizer,
-                                   input_statistics_generator)
-
-
 class _NoStatePredictOutput(export_lib.PredictOutput):
 
   def as_signature_def(self, receiver_tensors):
@@ -68,8 +49,8 @@ class _NoStatePredictOutput(export_lib.PredictOutput):
         receiver_tensors=no_state_receiver_tensors)
 
 
-class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
-  """See `time_series_regression_head`."""
+class TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
+  """Determines input and output signatures for a time series model."""
 
   def __init__(self,
                model,
@@ -77,6 +58,15 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                optimizer,
                input_statistics_generator=None,
                name=None):
+    """Creates a `_Head` for time series regression.
+
+    Args:
+      model: A model for time series regression.
+      state_manager: A state manager.
+      optimizer: An optimizer.
+      input_statistics_generator: A input statistics generator.
+      name: An optional name for the model.
+    """
     self.model = model
     self.state_manager = state_manager
     self.optimizer = optimizer
@@ -265,6 +255,58 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         return self._serving_ops(features)
 
 
+class OneShotPredictionHead(TimeSeriesRegressionHead):
+  """A time series head which exports a single stateless serving signature.
+
+  The serving default signature exported by this head expects `times`, `values`,
+  and any exogenous features, but no state. `values` has shape `[batch_size,
+  filter_length, num_features]` and `times` has shape `[batch_size,
+  total_length]`, where `total_length > filter_length`. Any exogenous features
+  must have their shapes prefixed by the shape of the `times` feature.
+
+  When serving, first performs filtering on the series up to `filter_length`
+  starting from the default start state for the model, then computes predictions
+  on the remainder of the series, returning them.
+
+  Model state is neither accepted nor returned, so filtering must be performed
+  each time predictions are requested when using this head.
+  """
+
+  def _serving_ops(self, features):
+    """Add ops for serving to the graph."""
+    with variable_scope.variable_scope("model", use_resource=True):
+      filtering_features = {}
+      prediction_features = {}
+      values_length = array_ops.shape(
+          features[feature_keys.FilteringFeatures.VALUES])[1]
+      for key, value in features.items():
+        if key == feature_keys.State.STATE_TUPLE:
+          # Ignore state input. The model's default start state is replicated
+          # across the batch.
+          continue
+        if key == feature_keys.FilteringFeatures.VALUES:
+          filtering_features[key] = value
+        else:
+          filtering_features[key] = value[:, :values_length]
+          prediction_features[key] = value[:, values_length:]
+      cold_filtering_outputs = self.model.define_loss(
+          features=filtering_features, mode=estimator_lib.ModeKeys.EVAL)
+      prediction_features[feature_keys.State.STATE_TUPLE] = (
+          cold_filtering_outputs.end_state)
+    with variable_scope.variable_scope("model", reuse=True):
+      prediction_outputs = self.model.predict(
+          features=prediction_features)
+    return estimator_lib.EstimatorSpec(
+        mode=estimator_lib.ModeKeys.PREDICT,
+        export_outputs={
+            feature_keys.SavedModelLabels.PREDICT:
+                _NoStatePredictOutput(prediction_outputs),
+        },
+        # Likely unused, but it is necessary to return `predictions` to satisfy
+        # the Estimator's error checking.
+        predictions={})
+
+
 def _check_feature_shapes_compatible_with(features,
                                           compatible_with_name,
                                           compatible_with_value,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 3415061cfd..c606db76a6 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -18,12 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy
+import six
+
+from tensorflow.contrib.timeseries.examples import lstm as lstm_example
+from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import state_management
 
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +39,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import adam
 from tensorflow.python.training import coordinator as coordinator_lib
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import training as train
@@ -90,7 +101,7 @@ class EvaluationMetricsTests(test.TestCase):
                       .count_up_to(10),
                       dtype=dtypes.float32), (1, 1, 1))
       }
-      model_fn = ts_head_lib.time_series_regression_head(
+      model_fn = ts_head_lib.TimeSeriesRegressionHead(
           model=_TickerModel(),
           state_manager=state_management.PassthroughStateManager(),
           optimizer=train.GradientDescentOptimizer(0.001)).create_estimator_spec
@@ -127,7 +138,7 @@ class _StubModel(object):
 
 
 def _stub_model_fn():
-  return ts_head_lib.time_series_regression_head(
+  return ts_head_lib.TimeSeriesRegressionHead(
       model=_StubModel(),
       state_manager=state_management.PassthroughStateManager(),
       optimizer=train.AdamOptimizer(0.001)).create_estimator_spec
@@ -263,5 +274,76 @@ class PredictFeatureCheckingTests(test.TestCase):
           mode=estimator_lib.ModeKeys.PREDICT)
 
 
+class OneShotTests(test.TestCase):
+
+  def test_one_shot_prediction_head_export(self):
+    model_dir = self.get_temp_dir()
+    categorical_column = feature_column.categorical_column_with_hash_bucket(
+        key="categorical_exogenous_feature", hash_bucket_size=16)
+    exogenous_feature_columns = [
+        feature_column.numeric_column(
+            "2d_exogenous_feature", shape=(2,)),
+        feature_column.embedding_column(
+            categorical_column=categorical_column, dimension=10)]
+    estimator = ts_estimators.TimeSeriesRegressor(
+        model=lstm_example._LSTMModel(
+            num_features=5, num_units=128,
+            exogenous_feature_columns=exogenous_feature_columns),
+        optimizer=adam.AdamOptimizer(0.001),
+        config=estimator_lib.RunConfig(tf_random_seed=4),
+        state_manager=state_management.ChainingStateManager(),
+        head_type=ts_head_lib.OneShotPredictionHead,
+        model_dir=model_dir)
+    train_features = {
+        feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
+            20, dtype=numpy.int64),
+        feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+            20, dtype=numpy.float32)[:, None], [1, 5]),
+        "2d_exogenous_feature": numpy.ones([20, 2]),
+        "categorical_exogenous_feature": numpy.array(
+            ["strkey"] * 20)[:, None]
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(train_features), shuffle_seed=2,
+        num_threads=1, batch_size=16, window_size=16)
+    estimator.train(input_fn=train_input_fn, steps=5)
+    input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+    export_location = estimator.export_savedmodel(self.get_temp_dir(),
+                                                  input_receiver_fn)
+    graph = ops.Graph()
+    with graph.as_default():
+      with session_lib.Session() as session:
+        signatures = loader.load(
+            session, [tag_constants.SERVING], export_location)
+        self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
+                         list(signatures.signature_def.keys()))
+        predict_signature = signatures.signature_def[
+            feature_keys.SavedModelLabels.PREDICT]
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES,
+             "2d_exogenous_feature",
+             "categorical_exogenous_feature"],
+            predict_signature.inputs.keys())
+        features = {
+            feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
+                numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
+            feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+                20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
+            "2d_exogenous_feature": numpy.ones([2, 35, 2]),
+            "categorical_exogenous_feature": numpy.tile(numpy.array(
+                ["strkey"] * 35)[None, :, None], [2, 1, 1])
+        }
+        feeds = {
+            graph.as_graph_element(input_value.name): features[input_key]
+            for input_key, input_value in predict_signature.inputs.items()}
+        fetches = {output_key: graph.as_graph_element(output_value.name)
+                   for output_key, output_value
+                   in predict_signature.outputs.items()}
+        output = session.run(fetches, feed_dict=feeds)
+        self.assertAllEqual((2, 15, 5), output["mean"].shape)
+
+
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From ce1e141e5bbe0b6894a0bf600c3835c532145ba0 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 2 Apr 2018 17:31:47 -0700
Subject: [PATCH 0174/1262] Internal change.

PiperOrigin-RevId: 191374719
---
 tensorflow/python/ops/math_ops.py                    |  1 -
 tensorflow/tools/api/tests/api_compatibility_test.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 1c20d004cb..39f40882db 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -71,7 +71,6 @@ See the @{$python/math_ops} guide.
 @@igammac
 @@zeta
 @@polygamma
-@@polyval
 @@betainc
 @@rint
 @@diag
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 603b2a4327..7eeae05847 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -145,6 +145,9 @@ class ApiCompatibilityTest(test.TestCase):
       verbose_diff_message = ''
       # First check if the key is not found in one or the other.
       if key in only_in_expected:
+        # TODO(annarev): remove once we switch to using tf_export decorators.
+        if key == 'tensorflow.math':
+          continue
         diff_message = 'Object %s expected but not found (removed). %s' % (
             key, additional_missing_object_message)
         verbose_diff_message = diff_message
@@ -229,6 +232,13 @@ class ApiCompatibilityTest(test.TestCase):
         for filename in golden_file_list
     }
 
+    # TODO(annarev): remove once we switch to using tf_export decorators.
+    tf_module = golden_proto_dict['tensorflow'].tf_module
+    for i in range(len(tf_module.member)):
+      if tf_module.member[i].name == 'math':
+        del tf_module.member[i]
+        break
+
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
     self._AssertProtoDictEquals(
-- 
GitLab


From e1f8206f8d2704132d546175c29c0693911ec240 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 2 Apr 2018 17:50:12 -0700
Subject: [PATCH 0175/1262] Don't use session context manager when we need
 session to outlive the context block.

PiperOrigin-RevId: 191376772
---
 tensorflow/python/keras/_impl/keras/estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 8426d84df9..5d370ebbb5 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -466,8 +466,8 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Pass the config into keras backend's default session.
-  with session.Session(config=estimator._session_config) as sess:
-    K.set_session(sess)
+  sess = session.Session(config=estimator._session_config)
+  K.set_session(sess)
 
   keras_weights = keras_model.get_weights()
   if keras_model._is_graph_network:
-- 
GitLab


From 556b4055a42eaaee248239e90b35f90cf2e89dcf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 18:18:11 -0700
Subject: [PATCH 0176/1262] [XLA] Redesign: migrate while_test to use
 XlaBuilder, and implement the ops needed to pass the tests, including: While,
 Slice, DynamicUpdateSlice, Dot, DotGeneral, ConvertElementType, Rng, and
 Reduce.

Also, when a module has embedded computaitons, the service side would complain if the instruction names are not unique in the scope of the module. To ensure instruction names are unique in module, use both the computation id and instruction id as suffix.

PiperOrigin-RevId: 191379697
---
 tensorflow/compiler/xla/client/lib/BUILD      |   2 +
 .../compiler/xla/client/lib/arithmetic.cc     |  95 ++++-
 .../compiler/xla/client/lib/arithmetic.h      |  44 +++
 .../xla/client/xla_client/xla_builder.cc      | 163 ++++++--
 .../xla/client/xla_client/xla_builder.h       |  10 +-
 tensorflow/compiler/xla/tests/BUILD           |   4 +-
 tensorflow/compiler/xla/tests/reduce_test.cc  |  69 ++--
 tensorflow/compiler/xla/tests/while_test.cc   | 368 +++++++++---------
 8 files changed, 513 insertions(+), 242 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index d02972f2c0..f4673a8204 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -24,6 +24,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 24048a1e5a..63df449e0b 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 using InstructionGenerator =
     ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
                               const ComputationDataHandle&);
@@ -47,6 +48,27 @@ Computation CreateScalarComputation(const string& name, PrimitiveType type,
   generator(b.get(), lhs, rhs);
   return b->BuildAndNoteError();
 }
+
+using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
+
+XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
+                                       XlaBuilder* builder,
+                                       XlaOpGenerator generator) {
+  std::unique_ptr<XlaBuilder> b;
+  if (type == PRED) {
+    b = builder->CreateSubBuilder(name);
+  } else {
+    b = builder->CreateSubBuilder(
+        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
+  }
+
+  const Shape scalar = ShapeUtil::MakeShape(type, {});
+  auto lhs = b->Parameter(0, scalar, "lhs");
+  auto rhs = b->Parameter(1, scalar, "rhs");
+  generator(b.get(), lhs, rhs);
+  return b->BuildAndNoteError();
+}
+
 }  // namespace
 
 Computation CreateScalarAddComputation(PrimitiveType type,
@@ -60,7 +82,7 @@ Computation CreateScalarAddComputation(PrimitiveType type,
 Computation CreateScalarMultiplyComputation(PrimitiveType type,
                                             ComputationBuilder* builder) {
   return CreateScalarComputation(
-      "add", type, builder,
+      "mul", type, builder,
       [](ComputationBuilder* b, const ComputationDataHandle& lhs,
          const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
 }
@@ -114,4 +136,75 @@ StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
   return builder->Reduce(predicates, f, logical_or, all_dimensions);
 }
 
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "add", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Add(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "mul", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Mul(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "ge", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Ge(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "max", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Max(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "min", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Min(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "and", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->And(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "or", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Or(lhs, rhs);
+      });
+}
+
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder) {
+  auto f = builder->ConstantR0<bool>(false);
+  XlaComputation logical_or = CreateScalarOrComputation(builder);
+  TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
+                      builder->GetShape(predicates));
+  std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
+  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
+  return builder->Reduce(predicates, f, logical_or, all_dimensions);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index ae89784bc2..f4d3fc8015 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -56,6 +58,48 @@ Computation CreateScalarOrComputation(ComputationBuilder* builder);
 StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
                                     ComputationBuilder* builder);
 
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar add computation and returns it.
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar multiply computation and returns it.
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar ge computation and returns it.
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar max computation and returns it.
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar min computation and returns it.
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical AND computation and returns it.
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical OR computation and returns it.
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Returns whether any predicate in "predicates" is set.
+//
+// Note: if predicates is zero-sized, Any() vacuously returns false.
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index ec2362179e..c2e661cb3d 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -81,7 +81,7 @@ StatusOr<Shape> XlaOp::GetShape() const {
 }
 
 XlaBuilder::XlaBuilder(const string& computation_name)
-    : name_(computation_name) {}
+    : name_(computation_name), unique_id_(GetUniqueId()) {}
 
 XlaBuilder::~XlaBuilder() {}
 
@@ -179,7 +179,6 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
   }
 
   HloComputationProto entry;
-  entry.set_name(name_);
 
   {
     int64 root_id;
@@ -193,9 +192,9 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
     entry.add_instructions()->Swap(&instruction);
   }
 
-  const int64 id = GetUniqueId();
-  entry.set_id(id);
-  XlaComputation computation(id);
+  entry.set_id(unique_id_);
+  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
+  XlaComputation computation(entry.id());
   HloModuleProto* module = computation.mutable_proto();
   module->set_name(entry.name());
   module->set_id(entry.id());
@@ -407,12 +406,7 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
         ShapeInference::InferCallShape(operand_shape_ptrs,
                                        /*to_apply=*/called_program_shape));
 
-    // Add called computation.
-    instr.add_called_computation_ids(
-        computation.proto().entry_computation_id());
-    for (const HloComputationProto& e : computation.proto().computations()) {
-      embedded_.insert({e.id(), e});
-    }
+    AddCalledComputation(computation, &instr);
 
     return AddInstruction(std::move(instr), HloOpcode::kCall, operands);
   });
@@ -470,7 +464,22 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
                         tensorflow::gtl::ArraySlice<int64> start_indices,
                         tensorflow::gtl::ArraySlice<int64> limit_indices,
                         tensorflow::gtl::ArraySlice<int64> strides) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferSliceShape(operand_shape, start_indices,
+                                        limit_indices, strides));
+    for (int i = 0; i < start_indices.size(); i++) {
+      auto* slice_config = instr.add_slice_dimensions();
+      slice_config->set_start(start_indices[i]);
+      slice_config->set_limit(limit_indices[i]);
+      slice_config->set_stride(strides[i]);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand});
+  });
 }
 
 XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
@@ -485,7 +494,20 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
 
 XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                      const XlaOp& start_indices) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicUpdateSliceShape(
+                            operand_shape, update_shape, start_indices_shape));
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
 }
 
 XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
@@ -620,12 +642,29 @@ XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
 }
 
 XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+
+    DotDimensionNumbers dimension_numbers;
+    dimension_numbers.add_lhs_contracting_dimensions(
+        lhs_shape.dimensions_size() == 1 ? 0 : 1);
+    dimension_numbers.add_rhs_contracting_dimensions(0);
+    return DotGeneral(lhs, rhs, dimension_numbers);
+  });
 }
 
 XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                              const DotDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
+                                                        dimension_numbers));
+    *instr.mutable_dot_dimension_numbers() = dimension_numbers;
+    return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
+  });
 }
 
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
@@ -860,7 +899,14 @@ XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
 
 XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
+  });
 }
 
 XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
@@ -894,19 +940,64 @@ XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
   return UnimplementedOp();
 }
 
+XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
+                        tensorflow::gtl::ArraySlice<XlaOp> parameters,
+                        const Shape& shape) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Check the number of parameters per RNG distribution.
+    switch (distribution) {
+      case RandomDistribution::RNG_NORMAL:
+      case RandomDistribution::RNG_UNIFORM:
+        if (parameters.size() != 2) {
+          return InvalidArgument(
+              "RNG distribution (%s) expects 2 parameters, but got %ld",
+              RandomDistribution_Name(distribution).c_str(), parameters.size());
+        }
+        break;
+      default:
+        LOG(FATAL) << "unhandled distribution " << distribution;
+    }
+
+    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+    *instr.mutable_shape() = shape;
+
+    instr.set_distribution(distribution);
+
+    return AddInstruction(std::move(instr), HloOpcode::kRng, parameters);
+  });
+}
+
 XlaOp XlaBuilder::RngNormal(const XlaOp& mu, const XlaOp& sigma,
                             const Shape& shape) {
-  return UnimplementedOp();
+  return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
 }
 
 XlaOp XlaBuilder::RngUniform(const XlaOp& a, const XlaOp& b,
                              const Shape& shape) {
-  return UnimplementedOp();
+  return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
 }
 
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, const XlaOp& init) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Infer shape.
+    TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
+                        condition.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferWhileShape(condition_program_shape,
+                                        body_program_shape, init_shape));
+    // Body comes before condition computation in the vector.
+    AddCalledComputation(body, &instr);
+    AddCalledComputation(condition, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kWhile, {init});
+  });
 }
 
 XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
@@ -926,7 +1017,27 @@ XlaOp XlaBuilder::Reduce(
     const XlaOp& operand, const XlaOp& init_value,
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReduceShape(
+                            operand_shape, init_shape, dimensions_to_reduce,
+                            called_program_shape));
+
+    for (int64 dim : dimensions_to_reduce) {
+      instr.add_dimensions(dim);
+    }
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kReduce,
+                          {operand, init_value});
+  });
 }
 
 XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
@@ -1109,10 +1220,10 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
   instr.set_id(handle);
   instr.set_opcode(HloOpcodeString(opcode));
   if (instr.name().empty()) {
-    instr.set_name(StrCat(instr.opcode(), ".", handle));
+    instr.set_name(StrCat(instr.opcode(), ".", unique_id_, ".", handle));
   } else {
     // Append the handle to make sure the name is unique.
-    instr.set_name(StrCat(instr.name(), ".", handle));
+    instr.set_name(StrCat(instr.name(), ".", unique_id_, ".", handle));
   }
   for (const auto& operand : operands) {
     if (operand.builder_ == nullptr) {
@@ -1138,6 +1249,14 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
   return op;
 }
 
+void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
+                                      HloInstructionProto* instr) {
+  instr->add_called_computation_ids(computation.proto().entry_computation_id());
+  for (const HloComputationProto& e : computation.proto().computations()) {
+    embedded_.insert({e.id(), e});
+  }
+}
+
 StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp& op) const {
   TF_RETURN_IF_ERROR(first_error_);
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index f43101db34..0673b86646 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -803,6 +803,9 @@ class XlaBuilder {
       HloInstructionProto&& instr, HloOpcode opcode,
       tensorflow::gtl::ArraySlice<XlaOp> operands = {});
 
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
+
   // Notes that the error occurred by:
   // * storing it internally and capturing a backtrace if it's the first error
   //   (this deferred value will be produced on the call to Build())
@@ -829,6 +832,10 @@ class XlaBuilder {
   XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
                   const XlaOp& ehs);
 
+  XlaOp RngOp(RandomDistribution distribution,
+              tensorflow::gtl::ArraySlice<XlaOp> parameters,
+              const Shape& shape);
+
   StatusOr<XlaOp> InDimBroadcast(
       const Shape& shape, const XlaOp& operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
@@ -846,7 +853,8 @@ class XlaBuilder {
   // computation and fills the root_id in the pointer.
   StatusOr<ProgramShape> GetProgramShape(int64* root_id);
 
-  string name_;  // Name to use for the built computation.
+  string name_;      // Name to use for the built computation.
+  int64 unique_id_;  // The unique id for the built computation.
 
   // The first error encountered while building the computation.
   // This is OK until the first error is encountered.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 9cead12eba..aba61fbca4 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -347,10 +347,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 3a097a01ab..d24927d22b 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -57,6 +57,11 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using FuncGeneratorForType = Computation (*)(PrimitiveType,
+                                             ComputationBuilder*);
+
+using FuncGenerator = Computation (*)(ComputationBuilder*);
+
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
@@ -755,53 +760,57 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Add) {
-  RunVectorizedReduceTest(CreateScalarAddComputation,
-                          [](float a, float b) { return a + b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) +
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarAddComputation),
+      [](float a, float b) { return a + b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) +
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Multiply) {
-  RunVectorizedReduceTest(CreateScalarMultiplyComputation,
-                          [](float a, float b) { return a * b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) *
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMultiplyComputation),
+      [](float a, float b) { return a * b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) *
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Max) {
-  RunVectorizedReduceTest(CreateScalarMaxComputation,
-                          [](float a, float b) { return std::max(a, b); },
-                          [](int32 a, int32 b) { return std::max(a, b); },
-                          [](uint32 a, uint32 b) { return std::max(a, b); },
-                          std::numeric_limits<float>::min(),
-                          std::numeric_limits<int32>::min(),
-                          std::numeric_limits<uint32>::min());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMaxComputation),
+      [](float a, float b) { return std::max(a, b); },
+      [](int32 a, int32 b) { return std::max(a, b); },
+      [](uint32 a, uint32 b) { return std::max(a, b); },
+      std::numeric_limits<float>::min(), std::numeric_limits<int32>::min(),
+      std::numeric_limits<uint32>::min());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Min) {
-  RunVectorizedReduceTest(CreateScalarMinComputation,
-                          [](float a, float b) { return std::min(a, b); },
-                          [](int32 a, int32 b) { return std::min(a, b); },
-                          [](uint32 a, uint32 b) { return std::min(a, b); },
-                          std::numeric_limits<float>::max(),
-                          std::numeric_limits<int32>::max(),
-                          std::numeric_limits<uint32>::max());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMinComputation),
+      [](float a, float b) { return std::min(a, b); },
+      [](int32 a, int32 b) { return std::min(a, b); },
+      [](uint32 a, uint32 b) { return std::min(a, b); },
+      std::numeric_limits<float>::max(), std::numeric_limits<int32>::max(),
+      std::numeric_limits<uint32>::max());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanAnd) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarAndComputation, [](bool a, bool b) { return a && b; }, true);
+      static_cast<FuncGenerator>(CreateScalarAndComputation),
+      [](bool a, bool b) { return a && b; }, true);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanOr) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarOrComputation, [](bool a, bool b) { return a || b; }, false);
+      static_cast<FuncGenerator>(CreateScalarOrComputation),
+      [](bool a, bool b) { return a || b; }, false);
 }
 
 class ReduceR3ToR2Test : public ReduceTest,
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 33d457c70b..89ce2ce797 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -54,29 +54,28 @@ TEST_F(WhileTest, WhileWithScalarS32Result) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -91,29 +90,28 @@ TEST_F(WhileTest, WhileWithScalarS64Result) {
   auto result_shape = ShapeUtil::MakeShape(S64, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int64>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int64>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int64>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int64>(&builder, 5, {});
 }
@@ -123,31 +121,30 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
   auto orig_shape = ShapeUtil::MakeShape(S32, {2});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Reduce(builder.ConstantR1<int32>(2, 1),
                              builder.ConstantR0<int32>(0),
                              CreateScalarAddComputation(S32, &builder), {0});
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -156,28 +153,28 @@ TEST_F(WhileTest, WhileWithPredicateResult) {
   auto result_shape = ShapeUtil::MakeShape(PRED, {});
 
   // Create a computation for the condition: run until condition is true.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Ne(builder.ConstantR0<bool>(true), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: or condition with true.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
-    auto result = builder.Or(prev, builder.ConstantR0<bool>(true));
+    builder.Or(prev, builder.ConstantR0<bool>(true));
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Ne(builder.ConstantR0<bool>(false),
                          builder.ConstantR0<bool>(true));
-  auto result = builder.While(condition, body, init);
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
@@ -194,9 +191,9 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -205,33 +202,34 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 15.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>({});
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>({});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   ComputeAndCompareR1<float>(&builder, {}, {}, ErrorSpec(0.0001));
 }
@@ -247,9 +245,9 @@ TEST_F(WhileTest, WhileWithVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -258,33 +256,34 @@ TEST_F(WhileTest, WhileWithVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   // Individual elements with increase by 1/8 each time through the loop, so
   // the sum will increase by 1.0.  It will first be >15.5 when the elements
@@ -306,9 +305,9 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -317,34 +316,34 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   builder.Tuple({result});
 
   // Individual elements with increase by 1/8 each time through the loop, so
@@ -366,9 +365,9 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -377,28 +376,28 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(N);
   auto expected_w1 = Literal::CreateR1<float>({1.0f, 1.0f, 1.0f});
@@ -419,9 +418,9 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -430,21 +429,21 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
@@ -455,7 +454,7 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   auto result = builder.Add(add12, builder.GetTupleElement(xla_while, 3));
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   std::vector<float> expected = {6.f, 6.f, 6.f};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
@@ -474,9 +473,9 @@ TEST_F(WhileTest, WhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -486,26 +485,27 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -523,9 +523,9 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -534,27 +534,27 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and or the predicate with true
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto pred = builder.GetTupleElement(prev, 1);
     auto new_pred = builder.Or(pred, builder.ConstantR0<bool>(true));
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_pred});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple({builder.ConstantR0<int32>(0),
                              builder.Ne(builder.ConstantR0<bool>(false),
                                         builder.ConstantR0<bool>(true))});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_predicate = Literal::CreateR0<bool>(true);
@@ -570,9 +570,9 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -582,25 +582,24 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and set the other tuple element to a
   // constant.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
-    auto result =
-        builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
-                       builder.ConstantR0<int32>(7)});
+    builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
+                   builder.ConstantR0<int32>(7)});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR0<int32>(7)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR0<int32>(7);
@@ -631,20 +630,20 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -654,34 +653,34 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  Computation body2;
+  XlaComputation body2;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body2, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -692,11 +691,11 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -710,20 +709,20 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -733,21 +732,21 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -758,11 +757,11 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -777,20 +776,20 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -800,21 +799,21 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -824,11 +823,11 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -844,9 +843,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -856,9 +855,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -873,18 +872,18 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
 
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -915,18 +914,18 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
 
   // Create a computation for the condition: repeat for count iterations.
   auto build_condition = [this, v6s32](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto prev = builder.Reshape(
         builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
-          {});
+        {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, v6s32, "prev");
     auto inc = builder.ConcatInDim(
         {builder.ConstantR1<int32>({1}),
@@ -934,16 +933,15 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
                             builder.ConstantR0<int32>(100),
                             ShapeUtil::MakeShape(S32, {5}))},
         0);
-    auto result = builder.Add(inc, prev);
+    builder.Add(inc, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
   auto while_loop = [this, &body, build_condition](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto init = builder.ConstantR1<int32>({0, 0, 0, 0, 0, 0});
-    auto result = builder.While(build_condition(count), body, init);
-    auto shape = builder.GetShape(result).ConsumeValueOrDie();
+    builder.While(build_condition(count), body, init);
     return builder.Build();
   };
 
@@ -1107,9 +1105,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
   auto inner_result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
-  Computation inner_condition;
+  XlaComputation inner_condition;
   {
-    ComputationBuilder builder(client_, "inner_condition");
+    XlaBuilder builder("inner_condition");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     builder.Lt(i, builder.ConstantR0<int32>(7));
@@ -1118,9 +1116,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the outer loop condition:
   // repeat while result < 30.
-  Computation outer_condition;
+  XlaComputation outer_condition;
   {
-    ComputationBuilder builder(client_, "outer_condition");
+    XlaBuilder builder("outer_condition");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     builder.Lt(prev, builder.ConstantR0<int32>(30));
     outer_condition = builder.Build().ConsumeValueOrDie();
@@ -1128,34 +1126,33 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the inner loop body: add 1 to `i`, and add 2 to
   // `result`.
-  Computation inner_body;
+  XlaComputation inner_body;
   {
-    ComputationBuilder builder(client_, "inner_body");
+    XlaBuilder builder("inner_body");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     auto result = builder.GetTupleElement(params, 1);
     i = builder.Add(builder.ConstantR0<int32>(1), i);
     result = builder.Add(builder.ConstantR0<int32>(2), result);
-    auto output = builder.Tuple({i, result});
+    builder.Tuple({i, result});
     inner_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Creates a computation for the outer loop: run the inner loop with i = 0.
-  Computation outer_body;
+  XlaComputation outer_body;
   {
-    ComputationBuilder builder(client_, "outer_body");
+    XlaBuilder builder("outer_body");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     auto init = builder.Tuple({builder.ConstantR0<int32>(0), prev});
     auto result = builder.While(inner_condition, inner_body, init);
-    auto output = builder.GetTupleElement(result, 1);
+    builder.GetTupleElement(result, 1);
     outer_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(outer_condition, outer_body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(outer_condition, outer_body, init);
 
   ComputeAndCompareR0<int32>(&builder, 42, {});
 }
@@ -1170,18 +1167,18 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition_callee;
+  XlaComputation condition_callee;
   {
-    ComputationBuilder builder(client_, "condition_callee");
+    XlaBuilder builder("condition_callee");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Tuple({builder.Gt(builder.ConstantR0<int32>(5), prev)});
 
     condition_callee = builder.Build().ConsumeValueOrDie();
   }
 
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto result = builder.Call(condition_callee, {prev});
     builder.GetTupleElement(result, 0);
@@ -1189,20 +1186,19 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -1214,28 +1210,28 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
       {scalar_s32, matrix_shape, matrix_shape, matrix_shape});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto state = builder.Parameter(0, while_shape, "state");
     builder.Gt(builder.ConstantR0<int32>(5), builder.GetTupleElement(state, 0));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto state = builder.Parameter(0, while_shape, "state");
     auto indvar = builder.GetTupleElement(state, 0);
     auto input_0 = builder.GetTupleElement(state, 1);
     auto input_1 = builder.GetTupleElement(state, 2);
     auto output = builder.Tanh(builder.Dot(input_0, input_1));
     auto indvar_next = builder.Add(indvar, builder.ConstantR0<int32>(1));
-    auto tuple_result = builder.Tuple({indvar_next, input_0, input_1, output});
+    builder.Tuple({indvar_next, input_0, input_1, output});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto matrix_input = builder.Parameter(0, matrix_shape, "matrix");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), matrix_input, matrix_input, matrix_input});
@@ -1268,9 +1264,9 @@ void BM_WhileLoop(int num_iters) {
 
   // Create while condition computation with 'loop_limit'.
   const int32 loop_limit = 100;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(loop_limit));
@@ -1278,9 +1274,9 @@ void BM_WhileLoop(int num_iters) {
   }
 
   // Create while body computation with unit loop increment.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -1294,12 +1290,12 @@ void BM_WhileLoop(int num_iters) {
     auto starts = builder.ConstantR1<int32>({0, 0, 0});
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While instruction.
-  ComputationBuilder builder(client, "while");
+  XlaBuilder builder("while");
   auto zero = builder.ConstantR0<float>(0.0);
   auto input = builder.Broadcast(zero, {seq_len, 1024, 1024});
   auto init = builder.Tuple({builder.ConstantR0<int32>(0), input});
-- 
GitLab


From 9ba97cf96e8e2bd7a43db660b56932927d65d5bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 18:33:11 -0700
Subject: [PATCH 0177/1262] Add int64 to Variant mutable hash table variants.

PiperOrigin-RevId: 191380970
---
 tensorflow/core/kernels/lookup_table_op.cc | 32 +++++++++++-----------
 tensorflow/core/kernels/lookup_table_op.h  | 17 ++++++------
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e3872fee0e..57b7798ba0 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -62,8 +63,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
 
     return Status::OK();
@@ -78,9 +78,8 @@ class MutableHashTableOfScalars final : public LookupInterface {
       table_.clear();
     }
     for (int64 i = 0; i < key_values.size(); ++i) {
-      gtl::InsertOrUpdate(&table_,
-                          SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-                          SubtleMustCopyUnlessStringOrFloat(value_values(i)));
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          SubtleMustCopyIfIntegral(value_values(i)));
     }
     return Status::OK();
   }
@@ -172,8 +171,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
-      ValueArray* value_vec = gtl::FindOrNull(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)));
+      ValueArray* value_vec =
+          gtl::FindOrNull(table_, SubtleMustCopyIfIntegral(key_values(i)));
       if (value_vec != nullptr) {
         for (int64 j = 0; j < value_dim; j++) {
           value_values(i, j) = value_vec->at(j);
@@ -203,8 +202,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
         V value = value_values(i, j);
         value_vec.push_back(value);
       }
-      gtl::InsertOrUpdate(
-          &table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)), value_vec);
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          value_vec);
     }
     return Status::OK();
   }
@@ -379,15 +378,14 @@ class MutableDenseHashTable final : public LookupInterface {
           for (int64 j = 0; j < value_size; ++j) {
             // TODO(andreasst): check if we can get rid of SubtleMustCopy
             // here and elsewhere in this file.
-            value_matrix(i, j) = SubtleMustCopyUnlessStringOrFloat(
-                value_buckets_matrix(bucket_index, j));
+            value_matrix(i, j) =
+                SubtleMustCopyIfIntegral(value_buckets_matrix(bucket_index, j));
           }
           break;
         }
         if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_matrix, 0)) {
           for (int64 j = 0; j < value_size; ++j) {
-            value_matrix(i, j) =
-                SubtleMustCopyUnlessStringOrFloat(default_flat(j));
+            value_matrix(i, j) = SubtleMustCopyIfIntegral(default_flat(j));
           }
           break;
         }
@@ -531,7 +529,7 @@ class MutableDenseHashTable final : public LookupInterface {
         if (IsEqualKey(key_buckets_matrix, bucket_index, key_matrix, i)) {
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -539,11 +537,11 @@ class MutableDenseHashTable final : public LookupInterface {
           ++num_entries_;
           for (int64 j = 0; j < key_size; ++j) {
             key_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(key_matrix(i, j));
+                SubtleMustCopyIfIntegral(key_matrix(i, j));
           }
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -849,6 +847,7 @@ REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
@@ -899,6 +898,7 @@ REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, bool);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 3657fd5b6a..29a0cc91fe 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -125,19 +125,21 @@ namespace lookup {
 // integral types. However non-integer variables are not allowed and therefore
 // the local copy is unnecessary.
 template <typename T>
-T SubtleMustCopyUnlessStringOrFloat(const T& value) {
+T SubtleMustCopyIfIntegral(const T& value) {
   return internal::SubtleMustCopy(value);
 }
 
-inline const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
+inline const string& SubtleMustCopyIfIntegral(const string& value) {
   return value;
 }
 
-inline const float SubtleMustCopyUnlessStringOrFloat(const float value) {
+inline const float SubtleMustCopyIfIntegral(const float value) { return value; }
+
+inline const double SubtleMustCopyIfIntegral(const double value) {
   return value;
 }
 
-inline const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+inline const Variant& SubtleMustCopyIfIntegral(const Variant& value) {
   return value;
 }
 
@@ -204,8 +206,8 @@ class HashTable : public InitializableLookupTable {
     const auto key_values = keys.flat<K>();
     const auto value_values = values.flat<V>();
     for (int64 i = 0; i < key_values.size(); ++i) {
-      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
-      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
+      const K key = SubtleMustCopyIfIntegral(key_values(i));
+      const V value = SubtleMustCopyIfIntegral(value_values(i));
       const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
       if (previous_value != value) {
         return errors::FailedPrecondition(
@@ -224,8 +226,7 @@ class HashTable : public InitializableLookupTable {
 
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          *table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
     return Status::OK();
   }
-- 
GitLab


From ec16dd449482f38100083505314cb4179f7d1cb8 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 2 Apr 2018 19:20:53 -0700
Subject: [PATCH 0178/1262] Automated g4 rollback of changelist 191037166

PiperOrigin-RevId: 191385075
---
 tensorflow/compiler/tests/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 204a2a2f90..edabdc218a 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -375,7 +375,6 @@ tf_xla_py_test(
     name = "momentum_test",
     size = "small",
     srcs = ["momentum_test.py"],
-    tags = ["no_oss"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-- 
GitLab


From fa2e70d7fae8466c4006bd29334a3cc440ee6d3a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 2 Apr 2018 19:32:45 -0700
Subject: [PATCH 0179/1262] Deleted a special case

PiperOrigin-RevId: 191385909
---
 tensorflow/core/grappler/optimizers/function_optimizer.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 2a6b8a325f..6cdc51f1bd 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -286,12 +286,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (func.attr().count("_noinline") != 0) {
       continue;
     }
-    // Don't touch anything marked XLA to prevent XLA failures further down the
-    // road.
-    if (func.attr().count("_XlaCompile") > 0 &&
-        func.attr().at("_XlaCompile").b()) {
-      continue;
-    }
     // Can't create IdentityN nodes with no input or output: skip these
     // functions for now.
     if (func.signature().input_arg_size() == 0 ||
-- 
GitLab


From 38e0139329482d8e44629dea2e87853808eacd0d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 20:50:39 -0700
Subject: [PATCH 0180/1262] Windows: Enable tensorflow/contrib in Bazel build
 (Second try)

This reverts commit 4e108ef30d7cd7ae5e1c550ec5ae27e79b8c6e39.

PiperOrigin-RevId: 191391075
---
 tensorflow/contrib/BUILD                      |   8 +-
 tensorflow/contrib/__init__.py                |   7 +-
 .../boosted_trees/lib/utils/batch_features.h  |   6 +-
 tensorflow/contrib/distributions/BUILD        |   2 +
 .../python/examples/linear_regression/BUILD   |   1 +
 tensorflow/contrib/gan/BUILD                  |   1 +
 .../contrib/kfac/python/kernel_tests/BUILD    |   1 +
 tensorflow/contrib/labeled_tensor/BUILD       |   1 +
 tensorflow/contrib/layers/BUILD               |   2 +
 tensorflow/contrib/learn/BUILD                |   5 +
 tensorflow/contrib/lookup/BUILD               |   1 +
 .../contrib/remote_fused_graph/pylib/BUILD    |   1 -
 tensorflow/contrib/saved_model/BUILD          |   1 +
 tensorflow/contrib/session_bundle/BUILD       |   1 +
 .../contrib/slim/python/slim/data/BUILD       |   1 +
 tensorflow/contrib/tensor_forest/BUILD        |   1 -
 tensorflow/contrib/tensorboard/BUILD          |   1 +
 tensorflow/contrib/timeseries/examples/BUILD  |   5 +-
 .../timeseries/python/timeseries/BUILD        |   5 +-
 .../timeseries/state_space_models/BUILD       |   1 +
 tensorflow/contrib/tpu/BUILD                  |   1 +
 tensorflow/contrib/util/loader.py             |   7 +-
 tensorflow/core/framework/dataset.h           |   4 +-
 tensorflow/core/lib/core/stringpiece.cc       |   2 -
 tensorflow/core/lib/core/stringpiece.h        |   2 +-
 tensorflow/core/platform/abi.cc               |   8 +-
 tensorflow/core/platform/cpu_info.h           |   2 +-
 tensorflow/core/platform/tracing.h            |   4 +-
 tensorflow/python/BUILD                       |  77 ++++++--
 tensorflow/python/debug/BUILD                 |   1 +
 tensorflow/python/keras/BUILD                 |   5 +-
 tensorflow/python/kernel_tests/BUILD          |   4 -
 tensorflow/tensorflow.bzl                     |  34 +++-
 .../windows/cpu/pip/build_tf_windows.sh       |   3 +-
 tensorflow/tools/def_file_filter/BUILD        |   9 +
 tensorflow/tools/def_file_filter/BUILD.tpl    |  15 ++
 .../def_file_filter/def_file_filter.py.tpl    | 168 ++++++++++++++++++
 .../def_file_filter_configure.bzl             |  56 ++++++
 tensorflow/tools/pip_package/BUILD            | 130 ++++++--------
 tensorflow/workspace.bzl                      |   6 +
 40 files changed, 469 insertions(+), 121 deletions(-)
 create mode 100644 tensorflow/tools/def_file_filter/BUILD
 create mode 100644 tensorflow/tools/def_file_filter/BUILD.tpl
 create mode 100644 tensorflow/tools/def_file_filter/def_file_filter.py.tpl
 create mode 100644 tensorflow/tools/def_file_filter/def_file_filter_configure.bzl

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 0cebb49afb..bf69144ad8 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -8,6 +8,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
     name = "contrib_py",
@@ -40,7 +41,6 @@ py_library(
         "//tensorflow/contrib/estimator:estimator_py",
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/feature_column:feature_column_py",
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/fused_conv:fused_conv_py",
         "//tensorflow/contrib/gan",
@@ -63,7 +63,6 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
-        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -117,7 +116,10 @@ py_library(
             "//tensorflow/contrib/kafka",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_not_windows([
+        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
+        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index a8e05df708..1c5b00f92e 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -1,3 +1,4 @@
+# pylint: disable=g-import-not-at-top
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
@@ -84,7 +87,8 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-from tensorflow.contrib.lite.python import lite
+if os.name != "nt":
+  from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
@@ -94,6 +98,7 @@ from tensorflow.contrib.summary import summary
 from tensorflow.python.util.lazy_loader import LazyLoader
 ffmpeg = LazyLoader("ffmpeg", globals(),
                     "tensorflow.contrib.ffmpeg")
+del os
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index da5e744851..a3b1b013e3 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -48,9 +48,9 @@ class BatchFeatures {
   Status GetFeatureColumnSizes(int64* const num_dense_float_features,
                                int64* const num_sparse_float_features,
                                int64* const num_sparse_int_features) const {
-    QCHECK_NE(num_dense_float_features, nullptr);
-    QCHECK_NE(num_sparse_float_features, nullptr);
-    QCHECK_NE(num_sparse_int_features, nullptr);
+    QCHECK_NE(num_dense_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_int_features, static_cast<int64*>(nullptr));
     *num_dense_float_features = dense_float_feature_columns_.size();
     *num_sparse_float_features = sparse_float_feature_columns_.size();
     *num_sparse_int_features = sparse_int_feature_columns_.size();
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index de08eb491b..514638ecbb 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -454,6 +454,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
@@ -1128,6 +1129,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index f86331af6f..2f6cfdf31e 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -22,6 +22,7 @@ cuda_py_test(
         ":linear_regression",
         "//tensorflow:tensorflow_py",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 9e56d3c039..461066bbb4 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -354,6 +354,7 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index f73c24f8fb..2477d2bfc1 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -114,6 +114,7 @@ py_test(
     name = "utils_test",
     srcs = ["utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/contrib/tpu",
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 18b265ae80..c8812d4b23 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,6 +70,7 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":_typecheck",
         ":core",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 4be55468db..d5b3b279a1 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -188,6 +188,7 @@ py_test(
     size = "small",
     srcs = ["python/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/contrib/framework:framework_py",
@@ -353,6 +354,7 @@ py_test(
     size = "small",
     srcs = ["python/ops/sparse_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index ba55365c14..d665fc9335 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -117,6 +117,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -172,6 +173,7 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -190,6 +192,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/graph_actions_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -591,6 +594,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/learn/python/learn/datasets",
@@ -820,6 +824,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/utils/saved_model_export_utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 02b4f80252..f616207d46 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -46,4 +46,5 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     grpc_enabled = True,
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 996b55f9b8..3aa8a14f44 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -38,7 +38,6 @@ py_test(
     size = "small",
     srcs = ["python/ops/remote_fused_graph_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":remote_fused_graph_ops_py",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index faad40d335..e431c464ef 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -53,6 +53,7 @@ py_test(
     size = "small",
     srcs = ["python/saved_model/reader_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
     deps = [
         ":saved_model_py",
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 31717305e7..9c08859180 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -151,6 +151,7 @@ py_test(
     name = "gc_test",
     srcs = ["gc_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
     deps = [
         ":gc",
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index dc12e67fc6..eef043e832 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -61,6 +61,7 @@ py_test(
     name = "dataset_data_provider_test",
     srcs = ["dataset_data_provider_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":dataset",
         ":dataset_data_provider",
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 11a59ec22b..136856c015 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -539,7 +539,6 @@ py_test(
     srcs = ["client/random_forest_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_windows",
         "nomac",  # b/63258195
         "notsan",
     ],
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index f4efd9717d..c955b13244 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -9,6 +9,7 @@ exports_files(["LICENSE"])
 
 # For platform specific build config
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 tf_proto_library(
     name = "protos_all",
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 40cf9147b3..32e948a009 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -25,7 +25,10 @@ py_test(
     srcs = ["predict_test.py"],
     data = ["data/period_trend.csv"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67513579
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/67513579
+    ],
     deps = [
         ":predict",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 86022f46ce..af572d8124 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -160,9 +160,7 @@ py_test(
         "head_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip_gpu",  # b/63391119
-    ],
+    tags = ["no_pip_gpu"],  # b/63391119
     deps = [
         ":estimators",
         ":feature_keys",
@@ -440,6 +438,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip_gpu",  # b/63391119
+        "no_windows",  # TODO: needs investigation on Windows
     ],
     deps = [
         ":feature_keys",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index ca25ccd2b8..5d33e23a42 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,6 +40,7 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index b267cceef1..d4830b6bcf 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -228,6 +228,7 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:layers",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py
index f4283cd9ed..dca01d26f4 100644
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@@ -42,9 +42,10 @@ def load_op_library(path):
     plugin.
   """
   if os.name == 'nt':
-    # To avoid makeing every user_ops aware of windows, re-write
-    # the file extension from .so to .dll.
-    path = re.sub(r'\.so$', '.dll', path)
+    # To avoid making every user_ops aware of windows, re-write
+    # the file extension from .so to .dll if .so file doesn't exist.
+    if not os.path.exists(path):
+      path = re.sub(r'\.so$', '.dll', path)
 
     # Currently we have only some user_ops as dlls on windows - don't try
     # to load them if the dll is not found.
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index fb1fe9c51f..9e7ffe6c0b 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -474,11 +474,11 @@ class GraphDatasetBase : public DatasetBase {
   }
 
   // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
+  TF_EXPORT static const char kDatasetGraphKey[];
 
   // Key for storing the output node of the Dataset graph in the serialized
   // format.
-  static const char kDatasetGraphOutputNodeKey[];
+  TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
 
  private:
   Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 5bd79778a6..0b006fa2b4 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -55,6 +55,4 @@ StringPiece StringPiece::substr(size_t pos, size_t n) const {
   return StringPiece(data_ + pos, n);
 }
 
-const StringPiece::size_type StringPiece::npos = size_type(-1);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 79409cce4b..835b938cbf 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -65,7 +65,7 @@ class StringPiece {
   iterator begin() const { return data_; }
   iterator end() const { return data_ + size_; }
 
-  static const size_t npos;
+  static const size_t npos = size_type(-1);
 
   // Return the ith byte in the referenced data.
   // REQUIRES: n < size()
diff --git a/tensorflow/core/platform/abi.cc b/tensorflow/core/platform/abi.cc
index 4df62734e9..e597a490d6 100644
--- a/tensorflow/core/platform/abi.cc
+++ b/tensorflow/core/platform/abi.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/abi.h"
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include <windows.h>
 #include <cstring>
 #else
@@ -26,19 +26,19 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 
 extern "C" char* __unDName(char* output_string, const char* name,
                            int max_string_length, void* (*p_alloc)(std::size_t),
                            void (*p_free)(void*), unsigned short disable_flags);
 
-#endif  // defined(PLATFORM_WINDOWS)
+#endif  // defined(_MSC_VER)
 
 namespace tensorflow {
 namespace port {
 
 std::string MaybeAbiDemangle(const char* name) {
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
   std::unique_ptr<char> demangled{__unDName(nullptr, name, 0, std::malloc,
                                             std::free,
                                             static_cast<unsigned short>(0))};
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 331f3e5251..bb77650e26 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
 
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index 8f7bff1bb0..3c6e7b0db5 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -103,7 +103,9 @@ class Tracing {
   friend class ScopedAnnotation;
   friend class TraceMe;
 
-  static std::atomic<Tracing::Engine*> tracing_engine_;
+  // TODO: TF_EXPORT is for building //tensorflow/contrib/data:_dataset_ops.so
+  //       on Windows. Figure out a way to remove TF_EXPORT here.
+  TF_EXPORT static std::atomic<Tracing::Engine*> tracing_engine_;
   static Tracing::Engine* engine() {
     return tracing_engine_.load(std::memory_order_acquire);
   }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c502a3a42b..9d1e9bdc7e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -28,6 +28,8 @@ load("//tensorflow:tensorflow.bzl", "py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
@@ -58,9 +60,10 @@ py_library(
         "//tensorflow/tools/api/generator:__pkg__",
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
-    deps = [":no_contrib"] + if_not_windows([
+    deps = [
+        ":no_contrib",
         "//tensorflow/contrib:contrib_py",
-    ]),
+    ],
 )
 
 py_library(
@@ -971,7 +974,6 @@ py_test(
     srcs = ["framework/contrib_test.py"],
     main = "framework/contrib_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -1341,7 +1343,6 @@ py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1717,7 +1718,6 @@ py_test(
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":clip_ops",
@@ -2786,7 +2786,6 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -3391,6 +3390,65 @@ tf_py_wrap_cc(
          tf_additional_gdr_deps()),
 )
 
+# ** Targets for Windows build (start) **
+# We need the following targets to expose symbols from _pywrap_tensorflow.dll
+
+# Build a cc_binary from tf_custom_op_library_additional_deps_impl,
+# it contains all object code from its dependencies.
+tf_native_cc_binary(
+    name = "tf_custom_op_library_additional_deps.so",
+    linkshared = 1,
+    linkstatic = 1,
+    deps = tf_custom_op_library_additional_deps_impl(),
+)
+
+# Get a DEF file generated by parsing all object files
+# of tf_custom_op_library_additional_deps.so
+filegroup(
+    name = "pywrap_tensorflow_def_file",
+    srcs = [":tf_custom_op_library_additional_deps.so"],
+    output_group = "def_file",
+)
+
+# Filter the DEF file to reduce the number of symbols to 64K or less.
+# Note that we also write the name of the pyd file into DEF file so that
+# the dynamic libraries of custom ops can find it at runtime.
+genrule(
+    name = "pywrap_tensorflow_filtered_def_file",
+    srcs = [":pywrap_tensorflow_def_file"],
+    outs = ["pywrap_tensorflow_filtered_def_file.def"],
+    cmd = select({
+        "//tensorflow:windows": """
+              $(location @local_config_def_file_filter//:def_file_filter) \\
+              --input $(location :pywrap_tensorflow_def_file) \\
+              --output $@ \\
+              --target _pywrap_tensorflow_internal.pyd
+          """,
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    tools = ["@local_config_def_file_filter//:def_file_filter"],
+)
+
+# Get the import library of  _pywrap_tensorflow_internal.dll
+filegroup(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":_pywrap_tensorflow_internal.so"],
+    output_group = "interface_library",
+)
+
+# Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
+# so that custom ops' dynamic libraries can link against it.
+cc_import(
+    name = "pywrap_tensorflow_import_lib",
+    interface_library = select({
+        "//tensorflow:windows": ":pywrap_tensorflow_import_lib_file",
+        "//conditions:default": "not_exsiting_on_unix.lib",  # Just a placeholder for Unix platforms
+    }),
+    system_provided = 1,
+)
+
+# ** Targets for Windows build (end) **
+
 py_library(
     name = "lib",
     srcs = [
@@ -3763,7 +3821,6 @@ py_test(
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":lib",
@@ -4071,7 +4128,6 @@ py_test(
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":checkpoint_ops_gen",
         ":client",
@@ -4112,10 +4168,7 @@ py_test(
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # b/67945581
-    ],
+    tags = ["notsan"],  # b/67945581
     deps = [
         ":array_ops",
         ":client_testlib",
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 4195586313..b81aa3745c 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -913,6 +913,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 2a06907f49..57f5097639 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -637,7 +637,10 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/utils/io_utils_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index ea210346c1..d6f97fc4c3 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -295,7 +295,6 @@ tf_py_test(
         "//tensorflow/python:nn_grad",
     ],
     data = ["//tensorflow/core:image_testdata"],
-    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -1142,7 +1141,6 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     data = ["//tensorflow/core:lmdb_testdata"],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2332,7 +2330,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 4,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2463,7 +2460,6 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index fcc57d506e..e9d2f279cd 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -342,6 +342,22 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}.*",
 )
 
+# A simple wrap around native.cc_binary rule.
+# When using this rule, you should realize it doesn't link to any tensorflow
+# dependencies by default.
+def tf_native_cc_binary(name,
+                        copts=tf_copts(),
+                        **kwargs):
+  native.cc_binary(
+      name=name,
+      copts=copts,
+      **kwargs)
+
+register_extension_info(
+    extension_name = "tf_native_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
+
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
                          pkg="",
@@ -1178,6 +1194,20 @@ def tf_custom_op_library_additional_deps():
       "@protobuf_archive//:protobuf_headers",
       clean_dep("//third_party/eigen3"),
       clean_dep("//tensorflow/core:framework_headers_lib"),
+  ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
+
+# A list of targets that contains the implemenation of
+# tf_custom_op_library_additional_deps. It's used to generate a DEF file for
+# exporting symbols from _pywrap_tensorflow.dll on Windows.
+def tf_custom_op_library_additional_deps_impl():
+  return [
+      "@protobuf_archive//:protobuf",
+      "@nsync//:nsync_cpp",
+      # for //third_party/eigen3
+      clean_dep("//third_party/eigen3"),
+      # for //tensorflow/core:framework_headers_lib
+      clean_dep("//tensorflow/core:framework"),
+      clean_dep("//tensorflow/core:reader_base"),
   ]
 
 # Traverse the dependency graph along the "deps" attribute of the
@@ -1264,6 +1294,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
       deps=deps + if_cuda(cuda_deps),
       data=[name + "_check_deps"],
       copts=tf_copts(is_external=True),
+      features = ["windows_export_all_symbols"],
       linkopts=linkopts + select({
           "//conditions:default": [
               "-lm",
@@ -1410,7 +1441,8 @@ def tf_py_wrap_cc(name,
       ]) + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
-      deps=deps + extra_deps)
+      deps=deps + extra_deps,
+      **kwargs)
   native.genrule(
       name="gen_" + cc_library_pyd_name,
       srcs=[":" + cc_library_name],
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 8b8ba31a0d..40189a6d1b 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -65,4 +65,5 @@ bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-  //${PY_TEST_DIR}/tensorflow/python/...
+  //${PY_TEST_DIR}/tensorflow/python/... \
+  //${PY_TEST_DIR}/tensorflow/contrib/...
diff --git a/tensorflow/tools/def_file_filter/BUILD b/tensorflow/tools/def_file_filter/BUILD
new file mode 100644
index 0000000000..e390e0fb05
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/BUILD
@@ -0,0 +1,9 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
diff --git a/tensorflow/tools/def_file_filter/BUILD.tpl b/tensorflow/tools/def_file_filter/BUILD.tpl
new file mode 100644
index 0000000000..3cb72f4979
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/BUILD.tpl
@@ -0,0 +1,15 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
+
+py_binary(
+    name = "def_file_filter",
+    srcs = ["def_file_filter.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
new file mode 100644
index 0000000000..8bdc03eb0f
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""def_file_filter.py - tool to filter a windows def file.
+
+The def file can be used to export symbols from the tensorflow dll to enable
+tf.load_library().
+
+Because the linker allows only 64K symbols to be exported per dll
+we filter the symbols down to the essentials. The regular expressions
+we use for this are specific to tensorflow.
+
+TODO: this works fine but there is an issue with exporting
+'const char * const' and importing it from a user_ops. The problem is
+on the importing end and using __declspec(dllimport) works around it.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+# External tools we use that come with visual studio sdk
+UNDNAME = "%{undname_bin_path}"
+
+# Exclude if matched
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+
+# Include if matched before exclude
+INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
+                           r"google::protobuf::internal::ArenaImpl::AllocateAligned|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::internal::ArenaImpl::AddCleanup|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::Arena::OnArenaAllocation|" # for contrib/data/_prefetching_ops
+                           r"tensorflow::internal::LogMessage|"
+                           r"tensorflow::internal::LogString|"
+                           r"tensorflow::internal::CheckOpMessageBuilder|"
+                           r"tensorflow::internal::MakeCheckOpValueString|"
+                           r"tensorflow::internal::PickUnusedPortOrDie|"
+                           r"tensorflow::internal::ValidateDevice|"
+                           r"tensorflow::ops::internal::Enter|"
+                           r"tensorflow::strings::internal::AppendPieces|"
+                           r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::io::internal::JoinPathImpl")
+
+# Include if matched after exclude
+INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
+                        r"^(TFE_\w*)$|"
+                        r"nsync::|"
+                        r"tensorflow::|"
+                        r"functor::|"
+                        r"perftools::gputools")
+
+# We want to identify data members explicitly in the DEF file, so that no one
+# can implicitly link against the DLL if they use one of the variables exported
+# from the DLL and the header they use does not decorate the symbol with
+# __declspec(dllimport). It is easier to detect what a data symbol does
+# NOT look like, so doing it with the below regex.
+DATA_EXCLUDE_RE = re.compile(r"[)(]|"
+                             r"vftable|"
+                             r"vbtable|"
+                             r"vcall|"
+                             r"RTTI|"
+                             r"protobuf::internal::ExplicitlyConstructed")
+
+def get_args():
+  """Parse command line."""
+  filename_list = lambda x: x.split(";")
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", type=filename_list,
+                      help="paths to input def file",
+                      required=True)
+  parser.add_argument("--output", help="output deffile", required=True)
+  parser.add_argument("--target", help="name of the target", required=True)
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  """main."""
+  args = get_args()
+
+  # Pipe dumpbin to extract all linkable symbols from libs.
+  # Good symbols are collected in candidates and also written to
+  # a temp file.
+  candidates = []
+  tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
+  for def_file_path in args.input:
+    def_file = open(def_file_path, 'r')
+    for line in def_file:
+      cols = line.split()
+      sym = cols[0]
+      tmpfile.file.write(sym + "\n")
+      candidates.append(sym)
+  tmpfile.file.close()
+
+  # Run the symbols through undname to get their undecorated name
+  # so we can filter on something readable.
+  with open(args.output, "w") as def_fp:
+    # track dupes
+    taken = set()
+
+    # Header for the def file.
+    def_fp.write("LIBRARY " + args.target + "\n")
+    def_fp.write("EXPORTS\n")
+    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+
+    # Each symbols returned by undname matches the same position in candidates.
+    # We compare on undname but use the decorated name from candidates.
+    dupes = 0
+    proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
+    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+      decorated = candidates[idx]
+      if decorated in taken:
+        # Symbol is already in output, done.
+        dupes += 1
+        continue
+
+      if not INCLUDEPRE_RE.search(line):
+        if EXCLUDE_RE.search(line):
+          continue
+        if not INCLUDE_RE.search(line):
+          continue
+
+      if "deleting destructor" in line:
+        # Some of the symbols convered by INCLUDEPRE_RE export deleting
+        # destructor symbols, which is a bad idea.
+        # So we filter out such symbols here.
+        continue
+
+      if DATA_EXCLUDE_RE.search(line):
+        def_fp.write("\t" + decorated + "\n")
+      else:
+        def_fp.write("\t" + decorated + " DATA\n")
+      taken.add(decorated)
+    def_fp.close()
+
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(UNDNAME, exit_code))
+    return exit_code
+
+  os.unlink(tmpfile.name)
+
+  print("symbols={}, taken={}, dupes={}"
+        .format(len(candidates), len(taken), dupes))
+  return 0
+
+
+if __name__ == "__main__":
+  sys.exit(main())
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
new file mode 100644
index 0000000000..47539b2423
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -0,0 +1,56 @@
+"""Repository rule for def file filter autoconfiguration.
+
+This repository reuses Bazel's VC detect mechanism to find undname.exe,
+which is a tool used in def_file_filter.py.
+
+def_file_filter.py is for filtering the DEF file for TensorFlow on Windows.
+On Windows, we use a DEF file generated by Bazel to export symbols from the
+tensorflow dynamic library(_pywrap_tensorflow.dll). The maximum number of
+symbols that can be exported per DLL is 64K, so we have to filter some useless
+symbols through this python script.
+
+`def_file_filter_config` depends on the following environment variables:
+  * `BAZEL_VC`
+  * `BAZEL_VS`
+  * `VS90COMNTOOLS`
+  * `VS100COMNTOOLS`
+  * `VS110COMNTOOLS`
+  * `VS120COMNTOOLS`
+  * `VS140COMNTOOLS`
+"""
+
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
+load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+
+def _def_file_filter_configure_impl(repository_ctx):
+  if repository_ctx.os.name.lower().find("windows") == -1:
+    repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+    repository_ctx.file("def_file_filter.py", "")
+    return
+  vc_path = find_vc_path(repository_ctx)
+  if vc_path == "visual-studio-not-found":
+    auto_configure_fail("Visual C++ build tools not found on your machine")
+  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  repository_ctx.template(
+    "def_file_filter.py",
+    Label("//tensorflow/tools/def_file_filter:def_file_filter.py.tpl"),
+    {
+      "%{undname_bin_path}": undname_bin_path,
+    })
+  repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+
+
+def_file_filter_configure = repository_rule(
+    implementation = _def_file_filter_configure_impl,
+    environ = [
+        "BAZEL_VC",
+        "BAZEL_VS",
+        "VS90COMNTOOLS",
+        "VS100COMNTOOLS",
+        "VS110COMNTOOLS",
+        "VS120COMNTOOLS",
+        "VS140COMNTOOLS"
+    ],
+)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 62fec2c402..4a70f666b6 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -48,36 +48,66 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+COMMON_PIP_DEPS = [
+    ":licenses",
+    "MANIFEST.in",
+    "README",
+    "setup.py",
+    ":included_headers",
+    "//tensorflow:tensorflow_py",
+    "//tensorflow/contrib/autograph:autograph",
+    "//tensorflow/contrib/autograph/converters:converters",
+    "//tensorflow/contrib/autograph/converters:test_lib",
+    "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/pyct:pyct",
+    "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
+    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
+    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+    "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+    "//tensorflow/contrib/eager/python/examples:examples_pip",
+    "//tensorflow/contrib/eager/python:checkpointable_utils",
+    "//tensorflow/contrib/eager/python:evaluator",
+    "//tensorflow/contrib/gan:gan",
+    "//tensorflow/contrib/graph_editor:graph_editor_pip",
+    "//tensorflow/contrib/keras:keras",
+    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+    "//tensorflow/contrib/nn:nn_py",
+    "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/session_bundle:session_bundle_pip",
+    "//tensorflow/contrib/signal:signal_py",
+    "//tensorflow/contrib/signal:test_util",
+    "//tensorflow/contrib/slim:slim",
+    "//tensorflow/contrib/slim/python/slim/data:data_pip",
+    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+    "//tensorflow/contrib/specs:specs",
+    "//tensorflow/contrib/summary:summary_test_util",
+    "//tensorflow/contrib/tensor_forest:init_py",
+    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
+    "//tensorflow/contrib/timeseries:timeseries_pip",
+    "//tensorflow/contrib/tpu",
+    "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/python:distributed_framework_test_lib",
+    "//tensorflow/python:meta_graph_testdata",
+    "//tensorflow/python:spectral_ops_test_util",
+    "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/debug:debug_pip",
+    "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/saved_model:saved_model",
+    "//tensorflow/python/tools:tools_pip",
+    "//tensorflow/python:test_ops",
+    "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
+]
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = [
-        "MANIFEST.in",
-        "README",
-        "setup.py",
-        ":included_headers",
-        "//tensorflow/contrib/nn:nn_py",
-        "//tensorflow/contrib/session_bundle:session_bundle_pip",
-        "//tensorflow/contrib/signal:signal_py",
-        "//tensorflow/contrib/slim/python/slim/data:data_pip",
-        "//tensorflow/python:util_example_parser_configuration",
-        "//tensorflow/python/debug:debug_pip",
-        "//tensorflow/python/saved_model",
-        "//tensorflow/python:spectral_ops_test_util",
-        "//tensorflow/python/tools:tools_pip",
-        "//tensorflow/python/eager:eager_pip",
-        "//tensorflow/contrib/summary:summary_test_util",
-        # These targets don't build on Windows yet. Exclude them for now.
-        # "//tensorflow/contrib/slim",
-        # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-        # "//tensorflow/contrib/specs",
-        # "//tensorflow/contrib/tensor_forest:init_py",
-        # "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-        # "//tensorflow/examples/tutorials/mnist:package",
-    ],
+    data = COMMON_PIP_DEPS,
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -138,63 +168,13 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [":simple_console_for_windows"],
         "//tensorflow:windows_msvc": [":simple_console_for_windows"],
-        "//conditions:default": [
-            ":licenses",
-            "MANIFEST.in",
-            "README",
-            "setup.py",
-            ":included_headers",
+        "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
-            "//tensorflow:tensorflow_py",
-            "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-            "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-            "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
-            "//tensorflow/contrib/data/python/ops:contrib_op_loader",
-            "//tensorflow/contrib/eager/python/examples:examples_pip",
-            "//tensorflow/contrib/eager/python:checkpointable_utils",
-            "//tensorflow/contrib/eager/python:evaluator",
-            "//tensorflow/contrib/gan:gan",
-            "//tensorflow/contrib/graph_editor:graph_editor_pip",
-            "//tensorflow/contrib/keras:keras",
-            "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
             "//tensorflow/contrib/lite/python:tf_lite_py_pip",
             "//tensorflow/contrib/lite/toco:toco",
             "//tensorflow/contrib/lite/toco/python:toco_wrapper",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-            "//tensorflow/contrib/nn:nn_py",
-            "//tensorflow/contrib/predictor:predictor_pip",
-            "//tensorflow/contrib/autograph:autograph",
-            "//tensorflow/contrib/autograph/converters:converters",
-            "//tensorflow/contrib/autograph/converters:test_lib",
-            "//tensorflow/contrib/autograph/impl:impl",
-            "//tensorflow/contrib/autograph/pyct:pyct",
-            "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
-            "//tensorflow/contrib/receptive_field:receptive_field_pip",
-            "//tensorflow/contrib/session_bundle:session_bundle_pip",
-            "//tensorflow/contrib/signal:signal_py",
-            "//tensorflow/contrib/signal:test_util",
-            "//tensorflow/contrib/slim:slim",
-            "//tensorflow/contrib/slim/python/slim/data:data_pip",
-            "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-            "//tensorflow/contrib/specs:specs",
-            "//tensorflow/contrib/summary:summary_test_util",
-            "//tensorflow/contrib/tensor_forest:init_py",
-            "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-            "//tensorflow/contrib/timeseries:timeseries_pip",
-            "//tensorflow/contrib/tpu",
-            "//tensorflow/examples/tutorials/mnist:package",
-            "//tensorflow/python:distributed_framework_test_lib",
-            "//tensorflow/python:meta_graph_testdata",
-            "//tensorflow/python:spectral_ops_test_util",
-            "//tensorflow/python:util_example_parser_configuration",
-            "//tensorflow/python/debug:debug_pip",
-            "//tensorflow/python/eager:eager_pip",
-            "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
-            "//tensorflow/python/saved_model:saved_model",
-            "//tensorflow/python/tools:tools_pip",
-            "//tensorflow/python:test_ops",
-            "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index fe6b9407d6..5dd27bcda0 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -13,6 +13,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
+     "def_file_filter_configure")
 
 
 # Sanitize a dependency so that it works correctly from code that includes
@@ -33,6 +35,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
 
+  # For windows bazel build
+  # TODO: Remove def file filter when TensorFlow can export symbols properly on Windows.
+  def_file_filter_configure(name = "local_config_def_file_filter")
+
   # Point //external/local_config_arm_compiler to //external/arm_compiler
   arm_compiler_configure(
       name="local_config_arm_compiler",
-- 
GitLab


From 3027f580046866cb74d5edf4e41c9406e007234c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 2 Apr 2018 21:02:40 -0700
Subject: [PATCH 0181/1262] BUG_FIX: Allow Uniform pdf to work on float64
 inputs. PiperOrigin-RevId: 191391778

---
 .../kernel_tests/distributions/uniform_test.py   | 16 ++++++++++++++++
 tensorflow/python/ops/distributions/uniform.py   |  3 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/distributions/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
index df99a0ed25..a8def95b14 100644
--- a/tensorflow/python/kernel_tests/distributions/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -281,6 +281,22 @@ class UniformTest(test.TestCase):
       expected_pdf = [1.0, 0.1]
       self.assertAllClose(expected_pdf, pdf.eval())
 
+  def testUniformFloat64(self):
+    uniform = uniform_lib.Uniform(
+        low=np.float64(0.), high=np.float64(1.))
+
+    self.assertAllClose(
+        [1., 1.],
+        self.evaluate(uniform.prob(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(
+        [0.5, 0.6],
+        self.evaluate(uniform.cdf(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(0.5, self.evaluate(uniform.mean()))
+    self.assertAllClose(1 / 12., self.evaluate(uniform.variance()))
+    self.assertAllClose(0., self.evaluate(uniform.entropy()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index ec623b55eb..0891bffdd5 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -166,7 +166,8 @@ class Uniform(distribution.Distribution):
     return self.low + self.range() * samples
 
   def _prob(self, x):
-    broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
+    broadcasted_x = x * array_ops.ones(
+        self.batch_shape_tensor(), dtype=x.dtype)
     return array_ops.where(
         math_ops.is_nan(broadcasted_x),
         broadcasted_x,
-- 
GitLab


From 53eeeb7ac4a876a59ae975a8d6dd8a48f645b7b7 Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Mon, 2 Apr 2018 21:03:06 -0700
Subject: [PATCH 0182/1262] Re-enable Gather and Slice estimators with output
 size check.

PiperOrigin-RevId: 191391805
---
 .../grappler/costs/op_level_cost_estimator.cc | 10 ++-
 .../costs/op_level_cost_estimator_test.cc     | 75 +++++++++++--------
 2 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 75258d0547..14e46ecdd9 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -202,12 +202,9 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
 
-      // TODO(76227186): re-enable with output size check & test
-      /*
       {kGather, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
       {kGatherV2, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
       {kSlice, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
-      */
 
       {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
@@ -1058,6 +1055,13 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
   // part of it. For these op the size of the output determines the memory cost.
   const auto& op_info = op_context.op_info;
 
+  const int inputs_needed = op_info.op() == "Slice" ? 3 : 2;
+  if (op_info.outputs_size() == 0 || op_info.inputs_size() < inputs_needed) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    return costs;
+  }
+
   bool unknown_shapes = false;
 
   // Each output element is a copy of some element from input.
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 4758bbfee7..d797a8a8c1 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -401,43 +401,56 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
   OpLevelCostEstimator estimator_;
 };
 
-// TODO(76227186): re-enable with output size check & test
-/*
 TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
-OpContext op_context;
-SetCpuDevice(&op_context.op_info);
-op_context.op_info.set_op("Gather");
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
 
-// Huge first input shouldn't affect Gather execution and memory costs.
-DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
-DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
-DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
 
-auto cost = estimator_.PredictCosts(op_context);
-EXPECT_EQ(Costs::Duration(130), cost.memory_time);
-EXPECT_EQ(Costs::Duration(16), cost.compute_time);
-EXPECT_EQ(Costs::Duration(146), cost.execution_time);
-EXPECT_FALSE(cost.inaccurate);
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(130), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(16), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(146), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
+
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_TRUE(cost.inaccurate);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
-OpContext op_context;
-SetCpuDevice(&op_context.op_info);
-op_context.op_info.set_op("Slice");
-
-// Huge first input shouldn't affect Slice execution and memory costs.
-DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
-DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
-DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
-DescribeArbitraryRankOutput({10, 10}, DT_FLOAT, &op_context.op_info);
-
-auto cost = estimator_.PredictCosts(op_context);
-EXPECT_EQ(Costs::Duration(81), cost.memory_time);
-EXPECT_EQ(Costs::Duration(10), cost.compute_time);
-EXPECT_EQ(Costs::Duration(91), cost.execution_time);
-EXPECT_FALSE(cost.inaccurate);
-}
-*/
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Slice");
+
+  // Huge first input shouldn't affect Slice execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({10, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(81), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(10), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(91), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
 
 TEST_F(OpLevelCostEstimatorTest, BiasAddExecutionTime) {
   auto cost = PredictCosts(DescribeBiasAdd(1000, 10));
-- 
GitLab


From 89f86f518dc0317b9e4e7b2932d73209e636fa72 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Mon, 2 Apr 2018 22:24:14 -0700
Subject: [PATCH 0183/1262] Clarify OpDef.is_stateful flag definition

PiperOrigin-RevId: 191396824
---
 tensorflow/core/framework/op_def.proto | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index ba545a1994..ca0e5e7133 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -126,6 +126,12 @@ message OpDef {
   // -------------------------------------------------------------------------
   // Optimization constraints.
 
+  // Ops are marked as stateful if their behavior depends on some state beyond
+  // their input tensors (e.g. variable reading op) or if they have
+  // a side-effect (e.g. printing or asserting ops). Equivalently, stateless ops
+  // must always produce the same output for the same input and have
+  // no side-effects.
+  //
   // By default Ops may be moved between devices.  Stateful ops should
   // either not be moved, or should only be moved if that state can also
   // be moved (e.g. via some sort of save / restore).
-- 
GitLab


From 4e6d0fa3969f93415175b1bf85a0068da9210c45 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Mon, 2 Apr 2018 23:44:24 -0700
Subject: [PATCH 0184/1262] Automated g4 rollback of changelist 191385909

PiperOrigin-RevId: 191401933
---
 tensorflow/core/grappler/optimizers/function_optimizer.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 6cdc51f1bd..2a6b8a325f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -286,6 +286,12 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (func.attr().count("_noinline") != 0) {
       continue;
     }
+    // Don't touch anything marked XLA to prevent XLA failures further down the
+    // road.
+    if (func.attr().count("_XlaCompile") > 0 &&
+        func.attr().at("_XlaCompile").b()) {
+      continue;
+    }
     // Can't create IdentityN nodes with no input or output: skip these
     // functions for now.
     if (func.signature().input_arg_size() == 0 ||
-- 
GitLab


From 7dbb2b52d67ad584e7bb85d61e82c4a136ec5de0 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Tue, 3 Apr 2018 01:01:16 -0700
Subject: [PATCH 0185/1262] Remove "-lpthread" when building tests on macos. In
 most cases it seems to be not used.

PiperOrigin-RevId: 191407383
---
 tensorflow/tensorflow.bzl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index e9d2f279cd..098ae7e6e3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -638,9 +638,12 @@ def tf_cc_test(name,
       linkopts=select({
         clean_dep("//tensorflow:android"): [
             "-pie",
-          ],
+        ],
         clean_dep("//tensorflow:windows"): [],
         clean_dep("//tensorflow:windows_msvc"): [],
+        clean_dep("//tensorflow:darwin"): [
+            "-lm",
+        ],
         "//conditions:default": [
             "-lpthread",
             "-lm"
-- 
GitLab


From 03afed33b2f1e9edc8890920b2f8bcdae7db6de3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Tue, 3 Apr 2018 18:09:46 +0800
Subject: [PATCH 0186/1262] CLN: fix wrong hanging indentation

---
 tensorflow/contrib/opt/python/training/adamax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index ba9e79be99..4692f88349 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -186,6 +186,6 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
       beta1_power = self._get_beta_accumulators()
       with ops.colocate_with(beta1_power):
         update_beta1 = beta1_power.assign(
-          beta1_power * self._beta1_t, use_locking=self._use_locking)
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
     return control_flow_ops.group(*update_ops + [update_beta1],
                                   name=name_scope)
-- 
GitLab


From c3c3fb62f34213f96a6c9bb4174240168d8b5873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Tue, 3 Apr 2018 18:10:18 +0800
Subject: [PATCH 0187/1262] CLN: add deps: egaer:context

---
 tensorflow/contrib/opt/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index aaf0012808..39a86dbd71 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -44,6 +44,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
-- 
GitLab


From 62f61fbdb992eb6257f9002c7580373f8c08e758 Mon Sep 17 00:00:00 2001
From: Silver Chan <chenchuanyinuestc@gmail.com>
Date: Tue, 3 Apr 2018 19:05:06 +0800
Subject: [PATCH 0188/1262] fix problem: ld libgrpc symbols for MacOS

fix problem: ld libgrpc symbols for MacOS when cmake build
---
 tensorflow/contrib/cmake/external/grpc.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index abfc69243e..1fefb731a7 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -35,6 +35,7 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
-- 
GitLab


From c3821d7f4f72dd6f03942b776a2887c44e12710b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 05:52:03 -0700
Subject: [PATCH 0189/1262] Update LLVM API usage to match upstream change.

PiperOrigin-RevId: 191428965
---
 tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc | 1 -
 tensorflow/compiler/xla/service/cpu/simple_orc_jit.h  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 80c24eaccf..4198260a22 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -87,7 +87,6 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      execution_session_(string_pool_),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
           [this](const std::string& name) -> llvm::JITSymbol {
             return this->ResolveRuntimeSymbol(name);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index aaeff2de87..f4260a95bc 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -102,7 +102,6 @@ class SimpleOrcJIT {
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
-  llvm::orc::SymbolStringPool string_pool_;
   llvm::orc::ExecutionSession execution_session_;
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
-- 
GitLab


From b036452ca347b7ab809711f71041d0457b0062d0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 3 Apr 2018 07:28:40 -0700
Subject: [PATCH 0190/1262] [TF:XLA] Bump open source llvm revision to r329057

DataTypes.h is no longer a generated header. X86DisassemblerDecoderCommon.h is now part of :support.

PiperOrigin-RevId: 191438031
---
 tensorflow/workspace.bzl    |  8 ++++----
 third_party/llvm/llvm.BUILD | 12 +-----------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5dd27bcda0..0bb297e72e 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -460,11 +460,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/1c3cdea2f181d8e14ee184466c5fb237f1b4cda8.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/1c3cdea2f181d8e14ee184466c5fb237f1b4cda8.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
       ],
-      sha256 = "1efbb9b05af88368be984d2f6526061d4a857181ef10f8841889a3a46869bb01",
-      strip_prefix = "llvm-1c3cdea2f181d8e14ee184466c5fb237f1b4cda8",
+      sha256 = "a6d94bd9de23515a1e3792a830421e3885977ea43d03427cdbe68f98cb7e0045",
+      strip_prefix = "llvm-7e78daafdd22f3f17720a103d29d89590534004e",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 28293a3659..075b46896e 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -162,13 +162,6 @@ all_cmake_vars = select({
 })
 
 # Performs CMake variable substitutions on configuration header files.
-expand_cmake_vars(
-    name = "datatypes_gen",
-    src = "include/llvm/Support/DataTypes.h.cmake",
-    cmake_vars = all_cmake_vars,
-    dst = "include/llvm/Support/DataTypes.h",
-)
-
 expand_cmake_vars(
     name = "config_gen",
     src = "include/llvm/Config/config.h.cmake",
@@ -305,9 +298,7 @@ cc_binary(
     srcs = glob([
         "utils/TableGen/*.cpp",
         "utils/TableGen/*.h",
-    ]) + [
-        "lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h",
-    ],
+    ]),
     linkopts = [
         "-lm",
         "-ldl",
@@ -2014,7 +2005,6 @@ cc_library(
         "include/llvm/Support/WasmRelocs/*.def",
     ]) + [
         "include/llvm/BinaryFormat/MachO.def",
-        "include/llvm/Support/DataTypes.h",
         "include/llvm/Support/VCSRevision.h",
         "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
     ],
-- 
GitLab


From caf6706067718e5b8856036893977085b3b18fcb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 08:54:01 -0700
Subject: [PATCH 0191/1262] Changes loss_reduction default to
 SUM_OVER_BATCH_SIZE for regression_head and poisson_regression_head.

PiperOrigin-RevId: 191446787
---
 tensorflow/contrib/estimator/BUILD                 |  3 +++
 .../python/estimator/dnn_linear_combined_test.py   |  9 +++++++--
 .../contrib/estimator/python/estimator/dnn_test.py |  5 ++++-
 .../contrib/estimator/python/estimator/head.py     | 12 ++++++++----
 .../estimator/python/estimator/head_test.py        |  4 ++--
 .../estimator/python/estimator/linear_test.py      |  5 ++++-
 .../estimator/python/estimator/multi_head_test.py  | 14 +++++++-------
 7 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 2be62c9438..bec0329ebb 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -89,6 +89,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -129,6 +130,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -266,6 +268,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
index b5e4d34dc7..dd009a6753 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -52,7 +53,9 @@ def _dnn_only_estimator_fn(
     config=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       dnn_feature_columns=feature_columns,
       dnn_optimizer=optimizer,
@@ -100,7 +103,9 @@ def _linear_only_estimator_fn(
     partitioner=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       linear_feature_columns=feature_columns,
       linear_optimizer=optimizer,
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
index 71f810acec..75e3107670 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -41,7 +42,9 @@ def _dnn_estimator_fn(weight_column=None, label_dimension=1, *args, **kwargs):
   """Returns a DNNEstimator that uses regression_head."""
   return dnn.DNNEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 74da2cbb3f..85ef3291ba 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -178,7 +178,7 @@ def binary_classification_head(
 
 def regression_head(weight_column=None,
                     label_dimension=1,
-                    loss_reduction=losses.Reduction.SUM,
+                    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                     loss_fn=None,
                     inverse_link_fn=None,
                     name=None):
@@ -218,7 +218,9 @@ def regression_head(weight_column=None,
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
     loss_fn: Optional loss function. Defaults to `mean_squared_error`.
     inverse_link_fn: Optional inverse link function, also known as 'mean
       function'. Defaults to identity.
@@ -243,7 +245,7 @@ def regression_head(weight_column=None,
 def poisson_regression_head(
     weight_column=None,
     label_dimension=1,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     compute_full_loss=True,
     name=None):
   """Creates a `_Head` for poisson regression using `tf.nn.log_poisson_loss`.
@@ -275,7 +277,9 @@ def poisson_regression_head(
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
     compute_full_loss: Whether to include the constant `log(z!)` term in
       computing the poisson loss. See `tf.nn.log_poisson_loss` for the full
       documentation.
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 8837dfdc6c..98962ca427 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -1162,8 +1162,8 @@ class PoissonRegressionHead(test.TestCase):
     #         exp(-1) - 2 * (-1) + 2*ln(2) - 2 + 0.5*ln(2*pi*2),
     #         exp(1) - 3 * 1 + 3*ln(3) - 3 + 0.5*ln(2*pi*3)]
     #      = [1.0, 3.020, 1.482]
-    # sum_loss = 5.502
-    expected_loss = 5.502
+    # training_loss = (1.0 + 3.020 + 1.482) / 3
+    expected_loss = 1.834
     atol = 0.001
     expected_train_result = b'my_train_op'
     def _train_op_fn(loss):
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
index c63514eb68..c41996b9c6 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/linear_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -42,7 +43,9 @@ def _linear_estimator_fn(
   """Returns a LinearEstimator that uses regression_head."""
   return linear.LinearEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 74d3d6d728..d9e5aca295 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -483,14 +483,14 @@ class MultiHeadTest(test.TestCase):
                            [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
     }
     # Loss for the first head:
-    # loss1 = (1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
-    #         (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2
-    #       = 28
+    # loss1 = ((1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
+    #          (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2) / 8
+    #       = 3.5
     # Loss for the second head:
-    # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
-    #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
-    #       = 74
-    expected_training_loss = 28. + 74.
+    # loss2 = ((0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
+    #          (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2) / 12
+    #       = 6.167
+    expected_training_loss = 3.5 + 6.167
 
     training_loss = multi_head.create_loss(
         features={},
-- 
GitLab


From 5d1086ae98ccfe691161ff50c93036d432866741 Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Tue, 3 Apr 2018 08:59:08 -0700
Subject: [PATCH 0192/1262] cholesky_solve_with_broadcast,
 matrix_solve_with_broadcast and matrix_triangular_solve_with_broadcast added
 to linear_operator_util.py

PiperOrigin-RevId: 191447378
---
 .../linalg/linear_operator_util_test.py       | 136 ++++++++++++++++--
 .../python/ops/linalg/linear_operator_util.py |  85 +++++++++--
 2 files changed, 202 insertions(+), 19 deletions(-)

diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index e1edffc3d9..7b291e29de 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -94,8 +95,8 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 class BroadcastMatrixBatchDimsTest(test.TestCase):
 
   def test_zero_batch_matrices_returned_as_empty_list(self):
-    self.assertAllEqual(
-        [], linear_operator_util.broadcast_matrix_batch_dims([]))
+    self.assertAllEqual([],
+                        linear_operator_util.broadcast_matrix_batch_dims([]))
 
   def test_one_batch_matrix_returned_after_tensor_conversion(self):
     arr = rng.rand(2, 3, 4)
@@ -194,6 +195,44 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       linear_operator_util.broadcast_matrix_batch_dims([y, x])
 
 
+class CholeskySolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    chol = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    chol = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    chol_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.cholesky_solve_with_broadcast(
+                  chol_ph, rhs_ph),
+              linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              chol_ph: chol,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
 class MatmulWithBroadcastTest(test.TestCase):
 
   def test_static_dims_broadcast(self):
@@ -209,7 +248,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       expected = math_ops.matmul(x, y_broadcast)
       self.assertAllEqual(expected.eval(), result.eval())
 
-  def test_dynamic_dims_broadcast_32bit(self):
+  def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
     x = rng.rand(2, 1, 3)
@@ -221,9 +260,90 @@ class MatmulWithBroadcastTest(test.TestCase):
 
     with self.test_session() as sess:
       result, expected = sess.run(
-          [linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
-           math_ops.matmul(x, y_broadcast)],
-          feed_dict={x_ph: x, y_ph: y})
+          [
+              linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
+              math_ops.matmul(x, y_broadcast)
+          ],
+          feed_dict={
+              x_ph: x,
+              y_ph: y
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+          matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_triangular_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
       self.assertAllEqual(expected, result)
 
 
@@ -244,7 +364,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       # Should not raise
       linear_operator_util.assert_compatible_matrix_dimensions(
-          operator, x).run()
+          operator, x).run()  # pyformat: disable
 
   def test_incompatible_dimensions_raise(self):
     with self.test_session():
@@ -252,7 +372,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       with self.assertRaisesOpError("Incompatible matrix dimensions"):
         linear_operator_util.assert_compatible_matrix_dimensions(
-            operator, x).run()
+            operator, x).run()  # pyformat: disable
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 427bd1e890..9dd40765c2 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -102,6 +103,22 @@ def assert_is_batch_matrix(tensor):
         "%s" % tensor)
 
 
+def shape_tensor(shape, name=None):
+  """Convert Tensor using default type, unless empty list or tuple."""
+  # Works just like random_ops._ShapeTensor.
+  if isinstance(shape, (tuple, list)) and not shape:
+    dtype = dtypes.int32
+  else:
+    dtype = None
+  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+
+
+################################################################################
+# Broadcasting versions of common linear algebra functions.
+# TODO(b/77519145) Do this more efficiently in some special cases.
+################################################################################
+
+
 def broadcast_matrix_batch_dims(batch_matrices, name=None):
   """Broadcast leading dimensions of zero or more [batch] matrices.
 
@@ -170,7 +187,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = batch_matrices[0].get_shape()[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_static_shape(
-          bcast_batch_shape, mat.get_shape()[:-2])
+          bcast_batch_shape,
+          mat.get_shape()[:-2])
     if bcast_batch_shape.is_fully_defined():
       # The [1, 1] at the end will broadcast with anything.
       bcast_shape = bcast_batch_shape.concatenate([1, 1])
@@ -183,7 +201,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = array_ops.shape(batch_matrices[0])[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_dynamic_shape(
-          bcast_batch_shape, array_ops.shape(mat)[:-2])
+          bcast_batch_shape,
+          array_ops.shape(mat)[:-2])
     bcast_shape = array_ops.concat([bcast_batch_shape, [1, 1]], axis=0)
     for i, mat in enumerate(batch_matrices):
       batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
@@ -195,6 +214,13 @@ def _broadcast_to_shape(x, shape):
   return x + array_ops.zeros(shape=shape, dtype=x.dtype)
 
 
+def cholesky_solve_with_broadcast(chol, rhs, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "CholeskySolveWithBroadcast", [chol, rhs]):
+    chol, rhs = broadcast_matrix_batch_dims([chol, rhs])
+    return linalg_ops.cholesky_solve(chol, rhs)
+
+
 def matmul_with_broadcast(a,
                           b,
                           transpose_a=False,
@@ -206,6 +232,11 @@ def matmul_with_broadcast(a,
                           name=None):
   """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
 
+  Works identically to `tf.matmul`, but broadcasts batch dims
+  of `a` and `b` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
   The inputs must be matrices (or tensors of rank > 2, representing batches of
   matrices).
 
@@ -276,7 +307,7 @@ def matmul_with_broadcast(a,
     ValueError: If transpose_a and adjoint_a, or transpose_b and adjoint_b
       are both set to True.
   """
-  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]) as name:
+  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]):
     a, b = broadcast_matrix_batch_dims([a, b])
     return math_ops.matmul(
         a,
@@ -289,11 +320,43 @@ def matmul_with_broadcast(a,
         b_is_sparse=b_is_sparse)
 
 
-def shape_tensor(shape, name=None):
-  """Convert Tensor using default type, unless empty list or tuple."""
-  # Works just like random_ops._ShapeTensor.
-  if isinstance(shape, (tuple, list)) and not shape:
-    dtype = dtypes.int32
-  else:
-    dtype = None
-  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
+
+
+def matrix_triangular_solve_with_broadcast(matrix,
+                                           rhs,
+                                           lower=True,
+                                           adjoint=False,
+                                           name=None):
+  """Solves triangular systems of linear equations with by backsubstitution.
+
+  Works identically to `tf.matrix_triangular_solve`, but broadcasts batch dims
+  of `matrix` and `rhs` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
+  Args:
+    matrix: A Tensor. Must be one of the following types:
+      `float64`, `float32`, `complex64`, `complex128`. Shape is `[..., M, M]`.
+    rhs: A `Tensor`. Must have the same `dtype` as `matrix`.
+      Shape is `[..., M, K]`.
+    lower: An optional `bool`. Defaults to `True`. Indicates whether the
+      innermost matrices in `matrix` are lower or upper triangular.
+    adjoint: An optional `bool`. Defaults to `False`. Indicates whether to solve
+      with matrix or its (block-wise) adjoint.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor` with same `dtype` as `matrix` and shape `[..., M, K]`.
+  """
+  with ops.name_scope(name, "MatrixTriangularSolve", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_triangular_solve(
+        matrix,
+        rhs,
+        lower=lower,
+        adjoint=adjoint)
-- 
GitLab


From 4657e5336160e019379c373a369e3a9b199bc680 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 3 Apr 2018 09:09:13 -0700
Subject: [PATCH 0193/1262] Make batch_sequences_with_states_test.py work with
 the C API enabled, take 2.

It turns out the error can depend on what sequence comes first in the
input dict. This change internally sorts the input to make the error
predictable (this is useful for this test, as well as any users who
may run into this).

PiperOrigin-RevId: 191449214
---
 .../training/batch_sequences_with_states_test.py  | 15 ++++++---------
 .../training/sequence_queueing_state_saver.py     |  5 +++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 16c260edb0..f305197c19 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -517,6 +518,7 @@ class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
     ops._USE_C_API = self._prev_value
 
 
+@test_util.with_c_api
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
@@ -526,15 +528,10 @@ class PaddingTest(test.TestCase):
           "key_2": constant_op.constant([1.5, 2.5])  # length 2
       }
 
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            ValueError, "Fill dimensions must be >= 0"):
-          _, padded_seq = sqss._padding(sequences, 2)
-      else:
-        _, padded_seq = sqss._padding(sequences, 2)
-        with self.assertRaisesOpError(
-            ".*All sequence lengths must match, but received lengths.*"):
-          padded_seq["key_1"].eval()
+      _, padded_seq = sqss._padding(sequences, 2)
+      with self.assertRaisesOpError(
+          ".*All sequence lengths must match, but received lengths.*"):
+        padded_seq["key_1"].eval()
 
   def testPadding(self):
     with ops.Graph().as_default() as g, self.test_session(graph=g):
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 7223194885..99d486b183 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -1574,8 +1574,9 @@ def _padding(sequences, num_unroll):
   if not sequences:
     return 0, {}
 
-  sequences_dict = {}
-  for key, value in sequences.items():
+  # Sort 'sequences_dict' so 'length' will have a predictable value below.
+  sequences_dict = collections.OrderedDict()
+  for key, value in sorted(sequences.items()):
     if not (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
       sequences_dict[key] = ops.convert_to_tensor(value)
-- 
GitLab


From 9e1be727f1427284df4dda77f47a686cac07d098 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Wed, 4 Apr 2018 01:33:08 +0900
Subject: [PATCH 0194/1262] add functional_ops to BUILD

---
 tensorflow/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 3cbeb34c54..8b65b3f057 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1916,6 +1916,7 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
+        ":functional_ops",
         ":linalg_ops_gen",
         ":math_ops",
         "//third_party/py/numpy",
-- 
GitLab


From 27c762c336bb11c8f74694e3d3ea5c8c47a28003 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 3 Apr 2018 09:47:18 -0700
Subject: [PATCH 0195/1262] Enable C++ warnings on a few targets.

PiperOrigin-RevId: 191454435
---
 tensorflow/contrib/lite/arena_planner.h   | 2 +-
 tensorflow/contrib/lite/nnapi_delegate.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index f84b3dad95..e9d0fbc5a9 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace tflite {
 
-class AllocationInfo;
+struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
 //
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index decaf9f160..bc13444dc7 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -162,7 +162,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     };
 
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &augmented_inputs, &next_id](int tensor_id) {
+        [interpreter, &nn_model, &augmented_inputs](int tensor_id) {
           const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
           CHECK_NN(ANeuralNetworksModel_setOperandValue(
               nn_model, tensor_id, tensor->data.raw, tensor->bytes));
-- 
GitLab


From cf8c504688c5f5813c8772eb107ed3d4a1385888 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 10:00:00 -0700
Subject: [PATCH 0196/1262] Bug Fix: If num_uses > 0 the the inputs tensor need
 not be a list but can be reshaped to [batch_size*num_uses, input_size].
 `num_uses` should be incremented by one in this case.'

PiperOrigin-RevId: 191456184
---
 .../kfac/python/ops/layer_collection.py       | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 586a004f88..19608aca47 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -990,9 +990,11 @@ class LayerCollection(object):
                                                    num_uses=num_uses),
                                 reuse=reuse)
     block.register_additional_tower(inputs, outputs)
-
-    assert len(inputs) == len(outputs)
-    self._add_uses(params, len(inputs))
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   def register_conv2d_multi(self,
                             params,
@@ -1066,9 +1068,11 @@ class LayerCollection(object):
         reuse=reuse)
 
     block.register_additional_tower(inputs, outputs)
-
-    assert len(inputs) == len(outputs)
-    self._add_uses(params, len(inputs))
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   # TODO(b/74108452): change the loss registration functions names to refer
   # to "loss functions" instead of distributions.  Following naming convention
@@ -1088,7 +1092,7 @@ class LayerCollection(object):
       inputs: A list of Tensors, each of shape [batch_size, input_size] and
         dtype int32. Indices into embedding matrix. The list indexes each use
         in the graph (which might correspond to a "time-step" in an RNN).
-        OR, can be single Tensor, of shape [num_uses, batch_size, input_size],
+        OR, can be single Tensor, of shape [num_uses*batch_size, input_size],
         which is a reshaped version of a Tensor of shape [num_uses, batch_size,
         input_size].
       outputs: A list of Tensors, each of shape [batch_size, embedding_size].
@@ -1129,7 +1133,10 @@ class LayerCollection(object):
         params, block_type(self, vocab_size, num_uses=num_uses), reuse=reuse)
     block.register_additional_tower(inputs, outputs)
 
-    self._add_uses(params, len(inputs))
+    if isinstance(inputs, (tuple, list)):
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   def register_categorical_predictive_distribution(self,
                                                    logits,
-- 
GitLab


From cfc886ac6064a04c71dd6c52e8c21ebec91eae50 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 3 Apr 2018 10:00:02 -0700
Subject: [PATCH 0197/1262] [tf.data] Fix handling of nested structures in
 `tf.contrib.data.prefetch_to_device()`.

PiperOrigin-RevId: 191456191
---
 .../kernel_tests/prefetching_ops_test.py      | 32 +++++++++++++++++++
 .../data/python/ops/prefetching_ops.py        | 17 +++++++++-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 676959a900..f2c57f92e2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -231,6 +231,37 @@ class StagingAreaOpsTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchDictToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element["a"].dtype)
+    self.assertEqual([], next_element["a"].shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual({"a": i}, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -248,5 +279,6 @@ class StagingAreaOpsTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 98651bb568..554bfaa2cf 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -77,10 +78,24 @@ class _PrefetchToDeviceIterator(object):
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
           handle, input_iterator.output_types, input_iterator.output_shapes,
           input_iterator.output_classes)
-      return remote_iterator.get_next()
+      ret = remote_iterator.get_next()
+
+      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+      ])
+
+      # Serialize any sparse tensors and convert result to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          ops.convert_to_tensor(t)
+          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
+      ])
+      return nest.flatten(ret)
 
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
-- 
GitLab


From 655b2663e7d2609b0f578b9ef3c1401de22dc5c2 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 3 Apr 2018 10:04:47 -0700
Subject: [PATCH 0198/1262] Apply "Raise exception in SWIG on bad TF_Status" to
 base.i

Minor fixes to make this work.

PiperOrigin-RevId: 191457070
---
 tensorflow/python/client/tf_session.i      | 26 ----------
 tensorflow/python/eager/backprop.py        | 11 ++---
 tensorflow/python/eager/context.py         | 56 +++++++++-------------
 tensorflow/python/eager/imperative_grad.py |  6 +--
 tensorflow/python/grappler/item.py         |  4 +-
 tensorflow/python/lib/io/tf_record.py      |  3 +-
 tensorflow/python/platform/base.i          | 22 +++++++++
 tensorflow/python/pywrap_tfe.i             |  2 +
 8 files changed, 54 insertions(+), 76 deletions(-)

diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index ee76e29c05..68768f2b4c 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/python/client/tf_session_helper.h"
-#include "tensorflow/python/lib/core/py_exception_registry.h"
 
 // Helper function to convert a Python list of Tensors to a C++ vector of
 // TF_Outputs.
@@ -353,27 +352,6 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-// Typemaps to automatically raise a Python exception from bad output TF_Status.
-// TODO(b/77295559): expand this to all TF_Status* output params and deprecate
-// raise_exception_on_not_ok_status (currently it only affects the C API).
-%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
-  status = TF_NewStatus();
-  $1 = status;
-}
-
-%typemap(argout) TF_Status* status {
-  TF_Code code = TF_GetCode($1);
-  if (code != TF_OK) {
-    PyObject* exc = tensorflow::PyExceptionRegistry::Lookup(code);
-    // Arguments to OpError.
-    PyObject* exc_args = Py_BuildValue("sss", nullptr, nullptr, TF_Message($1));
-    TF_DeleteStatus($1);
-    SWIG_SetErrorObj(exc, exc_args);
-    SWIG_fail;
-  }
-  TF_DeleteStatus($1);
-}
-
 // Converts input Python list of wrapped TF_Outputs into a single array
 %typemap(in) (const TF_Output* inputs, int num_inputs)
     (std::vector<TF_Output> inputs) {
@@ -784,7 +762,3 @@ def TF_Reset(target, containers=None, config=None):
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
-
-// Clear "TF_Status* status" typemap so it doesn't affect other modules and
-// unexpectedly remove the TF_Status* argument from wrappers.
-%clear TF_Status* status;
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 209b012621..92774d4d50 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -31,7 +31,6 @@ from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -50,12 +49,10 @@ def op_attr_type(op_type, attr_name):
   try:
     return _op_attr_type_cache[(op_type, attr_name)]
   except KeyError:
-    with errors.raise_exception_on_not_ok_status() as status:
-      h = context.context()._handle  # pylint: disable=protected-access
-      attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(
-          h, op_type, attr_name, status)
-    _op_attr_type_cache[(op_type, attr_name)] = attr_type
-    return attr_type
+    h = context.context()._handle  # pylint: disable=protected-access
+    attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(h, op_type, attr_name)
+  _op_attr_type_cache[(op_type, attr_name)] = attr_type
+  return attr_type
 
 
 def make_attr(attr_type, value):
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 99ec895b54..9e146f021e 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -28,7 +28,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
@@ -224,24 +223,21 @@ class Context(object):
       assert self._context_devices is None
       opts = pywrap_tensorflow.TFE_NewContextOptions()
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._config is not None:
-            config_str = self._config.SerializeToString()
-            pywrap_tensorflow.TFE_ContextOptionsSetConfig(
-                opts, config_str, len(config_str), status)
-          if self._device_policy is not None:
-            pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
-                opts, self._device_policy)
-          if self._execution_mode == ASYNC:
-            pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
-          self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
+        if self._config is not None:
+          config_str = self._config.SerializeToString()
+          pywrap_tensorflow.TFE_ContextOptionsSetConfig(opts, config_str)
+        if self._device_policy is not None:
+          pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
+              opts, self._device_policy)
+        if self._execution_mode == ASYNC:
+          pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
+        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
       finally:
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
       # Store list of devices
       self._context_devices = []
-      with errors.raise_exception_on_not_ok_status() as status:
-        device_list = pywrap_tensorflow.TFE_ContextListDevices(
-            self._context_handle, status)
+      device_list = pywrap_tensorflow.TFE_ContextListDevices(
+          self._context_handle)
       try:
         self._num_gpus = 0
         for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
@@ -412,9 +408,7 @@ class Context(object):
     if mode is None:
       mode = SYNC
     self._eager_context.execution_mode = mode
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle,
-                                                     mode == ASYNC, status)
+    pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle, mode == ASYNC)
 
   @tf_contextlib.contextmanager
   def execution_mode(self, mode):
@@ -428,8 +422,7 @@ class Context(object):
 
   def async_wait(self):
     """Waits for ops dispatched in ASYNC mode to finish."""
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAsyncWait(self._handle, status)
+    pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
 
   def async_clear_error(self):
     """Clears errors raised during ASYNC execution."""
@@ -449,11 +442,9 @@ class Context(object):
     Args:
       fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunction(
-          self._handle,  # pylint: disable=protected-access
-          fn,
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunction(
+        self._handle,  # pylint: disable=protected-access
+        fn)
 
   def add_function_def(self, fdef):
     """Add a function definition to the context.
@@ -465,12 +456,10 @@ class Context(object):
       fdef: A FunctionDef protocol buffer message.
     """
     fdef_string = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunctionDef(
-          self._handle,  # pylint: disable=protected-access
-          fdef_string,
-          len(fdef_string),
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunctionDef(
+        self._handle,  # pylint: disable=protected-access
+        fdef_string,
+        len(fdef_string))
 
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
@@ -545,9 +534,8 @@ class Context(object):
     if not self._context_handle:
       return None
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TFE_ContextExportRunMetadata(
-            self._context_handle, buffer_, status)
+      pywrap_tensorflow.TFE_ContextExportRunMetadata(
+          self._context_handle, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     run_metadata = config_pb2.RunMetadata()
     run_metadata.ParseFromString(compat.as_bytes(proto_data))
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 837cad974a..000152855d 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
 
 
 VSpace = collections.namedtuple(
@@ -60,6 +59,5 @@ def imperative_grad(
      or if only non-differentiable functions of the source were used in the
      computation of target.
   """
-  with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.TFE_Py_TapeGradient(
-        tape._tape, vspace, target, sources, output_gradients, status)  # pylint: disable=protected-access
+  return pywrap_tensorflow.TFE_Py_TapeGradient(
+      tape._tape, vspace, target, sources, output_gradients)  # pylint: disable=protected-access
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index 4a083849bd..1748efdd13 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -51,9 +51,7 @@ class Item(object):
     self._BuildTFItem()
 
   def IdentifyImportantOps(self, sort_topologically=False):
-    with errors.raise_exception_on_not_ok_status() as status:
-      return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically,
-                                             status)
+    return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically)
 
   def GetOpProperties(self):
     ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 6fcf9c91d8..bf2d6f68b5 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -78,8 +78,7 @@ def tf_record_iterator(path, options=None):
   try:
     while True:
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          reader.GetNext(status)
+        reader.GetNext()
       except errors.OutOfRangeError:
         break
       yield reader.record()
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index dbefca2be9..478dd46f7e 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -229,3 +229,25 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 %define final %enddef
 %define override %enddef
 #endif
+
+// Typemaps to automatically raise a Python exception from bad output TF_Status.
+// TODO(b/77295559): expand this to all TF_Status* output params and deprecate
+// raise_exception_on_not_ok_status (currently it only affects the C API).
+%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+  $1 = TF_NewStatus();
+}
+
+%typemap(freearg) (TF_Status* status) {
+ TF_DeleteStatus($1);
+}
+
+%typemap(argout) TF_Status* status {
+  TF_Code code = TF_GetCode($1);
+  if (code != TF_OK) {
+    PyObject* exc = tensorflow::PyExceptionRegistry::Lookup(code);
+    // Arguments to OpError.
+    PyObject* exc_args = Py_BuildValue("sss", nullptr, nullptr, TF_Message($1));
+    SWIG_SetErrorObj(exc, exc_args);
+    SWIG_fail;
+  }
+}
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 39fabb9c1b..7acb8eeb1a 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+%include "tensorflow/python/platform/base.i"
+
 %ignore "";
 
 %rename("%s") TFE_NewContext;
-- 
GitLab


From 63a3be25482121e34ad3ac399e1ece4b24656318 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 3 Apr 2018 10:06:32 -0700
Subject: [PATCH 0199/1262] Don't call context function either in the fastpath.

Also add a slowpath benchmark for identity.

PiperOrigin-RevId: 191457407
---
 tensorflow/python/eager/benchmarks_test.py     |  3 +++
 tensorflow/python/eager/python_eager_op_gen.cc | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9ca5041c38..7ad37058fd 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -201,6 +201,9 @@ class MicroBenchmarks(test.Benchmark):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
 
+  def benchmark_slowpath_tf_identity(self):
+    self._run(lambda: gen_array_ops.identity(1), 30000)
+
   def benchmark_tfe_py_execute_identity(self):
     m = self._m_2
     ctx_handle = context.context()._handle
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 15d20bdd1a..9afab0077b 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -366,8 +366,8 @@ string GenEagerPythonOp::Code() {
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   // Handle graph-mode case
   strings::StrAppend(&result_,
-                     "  _ctx = _context.context()\n"
-                     "  if not _ctx._eager_context.is_eager:\n",
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
                      function_setup,
                      "    _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
@@ -683,13 +683,14 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix), parameters);
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
                      "\n  \"\"\"\n");
 
-  strings::StrAppend(&result_, "  _ctx = _context.context()\n");
+  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
 
   string function_setup;
   if (!GetEagerFunctionSetup("  ", &function_setup)) {
@@ -755,6 +756,8 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      ", "return _result\n");
 
   // Handle fallback.
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
   strings::StrAppend(
       &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
-- 
GitLab


From 1846881d8010d41e7c5a866bd2bb1b01ec220a13 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 3 Apr 2018 10:17:33 -0700
Subject: [PATCH 0200/1262] Use r1.7 version.

---
 tensorflow/docs_src/get_started/index.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index e09d2dcbde..b28cb9df75 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -18,12 +18,10 @@ TensorFlow provides many APIs. The remainder of this section focuses on the
 Estimator API which provide scalable, high-performance models.
 To get started with Estimators begin by reading one of the following documents:
 
-  * @{$get_started/eager} is for machine learning beginners and uses
-    @{$programmers_guide/eager}.
-  * @{$get_started/get_started_for_beginners} is also for machine learning
-    beginners and uses @{$programmers_guide/graphs}.
-  * @{$get_started/premade_estimators} assumes some machine learning background
-    and uses an @{tf.estimator.Estimator$Estimator}.
+  * @{$get_started/get_started_for_beginners}, which is aimed at readers
+    new to machine learning.
+  * @{$get_started/premade_estimators}, which is aimed at readers who have
+    experience in machine learning.
 
 Then, read the following documents, which demonstrate the key features
 in the high-level APIs:
-- 
GitLab


From f005999d571536b859229229e4487cf749d9a786 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 10:32:57 -0700
Subject: [PATCH 0201/1262] Add 8bit strided slice op to tflite

PiperOrigin-RevId: 191461696
---
 .../contrib/lite/kernels/strided_slice.cc     |   5 +
 .../lite/kernels/strided_slice_test.cc        | 113 ++++++++++--------
 2 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index eb374d9031..e6d5c300dc 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -228,6 +228,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, int64_t);
       }
       break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type is currently not supported "
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5c98c5f431..22d7b097cb 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -24,6 +24,8 @@ namespace {
 using ::int32;
 using ::testing::ElementsAreArray;
 
+template <typename input_type = float,
+          TensorType tensor_input_type = TensorType_FLOAT32>
 class StridedSliceOpModel : public SingleOpModel {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
@@ -32,11 +34,11 @@ class StridedSliceOpModel : public SingleOpModel {
                       std::initializer_list<int> strides_shape, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
-    input_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(tensor_input_type);
     begin_ = AddInput(TensorType_INT32);
     end_ = AddInput(TensorType_INT32);
     strides_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
         CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
@@ -45,8 +47,8 @@ class StridedSliceOpModel : public SingleOpModel {
     BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
   }
   void SetBegin(std::initializer_list<int32> data) {
     PopulateTensor<int32>(begin_, data);
@@ -58,7 +60,9 @@ class StridedSliceOpModel : public SingleOpModel {
     PopulateTensor<int32>(strides_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  private:
@@ -71,19 +75,19 @@ class StridedSliceOpModel : public SingleOpModel {
 
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
-      StridedSliceOpModel({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
+      StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
       "StridedSlice op only supports 1D-4D input arrays.");
 }
 
 TEST(StridedSliceOpTest, UnssupportedArgs) {
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
                "ellipsis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -94,7 +98,7 @@ TEST(StridedSliceOpTest, In1D) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutput) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({10});
   m.SetEnd({3});
@@ -104,7 +108,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutput) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({3});
@@ -115,7 +119,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-5});
   m.SetEnd({3});
@@ -126,7 +130,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({-2});
@@ -137,7 +141,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({5});
@@ -148,7 +152,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -159,7 +163,7 @@ TEST(StridedSliceOpTest, In1D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -170,7 +174,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({5});
   m.SetEnd({2});
@@ -181,7 +185,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({-4});
@@ -192,7 +196,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({-5});
@@ -203,7 +207,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EndMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -214,7 +218,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegStride) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({-1});
   m.SetEnd({-4});
@@ -225,7 +229,7 @@ TEST(StridedSliceOpTest, In1D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
-  StridedSliceOpModel m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2});
   m.SetBegin({0});
   m.SetEnd({2});
@@ -236,7 +240,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({0});
   m.SetEnd({3});
@@ -247,7 +251,7 @@ TEST(StridedSliceOpTest, In1D_OddLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_Identity) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -258,7 +262,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In2D) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -269,7 +273,7 @@ TEST(StridedSliceOpTest, In2D) {
 }
 
 TEST(StridedSliceOpTest, In2D_Stride2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -280,7 +284,7 @@ TEST(StridedSliceOpTest, In2D_Stride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStride) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -1});
   m.SetEnd({2, -4});
@@ -291,7 +295,7 @@ TEST(StridedSliceOpTest, In2D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In2D_BeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -302,7 +306,7 @@ TEST(StridedSliceOpTest, In2D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_EndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -313,7 +317,7 @@ TEST(StridedSliceOpTest, In2D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -4});
@@ -324,7 +328,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -3});
@@ -335,7 +339,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
 }
 
 TEST(StridedSliceOpTest, In3D_Identity) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -347,7 +351,7 @@ TEST(StridedSliceOpTest, In3D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In3D_NegStride) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({-1, -1, -1});
   m.SetEnd({-3, -4, -3});
@@ -359,7 +363,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In3D_Strided2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -370,7 +374,7 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
 }
 
 TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -381,7 +385,7 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({1});
@@ -392,7 +396,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -403,7 +407,7 @@ TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -414,7 +418,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -425,7 +429,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -436,7 +440,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -447,7 +451,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -458,7 +462,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -469,7 +473,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -480,7 +484,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -491,7 +495,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -502,7 +506,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -513,7 +517,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -525,7 +529,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
 
 // This tests catches a very subtle bug that was fixed by cl/188403234.
 TEST(StridedSliceOpTest, RunTwice) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
 
   auto setup_inputs = [&m]() {
     m.SetInput({1, 2, 3, 4, 5, 6});
@@ -544,6 +548,17 @@ TEST(StridedSliceOpTest, RunTwice) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
 }
 
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
+  StridedSliceOpModel<uint8, TensorType_UINT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 895566e3af6e3e44eb22d0c2d20b4890e42982fc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 10:35:00 -0700
Subject: [PATCH 0202/1262] Three operators are added that can be used to
 decrease the number of transpose/reshape occurences. These operators are: -
 Swap elementwise - Swap reshape-transpose - Fuse transpose-reshape

Swap elementwise groups operations that operate on value, and move values. This
allows all movement operations (reshape, transpose, etc) to group at one
portion of the graph while value manipulations group on the other end.

Swap reshape/transpose finds cases where the reshape-transpose can be reorderd.
This allows the reshape-transpose-reshape case to be transformed to
transpose-reshape-reshape, which can then merge the reshape-reshape.

Finally the transpose-reshape merge allows cases where, when reshape maintains
the same dimensions and does not affect memory ordering, the transpose and
reshape can be merged into a single transpose operator.

PiperOrigin-RevId: 191462038
---
 tensorflow/contrib/lite/toco/BUILD            |   4 +-
 .../graph_transformations.h                   |   4 +-
 .../merge_reshape_into_preceding_transpose.cc | 190 ++++++++++++++
 .../reorder_activation_functions.cc           | 137 ----------
 .../reorder_elementwise_unary.cc              | 153 +++++++++++
 .../reorder_reshape_transpose.cc              | 248 ++++++++++++++++++
 .../resolve_tensorflow_matmul.cc              |  14 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   4 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |  29 ++
 tensorflow/contrib/lite/toco/tooling_util.h   |  13 +
 10 files changed, 649 insertions(+), 147 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
 delete mode 100644 tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 2dd689ad4c..8a35fb9034 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -234,6 +234,7 @@ cc_library(
         "graph_transformations/identify_relu1.cc",
         "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
+        "graph_transformations/merge_reshape_into_preceding_transpose.cc",
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
@@ -251,7 +252,8 @@ cc_library(
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
-        "graph_transformations/reorder_activation_functions.cc",
+        "graph_transformations/reorder_elementwise_unary.cc",
+        "graph_transformations/reorder_reshape_transpose.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 76ec02aa07..27c5044bb3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -128,6 +128,7 @@ DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
 DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeReshapeIntoPrecedingTranspose)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyPRelu)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyDilatedConv)
@@ -152,7 +153,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
-DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(ReorderElementwiseUnary)
+DECLARE_GRAPH_TRANSFORMATION(ReorderReshapeTranspose)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
new file mode 100644
index 0000000000..5065004093
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    // Arrays are missing.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Returns whether the reshape could be a transpose.
+std::vector<int32> ReshapeToTranspose(const Model& model,
+                                      const TensorFlowReshapeOperator* op) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  std::vector<int> one_indices;
+  std::vector<int> not_one_indices;
+
+  // Separate into one indices and not one indices.
+  for (int i = 0; i < in_shape.size(); i++) {
+    if (in_shape[i] == 1) {
+      one_indices.push_back(i);
+    } else {
+      not_one_indices.push_back(i);
+    }
+  }
+
+  // Reorder the vertices.
+  std::vector<int> perm;
+  perm.reserve(in_shape.size());
+  int one_index = 0;
+  int not_one_index = 0;
+  for (const auto val : out_shape) {
+    if (val == 1) {
+      perm.push_back(one_indices[one_index]);
+      one_index++;
+    } else {
+      perm.push_back(not_one_indices[not_one_index]);
+      not_one_index++;
+    }
+  }
+
+  return perm;
+}
+
+}  // namespace
+
+// When a transpose is fed into a reshape, it is possible for the two operators
+// to be merged if the reshape does not affect memory ordering and does not
+// affects the number of dimensions. This only occurs when only unary dimensions
+// are shifting position.
+bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
+                                             std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* reshape_op = ConvertOperator<TensorFlowReshapeOperator*>(
+      it->get(), OperatorType::kTensorFlowReshape);
+
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  const string intermediate_name = reshape_op->inputs[0];
+  const string output_name = reshape_op->outputs[0];
+
+  // Guarantee the input is only consume by the reshape.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    return false;
+  }
+
+  // Check for the parent operator.
+  const auto& transpose_it = FindOpWithOutput(*model, intermediate_name);
+  if (transpose_it == model->operators.end()) {
+    return false;
+  }
+
+  // Find the parent operator and guarantee it is a transpose.
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    return false;
+  }
+
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      false /*allow_extra_unary_dimensions*/)) {
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot fuse %s and %s as it would invalidate the transpose "
+        "output array.",
+        LogName(*transpose_op), LogName(*reshape_op));
+    return false;
+  }
+
+  AddMessageF("Merging operations %s and %s", LogName(*transpose_op),
+              LogName(*reshape_op));
+
+  // const auto& intermediate_array = model->GetArray(intermediate_name);
+  // const auto& output_array = model->GetArray(output_name);
+
+  auto merged_perm = ReshapeToTranspose(*model, reshape_op);
+
+  // Combine the permutations.
+  const auto& transpose_perm = transpose_op->perm;
+  for (int i = 0; i < merged_perm.size(); i++) {
+    merged_perm[i] = transpose_perm[merged_perm[i]];
+  }
+
+  // Remove the reshape as passthrough operation.
+  if (!RemoveTrivialPassthroughOp(this, model, op_index)) {
+    return false;
+  }
+
+  // Update transpose_op's constant buffer to contain the new permutation.
+  model->GetArray(transpose_op->inputs[1])
+      .GetMutableBuffer<ArrayDataType::kInt32>()
+      .data = merged_perm;
+  transpose_op->perm = merged_perm;
+
+  // transpose_ops's shape will likely has changed.
+  model->GetArray(transpose_op->outputs[0]).clear_shape();
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
deleted file mode 100644
index 9852c86c21..0000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
-  const auto ac_it = model->operators.begin() + op_index;
-  std::unique_ptr<Operator>& ac_op = *ac_it;
-  DCHECK(ac_op);
-
-  if (ac_op->type != OperatorType::kRelu6 &&
-      ac_op->type != OperatorType::kRelu1 &&
-      ac_op->type != OperatorType::kRelu) {
-    return false;
-  }
-
-  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
-  if (exchange_it == model->operators.end()) return false;
-  // Find the op producing the array passed to this activation function
-  std::unique_ptr<Operator>& exchange_op = *exchange_it;
-  DCHECK(exchange_op);
-
-  // Allow activation functions to move up over any operator that does not
-  // change the values.
-  switch (exchange_op->type) {
-    case OperatorType::kExpandDims:
-    case OperatorType::kSqueeze:
-    case OperatorType::kTensorFlowReshape:
-    case OperatorType::kTranspose:
-      break;
-    default:
-      return false;
-  }
-
-  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
-  const auto exchange_op_input = exchange_op->inputs[0];
-  const auto intermediate_array = exchange_op->outputs[0];
-  const auto ac_op_output = ac_op->outputs[0];
-
-  int count_ops_consuming_output =
-      CountOpsWithInput(*model, intermediate_array);
-  DCHECK_GE(count_ops_consuming_output, 1);
-  if (count_ops_consuming_output > 1) {
-    AddMessageF(
-        "Not exchanging activation function with %s because it is consumed by "
-        "more than 1 other operator",
-        LogName(*exchange_op));
-    return false;
-  }
-
-  // If the ac_op was originally producing an output_array we can't trivially
-  // reorder as otherwise the output array name would change and break
-  // downstream assumptions. To work around that we perform some renaming below
-  // in that case at the cost of a bit more confusing array names in this rare
-  // case.
-  bool is_ac_op_output =
-      std::find(model->flags.output_arrays().begin(),
-                model->flags.output_arrays().end(),
-                ac_op_output) != model->flags.output_arrays().end();
-  if (is_ac_op_output) {
-    // To preserve the output array name of the activation function we need to
-    // create a temporary to use to pass between ac->ex.
-    //
-    // Original:
-    //  (a) -> EX -> (b) -> AC -> (c)
-    // Now:
-    //  (a) -> AC -> (c') -> EX -> (c)
-    AddMessageF(
-        "Exchanging activation function %s with %s but renaming to preserve "
-        "output array %s",
-        LogName(*ac_op), LogName(*exchange_op), ac_op->outputs[0]);
-
-    auto renamed_ac_op_output =
-        AvailableArrayName(*model, ac_op_output + "_exchange");
-    ac_op->inputs[0] = exchange_op_input;
-    ac_op->outputs[0] = renamed_ac_op_output;
-    model->EraseArray(exchange_op->outputs[0]);
-    exchange_op->inputs[0] = renamed_ac_op_output;
-    exchange_op->outputs[0] = ac_op_output;
-  } else {
-    // Simply swap the order and update consumers to use the exchange_op output
-    // array (b).
-    //
-    // Original:
-    //  (a) -> EX -> (b) -> AC -> (c)
-    // Now:
-    //  (a) -> AC -> (c) -> EX -> (b)
-    AddMessageF("Exchanging activation function %s with %s", LogName(*ac_op),
-                LogName(*exchange_op));
-
-    Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
-    while (consumer) {
-      for (int i = 0; i < consumer->inputs.size(); ++i) {
-        if (consumer->inputs[i] == ac_op_output) {
-          consumer->inputs[i] = intermediate_array;
-        }
-      }
-      consumer = GetFirstOpWithInput(*model, ac_op_output);
-    }
-    ac_op->inputs[0] = exchange_op_input;
-    exchange_op->inputs[0] = ac_op_output;
-  }
-
-  // Clear shapes; this will allow shape propagation to fix the sizes for us.
-  model->GetOrCreateArray(ac_op->outputs[0]).clear_shape();
-  model->GetOrCreateArray(exchange_op->outputs[0]).clear_shape();
-
-  // Finally, reorder operators.  Note that this only works when there are no
-  // other direct descendents of the exchange_op.
-  ac_op.swap(exchange_op);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
new file mode 100644
index 0000000000..9f5b7920cb
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsElementwiseOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kCast:
+    case OperatorType::kExp:
+    case OperatorType::kFloor:
+    case OperatorType::kNeg:
+    case OperatorType::kRelu:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu6:
+    case OperatorType::kTanh:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsMoveOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kDepthToSpace:
+    case OperatorType::kExpandDims:
+    case OperatorType::kSpaceToDepth:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
+// Swap elementwise operators such that all value operators occur before all
+// element move operators, e.g. negation then transpose.
+bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
+  const auto element_op_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& element_op = *element_op_it;
+  if (!IsElementwiseOperator(element_op->type)) {
+    return false;
+  }
+
+  const string intermediate_name = element_op->inputs[0];
+  auto it = FindOpWithOutput(*model, intermediate_name);
+  if (it == model->operators.end()) {
+    AddMessageF("No preceding operator");
+    return false;
+  }
+
+  std::unique_ptr<Operator>& move_op = *it;
+  if (!IsMoveOperator(move_op->type)) {
+    AddMessageF("Preceding operator is not a move operator");
+    return false;
+  }
+
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is discardable.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot swap elementwise as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // op->inputs may change so we need to keep a value by copy.
+  const string input_name = move_op->inputs[0];
+  const string output_name = element_op->outputs[0];
+
+  AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
+              LogName(*move_op));
+
+  // If the output array is an exit node for the graph then we need to retain
+  // the name as an output node. This makes the naming scheme a little confusing
+  // but is required in this rare case.
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, element_op->outputs[0] + "_reorder");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, output_name);
+
+    element_op->inputs[0] = input_name;
+    element_op->outputs[0] = new_intermediate_name;
+    model->EraseArray(intermediate_name);
+    move_op->inputs[0] = new_intermediate_name;
+    move_op->outputs[0] = output_name;
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    element_op->inputs[0] = input_name;
+    move_op->inputs[0] = output_name;
+  }
+
+  // Reset both arrays as shape, type, min/max, etc can all change because of
+  // the position swap.
+  model->EraseArray(element_op->outputs[0]);
+  model->EraseArray(move_op->outputs[0]);
+
+  // Reconstruct.
+  model->GetOrCreateArray(element_op->outputs[0]);
+  model->GetOrCreateArray(move_op->outputs[0]);
+
+  // Swap the order of the operators.
+  element_op.swap(move_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
new file mode 100644
index 0000000000..9e7fe1b1cc
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Utility function to filter out a value.
+void Filter(std::vector<int>* vec, int value) {
+  vec->erase(std::remove(vec->begin(), vec->end(), value), vec->end());
+}
+
+// Computes a new permutation used to swap a reshape-transpose to a
+// transpose-reshape. In this case the permutation operates on the intermediate
+// shape.
+std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
+                                std::vector<int> intermediate_dims,
+                                std::vector<int> perm) {
+  // These are the major axis of the input.
+  std::vector<int> input_indices;
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (input_dims[i] != 1) {
+      input_indices.push_back(i);
+    }
+  }
+
+  // This maps which indices of the input produced the intermediate indices for
+  // non-unary dimensions.
+  std::unordered_map<int, int> intermediate_to_input_indices_map;
+  for (int i = 0; i < intermediate_dims.size(); i++) {
+    if (intermediate_dims[i] != 1) {
+      intermediate_to_input_indices_map[i] =
+          input_indices[intermediate_to_input_indices_map.size()];
+    }
+  }
+
+  // Translate the transpose permutation to a new permutation starting with the
+  // major indices.
+  std::vector<int> new_perm;
+  new_perm.reserve(input_dims.size());
+  for (int i = 0; i < perm.size(); i++) {
+    if (intermediate_dims[perm[i]] == 1) continue;
+
+    new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
+  }
+
+  // Fill the rest of the transpose in with the ones.
+  for (int index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] == 1) {
+      new_perm.push_back(index);
+    }
+  }
+
+  CHECK_EQ(new_perm.size(), input_dims.size());
+  return new_perm;
+}
+
+}  // namespace
+
+// Swaps reshape-transpose to transpose-reshape whenever possible. This is
+// possible when the reshape does not affect memory ordering.
+bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
+  auto transpose_it = model->operators.begin() + op_index;
+
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    // Wait for values to propagate.
+    return false;
+  }
+
+  // Find the operator that produces the transpose op.
+  auto reshape_it = FindOpWithOutput(*model, transpose_op->inputs[0]);
+  if (reshape_it == model->operators.end()) {
+    return false;
+  }
+
+  TensorFlowReshapeOperator* reshape_op =
+      ConvertOperator<TensorFlowReshapeOperator*>(
+          reshape_it->get(), OperatorType::kTensorFlowReshape);
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  // Ignore if the reshape is uninitialized.
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  // Need to copy to keep static if permutated.
+  const string input_name = reshape_op->inputs[0];
+  const string intermediate_name = reshape_op->outputs[0];
+  const string output_name = transpose_op->outputs[0];
+
+  // Intermediate should not be consumed by any other operators.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot reorder reshape-transpose as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // Get the arrays.
+  const auto& input_array = model->GetArray(input_name);
+  const auto& intermediate_array = model->GetArray(intermediate_name);
+  const auto& output_array = model->GetArray(output_name);
+
+  // Get the shapes of each array.
+  Shape input_shape = input_array.shape();
+  Shape intermediate_shape = intermediate_array.shape();
+  Shape output_shape = output_array.shape();
+
+  // Assign ids to non-unary indices.
+  std::vector<int> input_dims = input_shape.dims();
+  std::vector<int> intermediate_dims = intermediate_shape.dims();
+  std::vector<int> output_dims = output_shape.dims();
+
+  // If the reshape is equivalent to a transpose with fewer/more unary
+  // dimensions then it can be moved between the transpose.
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      true /*allow_extra_unary_dims*/)) {
+    return false;
+  }
+
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, transpose_op->outputs[0] + "_exchange");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, transpose_op->outputs[0]);
+    transpose_op->inputs[0] = input_name;
+    transpose_op->outputs[0] = new_intermediate_name;
+    reshape_op->inputs[0] = new_intermediate_name;
+    reshape_op->outputs[0] = output_name;
+    model->EraseArray(intermediate_name);
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    transpose_op->inputs[0] = input_name;
+    reshape_op->inputs[0] = output_name;
+  }
+
+  // If transposes constant buffer is used elsewhere, make a new copy.
+  if (CountOpsWithInput(*model, transpose_op->inputs[1]) != 1) {
+    transpose_op->inputs[1] =
+        AvailableArrayName(*model, transpose_op->inputs[1] + "_copy");
+  }
+
+  // Make the new transpose permutation.
+  const std::vector<int> new_perm =
+      ComputeNewPerm(input_dims, intermediate_dims, transpose_op->perm);
+  CHECK_EQ(input_dims.size(), new_perm.size());
+
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
+  *(transpose_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(new_perm.size())};
+  transpose_op->perm = new_perm;
+
+  // If the reshape's constant buffer is reused, create a new one.
+  if (CountOpsWithInput(*model, reshape_op->inputs[1]) != 1) {
+    reshape_op->inputs[1] =
+        AvailableArrayName(*model, reshape_op->inputs[1] + "_copy");
+  }
+
+  // We need to modify the reshape input array to target the new output size.
+  auto& reshape_array = model->GetOrCreateArray(reshape_op->inputs[1]);
+  reshape_array.GetMutableBuffer<ArrayDataType::kInt32>().data = output_dims;
+  *(reshape_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(output_shape.dimensions_count())};
+  reshape_op->shape.clear();
+
+  AddMessageF("Swapping around operators between %s and %s", input_name,
+              output_name);
+
+  model->GetOrCreateArray(transpose_op->outputs[0]).clear_shape();
+  model->GetOrCreateArray(reshape_op->outputs[0]).clear_shape();
+
+  // Swap the order of the operators.
+  transpose_it->swap(*reshape_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index f38203c80f..2a236d3f98 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -60,6 +60,13 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   string input_lhs = matmul_op->inputs[0];
   string input_rhs = transpose_op->outputs[0];
 
+  // Construct the new FullyConnectedOperator.
+  auto* fc_op = new FullyConnectedOperator;
+  fc_op->outputs = matmul_op->outputs;
+
+  // Insert the newly constructed FullyConnectedOperator.
+  model->operators.emplace(matmul_it, fc_op) + 1;
+
   // Find the op producing the array passed to this MatMul
   auto previous_op_it = model->operators.begin();
   bool found = false;
@@ -76,13 +83,6 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   }
   Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
 
-  // Construct the new FullyConnectedOperator.
-  auto* fc_op = new FullyConnectedOperator;
-  fc_op->outputs = matmul_op->outputs;
-
-  // Insert the newly constructed FullyConnectedOperator.
-  model->operators.emplace(matmul_it, fc_op) + 1;
-
   // Refresh iterator.
   matmul_it = model->operators.begin();
   for (; matmul_it != model->operators.end(); ++matmul_it) {
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 0c52f50e39..76e9a27aef 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -74,7 +74,9 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
-  transformations->Add(new ReorderActivationFunctions);
+  transformations->Add(new MergeReshapeIntoPrecedingTranspose);
+  transformations->Add(new ReorderElementwiseUnary);
+  transformations->Add(new ReorderReshapeTranspose);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 060c52e9e3..c1eaba7f4c 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1921,6 +1921,35 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
   return true;
 }
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  // If the reshape changes the number of dimensions so it cannot be interpreted
+  // as a transpose.
+  if (!allow_extra_unary_dims && in_shape.size() != out_shape.size()) {
+    return false;
+  }
+
+  in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1),
+                 in_shape.end());
+  out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1),
+                  out_shape.end());
+  return in_shape == out_shape;
+}
+
 void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index d3b7224fe3..259ee7fbd0 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -169,10 +169,23 @@ void GetQuantizationParamsFromMinMax(const MinMax& minmax,
       ::tflite::ChooseQuantizationParams<Integer>(rmin, rmax);
 }
 
+template <typename T>
+T ConvertOperator(Operator* o, OperatorType type) {
+  if (o != nullptr && o->type == type) {
+    return static_cast<T>(o);
+  }
+
+  return nullptr;
+}
+
 void CheckIsReadyForQuantization(const Model& model);
 void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
                                  double default_ranges_max);
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims);
+
 inline int Offset(const Shape& shape, const std::vector<int>& indices) {
   DCHECK_EQ(shape.dimensions_count(), indices.size());
   const int dims_count = shape.dimensions_count();
-- 
GitLab


From 5e93a0795b57a22c4c08424fb10fe3ca7e182707 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 3 Apr 2018 10:47:37 -0700
Subject: [PATCH 0203/1262] [TF:XLA] Add DT_HALF to the supported data types
 for the CPU and GPU devices.

Add a test case to compilation_passes_test for DT_HALF data type.

PiperOrigin-RevId: 191464288
---
 .../jit/mark_for_compilation_pass_test.cc     | 22 ++++++++++++++++++-
 tensorflow/compiler/jit/xla_cpu_device.cc     |  4 ++--
 tensorflow/compiler/jit/xla_gpu_device.cc     |  4 ++--
 tensorflow/compiler/tf2xla/xla_op_registry.h  |  8 +++----
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 381c0205fd..af1919278c 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -138,7 +138,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
-TEST(XlaCompilationTest, UnsupportedTypes) {
+TEST(XlaCompilationTest, Complex128Unsupported) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
   {
@@ -158,6 +158,26 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
   EXPECT_TRUE(clusters.empty());
 }
 
+TEST(XlaCompilationTest, HalfSupported) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp(
+        "Const", builder.opts()
+                     .WithName("A")
+                     .WithAttr("dtype", DT_HALF)
+                     .WithAttr("value", Tensor(DT_HALF, TensorShape())));
+    Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
+    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+  EXPECT_FALSE(clusters.empty());
+}
+
 TEST(XlaCompilationTest, ConcatWithConstArg) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index d2dfdeea68..bc07dbd7bd 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -62,8 +62,8 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaCpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 7> kAllXlaCpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_CPU, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 5a1db81774..84c0d5f51f 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -62,8 +62,8 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaGpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 7> kAllXlaGpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_GPU, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index ff7453194a..da2a6c3e28 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -51,12 +51,12 @@ constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 8> kCpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
+constexpr std::array<DataType, 9> kCpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BOOL}};
 
-constexpr std::array<DataType, 8> kGpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
+constexpr std::array<DataType, 9> kGpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BOOL}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
-- 
GitLab


From f654b0d15af364d6f43d22a179fa05d20650fe9a Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 3 Apr 2018 10:51:57 -0700
Subject: [PATCH 0204/1262] tf.map_fn: Improve error messaging when elems
 consists of scalars.

Fixes #17694
Prior to this change, when tf.map_fn was provided with scalars, the error would
be something like:

Traceback (most recent call last):
  File "/tensorflow/python/kernel_tests/functional_ops_test.py", line 165, in testMapOverScalarErrors
    functional_ops.map_fn(lambda x: x, [1, 2])
  File "/tensorflow/python/ops/functional_ops.py", line 368, in map_fn
    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
  File "/tensorflow/python/framework/tensor_shape.py", line 609, in __getitem__
    return self._dims[key]
IndexError: list index out of range
PiperOrigin-RevId: 191465183
---
 tensorflow/python/kernel_tests/functional_ops_test.py |  7 +++++++
 tensorflow/python/ops/functional_ops.py               | 10 +++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 10aea89173..34fb655035 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -160,6 +160,13 @@ class FunctionalOpsTest(test.TestCase):
                 values=constant_op.constant([0, 1, 2]),
                 dense_shape=[2, 2]))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testMapOverScalarErrors(self):
+    with self.assertRaisesRegexp(ValueError, "not scalars"):
+      functional_ops.map_fn(lambda x: x, [1, 2])
+    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+      functional_ops.map_fn(lambda x: x, 1)
+
   def testMap_Scoped(self):
     with self.test_session() as sess:
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 4d95ca262c..161f6f3659 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -367,7 +367,15 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     dtype_flat = output_flatten(dtype)
 
     # Convert elems to tensor array. n may be known statically.
-    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+    static_shape = elems_flat[0].shape
+    if static_shape.ndims is not None and static_shape.ndims < 1:
+      if len(elems_flat) == 1:
+        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
+      else:
+        raise ValueError(
+            "elements in elems must be 1+ dimensional Tensors, not scalars"
+        )
+    n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
-- 
GitLab


From 86235e48fe39f2b9318f01e963499a555ea88084 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 11:15:29 -0700
Subject: [PATCH 0205/1262] Turn no-op split/splitv operators into identity.

PiperOrigin-RevId: 191469655
---
 tensorflow/core/grappler/op_types.cc          |  2 +
 tensorflow/core/grappler/op_types.h           |  1 +
 .../grappler/optimizers/constant_folding.cc   | 10 +++
 .../optimizers/constant_folding_test.cc       | 76 +++++++++++++++++++
 4 files changed, 89 insertions(+)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 584008b0c1..a24d2dbd9f 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -365,6 +365,8 @@ bool IsTruncateDiv(const NodeDef& node) { return node.op() == "TruncateDiv"; }
 
 bool IsTruncateMod(const NodeDef& node) { return node.op() == "TruncateMod"; }
 
+bool IsUnpack(const NodeDef& node) { return node.op() == "Unpack"; }
+
 bool IsVariable(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index aa6750d5c3..8667f72c7e 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -140,6 +140,7 @@ bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
+bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 7de544de52..87052c7ba0 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1542,6 +1542,16 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
     NodeDef* node = optimized_graph->mutable_node(i);
 
+    if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(1, node, optimized_graph);
+      continue;
+    }
+
+    if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(0, node, optimized_graph);
+      continue;
+    }
+
     // Remove Shuffle or Reverse op over scalar values.
     if (use_shape_info &&
         !properties->GetInputProperties(node->name()).empty() &&
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 1db4fb9de7..7faa68a657 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1335,6 +1335,82 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   EXPECT_EQ(2, out_idx.flat<int32>()(0));
 }
 
+TEST_F(ConstantFoldingTest, SplitRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({4}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  ops::Split s1(scope.WithOpName("s1"), split_dim, in1, 1);
+  ops::Split s2(scope.WithOpName("s2"), split_dim, in2, 2);
+
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
+          &want);
+  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, SplitVRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({5}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  auto size_splits1 = ops::Const(scope.WithOpName("size_splits1"), {2}, {1});
+  auto size_splits2 = ops::Const(scope.WithOpName("size_splits2"), {2, 3}, {2});
+  ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
+  ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
+
+  LOG(INFO) << s1.output.size();
+  LOG(INFO) << s2.output.size();
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("size_splits1", "Const", {}, {}, &want);
+  AddNode("size_splits2", "Const", {}, {}, &want);
+  AddNode("s1", "Identity",
+          {"in1", AsControlDependency("size_splits1"),
+           AsControlDependency("split_dim")},
+          {}, &want);
+  AddNode("s2", "SplitV", {"in2", "size_splits2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
-- 
GitLab


From fe2d7600bd6ecfcf3a083b7c01b2c114ba7c63ff Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 3 Apr 2018 11:52:23 -0700
Subject: [PATCH 0206/1262] Add convert f32 to s64 test.

PiperOrigin-RevId: 191476199
---
 tensorflow/compiler/xla/tests/convert_test.cc | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 9a899b7914..8718fa5066 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -230,6 +230,42 @@ XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) {
   ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
 }
 
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
+  ComputationBuilder builder(client_, TestName());
+  // Test cases from compiler_rt library.
+  std::vector<float> arg{0.0f,
+                         0.5f,
+                         0.99f,
+                         1.0f,
+                         1.5f,
+                         1.99f,
+                         2.0f,
+                         2.01f,
+                         -0.5f,
+                         -0.99f,
+                         -1.0f,
+                         -1.5f,
+                         -1.99f,
+                         -2.0f,
+                         -2.01f,
+                         0x1.FFFFFEp+62F,
+                         0x1.FFFFFCp+62F,
+                         -0x1.FFFFFEp+62F,
+                         -0x1.FFFFFCp+62F};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, S64);
+
+  std::vector<int64> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<int64>(arg[i]);
+  }
+  ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
+}
+
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
-- 
GitLab


From d5ebc299ece050d3349feadc20339d636a9d84a1 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 3 Apr 2018 12:38:54 -0700
Subject: [PATCH 0207/1262] Use PyLong_AsLongLong instead of PyInt_AsLong to
 guarantee 64-bit output.

A C long is 32 bits on some platforms, which can cause the
PyInt_AsLong call in PyInt64ListToVector to
overflow. large_concat_op_test exposes this bug on such platforms.

PiperOrigin-RevId: 191484167
---
 tensorflow/python/client/tf_session.i                  | 2 +-
 tensorflow/python/kernel_tests/large_concat_op_test.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 68768f2b4c..0c18d973a7 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -72,7 +72,7 @@ void PyInt64ListToVector(PyObject* py_int_seq, std::vector<int64_t>* vec) {
   int size = PySequence_Fast_GET_SIZE(py_int_seq);
   for (int i = 0; i < size; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(py_int_seq, i);
-    vec->push_back(PyInt_AsLong(item));
+    vec->push_back(PyLong_AsLongLong(item));
   }
 }
 
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 66afb6ec01..184d1dde2a 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -19,10 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class LargeConcatOpTest(test.TestCase):
   """Tests that belong in concat_op_test.py, but run over large tensors."""
 
-- 
GitLab


From 1c16317d2d63d2a61812961e96be0d840fa81186 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 12:40:51 -0700
Subject: [PATCH 0208/1262] Only create a min/max from extra arrays info if
 provided.

PiperOrigin-RevId: 191484470
---
 tensorflow/contrib/lite/toco/tooling_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index c1eaba7f4c..668cf51619 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -2006,9 +2006,9 @@ void UseArraysExtraInfo(Model* model) {
       continue;
     }
     auto& array = model->GetArray(entry.name());
-    auto& minmax = array.GetOrCreateMinMax();
     if (entry.has_min() || entry.has_max()) {
       CHECK_EQ(entry.has_min(), entry.has_max());
+      auto& minmax = array.GetOrCreateMinMax();
       minmax.min = entry.min();
       minmax.max = entry.max();
     }
-- 
GitLab


From b888bae170e38007be39dc047d0590457ad5c069 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 13:28:37 -0700
Subject: [PATCH 0209/1262] [XLA] Redesign: migrate slice_test.

PiperOrigin-RevId: 191491425
---
 tensorflow/compiler/xla/tests/BUILD         |  2 +-
 tensorflow/compiler/xla/tests/slice_test.cc | 30 ++++++++++-----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index aba61fbca4..5dcd02a1a4 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -937,8 +937,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index a14a365bd0..8d9a9c7b73 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -41,7 +41,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1});
 
@@ -54,7 +54,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x1x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1});
 
@@ -67,7 +67,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1});
 
@@ -77,7 +77,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
   builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
 
@@ -85,7 +85,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
   builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
 
@@ -93,7 +93,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
   builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
 
@@ -108,7 +108,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
 
@@ -126,7 +126,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
   Array2D<float> values(1, 4096);
   std::iota(values.data(), values.data() + 4096, 0.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
 
@@ -147,7 +147,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
       }
     }
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -159,7 +159,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   values.FillRandom(3.14f);
   auto expected = ReferenceUtil::Slice4D(
       values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}, /*strides=*/{{1, 1, 1, 1}});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
@@ -172,7 +172,7 @@ XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
                                          /*strides=*/{{1, 1, 2, 1}});
   auto expected_literal = Literal::CreateR4FromArray4DWithLayout(
       *expected, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {0, 0, 0, 0}, {2, 4, 6, 8}, {1, 1, 2, 1});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, ErrorSpec(0.000001),
@@ -198,7 +198,7 @@ class SliceR1Test : public ClientLibraryTestBase,
     tensorflow::gtl::InlinedVector<NativeT, 1> input(spec.input_dim0);
     std::iota(input.begin(), input.end(), NativeT());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto original = builder.ConstantR1<NativeT>(input);
     builder.Slice(original, {spec.slice_start}, {spec.slice_limit},
                   {spec.slice_stride});
@@ -339,7 +339,7 @@ XLA_TEST_P(SliceR2Test, DoIt) {
   Array2D<int32> input(spec.input_dim0, spec.input_dim1);
   input.FillUnique();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2DWithLayout<int32>(
       input, LayoutUtil::MakeLayout(spec.layout));
   builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
@@ -429,7 +429,7 @@ class SliceR4Test : public ClientLibraryTestBase,
     values.FillRandom(3.14f);
     auto expected = ReferenceUtil::Slice4D(
         values, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto literal = Literal::CreateR4FromArray4DWithLayout(
         values, LayoutUtil::MakeLayout(spec.input_layout));
     auto parameter = builder.Parameter(0, literal->shape(), "p0");
-- 
GitLab


From bf1d69a69819c9435c7819f40f17adb64024cedd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 13:43:05 -0700
Subject: [PATCH 0210/1262] Use double for arrays extra info min/max to match
 toco type.

PiperOrigin-RevId: 191493450
---
 tensorflow/contrib/lite/toco/model_flags.proto | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 42e0f54826..835dea49eb 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -98,8 +98,8 @@ message ArraysExtraInfo {
   message Entry {
     // Next ID to use: 7.
     optional string name = 1;
-    optional float min = 2;
-    optional float max = 3;
+    optional double min = 2;
+    optional double max = 3;
     optional IODataType data_type = 4;
     optional InputArrayShape shape = 5;
     optional float constant_float_value = 6;
-- 
GitLab


From b9b90965de4e475ccff8a571de016026447ee1df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 13:53:00 -0700
Subject: [PATCH 0211/1262] Disabled some tests on Windows

PiperOrigin-RevId: 191494857
---
 tensorflow/contrib/BUILD                                  | 2 +-
 tensorflow/contrib/autograph/converters/BUILD             | 3 +++
 tensorflow/contrib/autograph/impl/BUILD                   | 2 ++
 tensorflow/contrib/autograph/pyct/BUILD                   | 1 +
 tensorflow/contrib/autograph/pyct/static_analysis/BUILD   | 2 ++
 tensorflow/contrib/autograph/utils/BUILD                  | 2 ++
 tensorflow/contrib/data/BUILD                             | 8 +++++++-
 tensorflow/contrib/lite/python/BUILD                      | 1 +
 tensorflow/contrib/stat_summarizer/BUILD                  | 1 +
 tensorflow/contrib/tensorboard/BUILD                      | 1 +
 tensorflow/contrib/tpu/BUILD                              | 1 +
 tensorflow/python/BUILD                                   | 2 ++
 tensorflow/python/debug/BUILD                             | 1 +
 tensorflow/python/kernel_tests/BUILD                      | 1 +
 .../tools/ci_build/windows/cpu/pip/build_tf_windows.sh    | 1 +
 15 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index bf69144ad8..6964cbd564 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -117,7 +117,7 @@ py_library(
         ],
         "//conditions:default": [],
     }) + if_not_windows([
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
+        # for :contrib_py
         "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
     ]),
 )
diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 608bd82722..c5a0dc1095 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -61,6 +61,7 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -81,6 +82,7 @@ py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -92,6 +94,7 @@ py_test(
     size = "large",
     srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/contrib/autograph/impl",
diff --git a/tensorflow/contrib/autograph/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
index e468176da1..0de479741a 100644
--- a/tensorflow/contrib/autograph/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -38,6 +38,7 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/contrib/autograph/utils",
@@ -50,6 +51,7 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index edec5f7712..c483ff68c4 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -66,6 +66,7 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
index d192bc7aab..83f3bafc42 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
@@ -34,6 +34,7 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/autograph/pyct",
@@ -46,6 +47,7 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/autograph/pyct",
diff --git a/tensorflow/contrib/autograph/utils/BUILD b/tensorflow/contrib/autograph/utils/BUILD
index b53fbb5c18..d3a1b94688 100644
--- a/tensorflow/contrib/autograph/utils/BUILD
+++ b/tensorflow/contrib/autograph/utils/BUILD
@@ -44,6 +44,7 @@ py_test(
     name = "builtins_test",
     srcs = ["builtins_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -84,6 +85,7 @@ py_test(
     name = "py_func_test",
     srcs = ["py_func_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 35312f06b3..7bb0dc1c0f 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -8,6 +8,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_libs",
+    "if_not_windows",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -31,12 +32,17 @@ py_library(
     ],
 )
 
+cc_library(
+    name = "lib_proto_parsing_for_dataset_ops",
+    deps = if_not_windows(["//tensorflow/core:lib_proto_parsing"]),
+)
+
 tf_custom_op_library(
     name = "_dataset_ops.so",
     srcs = ["ops/dataset_ops.cc"],
     deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"] +
            if_static(
-               extra_deps = ["//tensorflow/core:lib_proto_parsing"],
+               extra_deps = [":lib_proto_parsing_for_dataset_ops"],
                otherwise = [],
            ),
 )
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index e70aa51298..e735062a7f 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -101,6 +101,7 @@ py_test(
     name = "convert_saved_model_test",
     srcs = ["convert_saved_model_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
         ":convert_saved_model",
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index d4096751c4..30be14c10c 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,4 +31,5 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],
 )
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index c955b13244..2b6a2b2f3c 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -82,6 +82,7 @@ py_test(
     size = "small",
     srcs = ["plugins/trace/trace_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":trace",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index d4830b6bcf..4de09dd988 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -215,6 +215,7 @@ tf_py_test(
         ":datasets",
     ],
     grpc_enabled = True,
+    tags = ["no_windows"],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9d1e9bdc7e..b5f4387efd 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3782,6 +3782,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -4013,6 +4014,7 @@ py_test(
     srcs = ["training/saver_large_partitioned_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index b81aa3745c..250b4b1b6a 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -921,6 +921,7 @@ py_test(
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":debugger_cli_common",
         ":profile_analyzer_cli",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index d6f97fc4c3..a544e4fa6e 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2722,6 +2722,7 @@ cuda_py_test(
     ],
     data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 40189a6d1b..438c5d52f6 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -65,5 +65,6 @@ bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
+  --flaky_test_attempts=3 \
   //${PY_TEST_DIR}/tensorflow/python/... \
   //${PY_TEST_DIR}/tensorflow/contrib/...
-- 
GitLab


From 86b3f351dd98db9cdfc2fc68a2a4328e90b36035 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 14:42:31 -0700
Subject: [PATCH 0212/1262] [XLA] Redesign: implement and test dynamic slice.

PiperOrigin-RevId: 191502312
---
 .../xla/client/xla_client/xla_builder.cc      | 18 +++++++-
 tensorflow/compiler/xla/tests/BUILD           |  3 +-
 .../compiler/xla/tests/dynamic_ops_test.cc    | 41 +++++++++----------
 3 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index c2e661cb3d..fe8ae77683 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -489,7 +489,23 @@ XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
 
 XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                                tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shape, slice_sizes));
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
+                          {operand, start_indices});
+  });
 }
 
 XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 5dcd02a1a4..6f58c20f34 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -977,9 +977,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 4f354e6aef..c0a16ad288 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@@ -112,10 +111,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   void TestR3Wrap() {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR3<IndexT, DataT>(
-      {{{1, 2}, {3, 4}, {5, 6}},
-       {{7, 8}, {9, 10}, {11, 12}}},
-      {0, 2, 1}, {2, 1, 2},
-      {{{6, 5}}, {{12, 11}}});
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {0, 2, 1},
+        {2, 1, 2}, {{{6, 5}}, {{12, 11}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -137,9 +134,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -163,9 +160,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -189,9 +186,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -359,9 +356,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -390,9 +387,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -421,9 +418,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -474,13 +471,13 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     }
 
     // Build dynamic slice computation.
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer input parameter.
-    ComputationDataHandle input;
+    XlaOp input;
     std::unique_ptr<GlobalData> input_data =
         CreateR3Parameter<T>(input_values, 0, "input_values", &builder, &input);
     // Initialize and transfer update parameter.
-    ComputationDataHandle update;
+    XlaOp update;
     std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
     auto starts = builder.ConstantR1<int32>({index, 0, 0});
@@ -672,7 +669,7 @@ void BM_DynamicSlice(int num_iters) {
       TransferManager::GetForPlatform(platform).ValueOrDie();
   int device_ordinal = client->default_device_ordinal();
 
-  ComputationBuilder builder(client, "DynamicSlice");
+  XlaBuilder builder("DynamicSlice");
 
   // Create input as a constant: shape [1, 2, 3, 4]
   auto input_literal = Literal::CreateR4(
-- 
GitLab


From 5dcbb54070e9c4d7d2bb5ce4b1f488ac1ef22333 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 15:01:09 -0700
Subject: [PATCH 0213/1262] Internal change

PiperOrigin-RevId: 191505262
---
 tensorflow/contrib/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 6964cbd564..bf69144ad8 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -117,7 +117,7 @@ py_library(
         ],
         "//conditions:default": [],
     }) + if_not_windows([
-        # for :contrib_py
+        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
     ]),
 )
-- 
GitLab


From a764216776465a5385596ca83af6edf3da72c504 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 15:03:44 -0700
Subject: [PATCH 0214/1262] Accept toco ModelFlags protos on the command line.

PiperOrigin-RevId: 191505886
---
 tensorflow/contrib/lite/toco/args.h           |  1 +
 .../contrib/lite/toco/model_cmdline_flags.cc  | 24 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 52c789293c..39e49bc347 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -211,6 +211,7 @@ struct ParsedModelFlags {
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
   Arg<string> arrays_extra_info_file;
+  Arg<string> model_flags_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 4264f21c76..245eb52444 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -160,6 +160,11 @@ bool ParseModelFlagsFromCommandLineFlags(
           "Path to an optional file containing a serialized ArraysExtraInfo "
           "proto allowing to pass extra information about arrays not specified "
           "in the input model file, such as extra MinMax information."),
+      Flag("model_flags_file", parsed_flags.model_flags_file.bind(),
+           parsed_flags.model_flags_file.default_value(),
+           "Path to an optional file containing a serialized ModelFlags proto. "
+           "Options specified on the command line will override the values in "
+           "the proto."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -182,7 +187,24 @@ void ReadModelFlagsFromCommandLineFlags(
     const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags) {
   toco::port::CheckInitGoogleIsDone("InitGoogle is not done yet");
 
-// "batch" flag only exists internally
+  // Load proto containing the initial model flags.
+  // Additional flags specified on the command line will overwrite the values.
+  if (parsed_model_flags.model_flags_file.specified()) {
+    string model_flags_file_contents;
+    QCHECK(port::file::GetContents(parsed_model_flags.model_flags_file.value(),
+                                   &model_flags_file_contents,
+                                   port::file::Defaults())
+               .ok())
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " was not found or could not be read";
+    QCHECK(ParseFromStringEitherTextOrBinary(model_flags_file_contents,
+                                             model_flags))
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " could not be parsed";
+  }
+
 #ifdef PLATFORM_GOOGLE
   CHECK(!((base::SpecifiedOnCommandLine("batch") &&
            parsed_model_flags.variable_batch.specified())))
-- 
GitLab


From 4681562607bf4001ecd61492f1e7567be9212c6f Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Tue, 3 Apr 2018 15:08:51 -0700
Subject: [PATCH 0215/1262] Implementing make_initializatable_iterator for the
 PrefetchToDeviceDataset.

PiperOrigin-RevId: 191506754
---
 .../kernel_tests/prefetching_ops_test.py      | 61 +++++++++++++++-
 .../data/python/ops/prefetching_ops.py        | 73 +++++++++++++++----
 2 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index f2c57f92e2..4b50260670 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-class StagingAreaOpsTest(test.TestCase):
+class PrefetchingKernelsOpsTest(test.TestCase):
 
   def setUp(self):
     self._event = threading.Event()
@@ -200,6 +200,9 @@ class StagingAreaOpsTest(test.TestCase):
 
       sess.run(destroy_op)
 
+
+class PrefetchToDeviceTest(test.TestCase):
+
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -279,6 +282,62 @@ class StagingAreaOpsTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchToDeviceWithReInit(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpuWithReInit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 554bfaa2cf..77e23d0319 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -68,20 +68,48 @@ def function_buffering_resource_reset(function_buffer_resource, name=None):
 
 # pylint: disable=protected-access
 class _PrefetchToDeviceIterator(object):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device."""
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
 
-  def __init__(self, input_dataset, device, buffer_size):
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               device,
+               buffer_size,
+               shared_name=None):
     self._input_dataset = input_dataset
     self._get_next_call_count = 0
-    input_iterator = input_dataset.make_one_shot_iterator()
-    input_iterator_handle = input_iterator.string_handle()
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
       """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, input_iterator.output_types, input_iterator.output_shapes,
-          input_iterator.output_classes)
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
       ret = remote_iterator.get_next()
 
       # Convert any `SparseTensorValue`s to `SparseTensor`s.
@@ -101,9 +129,16 @@ class _PrefetchToDeviceIterator(object):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
           target_device=gen_dataset_ops.iterator_get_device(
-              input_iterator._iterator_resource),
+              self._input_iterator._iterator_resource),
           string_arg=input_iterator_handle,
-          buffer_size=buffer_size)
+          buffer_size=buffer_size,
+          shared_name=shared_name)
+
+    if not self._one_shot:
+      reset_op = function_buffering_resource_reset(self._buffering_resource)
+      with ops.control_dependencies([reset_op]):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
 
   def get_next(self, name=None):
     """See @{tf.data.Iterator.get_next}."""
@@ -127,6 +162,12 @@ class _PrefetchToDeviceIterator(object):
 
     return ret
 
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
@@ -150,13 +191,19 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset):
     self._buffer_size = buffer_size if buffer_size is not None else 1
 
   def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(self._input_dataset, self._device,
-                                     self._buffer_size)
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=True,
+        device=self._device,
+        buffer_size=self._buffer_size)
 
   def make_initializable_iterator(self, shared_name=None):
-    raise NotImplementedError("`prefetch_to_device()` is not currently "
-                              "compatible with initializable iterators. Use "
-                              "`make_one_shot_iterator()` instead.")
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        device=self._device,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
 
   def _as_variant_tensor(self):
     # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
-- 
GitLab


From 97a7df890632b349d0b179f418a2d17ae812f8ce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 15:20:58 -0700
Subject: [PATCH 0216/1262] Remove unused #includes

PiperOrigin-RevId: 191508478
---
 tensorflow/core/platform/default/tracing_impl.h | 1 -
 tensorflow/core/platform/denormal.cc            | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/default/tracing_impl.h b/tensorflow/core/platform/default/tracing_impl.h
index e813e4a17a..7834548896 100644
--- a/tensorflow/core/platform/default/tracing_impl.h
+++ b/tensorflow/core/platform/default/tracing_impl.h
@@ -22,7 +22,6 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/tracing.h
 
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/tracing.h"
 
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 3631d9ddf9..82cbc43b4f 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+
 #include "tensorflow/core/platform/denormal.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
-- 
GitLab


From 1b20395c19199aaf124b2468da0e1d935c659d7b Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 3 Apr 2018 15:25:16 -0700
Subject: [PATCH 0217/1262] Add bfloat16 to GPU types.

PiperOrigin-RevId: 191509090
---
 tensorflow/compiler/jit/xla_gpu_device.cc    | 5 +++--
 tensorflow/compiler/tests/build_defs.bzl     | 3 +--
 tensorflow/compiler/tf2xla/xla_op_registry.h | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 84c0d5f51f..ac60423d95 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -62,8 +62,9 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 7> kAllXlaGpuTypes = {
-    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 8> kAllXlaGpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL,
+     DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_GPU, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 0528a5415d..a9db1c173d 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -56,7 +56,7 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     elif backend == "gpu":
       backend_args += [
           "--test_device=XLA_GPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
       ]
       backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
@@ -89,4 +89,3 @@ def generate_backend_suites(backends=[]):
     backends = all_backends()
   for backend in backends:
     native.test_suite(name="%s_tests" % backend, tags=["tf_xla_%s" % backend])
-
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index da2a6c3e28..e255b01dd7 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -55,9 +55,9 @@ constexpr std::array<DataType, 9> kCpuAllTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BOOL}};
 
-constexpr std::array<DataType, 9> kGpuAllTypes = {
+constexpr std::array<DataType, 10> kGpuAllTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64, DT_BOOL}};
+     DT_COMPLEX64, DT_BOOL, DT_BFLOAT16}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
 // Not thread-safe.
-- 
GitLab


From bfb63e0a6458ff66e67a79467beab854d7c9d69d Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 3 Apr 2018 15:28:27 -0700
Subject: [PATCH 0218/1262] Checkpointable: Utility to split a dependency for
 saving/loading

Useful when a single op produces Tensors which should each be saved under
different objects (or when Tensors saved with many different objects need to be
restored together as inputs to a single op).

I plan to use this for cuDNN RNN saving, which currently relies heavily on name
matching to allow cuDNN cells to be swapped out for CPU-compatible RNN
cells. This change has just the utility and some tests for it.

PiperOrigin-RevId: 191509664
---
 .../eager/python/checkpointable_utils.py      | 113 ++++++++++++++++++
 .../eager/python/checkpointable_utils_test.py |  82 +++++++++++++
 2 files changed, 195 insertions(+)

diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py
index 91a7aded11..34cb8d0e08 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import abc
 import collections
+import functools
 import weakref
 
 from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
@@ -867,3 +868,115 @@ class Checkpoint(core_checkpointable.Checkpointable):
     # initialization when executing eagerly.
     self._maybe_create_save_counter()
     return status
+
+
+class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+  """Wraps save and restore callbacks as a `SaveableObject`."""
+
+  def __init__(self, name, dtype, save_callback, restore_callback):
+    self._restore_callback = restore_callback
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=save_callback,
+        slice_spec="",
+        name=name,
+        dtype=dtype)
+    super(_CallbackSaveable, self).__init__(
+        save_callback, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return self._restore_callback(tensor)
+
+
+class _SplitDependency(core_checkpointable.CheckpointableBase):
+  """Looks like a regular variable while synchronizing save/restores."""
+
+  def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
+               fill_save_buffer_fn, consume_restore_buffer_fn):
+    self._save_buffer = save_buffer
+    self._restore_buffer = restore_buffer
+    self._name = name
+    self._dtype = dtype
+    self._num_components = num_components
+    self._fill_save_buffer_fn = fill_save_buffer_fn
+    self._consume_restore_buffer_fn = consume_restore_buffer_fn
+
+  def _save(self):
+    """Pull from the shared buffer, populating it if necessary."""
+    if self._name not in self._save_buffer:
+      if self._save_buffer:
+        raise AssertionError(
+            ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+             "be saved together.") % (self._name, self))
+      self._fill_save_buffer_fn(self._save_buffer)
+    return self._save_buffer.pop(self._name)
+
+  def _restore(self, tensor):
+    """Push into the shared buffer, flushing it if necessary."""
+    if self._name in self._restore_buffer:
+      raise AssertionError(
+          ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+           "be restored together.") % (self._name, self))
+    self._restore_buffer[self._name] = tensor
+    if len(self._restore_buffer) == self._num_components:
+      op = self._consume_restore_buffer_fn(self._restore_buffer)
+      self._restore_buffer.clear()
+      return op
+    else:
+      return control_flow_ops.no_op()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Looks to Checkpointable like a regular variable."""
+    return {
+        core_checkpointable.VARIABLE_VALUE_KEY:
+        functools.partial(_CallbackSaveable,
+                          dtype=self._dtype,
+                          save_callback=self._save,
+                          restore_callback=self._restore)
+    }
+
+
+def split_dependency(component_names, component_dtypes,
+                     fill_save_buffer_fn, consume_restore_buffer_fn):
+  """Creates multiple dependencies with a synchronized save/restore.
+
+  Useful when a single op produces `Tensor`s which should each be saved under
+  different objects, or when `Tensor`s saved with many different objects need to
+  be restored together as inputs to a single op (i.e. an object which uses a
+  single fused op may be swapped out for a subgraph of objects, and these two
+  programs are checkpoint compatible).
+
+  Args:
+    component_names: A sequence of names for the split
+      dependencies. `fill_save_buffer_fn` must add these keys to the dictionary
+      it is passed, and `consume_restore_buffer_fn` will receive a dictionary
+      with these keys.
+    component_dtypes: Data types for the `Tensor`s being saved and restored, a
+      sequence corresponding to `component_names`.
+    fill_save_buffer_fn: A function which takes an empty dictionary as an
+      argument and adds `Tensor`s with `component_names` as keys. These
+      `Tensor`s will be saved as if they were individual variables.
+    consume_restore_buffer_fn: A function which takes a dictionary with
+      `component_names` as keys mapping to restored individual `Tensor`s and
+      returns a restore op (or if executing eagerly, runs the restoration and
+      may return `None`).
+
+  Returns:
+    A dictionary mapping from names to Checkpointable objects. If one is
+    reachable from an object as a dependency, the others should be too; adding
+    dependencies on some but not all of the objects will result in errors.
+  """
+  save_buffer = {}
+  restore_buffer = {}
+  split_dependencies = {}
+  for name, dtype in zip(component_names, component_dtypes):
+    split_dependencies[name] = _SplitDependency(
+        save_buffer=save_buffer,
+        restore_buffer=restore_buffer,
+        name=name,
+        dtype=dtype,
+        num_components=len(component_names),
+        fill_save_buffer_fn=fill_save_buffer_fn,
+        consume_restore_buffer_fn=consume_restore_buffer_fn)
+  return split_dependencies
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 5e1b64728a..891c093a0f 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl.keras.engine import sequential
 from tensorflow.python.keras._impl.keras.engine import training
 from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -69,6 +70,87 @@ class MyModel(training.Model):
     return ret
 
 
+def _split_variable_closure(variable):
+  def _fill_save_buffer_fn(save_buffer):
+    save_buffer["first_half"] = variable[:2]
+    save_buffer["second_half"] = variable[2:]
+  return _fill_save_buffer_fn
+
+
+def _combine_variable_closure(variable):
+  def _consume_restore_buffer_fn(restore_buffer):
+    return variable.assign(
+        array_ops.concat([restore_buffer["first_half"],
+                          restore_buffer["second_half"]],
+                         axis=0))
+  return _consume_restore_buffer_fn
+
+
+class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
+
+  def __init__(self):
+    self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
+    split_dependencies = checkpointable_utils.split_dependency(
+        component_names=("first_half", "second_half"),
+        component_dtypes=(self.combined.dtype,) * 2,
+        fill_save_buffer_fn=_split_variable_closure(
+            self.combined),
+        consume_restore_buffer_fn=_combine_variable_closure(
+            self.combined))
+    for name, dep in split_dependencies.items():
+      self._track_checkpointable(dep, name=name)
+
+
+class HasRegularDeps(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+    self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class OnlyOneDep(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class SplitTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testSaveRestoreSplitDep(self):
+    save_checkpoint = checkpointable_utils.Checkpoint(
+        dep=SaveTensorSlicesAsDeps())
+    self.evaluate(save_checkpoint.dep.combined.assign([1., 2., 3., 4.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_checkpoint.save(checkpoint_prefix)
+
+    regular_deps = HasRegularDeps()
+    regular_restore_checkpoint = checkpointable_utils.Checkpoint(
+        dep=regular_deps)
+    regular_restore_checkpoint.restore(
+        save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(regular_deps.first_half))
+    self.assertAllEqual([3., 4.], self.evaluate(regular_deps.second_half))
+
+    one_dep = OnlyOneDep()
+    one_dep_restore_checkpoint = checkpointable_utils.Checkpoint(dep=one_dep)
+    status = one_dep_restore_checkpoint.restore(save_path)
+    with self.assertRaises(AssertionError):
+      # Missing the second dependency.
+      status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(one_dep.first_half))
+
+    restore_checkpoint = checkpointable_utils.Checkpoint()
+    status = restore_checkpoint.restore(save_path)
+    restore_checkpoint.dep = SaveTensorSlicesAsDeps()
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual(
+        [1., 2., 3., 4.],
+        self.evaluate(restore_checkpoint.dep.combined))
+
+
 class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-- 
GitLab


From 1948b74779e34e5ac608ef427b3409ca0a98c5f5 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 3 Apr 2018 15:39:02 -0700
Subject: [PATCH 0219/1262] Add ability to pass symbolic tensors as inputs and
 targets in calls to Model training and evaluation methods.

This also works for eager tensors, but due to a slicing behavior difference between eager tensors and Numpy arrays, we have to implement a workaround (with a performance cost).

PiperOrigin-RevId: 191511215
---
 .../python/keras/_impl/keras/backend.py       |  18 ++-
 .../keras/_impl/keras/engine/training.py      |   3 +
 .../_impl/keras/engine/training_eager.py      |  70 ++++++++---
 .../_impl/keras/engine/training_eager_test.py | 104 ++++++++++++++--
 .../keras/_impl/keras/engine/training_test.py | 115 ++++++++++++++++++
 .../_impl/keras/engine/training_utils.py      |  37 ++++--
 6 files changed, 312 insertions(+), 35 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 7baf27642a..3aac6a9065 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -2795,6 +2796,8 @@ class Function(object):
     else:
       feed_dict = {}
 
+    session = get_session()
+    data_tensors_to_feed = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
         continue
@@ -2803,9 +2806,20 @@ class Function(object):
         indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
-      feed_dict[tensor] = value
+      elif tensor_util.is_tensor(value):
+        data_tensors_to_feed.append((tensor, value))
+      else:
+        feed_dict[tensor] = value
+
+    if data_tensors_to_feed:
+      # This is a *temporary* workaround (i.e. hack) to feed a symbolic tensor
+      # to `feed_dict`. It is very inefficient. It will be removed as soon
+      # as it becomes possible to pass symbolic tensors to `feed_dict`.
+      data_tensor_values = session.run([x[1] for x in data_tensors_to_feed])
+      for i, v in enumerate(data_tensor_values):
+        feed_dict[data_tensors_to_feed[i][0]] = v
+
     fetches = self.outputs + [self.updates_op] + self.fetches
-    session = get_session()
     updated = session.run(
         fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
     return updated[:len(self.outputs)]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 971245c162..71de657da8 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -1181,6 +1181,9 @@ class Model(Network):
           batch_size=batch_size)
 
     elif validation_split and 0. < validation_split < 1.:
+      if training_utils.has_symbolic_tensors(x):
+        raise ValueError('If your data is in the form of symbolic tensors, '
+                         'you cannot use `validation_split`.')
       if hasattr(x[0], 'shape'):
         split_at = int(x[0].shape[0] * (1. - validation_split))
       else:
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 67858a578c..4cdb5f108a 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -31,9 +31,8 @@ from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -173,6 +172,41 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
   return outs, total_loss, loss_metrics
 
 
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    converted_to_list = False
+    if not isinstance(arrays, list):
+      converted_to_list = True
+      arrays = [arrays]
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+    if converted_to_list:
+      slices = slices[0]
+    return slices
+  else:
+    return generic_utils.slice_arrays(arrays, indices)
+
+
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -270,9 +304,8 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  metric_names, metrics_results = _eager_metrics_fn(
+  _, metrics_results = _eager_metrics_fn(
       model, outs, targets)
-  model.metrics_names.append(metric_names)
   if not isinstance(loss, list):
     loss = [loss]
   return loss + loss_metrics + metrics_results
@@ -328,6 +361,12 @@ def fit_loop(
   Raises:
     ValueError: In case of invalid argument values.
   """
+  if not batch_size:
+    raise ValueError('With eager execution, `batch_size` should be specified.')
+  if steps_per_epoch or validation_steps:
+    raise ValueError('With eager execution, `steps_per_epoch` and '
+                     '`validation_steps` are not valid arguments '
+                     '(set `batch_size` instead).')
   # Required for Eager mode
   with backend.learning_phase_scope(1):
     do_validation = False
@@ -410,15 +449,18 @@ def fit_loop(
       elif shuffle:
         np.random.shuffle(index_array)
 
-      batches = make_batches(num_train_samples, batch_size)
+      batches = generic_utils.make_batches(num_train_samples, batch_size)
 
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
         try:
-          inputs_batch = slice_arrays(inputs, batch_ids)
-          targets_batch = slice_arrays(targets, batch_ids)
+          inputs_batch = slice_arrays(inputs, batch_ids,
+                                      contiguous=not shuffle)
+          targets_batch = slice_arrays(targets, batch_ids,
+                                       contiguous=not shuffle)
           if sample_weights:
-            sample_weights_batch = slice_arrays(sample_weights, batch_ids)
+            sample_weights_batch = slice_arrays(sample_weights, batch_ids,
+                                                contiguous=not shuffle)
           else:
             sample_weights_batch = None
         except TypeError:
@@ -539,8 +581,8 @@ def test_loop(model, inputs, targets,
         feed_data, batch_size=batch_size, steps=steps, steps_name='steps')
     outs = []
     if verbose == 1:
-      progbar = Progbar(target=num_samples)
-    batches = make_batches(num_samples, batch_size)
+      progbar = generic_utils.Progbar(target=num_samples)
+    batches = generic_utils.make_batches(num_samples, batch_size)
     index_array = np.arange(num_samples)
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
@@ -620,12 +662,12 @@ def predict_loop(model, inputs,
         inputs, batch_size, steps, 'steps')
     if verbose == 1:
       if steps is not None:
-        progbar = Progbar(target=steps)
+        progbar = generic_utils.Progbar(target=steps)
       else:
-        progbar = Progbar(target=num_samples)
+        progbar = generic_utils.Progbar(target=num_samples)
 
     outs = []
-    batches = make_batches(num_samples, batch_size)
+    batches = generic_utils.make_batches(num_samples, batch_size)
     index_array = np.arange(num_samples)
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index 8848b393d5..6cdb6b0753 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import numpy as np
 
 from tensorflow.python.framework import ops
@@ -308,6 +307,100 @@ class TrainingTest(test.TestCase):
       model.compile(loss=None,
                     optimizer='rms')
 
+  def test_model_methods_with_eager_tensors_multi_io(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a = keras.backend.zeros(shape=(10, 3))
+    input_b = keras.backend.zeros(shape=(10, 3))
+    target_d = keras.backend.zeros(shape=(10, 4))
+    target_e = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    # Test: no shuffle.
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    # Test: validation data.
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+    # Test: mix np and tensors.
+    input_b = np.zeros(shape=(10, 3)).astype('float32')
+    target_e = np.zeros(shape=(10, 4)).astype('float32')
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+  def test_model_methods_with_eager_tensors_single_io(self):
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
+    model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
+    model.fit(inputs, targets, epochs=1, batch_size=4, verbose=0,
+              validation_data=(inputs, targets))
+    model.evaluate(inputs, targets, batch_size=2, verbose=0)
+    model.predict(inputs, batch_size=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -533,14 +626,5 @@ class LossWeightingTest(test.TestCase):
 
 
 if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
-
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index fd91dbba52..08fd26dd18 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -1117,6 +1117,121 @@ class TestTrainingUtils(test.TestCase):
 
 class TestTrainingWithDataTensors(test.TestCase):
 
+  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
+  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+      input_a_tf = keras.backend.zeros(shape=(10, 3))
+      input_b_tf = keras.backend.zeros(shape=(10, 3))
+
+      output_d_tf = keras.backend.zeros(shape=(10, 4))
+      output_e_tf = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'should specify the `steps_per_epoch`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=1,
+            batch_size=5,
+            verbose=0)
+      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
+      # Test with dictionary inputs
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          validation_data=({'input_a': input_a_tf,
+                            'input_b': input_b_tf},
+                           {'dense': output_d_tf,
+                            'dropout': output_e_tf}),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      model.train_on_batch(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf})
+
+      # Test with validation data
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          validation_data=([input_a_tf, input_b_tf],
+                           [output_d_tf, output_e_tf]),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=2,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.2,
+            validation_steps=2)
+
+      # Test evaluation / prediction methods
+      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                     steps=2, verbose=0)
+      model.predict([input_a_tf, input_b_tf], steps=2)
+      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 105638ce10..76537b735f 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -22,6 +22,7 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import losses
@@ -64,15 +65,29 @@ def check_num_samples(ins,
     if batch_size is not None:
       raise ValueError(
           'If ' + steps_name + ' is set, the `batch_size` must be None.')
-  elif ins and hasattr(ins[0], 'shape'):
-    num_samples = ins[0].shape[0]
-  else:
+  if has_symbolic_tensors(ins) and steps is None:
+    raise ValueError('If your data is in the form of symbolic tensors, '
+                     'you should specify the `' + steps_name + '` argument '
+                     '(instead of the `batch_size` argument).')
+  if ins and hasattr(ins[0], 'shape'):
+    num_samples = int(ins[0].shape[0])
+  elif steps is None:
     raise ValueError(
         'Either the input data should have '
         'a defined shape, or ' + steps_name + ' should be specified.')
   return num_samples
 
 
+def standardize_single_array(x):
+  if x is None:
+    return None
+  elif tensor_util.is_tensor(x):
+    return x
+  elif x.ndim == 1:
+    x = np.expand_dims(x, 1)
+  return x
+
+
 def standardize_input_data(data,
                            names,
                            shapes=None,
@@ -130,9 +145,7 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
-  data = [
-      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
-  ]
+  data = [standardize_single_array(x) for x in data]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -158,7 +171,7 @@ def standardize_input_data(data,
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is not None:
+      if shapes[i] is not None and not tensor_util.is_tensor(data[i]):
         data_shape = data[i].shape
         shape = shapes[i]
         if data[i].ndim != len(shape):
@@ -245,12 +258,13 @@ def check_array_lengths(inputs, targets, weights=None):
   """
 
   def set_of_lengths(x):
-    # return a set with the variation between
+    # Returns a set with the variation between
     # different shapes, with None => 0
     if x is None:
       return {}
     else:
-      return set([y.shape[0] for y in x if y is not None])
+      return set([y.shape[0] for y in x
+                  if y is not None and not tensor_util.is_tensor(y)])
 
   set_x = set_of_lengths(inputs)
   set_y = set_of_lengths(targets)
@@ -532,3 +546,8 @@ def standardize_weights(y,
     return weights
   else:
     return None
+
+
+def has_symbolic_tensors(ls):
+  return (any(tensor_util.is_tensor(v) for v in ls)
+          and not context.executing_eagerly())
-- 
GitLab


From fa047c0e5976bb92d91afedb5961a697cb3be13b Mon Sep 17 00:00:00 2001
From: Jon Shlens <shlens@google.com>
Date: Tue, 3 Apr 2018 15:55:59 -0700
Subject: [PATCH 0220/1262] Add Group Normalization to tf.contrib.layers.

# Example usage: NHWC
outputs = tf.contrib.layers.group_norm(inputs, groups=32, channels_axis=-1, reduction_axes=[-3, -2])
# Example usage: NCHW
outputs = tf.contrib.layers.group_norm(inputs, groups=32, channels_axis=-3, reduction_axes=[-2, -1])
PiperOrigin-RevId: 191513496
---
 tensorflow/contrib/layers/__init__.py         |   2 +
 .../layers/python/layers/normalization.py     | 195 +++++++++++++++
 .../python/layers/normalization_test.py       | 226 ++++++++++++++++++
 3 files changed, 423 insertions(+)

diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 337c9e06b8..00f03a111a 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -104,6 +104,7 @@ See the @{$python/contrib.layers} guide.
 @@infer_real_valued_columns
 @@sequence_input_from_feature_columns
 
+@@group_norm
 @@instance_norm
 """
 
@@ -122,6 +123,7 @@ _allowed_symbols = ['bias_add',
                     'conv3d',
                     'elu',
                     'feature_column',
+                    'group_norm',
                     'instance_norm',
                     'legacy_fully_connected',
                     'legacy_linear',
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index e7d4080ff7..c807ab0f2e 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -24,11 +24,13 @@ from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
 
 
 __all__ = [
+    'group_norm',
     'instance_norm',
 ]
 
@@ -158,3 +160,196 @@ def instance_norm(inputs,
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
+@add_arg_scope
+def group_norm(inputs,
+               groups=32,
+               channels_axis=-1,
+               reduction_axes=(-3, -2),
+               center=True,
+               scale=True,
+               epsilon=1e-6,
+               activation_fn=None,
+               param_initializers=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               scope=None):
+  """Functional interface for the group normalization layer.
+
+  Reference: https://arxiv.org/abs/1803.08494.
+
+    "Group Normalization", Yuxin Wu, Kaiming He
+
+  Args:
+    inputs: A Tensor with at least 2 dimensions one which is channels. All
+     shape dimensions must be fully defined.
+    groups: Integer. Divide the channels into this number of groups over which
+      normalization statistics are computed. This number must be commensurate
+      with the number of channels in `inputs`.
+    channels_axis: An integer. Specifies index of channels axis which will be
+      broken into `groups`, each of which whose statistics will be computed
+      across. Must be mutually exclusive with `reduction_axes`. Preferred usage
+      is to specify negative integers to be agnostic as to whether a batch
+      dimension is included.
+    reduction_axes: Tuple of integers. Specifies dimensions over which
+       statistics will be accumulated. Must be mutually exclusive with
+       `channels_axis`. Statistics will not be accumulated across axes not
+       specified in `reduction_axes` nor `channel_axis`. Preferred usage is to
+       specify negative integers to be agnostic to whether a batch dimension is
+       included.
+
+      Some sample usage cases:
+        NHWC format: channels_axis=-1, reduction_axes=[-3, -2]
+        NCHW format: channels_axis=-3, reduction_axes=[-2, -1]
+
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    activation_fn: Activation function, default set to None to skip it and
+      maintain a linear activation.
+    param_initializers: Optional initializers for beta, gamma, moving mean and
+      moving variance.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional collections for the variables.
+    outputs_collections: Collections to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    scope: Optional scope for `variable_scope`.
+
+  Returns:
+    A `Tensor` representing the output of the operation.
+
+  Raises:
+    ValueError: If the rank of `inputs` is undefined.
+    ValueError: If rank or channels dimension of `inputs` is undefined.
+    ValueError: If number of groups is not commensurate with number of channels.
+    ValueError: If reduction_axes or channels_axis are out of bounds.
+    ValueError: If reduction_axes are not mutually exclusive with channels_axis.
+  """
+  # TODO(shlens): Support partially defined shapes for the inputs.
+  inputs = ops.convert_to_tensor(inputs)
+  original_shape = inputs.shape
+
+  if inputs.shape.ndims is None:
+    raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+  if channels_axis > (inputs.shape.ndims - 1):
+    raise ValueError('Axis is out of bounds.')
+
+  # Standardize the channels_axis to be positive and identify # of channels.
+  if channels_axis < 0:
+    channels_axis = inputs.shape.ndims + channels_axis
+  channels = inputs.shape[channels_axis].value
+
+  if channels is None:
+    raise ValueError('Inputs %s has undefined channel dimension: %d.' % (
+        inputs.name, channels_axis))
+
+  # Standardize the reduction_axes to be positive.
+  reduction_axes = list(reduction_axes)
+  for i in range(len(reduction_axes)):
+    if reduction_axes[i] < 0:
+      reduction_axes[i] += inputs.shape.ndims
+
+  for a in reduction_axes:
+    if a > inputs.shape.ndims:
+      raise ValueError('Axis is out of bounds.')
+    if inputs.shape[a].value is None:
+      raise ValueError('Inputs %s has undefined dimensions %d.' % (
+          inputs.name, a))
+    if channels_axis == a:
+      raise ValueError('reduction_axis must be mutually exclusive '
+                       'with channels_axis')
+  if groups > channels:
+    raise ValueError('Invalid groups %d for %d channels.' % (groups, channels))
+  if channels % groups != 0:
+    raise ValueError('%d channels is not commensurate with %d groups.' %
+                     (channels, groups))
+
+  # Determine axes before channels. Some examples of common image formats:
+  #  'NCHW': before = [N], after = [HW]
+  #  'NHWC': before = [NHW], after = []
+  axes_before_channels = inputs.shape.as_list()[:channels_axis]
+  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+
+  # Manually broadcast the parameters to conform to the number of groups.
+  params_shape_broadcast = ([1] * len(axes_before_channels) +
+                            [groups, channels // groups] +
+                            [1] * len(axes_after_channels))
+
+  # Reshape the input by the group within the channel dimension.
+  inputs_shape = (axes_before_channels + [groups, channels // groups] +
+                  axes_after_channels)
+  inputs = array_ops.reshape(inputs, inputs_shape)
+
+  # Determine the dimensions across which moments are calculated.
+  moments_axes = [channels_axis + 1]
+  for a in reduction_axes:
+    if a > channels_axis:
+      moments_axes.append(a + 1)
+    else:
+      moments_axes.append(a)
+
+  with variable_scope.variable_scope(
+      scope, 'GroupNorm', [inputs], reuse=reuse) as sc:
+    # Note that the params_shape is the number of channels always.
+    params_shape = [channels]
+
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    dtype = inputs.dtype.base_dtype
+    if param_initializers is None:
+      param_initializers = {}
+    if center:
+      beta_collections = utils.get_variable_collections(
+          variables_collections, 'beta')
+      beta_initializer = param_initializers.get(
+          'beta', init_ops.zeros_initializer())
+      beta = variables.model_variable('beta',
+                                      shape=params_shape,
+                                      dtype=dtype,
+                                      initializer=beta_initializer,
+                                      collections=beta_collections,
+                                      trainable=trainable)
+      beta = array_ops.reshape(beta, params_shape_broadcast)
+
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma_initializer = param_initializers.get(
+          'gamma', init_ops.ones_initializer())
+      gamma = variables.model_variable('gamma',
+                                       shape=params_shape,
+                                       dtype=dtype,
+                                       initializer=gamma_initializer,
+                                       collections=gamma_collections,
+                                       trainable=trainable)
+      gamma = array_ops.reshape(gamma, params_shape_broadcast)
+
+    # Calculate the moments.
+    mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
+
+    # Compute normalization.
+    # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
+    # appropriately so that this operation may be faster.
+    gain = math_ops.rsqrt(variance + epsilon)
+    offset = -mean * gain
+    if gamma is not None:
+      gain *= gamma
+      offset *= gamma
+    if beta is not None:
+      offset += beta
+    outputs = inputs * gain + offset
+
+    # Collapse the groups into the channel dimension.
+    outputs = array_ops.reshape(outputs, original_shape)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index 5cff1bf0eb..b6e96350db 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -166,5 +166,231 @@ class InstanceNormTest(test.TestCase):
   def testOutputBigInput5DNCHW(self):
     self.doOutputTest((1, 100, 100, 1, 1), 'NCHW', tol=1e-3)
 
+
+class GroupNormTest(test.TestCase):
+
+  def testInvalidGroupSize(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 2, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Invalid groups 10 for 2 channels.'):
+      normalization.group_norm(inputs, groups=10,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testBadCommensurateGroup(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 4, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 '4 channels is not commensurate with '
+                                 '3 groups.'):
+      normalization.group_norm(inputs, groups=3,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testAxisIsBad(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 2, 4, 5))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, channels_axis=5)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, reduction_axes=[1, 5])
+
+  def testNotMutuallyExclusiveAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(10, 32, 32, 32))
+    # Specify axis with negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[-2])
+    # Specify axis with positive values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=1, reduction_axes=[1, 3])
+    # Specify axis with mixed positive and negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[2])
+
+  def testUnknownShape(self):
+    inputs = array_ops.placeholder(dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'undefined rank'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedReductionAxes(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 32, None, 4))
+    with self.assertRaisesRegexp(ValueError, 'undefined dimensions'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedChannelsAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 3, 4, None))
+    with self.assertRaisesRegexp(ValueError, 'undefined channel dimension'):
+      normalization.group_norm(inputs, channels_axis=-1,
+                               reduction_axes=[-3, -2])
+
+  def testCreateOp(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
+    output = normalization.group_norm(images, groups=groups, channels_axis=-1,
+                                      reduction_axes=[-3, -2])
+    print('name: ', output.op.name)
+    self.assertListEqual([5, height, width, 2*groups], output.shape.as_list())
+
+  def testCreateOpFloat64(self):
+    height, width, groups = 3, 3, 5
+    images = random_ops.random_uniform(
+        (5, height, width, 4*groups), dtype=dtypes.float64, seed=1)
+    output = normalization.group_norm(images, groups=groups)
+    self.assertEqual(dtypes.float64, output.dtype)
+    self.assertListEqual([5, height, width, 4*groups], output.shape.as_list())
+
+  def testCreateOpNoScaleCenter(self):
+    height, width, groups = 3, 3, 7
+    images = random_ops.random_uniform(
+        (5, height, width, 3*groups), dtype=dtypes.float32, seed=1)
+    output = normalization.group_norm(images, groups=groups, center=False,
+                                      scale=False)
+    self.assertListEqual([5, height, width, 3*groups], output.shape.as_list())
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('beta')))
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('gamma')))
+
+  def testCreateVariables_NHWC(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 8), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-1, reduction_axes=(-3, -2),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testCreateVariables_NCHW(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, 2*groups, height, width), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-3, reduction_axes=(-2, -1),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testReuseVariables(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 4), seed=1)
+    normalization.group_norm(images, groups=2, scale=True, scope='IN')
+    normalization.group_norm(images, groups=2, scale=True, scope='IN',
+                             reuse=True)
+    beta = contrib_variables.get_variables_by_name('beta')
+    gamma = contrib_variables.get_variables_by_name('gamma')
+    self.assertEqual(1, len(beta))
+    self.assertEqual(1, len(gamma))
+
+  def testValueCorrectWithReuseVars(self):
+    height, width = 3, 3
+    image_shape = (10, height, width, 4)
+    images = random_ops.random_uniform(image_shape, seed=1)
+    output_train = normalization.group_norm(images, groups=2, scope='IN')
+    output_eval = normalization.group_norm(images, groups=2, scope='IN',
+                                           reuse=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      # output_train and output_eval should be the same.
+      train_np, eval_np = sess.run([output_train, output_eval])
+      self.assertAllClose(train_np, eval_np)
+
+  def doOutputTest(self, input_shape, channels_axis=None, reduction_axes=None,
+                   groups=2, tol=1e-2):
+    # Select the axis for the channel and the dimensions along which statistics
+    # are accumulated.
+    if channels_axis < 0:
+      channels_axis += len(input_shape)
+    reduced_axes = [channels_axis + 1]
+    for a in reduction_axes:
+      if a < 0:
+        a += len(input_shape)
+      if a < channels_axis:
+        reduced_axes.append(a)
+      else:
+        reduced_axes.append(a+1)
+    reduced_axes = tuple(reduced_axes)
+
+    # Calculate the final shape for the output Tensor.
+    axes_before_channels = input_shape[:channels_axis]
+    axes_after_channels = input_shape[channels_axis+1:]
+    channels = input_shape[channels_axis]
+    outputs_shape = (axes_before_channels + [groups, channels // groups] +
+                     axes_after_channels)
+
+    # Calculate the final shape for the output statistics.
+    reduced_shape = []
+    for i, a in enumerate(outputs_shape):
+      if i not in reduced_axes:
+        reduced_shape.append(a)
+
+    for mu in (0.0, 1e2):
+      for sigma in (1.0, 0.1):
+        # Determine shape of Tensor after normalization.
+        expected_mean = np.zeros(reduced_shape)
+        expected_var = np.ones(reduced_shape)
+
+        inputs = random_ops.random_uniform(input_shape, seed=0) * sigma + mu
+        output_op = normalization.group_norm(
+            inputs, groups=groups, center=False, scale=False,
+            channels_axis=channels_axis,
+            reduction_axes=reduction_axes)
+        with self.test_session() as sess:
+          sess.run(variables.global_variables_initializer())
+          outputs = sess.run(output_op)
+          # Make sure that there are no NaNs
+          self.assertFalse(np.isnan(outputs).any())
+
+          outputs = np.reshape(outputs, outputs_shape)
+          mean = np.mean(outputs, axis=reduced_axes)
+          var = np.var(outputs, axis=reduced_axes)
+          # The mean and variance of each example should be close to 0 and 1
+          # respectively.
+          self.assertAllClose(expected_mean, mean, rtol=tol, atol=tol)
+          self.assertAllClose(expected_var, var, rtol=tol, atol=tol)
+
+  def testOutputSmallInput4D_NHWC(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=3, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput3D_NHWC(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=2, reduction_axes=[0, 1])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput4D_NCHW(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=1, reduction_axes=[2, 3])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputSmallInput3D_NCHW(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=0, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputBigInput4D_NHWC(self):
+    self.doOutputTest([5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2],
+                      groups=1)
+
+  def testOutputBigInput4D_NCHW(self):
+    self.doOutputTest([1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3],
+                      groups=4)
+
+  def testOutputSmallInput2D_NC(self):
+    self.doOutputTest([10, 7*100], channels_axis=1, reduction_axes=[], groups=7)
+
+  def testOutputSmallInput5D_NCXXX(self):
+    self.doOutputTest([10, 10, 20, 40, 5],
+                      channels_axis=1,
+                      reduction_axes=[2, 3, 4],
+                      groups=5)
+
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 9a0f96d0ba8770e8089bce9916de10d117b082e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 15:58:06 -0700
Subject: [PATCH 0221/1262] Add forward_from_array to OpKernelContext::Params.

This is an optional reservation on the forwarding of a particular
input tensor to a particular output that will be used by future
optimizations.

PiperOrigin-RevId: 191513782
---
 tensorflow/core/common_runtime/executor.cc    | 34 ++++++---
 tensorflow/core/framework/op_kernel.cc        | 72 ++++++++++++++-----
 tensorflow/core/framework/op_kernel.h         | 42 +++++++++--
 tensorflow/core/kernels/assign_op.h           |  3 +-
 .../core/kernels/resource_variable_ops.cc     | 13 ++--
 5 files changed, 129 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 195803fd7f..0c461a9ee9 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -258,6 +258,13 @@ struct NodeItem {
   // Return array of per-output allocator attributes.
   const AllocatorAttributes* output_attrs() const { return output_attr_base(); }
 
+  // Return array of expected input index from which each output should
+  // be forwarded:
+  // kNeverForward (-2) for DO NOT FORWARD (must allocate).
+  // kNoReservation (-1) for no expected forwarding.
+  // 0... for forward from that input.
+  const int* forward_from() const { return forward_from_base(); }
+
  private:
   friend class GraphView;
 
@@ -267,6 +274,7 @@ struct NodeItem {
   //   AllocatorAttributes output_attr[num_outputs];
   //   uint8               input_type[num_inputs];
   //   uint8               output_type[num_outputs];
+  //   int                 forward_from[num_outputs];
 
   // Return pointer to variable length section.
   char* var() const {
@@ -292,6 +300,13 @@ struct NodeItem {
         sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
   }
 
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs +
+                                  sizeof(uint8) * num_inputs +
+                                  sizeof(uint8) * num_outputs);
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
 };
 
@@ -466,7 +481,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
       + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
+      + num_outputs * sizeof(uint8)                // output_type[num_outputs]
+      + num_outputs * sizeof(int);                 // forward_from[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -737,8 +753,8 @@ Status InferAllocAttr(const Node* n, const Node* dst,
       VLOG(2) << "node " << n->name() << " is the sink of an RPC in";
     } else if ((local_dev_name.type == "CPU" || n->IsHostRecv()) &&
                parsed_src_name.type != "CPU") {
-      // Value is going to be the sink of a local DMA from GPU to CPU (or other
-      // types of accelerators).
+      // Value is going to be the sink of a local DMA from GPU to CPU (or
+      // other types of accelerators).
       attr->set_gpu_compatible(true);
       VLOG(2) << "node " << n->name() << " is the sink of a gpu->cpu copy";
     } else {
@@ -1091,7 +1107,8 @@ class ExecutorState {
     void ActivateLoopInvs(const GraphView* gview, int64 iter,
                           TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
-    // Add a new loop invariant and make it available to all active iterations.
+    // Add a new loop invariant and make it available to all active
+    // iterations.
     void AddLoopInv(const NodeItem* item, const Entry& value,
                     TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
@@ -1148,8 +1165,8 @@ class ExecutorState {
         if (front_index_ == ready_.size()) {
           ready_.clear();
         } else {
-          // Lots of unused entries at beginning of vector: move everything down
-          // to start of vector.
+          // Lots of unused entries at beginning of vector: move everything
+          // down to start of vector.
           ready_.erase(ready_.begin(), ready_.begin() + front_index_);
         }
         front_index_ = 0;
@@ -1597,6 +1614,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
       params.is_input_dead = is_input_dead;
       params.output_attr_array = item.output_attrs();
+      params.forward_from_array = nullptr;  // later: item.forward_from();
 
       if (item.kernel_is_async) {
         // Asynchronous computes.
@@ -2605,7 +2623,7 @@ void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
   (new ExecutorState(args, this))->RunAsync(std::move(done));
 }
 
-}  // end namespace
+}  // namespace
 
 Status NewLocalExecutor(const LocalExecutorParams& params,
                         std::unique_ptr<const Graph> graph,
@@ -2631,4 +2649,4 @@ Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
 
 void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; }
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 6ba196cc34..cfde1e8ea3 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -96,7 +96,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(str_util::StartsWith(type_string(), "_")),
+      is_internal_(StringPiece(type_string()).starts_with("_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
@@ -365,7 +365,7 @@ Status OpKernelContext::input_ref_mutex(StringPiece name, mutex** out_mutex) {
 
 const Tensor& OpKernelContext::input(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, num_inputs());
+  DCHECK_LT(index, num_inputs()) << " name: " << op_kernel().name();
   DCHECK(!input_is_ref(index));
   const Tensor& tensor = *((*params_->inputs)[index].tensor);
   record_tensor_reference(tensor);
@@ -420,8 +420,8 @@ bool OpKernelContext::forward_input_to_output_with_shape(
                                ? AllocatorAttributes()
                                : output_alloc_attr(output_index);
   std::unique_ptr<Tensor> new_tensor = forward_input(
-      input_index, expected_output_dtype(output_index), output_shape,
-      output_memory_type(output_index), output_attr);
+      input_index, output_index, expected_output_dtype(output_index),
+      output_shape, output_memory_type(output_index), output_attr);
   if (new_tensor != nullptr) {
     // Transfer ownership to the output slot in OpKernelContext.
     outputs_[output_index] = TensorValue(new_tensor.release());
@@ -461,35 +461,66 @@ Status OpKernelContext::forward_input_to_output_with_shape(
 }
 
 std::unique_ptr<Tensor> OpKernelContext::forward_input(
-    int input_index, DataType output_dtype, const TensorShape& output_shape,
-    MemoryType output_memory_type, const AllocatorAttributes& output_attr) {
+    int input_index, int output_index, DataType output_dtype,
+    const TensorShape& output_shape, MemoryType output_memory_type,
+    const AllocatorAttributes& output_attr) {
   DCHECK_GE(input_index, 0);
   DCHECK_LT(input_index, num_inputs());
   const TensorValue& input = (*params_->inputs)[input_index];
-  // Check that input tensor exists, is not a ref, and has no other consumers.
-  if (input.tensor == nullptr || input.is_ref() || !input->RefCountIsOne()) {
+  // Check whether at graph construction time this output was marked
+  // either for no forwarding or with a reservation for this input.
+  // If it's reserved for this input we'll skip the refcount and
+  // AllocatorAttribute checks.
+  // TODO(tucker): Maybe we should skip all of the checks?
+  bool never_forward =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == Params::kNeverForward);
+  if (never_forward) return nullptr;
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == input_index);
+  if (!forward_expected && params_->forward_from_array != nullptr) {
+    // Check for possibly conflicting forward.
+    for (int i = 0; i < num_outputs(); ++i) {
+      if (params_->forward_from_array[i] == input_index) {
+        // This input is reserved for output i.
+        return nullptr;
+      }
+    }
+  }
+  // Check that input tensor exists and is not a ref.
+  if (input.tensor == nullptr || input.is_ref()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input type matches.
   if (input_dtype(input_index) != output_dtype) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that the input and output sizes are compatible.
   if (input.tensor->shape().num_elements() != output_shape.num_elements()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input and output memory types match, i.e.
   // that they either both live in host or both live in device memory.
   if (input_memory_type(input_index) != output_memory_type) {
+    CHECK(!forward_expected);
     return nullptr;
   }
-  // Check that output allocator attributes are not more restrictive than
-  // input allocator attributes.
-  const auto input_attr = params_->input_alloc_attrs == nullptr
-                              ? AllocatorAttributes()
-                              : input_alloc_attr(input_index);
-  if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
-    return nullptr;
+  if (!forward_expected) {
+    if (!input->RefCountIsOne()) {
+      return nullptr;
+    }
+    // Check that output allocator attributes are not more restrictive than
+    // input allocator attributes.
+    const auto input_attr = params_->input_alloc_attrs == nullptr
+                                ? AllocatorAttributes()
+                                : input_alloc_attr(input_index);
+    if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
+      return nullptr;
+    }
   }
   // TODO(rmlarsen): Use MakeUnique here. There is already a copy in
   // tensorflow/compiler/xla/ptr_util.h. Perhaps this should be part of
@@ -505,7 +536,8 @@ Status OpKernelContext::forward_input_or_allocate_temp(
     Tensor* out_temp) {
   for (int input_index : candidate_input_indices) {
     std::unique_ptr<Tensor> new_tensor =
-        forward_input(input_index, type, shape, DEVICE_MEMORY, allocator_attr);
+        forward_input(input_index, Params::kNoReservation /*output_index*/,
+                      type, shape, DEVICE_MEMORY, allocator_attr);
     if (new_tensor != nullptr) {
       *out_temp = std::move(*new_tensor);
       return Status::OK();
@@ -595,6 +627,14 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
                                         Tensor** output) {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, num_outputs());
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && index >= 0 &&
+       params_->forward_from_array[index] >= 0);
+  if (forward_expected) {
+    return errors::Internal(
+        "Explicit allocate_output call where input forwarding required.  Try "
+        "turning off the ScopedAllocator optimizer.");
+  }
   AllocatorAttributes attr = output_alloc_attr(index);
   return allocate_output(index, shape, output, attr);
 }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 2d97160830..67943377b9 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -64,10 +64,11 @@ class AsyncOpKernel;
 class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
-class OpKernelContext;       // declared below
+class OpKernelContext;       // declared below,
 class OpRegistryInterface;
 class ResourceMgr;
 class ScopedStepContainer;
+class CollectiveExecutor;
 class StepStatsCollector;
 
 class OpKernel {
@@ -532,6 +533,10 @@ class OpKernelContext {
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
 
+    // Mechanism for executing a collective op that needs to coordinate
+    // with parallel instances runing on other devices.
+    CollectiveExecutor* collective_executor = nullptr;
+
     // The session state for this op.
     SessionState* session_state = nullptr;
 
@@ -565,6 +570,12 @@ class OpKernelContext {
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
+
+    // Support for forwarding reservations (used by ScopedAllocator).
+    static const int kNeverForward = -2;
+    static const int kNoReservation = -1;
+    // Values in [0,...) represent reservations for the indexed output.
+    const int* forward_from_array = nullptr;
   };
 
   // params must outlive the OpKernelContext.
@@ -707,14 +718,31 @@ class OpKernelContext {
   //     input[input_index] are compatible with those given in dtype, shape,
   //     memory_type, and attr,
   //   * refcount on the underlying buffer is one.
+  //   * Either there is no forwarding reservation for either input_index
+  //     or output_index or the specified input is reserved for the specified
+  //     output. More precisely:
+  //
+  //     These cases mean neither input nor output has a reservation:
+  //        forward_from_array = nullptr
+  //     OR (input_index is not in forward_from_array AND
+  //         (output_index == kNoReservation OR
+  //          forward_from_array[output_index] == kNoReservation))
+  //
+  //     This case means that input_index is reserved for output_index:
+  //        forward_from_array[output_index] == input_index
+  //
+  //     This case means the output is reserved to always be allocated,
+  //     never assigned a forwarded input:
+  //        forward_from_array[output_index] == kNeverForward
+  //
   // Otherwise returns nullptr.
   // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
   // forwarding is only safe if there are no reads via __ldg() after writes
   // to the same address.
   std::unique_ptr<Tensor> forward_input(
-      int input_index, DataType dtype, const TensorShape& shape,
-      MemoryType memory_type,
-      const AllocatorAttributes& attr) TF_MUST_USE_RESULT;
+      int input_index, int output_index, DataType output_dtype,
+      const TensorShape& output_shape, MemoryType output_memory_type,
+      const AllocatorAttributes& output_attr) TF_MUST_USE_RESULT;
 
   // Tries to forward one of the inputs given in input_indices to
   // output[output_index]. If none of the given inputs can be forwarded, calls
@@ -934,6 +962,10 @@ class OpKernelContext {
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
 
+  CollectiveExecutor* collective_executor() const {
+    return params_->collective_executor;
+  }
+
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
@@ -1102,7 +1134,7 @@ class OpKernelContext {
 
   Status status_;
   friend class CollectiveExecutor;  // for access to params_
-  Params* params_;    // not owned
+  Params* params_;                  // not owned
   mutable mutex mu_;  // mutable so const accessors can acquire the lock
   gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators_ GUARDED_BY(mu_);
   gtl::InlinedVector<TensorValue, 4> outputs_;
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index a312e8e8a4..2ed1628bf1 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -77,7 +77,8 @@ class AssignOp : public OpKernel {
 
       // 1. Try to reuse the rhs.
       std::unique_ptr<Tensor> input_alias = context->forward_input(
-          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+          1, OpKernelContext::Params::kNoReservation /*output_index*/,
+          old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
       if (input_alias != nullptr) {
         // Transfer ownership to the ref.
         context->replace_ref_input(0, *input_alias.release(),
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 082b57b8e2..5c54609ee6 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -250,9 +250,9 @@ class AssignVariableOp : public OpKernel {
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
-    std::unique_ptr<Tensor> input_alias =
-        context->forward_input(1, dtype_, value.shape(), DEVICE_MEMORY, attr);
-
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, dtype_,
+        value.shape(), DEVICE_MEMORY, attr);
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     if (input_alias) {
@@ -370,11 +370,14 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
     // Note that Variant objects themselves always reside on host.
-    std::unique_ptr<Tensor> input_alias =
-        context->forward_input(1, DT_VARIANT, value.shape(), HOST_MEMORY, attr);
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
+        value.shape(), HOST_MEMORY, attr);
 
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
+    *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+
     if (input_alias) {
       *variable->tensor() = *input_alias;
       return;
-- 
GitLab


From b3b96493e7ff08d7c026926f92f97c51ecaf9aa3 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 3 Apr 2018 16:18:19 -0700
Subject: [PATCH 0222/1262] [TF]Disable memory sanitizer for
 compilation_passes_test while the problem is being

PiperOrigin-RevId: 191516643
---
 tensorflow/compiler/jit/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 24aa203c00..e7d18e8351 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -347,6 +347,7 @@ tf_cc_test(
         "encapsulate_subgraphs_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
     ],
+    tags = ["nomsan"],  # TODO: b/77543571
     deps = [
         ":common",
         ":compilation_passes",
-- 
GitLab


From 10055506747ced8773863f4695c78661e6059e32 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 3 Apr 2018 16:20:15 -0700
Subject: [PATCH 0223/1262] Fold more constants

PiperOrigin-RevId: 191516861
---
 tensorflow/core/grappler/optimizers/constant_folding.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 87052c7ba0..dd522aa228 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -747,10 +747,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op.find("Quantized") != string::npos || op.find("Sparse") == 0) {
     return false;
   }
-  if (node.attr().count("_XlaCompile") > 0 &&
-      node.attr().at("_XlaCompile").b()) {
-    return false;
-  }
 
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
-- 
GitLab


From fa2a9473923bb6f20454623810a1b23cbeb7de1c Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Tue, 3 Apr 2018 16:35:49 -0700
Subject: [PATCH 0224/1262] Minor touch-ups to the eager programming guide.

PiperOrigin-RevId: 191518771
---
 .../docs_src/programmers_guide/eager.md       | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index 8db65737dc..414653c280 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -1,35 +1,34 @@
 # Eager Execution
 
 TensorFlow's eager execution is an imperative programming environment that
-evaluates operations immediately, without an extra graph-building step.
-Operations return concrete values instead of constructing a computational graph
-to run later. This makes it easy to get started with TensorFlow, debug models,
-reduce boilerplate code, and is fun! To follow along with this guide, run the
-code samples below in an interactive `python` interpreter.
-
-Eager execution supports most TensorFlow operations and GPU acceleration.
-Automatic differentiation uses a dynamically-constructed tape instead of a static
-graph to compute gradients. Eager execution is a flexible machine learning
-platform for research and experimentation that provides:
-
-* *An intuitive interface* —Structure your code naturally and use Python data
+evaluates operations immediately, without building graphs: operations return
+concrete values instead of constructing a computational graph to run later. This
+makes it easy to get started with TensorFlow and debug models, and it
+reduces boilerplate as well. To follow along with this guide, run the code
+samples below in an interactive `python` interpreter.
+
+Eager execution is a flexible machine learning platform for research and
+experimentation, providing:
+
+* *An intuitive interface*—Structure your code naturally and use Python data
   structures. Quickly iterate on small models and small data.
-* *Easier debugging* —Call ops directly to inspect running models and test
+* *Easier debugging*—Call ops directly to inspect running models and test
   changes. Use standard Python debugging tools for immediate error reporting.
-* *Natural control flow* —Use Python control flow instead of graph control flow,
-  including support for dynamic models.
+* *Natural control flow*—Use Python control flow instead of graph control
+  flow, simplifying the specification of dynamic models.
 
-For a collection of examples running in eager execution, see:
+Eager execution supports most TensorFlow operations and GPU acceleration. For a
+collection of examples running in eager execution, see:
 [tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
 
-Note: Some models may experience increased overhead with eager execution enabled.
-Performance improvements are ongoing, but please
+Note: Some models may experience increased overhead with eager execution
+enabled. Performance improvements are ongoing, but please
 [file a bug](https://github.com/tensorflow/tensorflow/issues) if you find a
 problem and share your benchmarks.
 
 ## Setup and basic usage
 
-Upgrade to TensorFlow 1.7 to include updates for eager execution:
+Upgrade to the latest version of TensorFlow:
 
 ```
 $ pip install --upgrade tensorflow
@@ -110,7 +109,7 @@ environments and is useful for writing code to [work with graphs](#work_with_gra
 import tensorflow.contrib.eager as tfe
 ```
 
-## Eager training
+## Updating model parameters
 
 ### Automatic differentiation
 
@@ -124,7 +123,7 @@ operations for computing gradients later.
 not tracing. Since different operations can occur during each call, all
 forward-pass operations get recorded to a "tape". To compute the gradient, play
 the tape backwards and then discard. A particular `tfe.GradientTape` can only
-be computed once, subsequent calls throw a runtime error.
+compute one gradient; subsequent calls throw a runtime error.
 
 ```py
 w = tfe.Variable([[1.0]])
@@ -415,7 +414,7 @@ result = model(batch)
 This example uses the
 [dataset.py module](https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py)
 from the
-[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist),
+[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist);
 download this file to your local directory. Run the following to download the
 MNIST data files to your working directory and prepare a `tf.data.Dataset`
 for training:
-- 
GitLab


From 0520c62b0955af879c48abf87df56f07c4c37b54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 16:46:29 -0700
Subject: [PATCH 0225/1262] This is a cosmetic change to logging, which makes
 it easier to understand this message when TPU nodes have a huge number of
 devices attached.

PiperOrigin-RevId: 191520116
---
 tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index eea57ed336..3ae350c7bb 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -120,7 +120,8 @@ def _query_tpu_system_metadata(master_address, run_config,
     logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
     logging.info('*** Num TPU Cores Per Worker: %d',
                  metadata.num_of_cores_per_host)
-    logging.info('*** Available Devices: %s', metadata.devices)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
   else:
     logging.info('Failed to find TPU: %s', metadata)
   return metadata
-- 
GitLab


From 41f3f5707b105bab91683edc3cdd8497a8e0c71d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 16:56:25 -0700
Subject: [PATCH 0226/1262] [TF:XLA] Add test for scalar dynamic-update-slice.

PiperOrigin-RevId: 191521429
---
 .../compiler/xla/tests/dynamic_ops_test.cc    | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index c0a16ad288..5f00c34002 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -278,6 +278,15 @@ XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
 
 class DynamicUpdateSliceTest : public ClientLibraryTestBase {
  protected:
+  template <typename IndexT, typename DataT>
+  void TestR0() {
+    // Disable algebraic simplifier, otherwise the op will be replaced by a
+    // constant.
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "algsimp");
+    RunR0<IndexT, DataT>(0, 123, {}, 123);
+  }
+
   template <typename IndexT, typename DataT>
   void TestR1() {
     // Slice at dimension start.
@@ -338,6 +347,35 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
         {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
   }
 
+  template <typename IndexT, typename DataT>
+  void RunR0(int input_value_int, int update_value_int,
+             const std::vector<IndexT> slice_starts, int expected_value_int) {
+    Literal input_value =
+        std::move(*Literal::CreateR0(input_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_value =
+        std::move(*Literal::CreateR0(update_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_value =
+        std::move(*Literal::CreateR0(expected_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
+    ComputationBuilder builder(client_, TestName());
+    // Initialize and transfer dynamic slice start indices parameter.
+    ComputationDataHandle starts;
+    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
+        slice_starts, 0, "slice_starts", &builder, &starts);
+    // Build dynamic slice computation.
+    auto input = builder.ConstantLiteral(input_value);
+    auto update = builder.ConstantLiteral(update_value);
+    builder.DynamicUpdateSlice(input, update, starts);
+    // Run computation and compare against expected values.
+    ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()});
+  }
+
   template <typename IndexT, typename DataT>
   void RunR1(tensorflow::gtl::ArraySlice<int> input_values_int,
              tensorflow::gtl::ArraySlice<int> update_values_int,
@@ -497,6 +535,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0BF16) { TestR0<int32, bfloat16>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0<int32, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64, float>(); }
+
 // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
 XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) {
   TestR1<int32, bfloat16>();
-- 
GitLab


From df2229540e9a1607193dcb8c83d5f3d7cf5d1a56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 17:08:19 -0700
Subject: [PATCH 0227/1262] [XLA] Redesign: implement and test Send, Recv.

PiperOrigin-RevId: 191523125
---
 .../xla/client/xla_client/xla_builder.cc      | 37 ++++++++++++++++++-
 .../xla/tests/client_library_test_base.cc     |  6 +++
 .../xla/tests/client_library_test_base.h      | 13 +++++++
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index fe8ae77683..2d587cc3b9 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -45,6 +45,7 @@ int64 GetUniqueId() {
 bool CanBeRoot(HloOpcode opcode) {
   switch (opcode) {
     case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
       return false;
@@ -1127,11 +1128,43 @@ XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
 }
 
 void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Send instruction produces a tuple of {aliased operand, U32 context}.
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(
+        XlaOp send,
+        AddInstruction(std::move(instr), HloOpcode::kSend, {operand}));
+
+    HloInstructionProto send_done_instr;
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeNil();
+    send_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
+                          {send});
+  });
 }
 
 XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Recv instruction produces a tuple of {receive buffer, U32 context}.
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(XlaOp recv,
+                        AddInstruction(std::move(instr), HloOpcode::kRecv, {}));
+
+    HloInstructionProto recv_done_instr;
+    *recv_done_instr.mutable_shape() = shape;
+    recv_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
+                          {recv});
+  });
 }
 
 StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 4a9faef1dc..17c6a83c1a 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -601,6 +601,12 @@ ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
       use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
 }
 
+XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
+                                                       XlaBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
 template void ClientLibraryTestBase::ComputeAndCompareLiteral(
     ComputationBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index be90f14c8e..52f31b0669 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -312,6 +312,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   // will be converted to BF16s.
   ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
                                                   ComputationBuilder* builder);
+  XlaOp CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder);
 
   // Creates a constant instruction with the given array. When the use_bfloat16
   // flag is set but the array has float elements, the elements will be
@@ -322,6 +323,12 @@ class ClientLibraryTestBase : public ::testing::Test {
     return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromArray(const Array<NativeT>& array,
+                                XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
   // Same as CreateConstantFromArray, but for scalars.
   template <typename NativeT>
   ComputationDataHandle CreateConstantFromScalar(NativeT value,
@@ -330,6 +337,12 @@ class ClientLibraryTestBase : public ::testing::Test {
                                      builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
   // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
-- 
GitLab


From 467f195a2dd87257e3719576637774ebcf7a4590 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 17:08:57 -0700
Subject: [PATCH 0228/1262] Add max_constant_size_in_bytes parameter for
 ConstantFolding transform that sets the maximum size of each created
 constant.

PiperOrigin-RevId: 191523208
---
 .../graph_transforms/fold_constants_lib.cc    |  4 ++
 .../graph_transforms/fold_constants_test.cc   | 44 +++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 250f54e20f..85660f94a8 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -283,6 +283,10 @@ Status FoldConstants(const GraphDef& input_graph_def,
     };
   }
 
+  TF_RETURN_IF_ERROR(context.GetOneInt64Parameter(
+      "max_constant_size_in_bytes", cf_opts.max_constant_size_in_bytes,
+      &cf_opts.max_constant_size_in_bytes));
+
   // Constant folding.
   bool was_mutated;
   TF_RETURN_IF_ERROR(ConstantFold(cf_opts, nullptr, Env::Default(), nullptr,
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index 41106de008..6bfdfe43f5 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -370,6 +370,46 @@ class ConstantFoldingTest : public ::testing::Test {
     EXPECT_EQ(0, node_map.count("b"));
     EXPECT_EQ(1, node_map.count("c"));
   }
+
+  void TestMaxConstantSizeInBytes() {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    const int width = 100;
+
+    Tensor a_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&a_data, 1.0f);
+    Output a_const = ::tensorflow::ops::Const(
+        root.WithOpName("a_expect_remains"), Input::Initializer(a_data));
+
+    Tensor b_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&b_data, 1.0f);
+    Output b_const = ::tensorflow::ops::Const(
+        root.WithOpName("b_expect_remains"), Input::Initializer(b_data));
+
+    Output add = ::tensorflow::ops::Add(root.WithOpName("add_expect_remains"),
+                                        a_const, b_const);
+
+    Output placeholder = ::tensorflow::ops::Placeholder(
+        root.WithOpName("placeholder_expect_remains"), DT_FLOAT);
+
+    Output mul = ::tensorflow::ops::Mul(
+        root.WithOpName("output_expect_remains"), add, placeholder);
+
+    GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+    Tensor placeholder_tensor(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&placeholder_tensor, 1.0f);
+
+    // Setting the maximum constant size to 10 bytes should stop the constant
+    // folding at add(a, b) that would have yielded a constant of
+    // 100*sizeof(float) bytes.
+    graph_transforms::TransformFuncContext context;
+    context.params["max_constant_size_in_bytes"] = {"10"};
+    TestConstantFolding(graph_def,
+                        {{"placeholder_expect_remains", placeholder_tensor}},
+                        {}, {"output_expect_remains"}, context);
+  }
 };
 
 TEST_F(ConstantFoldingTest, TestSimpleAdd) { TestSimpleAdd(); }
@@ -394,5 +434,9 @@ TEST_F(ConstantFoldingTest, TestRemoveUnusedNodesMultipleOutputs) {
   TestRemoveUnusedNodesMultipleOutputs();
 }
 
+TEST_F(ConstantFoldingTest, TestMaxConstantSizeInBytes) {
+  TestMaxConstantSizeInBytes();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
-- 
GitLab


From 4a20926c555a2c8da47f4138f436417bfe7db6d0 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Tue, 3 Apr 2018 17:26:16 -0700
Subject: [PATCH 0229/1262] Fix Conv3D shape inference.

Before, the stride rows and columns were mixed up, causing shape inference to output the wrong shape.

PiperOrigin-RevId: 191525254
---
 tensorflow/core/framework/common_shape_fns.cc      | 4 ++--
 tensorflow/python/kernel_tests/conv_ops_3d_test.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 2fb17c2b02..72eeda7a43 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -504,8 +504,8 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
     input_shape =
         c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('2'), dim('C')}});
     stride_planes = strides[2];
-    stride_cols = strides[3];
-    stride_rows = strides[4];
+    stride_rows = strides[3];
+    stride_cols = strides[4];
   } else {
     stride_planes = strides[1];
     stride_rows = strides[2];
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index ec8ac74163..f4616fd661 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -344,6 +345,8 @@ class Conv3DTest(test.TestCase):
         if data_format == "NCDHW":
           conv = test_util.NCHWToNHWC(conv)
 
+        self.assertEqual(conv.shape, tensor_shape.TensorShape(output_shape))
+
         if test_input:
           jacob_t, jacob_n = gradient_checker.compute_gradient(
               orig_input_tensor, input_shape, conv, output_shape)
-- 
GitLab


From 68a2716ba9c553b0442bb6ecca8742c0ad5d37f8 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 3 Apr 2018 17:35:54 -0700
Subject: [PATCH 0230/1262] Refactors the eager guide to be more
 researcher-friendly.

 * Shows how to build layers
 * Orders the topics as they'd be ordered in a normal model

PiperOrigin-RevId: 191526275
---
 .../docs_src/programmers_guide/eager.md       | 414 ++++++++++--------
 1 file changed, 232 insertions(+), 182 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index 414653c280..dc5b403428 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -109,9 +109,106 @@ environments and is useful for writing code to [work with graphs](#work_with_gra
 import tensorflow.contrib.eager as tfe
 ```
 
-## Updating model parameters
+## Dynamic control flow
 
-### Automatic differentiation
+A major benefit of eager execution is that all the functionality of the host
+language is available while your model is executing. So, for example,
+it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz):
+
+```py
+def fizzbuzz(max_num):
+  counter = tf.constant(0)
+  for num in range(max_num):
+    num = tf.constant(num)
+    if num % 3 == 0 and num % 5 == 0:
+      print('FizzBuzz')
+    elif num % 3 == 0:
+      print('Fizz')
+    elif num % 5 == 0:
+      print('Buzz')
+    else:
+      print(num)
+    counter += 1
+  return counter
+```
+
+This has conditionals that depend on tensor values and it prints these values
+at runtime.
+
+## Build a model
+
+Many machine learning models are represented by composing layers. When
+using TensorFlow with eager execution you can either write your own layers or
+use a layer provided in the `tf.keras.layers` package.
+
+While you can use any Python object to represent a layer,
+TensorFlow has `tf.keras.layers.Layer` as a convenient base class. Inherit from
+it to implement your own layer:
+
+```py
+class MySimpleLayer(tf.keras.layers.Layer):
+  def __init__(self, output_units):
+    self.output_units = output_units
+
+  def build(self, input):
+    # The build method gets called the first time your layer is used.
+    # Creating variables on build() allows you to make their shape depend
+    # on the input shape and hence remove the need for the user to specify
+    # full shapes. It is possible to create variables during __init__() if
+    # you already know their full shapes.
+    self.kernel = self.add_variable(
+      "kernel", [input.shape[-1], self.output_units])
+
+  def call(self, input):
+    # Override call() instead of __call__ so we can perform some bookkeeping.
+    return tf.matmul(input, self.kernel)
+```
+
+Use `tf.keras.layers.Dense` layer instead  of `MySimpleLayer` above as it has
+a superset of its functionality (it can also add a bias).
+
+When composing layers into models you can use `tf.keras.Sequential` to represent
+models which are a linear stack of layers. It is easy to use for basic models:
+
+```py
+model = tf.keras.Sequential([
+  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
+  tf.keras.layers.Dense(10)
+])
+```
+
+Alternatively, organize models in classes by inheriting from `tf.keras.Model`.
+This is a container for layers that is a layer itself, allowing `tf.keras.Model`
+objects to contain other `tf.keras.Model` objects.
+
+```py
+class MNISTModel(tf.keras.Model):
+  def __init__(self):
+    super(MNISTModel, self).__init__()
+    self.dense1 = tf.keras.layers.Dense(units=10)
+    self.dense2 = tf.keras.layers.Dense(units=10)
+
+  def call(self, input):
+    """Run the model."""
+    result = self.dense1(input)
+    result = self.dense2(result)
+    result = self.dense2(result)  # reuse variables from dense2 layer
+    return result
+
+model = MNISTModel()
+```
+
+It's not required to set an input shape for the `tf.keras.Model` class since
+the parameters are set the first time input is passed to the layer.
+
+`tf.keras.layers` classes create and contain their own model variables that
+are tied to the lifetime of their layer objects. To share layer variables, share
+their objects.
+
+
+## Eager training
+
+### Computing gradients
 
 [Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
 is useful for implementing machine learning algorithms such as
@@ -215,189 +312,12 @@ for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
                             global_step=tf.train.get_or_create_global_step())
 ```
 
-#### Dynamic models
-
-`tfe.GradientTape` can also be used in dynamic models. This example for a
-[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
-algorithm looks like normal NumPy code, except there are gradients and is
-differentiable, despite the complex control flow:
-
-```py
-def line_search_step(fn, init_x, rate=1.0):
-  with tfe.GradientTape() as tape:
-    # Variables are automatically recorded, but manually watch a tensor
-    tape.watch(init_x)
-    value = fn(init_x)
-  grad, = tape.gradient(value, [init_x])
-  grad_norm = tf.reduce_sum(grad * grad)
-  init_value = value
-  while value > init_value - rate * grad_norm:
-    x = init_x - rate * grad
-    value = fn(x)
-    rate /= 2.0
-  return x, value
-```
-
-#### Additional functions to compute gradients
-
-`tfe.GradientTape` is a powerful interface for computing gradients, but there
-is another [Autograd](https://github.com/HIPS/autograd)-style API available for
-automatic differentiation. These functions are useful if writing math code with
-only tensors and gradient functions, and without `tfe.Variables`:
-
-* `tfe.gradients_function` —Returns a function that computes the derivatives
-  of its input function parameter with respect to its arguments. The input
-  function parameter must return a scalar value. When the returned function is
-  invoked, it returns a list of `tf.Tensor` objects: one element for each
-  argument of the input function. Since anything of interest must be passed as a
-  function parameter, this becomes unwieldy if there's a dependency on many
-  trainable parameters.
-* `tfe.value_and_gradients_function` —Similar to
-  `tfe.gradients_function`, but when the returned function is invoked, it
-  returns the value from the input function in addition to the list of
-  derivatives of the input function with respect to its arguments.
-
-In the following example, `tfe.gradients_function` takes the `square`
-function as an argument and returns a function that computes the partial
-derivatives of `square` with respect to its inputs. To calculate the derivative
-of `square` at `3`, `grad(3.0)` returns `6`.
-
-```py
-def square(x):
-  return tf.multiply(x, x)
-
-grad = tfe.gradients_function(square)
-
-square(3.)  # => 9.0
-grad(3.)    # => [6.0]
-
-# The second-order derivative of square:
-gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
-gradgrad(3.)  # => [2.0]
-
-# The third-order derivative is None:
-gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
-gradgradgrad(3.)  # => [None]
-
-
-# With flow control:
-def abs(x):
-  return x if x > 0. else -x
-
-grad = tfe.gradients_function(abs)
-
-grad(3.)   # => [1.0]
-grad(-3.)  # => [-1.0]
-```
-
-### Custom gradients
-
-Custom gradients are an easy way to override gradients in eager and graph
-execution. Within the forward function, define the gradient with respect to the
-inputs, outputs, or intermediate results. For example, here's an easy way to clip
-the norm of the gradients in the backward pass:
-
-```py
-@tf.custom_gradient
-def clip_gradient_by_norm(x, norm):
-  y = tf.identity(x)
-  def grad_fn(dresult):
-    return [tf.clip_by_norm(dresult, norm), None]
-  return y, grad_fn
-```
-
-Custom gradients are commonly used to provide a numerically stable gradient for a
-sequence of operations:
-
-```py
-def log1pexp(x):
-  return tf.log(1 + tf.exp(x))
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# The gradient computation works fine at x = 0.
-grad_log1pexp(0.)  # => [0.5]
-
-# However, x = 100 fails because of numerical instability.
-grad_log1pexp(100.)  # => [nan]
-```
-
-Here, the `log1pexp` function can be analytically simplified with a custom
-gradient. The implementation below reuses the value for `tf.exp(x)` that is
-computed during the forward pass—making it more efficient by eliminating
-redundant calculations:
-
-```py
-@tf.custom_gradient
-def log1pexp(x):
-  e = tf.exp(x)
-  def grad(dy):
-    return dy * (1 - 1 / (1 + e))
-  return tf.log(1 + e), grad
-
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# As before, the gradient computation works fine at x = 0.
-grad_log1pexp(0.)  # => [0.5]
-
-# And the gradient computation also works at x = 100.
-grad_log1pexp(100.)  # => [1.0]
-```
-
-
-## Build and train models
-
-There are many parameters to optimize when calculating derivatives. TensorFlow
-code is easier to read when structured into reusable classes and objects instead
-of a single top-level function. Eager execution encourages the use of the
-Keras-style layer classes in the `tf.keras.layers` module. Additionally, the
-`tf.train.Optimizer` classes provide sophisticated techniques to calculate
-parameter updates.
 
 The following example creates a multi-layer model that classifies the standard
 [MNIST handwritten digits](https://www.tensorflow.org/tutorials/layers). It
 demonstrates the optimizer and layer APIs to build trainable graphs in an eager
 execution environment.
 
-### Build a model
-
-The `tf.keras.Sequential` model is a linear stack of layers. It is easy to
-use for basic models:
-
-```py
-model = tf.keras.Sequential([
-  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
-  tf.keras.layers.Dense(10)
-])
-```
-
-Alternatively, organize models in classes by inheriting from `tf.keras.Model`.
-This is a container for layers that is a layer itself, allowing `tf.keras.Model`
-objects to contain other `tf.keras.Model` objects.
-
-```py
-class MNISTModel(tf.keras.Model):
-  def __init__(self):
-    super(MNISTModel, self).__init__()
-    self.dense1 = tf.keras.layers.Dense(units=10)
-    self.dense2 = tf.keras.layers.Dense(units=10)
-
-  def call(self, input):
-    """Run the model."""
-    result = self.dense1(input)
-    result = self.dense2(result)
-    result = self.dense2(result)  # reuse variables from dense2 layer
-    return result
-
-model = MNISTModel()
-```
-
-It's not required to set an input shape for the `tf.keras.Model` class since
-the parameters are set the first time input is passed to the layer.
-
-`tf.keras.layers` classes create and contain their own model variables that
-are tied to the lifetime of their layer objects. To share layer variables, share
-their objects.
-
 ### Train a model
 
 Even without training, call the model and inspect the output in eager execution:
@@ -661,11 +581,141 @@ for _ in range(iterations):
      ...
 ```
 
+## Advanced automatic differentiation topics
+
+### Dynamic models
+
+`tfe.GradientTape` can also be used in dynamic models. This example for a
+[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
+algorithm looks like normal NumPy code, except there are gradients and is
+differentiable, despite the complex control flow:
+
+```py
+def line_search_step(fn, init_x, rate=1.0):
+  with tfe.GradientTape() as tape:
+    # Variables are automatically recorded, but manually watch a tensor
+    tape.watch(init_x)
+    value = fn(init_x)
+  grad, = tape.gradient(value, [init_x])
+  grad_norm = tf.reduce_sum(grad * grad)
+  init_value = value
+  while value > init_value - rate * grad_norm:
+    x = init_x - rate * grad
+    value = fn(x)
+    rate /= 2.0
+  return x, value
+```
+
+### Additional functions to compute gradients
+
+`tfe.GradientTape` is a powerful interface for computing gradients, but there
+is another [Autograd](https://github.com/HIPS/autograd)-style API available for
+automatic differentiation. These functions are useful if writing math code with
+only tensors and gradient functions, and without `tfe.Variables`:
+
+* `tfe.gradients_function` —Returns a function that computes the derivatives
+  of its input function parameter with respect to its arguments. The input
+  function parameter must return a scalar value. When the returned function is
+  invoked, it returns a list of `tf.Tensor` objects: one element for each
+  argument of the input function. Since anything of interest must be passed as a
+  function parameter, this becomes unwieldy if there's a dependency on many
+  trainable parameters.
+* `tfe.value_and_gradients_function` —Similar to
+  `tfe.gradients_function`, but when the returned function is invoked, it
+  returns the value from the input function in addition to the list of
+  derivatives of the input function with respect to its arguments.
+
+In the following example, `tfe.gradients_function` takes the `square`
+function as an argument and returns a function that computes the partial
+derivatives of `square` with respect to its inputs. To calculate the derivative
+of `square` at `3`, `grad(3.0)` returns `6`.
+
+```py
+def square(x):
+  return tf.multiply(x, x)
+
+grad = tfe.gradients_function(square)
+
+square(3.)  # => 9.0
+grad(3.)    # => [6.0]
+
+# The second-order derivative of square:
+gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
+gradgrad(3.)  # => [2.0]
+
+# The third-order derivative is None:
+gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
+gradgradgrad(3.)  # => [None]
+
+
+# With flow control:
+def abs(x):
+  return x if x > 0. else -x
+
+grad = tfe.gradients_function(abs)
+
+grad(3.)   # => [1.0]
+grad(-3.)  # => [-1.0]
+```
+
+### Custom gradients
+
+Custom gradients are an easy way to override gradients in eager and graph
+execution. Within the forward function, define the gradient with respect to the
+inputs, outputs, or intermediate results. For example, here's an easy way to clip
+the norm of the gradients in the backward pass:
+
+```py
+@tf.custom_gradient
+def clip_gradient_by_norm(x, norm):
+  y = tf.identity(x)
+  def grad_fn(dresult):
+    return [tf.clip_by_norm(dresult, norm), None]
+  return y, grad_fn
+```
+
+Custom gradients are commonly used to provide a numerically stable gradient for a
+sequence of operations:
+
+```py
+def log1pexp(x):
+  return tf.log(1 + tf.exp(x))
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# The gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# However, x = 100 fails because of numerical instability.
+grad_log1pexp(100.)  # => [nan]
+```
+
+Here, the `log1pexp` function can be analytically simplified with a custom
+gradient. The implementation below reuses the value for `tf.exp(x)` that is
+computed during the forward pass—making it more efficient by eliminating
+redundant calculations:
+
+```py
+@tf.custom_gradient
+def log1pexp(x):
+  e = tf.exp(x)
+  def grad(dy):
+    return dy * (1 - 1 / (1 + e))
+  return tf.log(1 + e), grad
+
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# As before, the gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# And the gradient computation also works at x = 100.
+grad_log1pexp(100.)  # => [1.0]
+```
+
 ## Performance
 
-Computation is not automatically offloaded to GPUs during eager execution. To
-explicitly direct a computation to a GPU, enclose it in a
-`tf.device('/gpu:0')` block:
+Computation is automatically offloaded to GPUs during eager execution. If you
+want control over where a computation runs you can enclose it in a
+`tf.device('/gpu:0')` block (or the CPU equivalent):
 
 ```py
 import time
-- 
GitLab


From e61eb9b62a6b7b3a00cccd143499f91e7255d7fd Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 3 Apr 2018 17:36:10 -0700
Subject: [PATCH 0231/1262] Add testcase for convert.

PiperOrigin-RevId: 191526297
---
 tensorflow/compiler/xla/tests/convert_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 8718fa5066..0842a8918b 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -241,6 +241,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
                          1.99f,
                          2.0f,
                          2.01f,
+                         2147483648.f,
                          -0.5f,
                          -0.99f,
                          -1.0f,
-- 
GitLab


From 5b652f57709d30d883570a82ac500051d8bfe1e6 Mon Sep 17 00:00:00 2001
From: Tony Wang <tonywy@google.com>
Date: Tue, 3 Apr 2018 17:45:18 -0700
Subject: [PATCH 0232/1262] [TF:XLA] Add INTEL_MKL_ML MatMul method to XLA/CPU
 backend

The INTEL GEMM API provides 32-bit and 64-bit MatMul. With INTEL_MKL flag set,
XLA backend emits runtime call to INTEL GEMM MatMul instead of Eigen.

PiperOrigin-RevId: 191527251
---
 .../xla/legacy_flags/debug_options_flags.cc   |   7 +
 tensorflow/compiler/xla/service/cpu/BUILD     |  25 ++++
 .../compiler/xla/service/cpu/cpu_runtime.cc   |   8 ++
 .../compiler/xla/service/cpu/cpu_runtime.h    |   4 +
 .../xla/service/cpu/cpu_runtime_test.cc       |  82 ++++++++++-
 .../xla/service/cpu/dot_op_emitter.cc         |  23 ++--
 .../xla/service/cpu/runtime_matmul_mkl.cc     | 129 ++++++++++++++++++
 .../xla/service/cpu/runtime_matmul_mkl.h      |  80 +++++++++++
 .../xla/service/cpu/simple_orc_jit.cc         |   5 +
 tensorflow/compiler/xla/xla.proto             |   3 +
 10 files changed, 351 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h

diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index c8ed3e3a2b..f037663e3f 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,6 +40,9 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
+#ifdef INTEL_MKL
+  flags->set_xla_cpu_use_mkl_dnn(true);
+#endif  // INTEL_MKL
 
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
@@ -288,6 +291,10 @@ void AllocateFlags() {
           flag_values->xla_gpu_use_cudnn_batchnorm(),
           "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
           "rather than expanding them to a soup of HLOs."),
+      tensorflow::Flag("xla_cpu_use_mkl_dnn",
+                       bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
+                       flag_values->xla_cpu_use_mkl_dnn(),
+                       "Generate calls to MKL-DNN in the CPU backend."),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 966e2d0fc5..d22c135249 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -18,6 +18,10 @@ load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -170,6 +174,7 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
@@ -534,10 +539,28 @@ cc_library(
         ":runtime_matvec",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
 
+cc_library(
+    name = "runtime_matmul_mkl",
+    srcs = ["runtime_matmul_mkl.cc"],
+    hdrs = ["runtime_matmul_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
+    ]),
+)
+
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = [
@@ -584,10 +607,12 @@ cc_library(
 tf_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
+    shard_count = 10,
     tags = ["optonly"],
     deps = [
         ":cpu_runtime",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_matmul",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 9a3bd68c80..872b0be1f8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,6 +37,14 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kMKLMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF32";
+extern const char* const kMKLMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF64";
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF32";
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF64";
 extern const char* const kEigenConvF16SymbolName =
     "__xla_cpu_runtime_EigenConvF16";
 extern const char* const kEigenConvF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index e61d6ea28b..e392e231b4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,6 +44,10 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kMKLMatMulF32SymbolName;
+extern const char* const kMKLMatMulF64SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f385829cdf..9e04307295 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
@@ -130,21 +131,19 @@ MatMulShape MatMulShapes[] = {
 // * transpose_lhs
 // * transpose_rhs
 // * single_threaded
-using EigenMatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
+using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
 
-class EigenMatMulTest
-    : public CpuRuntimeTest,
-      public ::testing::WithParamInterface<EigenMatMulTestParam> {
+class EigenMatMulTest : public CpuRuntimeTest,
+                        public ::testing::WithParamInterface<MatMulTestParam> {
  public:
-  static string Name(
-      const ::testing::TestParamInfo<EigenMatMulTestParam>& info) {
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
     MatMulShape shape = std::get<0>(info.param);
     bool transpose_lhs = std::get<1>(info.param);
     bool transpose_rhs = std::get<2>(info.param);
     bool single_threaded = std::get<3>(info.param);
 
     return tensorflow::strings::Printf(
-        "MatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
         transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
         single_threaded ? "single" : "multi");
   }
@@ -169,5 +168,74 @@ INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
                                            ::testing::Bool()),
                         EigenMatMulTest::Name);
 
+#ifdef INTEL_MKL
+class MKLMatMulTest : public CpuRuntimeTest,
+                      public ::testing::WithParamInterface<MatMulTestParam> {
+ public:
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
+    MatMulShape shape = std::get<0>(info.param);
+    bool transpose_lhs = std::get<1>(info.param);
+    bool transpose_rhs = std::get<2>(info.param);
+    bool single_threaded = std::get<3>(info.param);
+
+    return tensorflow::strings::Printf(
+        "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
+        single_threaded ? "single" : "multi");
+  }
+};
+
+std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
+                                                  const Array2D<float>& b,
+                                                  bool transpose_lhs,
+                                                  bool transpose_rhs,
+                                                  bool single_threaded) {
+  CHECK_EQ(a.width(), b.height());
+  int64 m = a.height();
+  int64 n = b.width();
+  int64 k = a.width();
+
+  // The MKL matmul runtime function expects the matrix to be in column major
+  // order and array2d is in row-major order. Create transposes of a and b. The
+  // 'data' buffer in the transposed array is the original array in column major
+  // order.
+  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
+  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
+
+  // Since we're going to transpose c before returning it, swap the order of the
+  // dimension sizes to ensure the returned array is properly dimensioned.
+  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  if (single_threaded) {
+    __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
+        m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    __xla_cpu_runtime_MKLMatMulF32(nullptr, c_transpose->data(),
+                                   a_transpose->data(), b_transpose->data(), m,
+                                   n, k, transpose_lhs, transpose_rhs);
+  }
+  return MaybeTransposeArray2D(*c_transpose, true);
+}
+
+TEST_P(MKLMatMulTest, DoIt) {
+  MatMulShape shape = std::get<0>(GetParam());
+  bool transpose_lhs = std::get<1>(GetParam());
+  bool transpose_rhs = std::get<2>(GetParam());
+  bool single_threaded = std::get<3>(GetParam());
+
+  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
+  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
+  auto c =
+      MKLMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs, single_threaded);
+  CheckMatrixMultiply(*a, *b, *c);
+}
+
+INSTANTIATE_TEST_CASE_P(MKLMatMulTestInstantiaion, MKLMatMulTest,
+                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        MKLMatMulTest::Name);
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 8b1e20d79e..29afd8ea5f 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -918,28 +918,35 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded_eigen =
+  bool multi_threaded =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
     case F16:
-      fn_name = multi_threaded_eigen
+      fn_name = multi_threaded
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
       float_type = ir_builder_->getHalfTy();
       break;
     case F32:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF32SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF32SymbolName
+                                   : runtime::kEigenMatMulF32SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF32SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF32SymbolName);
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF64SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF64SymbolName
+                                   : runtime::kEigenMatMulF64SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF64SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
new file mode 100644
index 0000000000..729a4e7f5b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+#include "third_party/intel_mkl_ml/include/mkl_service.h"
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+#define EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+namespace {
+// BLAS GEMM API for 32-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF32(const void* run_options_ptr, float* out, float* lhs, float* rhs,
+               int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For column-major matrices, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_sgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+// BLAS GEMM API for 64-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF64(const void* run_options_ptr, double* out, double* lhs,
+               double* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For a column-major matrix, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_dgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+}  // namespace
+
+void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out,
+                                    float* lhs, float* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+// BLAS GEMM API for 64-bit Matrix Multiplication
+void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out,
+                                    double* lhs, double* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr,
+                                                  float* out, float* lhs,
+                                                  float* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr,
+                                                  double* out, double* lhs,
+                                                  double* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+#endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
new file mode 100644
index 0000000000..9dbc506c08
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#ifdef INTEL_MKL
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+#else
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
+                "INTEL_MKL. Add --config=mkl to build with MKL.";
+}
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
+                "INTEL_MKL. Add --config=mkl to build with MKL.";
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
+                "INTEL_MKL. Add --config=mkl to build with MKL.";
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
+                "INTEL_MKL. Add --config=mkl to build with MKL.";
+}
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 4198260a22..b7ce5bbe47 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
@@ -183,6 +184,10 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 5cb18113e5..f9943f71d3 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -189,6 +189,9 @@ message DebugOptions {
   // directory.
   string xla_dump_per_pass_hlo_proto_to = 96;
 
+  // Generate calls to MKL-DNN in the CPU backend.
+  bool xla_cpu_use_mkl_dnn = 97;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
-- 
GitLab


From bde05fdf247ea6311414677f55260f7e8085718f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 3 Apr 2018 17:52:53 -0700
Subject: [PATCH 0233/1262] Fix a shape inference bug.

PiperOrigin-RevId: 191528009
---
 .../core/common_runtime/shape_refiner.cc      |  5 ++
 tensorflow/python/BUILD                       |  1 +
 .../python/grappler/tf_optimizer_test.py      | 47 ++++++++++++++++++-
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index cef50be3b1..1b7e3138ee 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -351,6 +351,11 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
         }
       }
     }
+    if (node_context->requested_input_tensor_as_partial_shape(dst_input)) {
+      // The input value may have changed. Since we have no way to know if
+      // that's indeed the case, err on the safe side.
+      *refined = true;
+    }
 
     // Also propagate handle shape and dtype of edges which are carrying
     // resource handles.
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index b5f4387efd..57b0b78c82 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4782,6 +4782,7 @@ py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":tf_item",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 3ee4d7807e..1c0f072dd3 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,12 +17,16 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -74,6 +78,47 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertEqual(a2.op.name, optimized_graph.node[3].name)
     self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
 
+  def testLoops(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      def _Cond(_, counter):
+        return counter < end
+
+      def _Body(buf, counter):
+        buf = array_ops.concat([buf, [counter]], 0)
+        counter += 1
+        return [buf, counter]
+
+      start = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      end = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      init_buf = array_ops.zeros(shape=[0], dtype=dtypes.int32)
+      loop_vars = [init_buf, start]
+      shape_inv = [
+          tensor_shape.TensorShape([None]),
+          tensor_shape.TensorShape([])
+      ]
+      buf, _ = control_flow_ops.while_loop(_Cond, _Body, loop_vars, shape_inv)
+
+      f = -array_ops.ones_like(buf, optimize=False)
+      buf_shape = array_ops.shape(buf)
+      f_shape = array_ops.shape(f)
+      ops.add_to_collection('train_op', buf_shape)
+      ops.add_to_collection('train_op', f_shape)
+
+    # Optimize the graph.
+    mg = meta_graph.create_meta_graph_def(graph=g)
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    mg.graph_def.CopyFrom(optimized_graph)
+
+    # Check that the nodes referenced in various collections have been preserved
+    item = gitem.Item(mg)
+    props = item.GetOpProperties()
+    buf_prop = props[buf.op.name]
+    f_prop = props[f.op.name]
+    self.assertEqual(buf_prop, f_prop)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From f5f7fdb28dd798c6e6b39ab5e55fd33a63ae8733 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Tue, 3 Apr 2018 17:56:14 -0700
Subject: [PATCH 0234/1262] Replace trivial backend calls with calls to
 underlying TensorFlow functions - Part 1

PiperOrigin-RevId: 191528321
---
 .../python/keras/_impl/keras/activations.py   |  3 +-
 .../keras/applications/imagenet_utils.py      |  6 +-
 .../python/keras/_impl/keras/constraints.py   | 12 ++-
 .../keras/_impl/keras/engine/topology_test.py |  2 +-
 .../_impl/keras/engine/training_utils.py      |  5 +-
 .../keras/layers/advanced_activations.py      |  3 +-
 .../keras/layers/convolutional_recurrent.py   | 14 +--
 .../python/keras/_impl/keras/layers/core.py   | 12 ++-
 .../keras/_impl/keras/layers/embeddings.py    |  6 +-
 .../python/keras/_impl/keras/layers/merge.py  |  9 +-
 .../python/keras/_impl/keras/layers/noise.py  | 10 +-
 .../keras/_impl/keras/layers/recurrent.py     | 30 +++---
 .../_impl/keras/layers/recurrent_test.py      | 15 +--
 .../keras/_impl/keras/layers/wrappers.py      |  3 +-
 tensorflow/python/keras/_impl/keras/losses.py |  7 +-
 .../python/keras/_impl/keras/metrics.py       | 12 ++-
 .../python/keras/_impl/keras/metrics_test.py  | 18 ++--
 .../python/keras/_impl/keras/optimizers.py    | 99 ++++++++++---------
 .../python/keras/_impl/keras/regularizers.py  |  5 +-
 .../keras/_impl/keras/utils/layer_utils.py    |  5 +-
 20 files changed, 154 insertions(+), 122 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 236e17653e..74ec373ea5 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.layers.base import Layer
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -46,7 +47,7 @@ def softmax(x, axis=-1):
     return K.softmax(x)
   elif ndim > 2:
     e = K.exp(x - K.max(x, axis=axis, keepdims=True))
-    s = K.sum(e, axis=axis, keepdims=True)
+    s = math_ops.reduce_sum(e, axis=axis, keepdims=True)
     return e / s
   else:
     raise ValueError('Cannot apply softmax to a tensor that is 1D')
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index c26a28ed40..d928a7afdc 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -22,8 +22,10 @@ import json
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -151,11 +153,11 @@ def _preprocess_symbolic_input(x, data_format, mode):
     std = None
 
   if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = K.constant(-np.array(mean))
+    _IMAGENET_MEAN = constant_op.constant(-np.array(mean), dtype=K.floatx())
 
   # Zero-center by mean pixel
   if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
-    x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
+    x = K.bias_add(x, math_ops.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
     x = K.bias_add(x, _IMAGENET_MEAN, data_format)
   if std is not None:
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index 271fbbb63d..aac4d0f1e9 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -65,7 +66,8 @@ class MaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(K.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
     return w * (desired / (K.epsilon() + norms))
 
@@ -79,7 +81,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    return w * K.cast(K.greater_equal(w, 0.), K.floatx())
+    return w * math_ops.cast(K.greater_equal(w, 0.), K.floatx())
 
 
 @tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
@@ -105,7 +107,8 @@ class UnitNorm(Constraint):
 
   def __call__(self, w):
     return w / (
-        K.epsilon() + K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True)))
+        K.epsilon() + K.sqrt(
+            math_ops.reduce_sum(K.square(w), axis=self.axis, keepdims=True)))
 
   def get_config(self):
     return {'axis': self.axis}
@@ -148,7 +151,8 @@ class MinMaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(K.square(w), axis=self.axis, keepdims=True))
     desired = (
         self.rate * K.clip(norms, self.min_value, self.max_value) +
         (1 - self.rate) * norms)
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index b50277c8ff..9ab4b6fdcf 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -783,7 +783,7 @@ class TopologyConstructionTest(test.TestCase):
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
-      return keras.backend.sum(x)
+      return math_ops.reduce_sum(x)
 
     net_a_input = keras.Input((2,))
     net_a = net_a_input
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 76537b735f..58d2c78aad 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.ops import math_ops
 
 
 def check_num_samples(ins,
@@ -436,7 +437,7 @@ def weighted_masked_objective(fn):
     score_array = fn(y_true, y_pred)
     if mask is not None:
       # Cast the mask to floatX to avoid float64 upcasting in theano
-      mask = K.cast(mask, K.floatx())
+      mask = math_ops.cast(mask, K.floatx())
       # mask should have the same shape as score_array
       score_array *= mask
       #  the loss per batch should be proportional
@@ -450,7 +451,7 @@ def weighted_masked_objective(fn):
       weight_ndim = K.ndim(weights)
       score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
       score_array *= weights
-      score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
+      score_array /= K.mean(math_ops.cast(K.not_equal(weights, 0), K.floatx()))
     return K.mean(score_array)
 
   return weighted
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index c40ee109aa..45b0c6c91a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -232,7 +233,7 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
+    return inputs * math_ops.cast(K.greater(inputs, self.theta), K.floatx())
 
   def get_config(self):
     config = {'theta': float(self.theta)}
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index d95a094245..b78962d66a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -29,6 +29,8 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -438,9 +440,9 @@ class ConvLSTM2D(ConvRecurrent2D):
 
   def get_initial_state(self, inputs):
     # (samples, timesteps, rows, cols, filters)
-    initial_state = K.zeros_like(inputs)
+    initial_state = array_ops.zeros_like(inputs)
     # (samples, rows, cols, filters)
-    initial_state = K.sum(initial_state, axis=1)
+    initial_state = math_ops.reduce_sum(initial_state, axis=1)
     shape = list(self.kernel_shape)
     shape[-1] = self.filters
     initial_state = self.input_conv(
@@ -483,8 +485,8 @@ class ConvLSTM2D(ConvRecurrent2D):
   def get_constants(self, inputs, training=None):
     constants = []
     if self.implementation == 0 and 0 < self.dropout < 1:
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
+      ones = array_ops.zeros_like(inputs)
+      ones = math_ops.reduce_sum(ones, axis=1)
       ones += 1
 
       def dropped_inputs():
@@ -501,8 +503,8 @@ class ConvLSTM2D(ConvRecurrent2D):
     if 0 < self.recurrent_dropout < 1:
       shape = list(self.kernel_shape)
       shape[-1] = self.filters
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
+      ones = array_ops.zeros_like(inputs)
+      ones = math_ops.reduce_sum(ones, axis=1)
       ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
       ones += 1.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 73e4f15f7e..a709a079fd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -37,6 +37,8 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
 from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -80,7 +82,7 @@ class Masking(Layer):
   def call(self, inputs):
     boolean_mask = K.any(
         K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    return inputs * K.cast(boolean_mask, inputs.dtype)
+    return inputs * math_ops.cast(boolean_mask, inputs.dtype)
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -170,7 +172,7 @@ class SpatialDropout1D(Dropout):
     self.input_spec = InputSpec(ndim=3)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     noise_shape = (input_shape[0], 1, input_shape[2])
     return noise_shape
 
@@ -222,7 +224,7 @@ class SpatialDropout2D(Dropout):
     self.input_spec = InputSpec(ndim=4)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1)
     elif self.data_format == 'channels_last':
@@ -275,7 +277,7 @@ class SpatialDropout3D(Dropout):
     self.input_spec = InputSpec(ndim=5)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1, 1)
     elif self.data_format == 'channels_last':
@@ -414,7 +416,7 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.reshape(inputs, (K.shape(inputs)[0],) + self.target_shape)
+    return K.reshape(inputs, (array_ops.shape(inputs)[0],) + self.target_shape)
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 006ecd3135..a0fd7a9637 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -24,6 +24,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -152,8 +154,8 @@ class Embedding(Layer):
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
-      inputs = K.cast(inputs, 'int32')
-    out = K.gather(self.embeddings, inputs)
+      inputs = math_ops.cast(inputs, 'int32')
+    out = array_ops.gather(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index c660cbd449..6290db29a7 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -137,7 +138,7 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           if x_ndim is None:
-            x_shape = K.shape(x)
+            x_shape = array_ops.shape(x)
             batch_size = x_shape[0]
             new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
             x_transposed = K.reshape(x,
@@ -159,8 +160,8 @@ class _Merge(Layer):
         if transposed:
           # If inputs have been transposed, we have to transpose the output too.
           if y_ndim is None:
-            y_shape = K.shape(y)
-            y_ndim = K.shape(y_shape)[0]
+            y_shape = array_ops.shape(y)
+            y_ndim = array_ops.shape(y_shape)[0]
             batch_size = y_shape[y_ndim - 1]
             new_shape = K.concatenate(
                 [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
@@ -418,7 +419,7 @@ class Concatenate(_Merge):
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
         # Input is unmasked. Append all 1s to masks,
-        masks.append(K.ones_like(input_i, dtype='bool'))
+        masks.append(array_ops.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
         masks.append(K.expand_dims(mask_i))
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index e309d160e5..4366b654f2 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -23,6 +23,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -58,7 +60,7 @@ class GaussianNoise(Layer):
 
     def noised():
       return inputs + K.random_normal(
-          shape=K.shape(inputs), mean=0., stddev=self.stddev)
+          shape=array_ops.shape(inputs), mean=0., stddev=self.stddev)
 
     return K.in_train_phase(noised, inputs, training=training)
 
@@ -104,7 +106,7 @@ class GaussianDropout(Layer):
       def noised():
         stddev = np.sqrt(self.rate / (1.0 - self.rate))
         return inputs * K.random_normal(
-            shape=K.shape(inputs), mean=1.0, stddev=stddev)
+            shape=array_ops.shape(inputs), mean=1.0, stddev=stddev)
 
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
@@ -153,7 +155,7 @@ class AlphaDropout(Layer):
     self.supports_masking = True
 
   def _get_noise_shape(self, inputs):
-    return self.noise_shape if self.noise_shape else K.shape(inputs)
+    return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
 
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
@@ -166,7 +168,7 @@ class AlphaDropout(Layer):
 
         kept_idx = K.greater_equal(
             K.random_uniform(noise_shape, seed=seed), rate)
-        kept_idx = K.cast(kept_idx, K.floatx())
+        kept_idx = math_ops.cast(kept_idx, K.floatx())
 
         # Get affine transformation params
         a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 791f9b3113..bd7c42e63e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -33,6 +33,9 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -503,8 +506,10 @@ class RNN(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
     initial_state = K.expand_dims(initial_state)  # (samples, 1)
     if hasattr(self.cell.state_size, '__len__'):
       return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
@@ -631,7 +636,7 @@ class RNN(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     if self.return_sequences:
@@ -907,8 +912,7 @@ class SimpleRNNCell(Layer):
     prev_output = states[0]
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training)
     if (0 < self.recurrent_dropout < 1 and
@@ -1309,8 +1313,7 @@ class GRUCell(Layer):
 
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=3)
@@ -1793,8 +1796,7 @@ class LSTMCell(Layer):
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=4)
@@ -2176,7 +2178,7 @@ class LSTM(RNN):
 
 
 def _generate_dropout_ones(inputs, dims):
-  return K.ones((K.shape(inputs)[0], dims))
+  return K.ones((array_ops.shape(inputs)[0], dims))
 
 
 def _generate_dropout_mask(ones, rate, training=None, count=1):
@@ -2351,8 +2353,10 @@ class Recurrent(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
     initial_state = K.expand_dims(initial_state)  # (samples, 1)
     initial_state = K.tile(initial_state, [1,
                                            self.units])  # (samples, output_dim)
@@ -2456,7 +2460,7 @@ class Recurrent(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     # Properly set learning phase
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index de022153f6..fb743b617f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -24,6 +24,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -395,8 +398,8 @@ class RNNTest(test.TestCase):
 
     # Test `get_losses_for` and `losses`
     x = keras.Input((None, 1))
-    loss_1 = keras.backend.sum(x)
-    loss_2 = keras.backend.sum(cells[0].kernel)
+    loss_1 = math_ops.reduce_sum(x)
+    loss_2 = math_ops.reduce_sum(cells[0].kernel)
     cells[0].add_loss(loss_1, inputs=x)
     cells[0].add_loss(loss_2)
     self.assertEqual(len(layer.losses), 2)
@@ -410,10 +413,10 @@ class RNNTest(test.TestCase):
     layer.build((None, None, 1))
 
     x = keras.Input((None, 1))
-    update_1 = keras.backend.update_add(
-        cells[0].kernel, x[0, 0, 0] * cells[0].kernel)
-    update_2 = keras.backend.update_add(
-        cells[0].kernel, keras.backend.ones_like(cells[0].kernel))
+    update_1 = state_ops.assign_add(cells[0].kernel,
+                                    x[0, 0, 0] * cells[0].kernel)
+    update_2 = state_ops.assign_add(cells[0].kernel,
+                                    array_ops.ones_like(cells[0].kernel))
     cells[0].add_update(update_1, inputs=x)
     cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 76ddd9299d..12f33614e2 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -28,6 +28,7 @@ from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import utils as tf_layers_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -209,7 +210,7 @@ class TimeDistributed(Wrapper):
       # We can go with reshape-based implementation for performance.
       input_length = input_shape[1]
       if not input_length:
-        input_length = K.shape(inputs)[1]
+        input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
       input_uid = tf_layers_util.object_list_uid(inputs)
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 1576ed7b99..859bda0c9d 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -66,7 +67,7 @@ def hinge(y_true, y_pred):
 
 @tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
-  pos = K.sum(y_true * y_pred, axis=-1)
+  pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
   neg = K.max((1. - y_true) * y_pred, axis=-1)
   return K.maximum(0., neg - pos + 1.)
 
@@ -117,7 +118,7 @@ def binary_crossentropy(y_true, y_pred):
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
-  return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
+  return math_ops.reduce_sum(y_true * K.log(y_true / y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.poisson', 'keras.losses.poisson')
@@ -129,7 +130,7 @@ def poisson(y_true, y_pred):
 def cosine_proximity(y_true, y_pred):
   y_true = K.l2_normalize(y_true, axis=-1)
   y_pred = K.l2_normalize(y_pred, axis=-1)
-  return -K.sum(y_true * y_pred, axis=-1)
+  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
 # Aliases.
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 82778a3dc4..24192cf5a1 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crosse
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -47,15 +48,15 @@ def binary_accuracy(y_true, y_pred):
 
 @tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
-  return K.cast(
+  return math_ops.cast(
       K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
-  return K.cast(
+  return math_ops.cast(
       K.equal(
-          K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1),
-                                         K.floatx())), K.floatx())
+          K.max(y_true, axis=-1),
+          math_ops.cast(K.argmax(y_pred, axis=-1), K.floatx())), K.floatx())
 
 
 @tf_export('keras.metrics.top_k_categorical_accuracy')
@@ -66,7 +67,8 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 @tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(
-      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+      K.in_top_k(y_pred, math_ops.cast(K.max(y_true, axis=-1), 'int32'), k),
+      axis=-1)
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py
index 44289ea02a..2b73e0c16f 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/_impl/keras/metrics_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -104,16 +106,16 @@ class KerasMetricsTest(test.TestCase):
             The total number of true positives seen this epoch at the
                 completion of the batch.
         """
-        y_true = keras.backend.cast(y_true, 'int32')
-        y_pred = keras.backend.cast(keras.backend.round(y_pred), 'int32')
-        correct_preds = keras.backend.cast(
+        y_true = math_ops.cast(y_true, 'int32')
+        y_pred = math_ops.cast(keras.backend.round(y_pred), 'int32')
+        correct_preds = math_ops.cast(
             keras.backend.equal(y_pred, y_true), 'int32')
-        true_pos = keras.backend.cast(
-            keras.backend.sum(correct_preds * y_true), 'int32')
+        true_pos = math_ops.cast(
+            math_ops.reduce_sum(correct_preds * y_true), 'int32')
         current_true_pos = self.true_positives * 1
-        self.add_update(keras.backend.update_add(self.true_positives,
-                                                 true_pos),
-                        inputs=[y_true, y_pred])
+        self.add_update(
+            state_ops.assign_add(self.true_positives, true_pos),
+            inputs=[y_true, y_pred])
         return current_true_pos + true_pos
 
     metric_fn = BinaryTruePositives()
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index acbb9091d3..dc0e472b88 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -31,6 +31,7 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
@@ -118,7 +119,7 @@ class Optimizer(object):
                        'Common ops without gradient: '
                        'K.argmax, K.round, K.eval.')
     if hasattr(self, 'clipnorm') and self.clipnorm > 0:
-      norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
+      norm = K.sqrt(sum([math_ops.reduce_sum(K.square(g)) for g in grads]))
       grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
     if hasattr(self, 'clipvalue') and self.clipvalue > 0:
       grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
@@ -204,20 +205,20 @@ class SGD(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
     self.weights = [self.iterations] + moments
     for p, g, m in zip(params, grads, moments):
       v = self.momentum * m - lr * g  # velocity
-      self.updates.append(K.update(m, v))
+      self.updates.append(state_ops.assign(m, v))
 
       if self.nesterov:
         new_p = p + self.momentum * v - lr * g
@@ -228,7 +229,7 @@ class SGD(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -277,25 +278,25 @@ class RMSprop(Optimizer):
     grads = self.get_gradients(loss, params)
     accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
       new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -339,24 +340,24 @@ class Adagrad(Optimizer):
     shapes = [K.int_shape(p) for p in params]
     accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       new_a = a + K.square(g)  # update accumulator
-      self.updates.append(K.update(a, new_a))
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -403,18 +404,18 @@ class Adadelta(Optimizer):
     accumulators = [K.zeros(shape) for shape in shapes]
     delta_accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators + delta_accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
       new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      self.updates.append(state_ops.assign(a, new_a))
 
       # use the new accumulator and the *old* delta_accumulator
       update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
@@ -424,11 +425,11 @@ class Adadelta(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
 
       # update delta_accumulator
       new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
-      self.updates.append(K.update(d_a, new_d_a))
+      self.updates.append(state_ops.assign(d_a, new_d_a))
     return self.updates
 
   def get_config(self):
@@ -483,15 +484,15 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
     lr_t = lr * (
         K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
 
@@ -509,19 +510,19 @@ class Adam(Optimizer):
       if self.amsgrad:
         vhat_t = K.maximum(vhat, v_t)
         p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
-        self.updates.append(K.update(vhat, vhat_t))
+        self.updates.append(state_ops.assign(vhat, vhat_t))
       else:
         p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -573,15 +574,15 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
     lr_t = lr / (1. - K.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
@@ -597,15 +598,15 @@ class Adamax(Optimizer):
       u_t = K.maximum(self.beta_2 * u, K.abs(g))
       p_t = p - lr_t * m_t / (u_t + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(u, u_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(u, u_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -659,9 +660,9 @@ class Nadam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
@@ -689,8 +690,8 @@ class Nadam(Optimizer):
       m_t_bar = (
           1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
 
       p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
       new_p = p_t
@@ -699,7 +700,7 @@ class Nadam(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -743,7 +744,7 @@ class TFOptimizer(Optimizer):
       global_step = training_util.get_global_step()
       opt_update = self.optimizer.apply_gradients(grads, global_step)
     else:
-      self.updates = [K.update_add(self.iterations, 1)]
+      self.updates = [state_ops.assign_add(self.iterations, 1)]
       if not params:
         return self.updates
 
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 2c30844647..fdb9d33810 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,9 +56,9 @@ class L1L2(Regularizer):
   def __call__(self, x):
     regularization = 0.
     if self.l1:
-      regularization += K.sum(self.l1 * K.abs(x))
+      regularization += math_ops.reduce_sum(self.l1 * K.abs(x))
     if self.l2:
-      regularization += K.sum(self.l2 * K.square(x))
+      regularization += math_ops.reduce_sum(self.l2 * K.square(x))
     return regularization
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 4c8009dfd8..902972ecbb 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -35,7 +35,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([K.count_params(p) for p in set(weights)]))
+  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
@@ -193,8 +193,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   else:
     trainable_count = count_params(model.trainable_weights)
 
-  non_trainable_count = int(
-      np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
+  non_trainable_count = count_params(model.non_trainable_weights)
 
   print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
   print_fn('Trainable params: {:,}'.format(trainable_count))
-- 
GitLab


From 6d05c781d71b09f10246b2d15039e1b1df899f62 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 3 Apr 2018 17:57:08 -0700
Subject: [PATCH 0235/1262] Fix bug where name generated by Graph::NewName can
 conflict with generated Send node name.

I fixed this very locally to avoid having to fix a bunch of tests
(this is currently blocking enabling the C API), but this should be
fixed more generally in the future.

PiperOrigin-RevId: 191528409
---
 tensorflow/core/graph/quantize_training.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index cb0fc8a154..3b6e8cc233 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -259,8 +259,14 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
   const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
   const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
   for (Node* var : variables) {
-    string new_restore_op_name = graph->NewName(restore_op_name);
-    string new_assign_op_name = graph->NewName(assign_op_name);
+    // Add an extra prefix after calling graph->NewName because the "unique"
+    // name may conflict with names generated for Send nodes.
+    // TODO(b/77547936): fix this more generally and get rid of the extra prefix
+    // here.
+    string new_restore_op_name =
+        strings::StrCat(graph->NewName(restore_op_name), "_qt");
+    string new_assign_op_name =
+        strings::StrCat(graph->NewName(assign_op_name), "_qt");
     string tensor_names_op_name =
         strings::StrCat(new_restore_op_name, "/tensor_names");
     string shape_and_slices_op_name =
-- 
GitLab


From 2615b467def240e6a90f309aaa56311677104d63 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 3 Apr 2018 18:08:21 -0700
Subject: [PATCH 0236/1262] [XLA] Reshape mover should only transform a
 reshape/transpose instruction if all the users of the instruction can be
 transformed.

This is because if only part of the users of a reshape/transpose instruction are
reshape-move transformed, the original reshape/transpose instruction can't be
eliminated while a modified clone copy of the instruction is added to support
the transformation. As a result, the transformation increases the number of
reshape/transpose instructions in the kerenel and can potentially increase
memory consumption.

Add two test cases.

PiperOrigin-RevId: 191529681
---
 .../compiler/xla/service/reshape_mover.cc     | 306 ++++++++++--------
 .../xla/service/reshape_mover_test.cc         |  80 ++++-
 2 files changed, 255 insertions(+), 131 deletions(-)

diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index f15117f45c..49ec38eb62 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -53,16 +53,8 @@ bool IsReshapeOrTranspose(const HloInstruction* instruction) {
          instruction->opcode() == HloOpcode::kTranspose;
 }
 
-// Returns true if `a` is a broadcast instruction to target shape `shape` and
-// its operand is a scalar.
-bool IsBroadcastScalarToShape(const HloInstruction* a, const Shape& shape) {
-  return a->opcode() == HloOpcode::kBroadcast &&
-         ShapeUtil::SameDimensions(a->shape(), shape) &&
-         ShapeUtil::IsScalar(a->operand(0)->shape());
-}
-
-// Returns true iff `instruction` can change its shape simply by adjusting
-// metadata.
+// Returns true if `instruction` can change its shape simply by adjusting
+// metadata or if `instruction` is a broadcast of a scalar value.
 bool CanTriviallyChangeShape(const HloInstruction* instruction) {
   // NOTE: Technically a sequence of reshape(reshape(constant)) is also
   // trivially reshapable, so we might be tempted to simply recurse if
@@ -97,19 +89,30 @@ bool CanTriviallyChangeShape(const HloInstruction* instruction) {
     return true;
   }
 
+  // A broadcase of scalar can trivially change its shape.
+  if (instruction->opcode() == HloOpcode::kBroadcast &&
+      ShapeUtil::IsScalar(instruction->operand(0)->shape())) {
+    return true;
+  }
+
   return false;
 }
 
-// Finds the first non-scalar operand of an instruction that is a non-trivial
-// reshape or transpose. Returns the operand if it is found or nullptr if not
-// found.
+// Returns true iff `instruction` is a reshape/transpose instruction for which
+// a shape change is nontrivial.
+bool IsNontrivialReshape(const HloInstruction* instruction) {
+  return !ShapeUtil::IsScalar(instruction->shape()) &&
+         IsReshapeOrTranspose(instruction) &&
+         !CanTriviallyChangeShape(instruction->operand(0));
+}
+
+// Finds the first operand of an instruction that is a non-trivial reshape or
+// transpose. Returns such an operand or nullptr if not found.
 HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
     const HloInstruction* hlo) {
   for (HloInstruction* operand : hlo->operands()) {
-    if (!ShapeUtil::IsScalar(operand->shape()) &&
-        IsReshapeOrTranspose(operand) &&
-        !CanTriviallyChangeShape(operand->operand(0))) {
-      VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
+    if (IsNontrivialReshape(operand)) {
+      VLOG(5) << "Found first non-trivial reshape operand of "
               << hlo->ToString(HloPrintOptions().set_print_metadata(false))
               << ":\n\t"
               << operand->ToString(HloPrintOptions().set_print_metadata(false));
@@ -119,7 +122,7 @@ HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
   return nullptr;
 }
 
-// Returns whether `a` and `b` are equivalent for the purposes of this pass.
+// Returns whether `a` and `b` are equivalent reshapes/transposes.
 bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   if (a->opcode() != b->opcode() ||
       !ShapeUtil::SameDimensions(a->shape(), b->shape())) {
@@ -136,85 +139,14 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   }
 }
 
-// Returns true if all operands of `instruction` can easily change shape.
-// Operands can easily change shape if they are all reshapes/transposes to and
-// from the same shape. Additionally, operands like constant, rng, and any
-// scalar change shape with only an adjustment of metadata.
-bool AllOperandsHaveEasyShapeChanges(
-    const HloInstruction* instruction,
-    const HloInstruction* first_reshape_operand) {
-  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
-  VLOG(3) << "** Checking whether all operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  // Check whether all operands:
-  //    0. Have the same dimensions as the output -- if not, it may be
-  //       implicitly broadcast, which can confound the movement's
-  //       correctness.
-  //
-  // And one of the following:
-  //    1. Are reshapes or transposes that have the same input and
-  //       output shapes as all other reshaped or transposed operands.
-  //     or
-  //    2. Are one of kConstant, kRng, and scalars that can change shape
-  //    trivially,
-  //     or
-  //    3. Are broadcast with a scalar operand.
-  for (const HloInstruction* operand : instruction->operands()) {
-    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
-      VLOG(5) << "Operand shape differs from output shape; may be "
-                 "implicitly broadcast, so preventing "
-                 "movement\n\toperand: "
-              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
-              << instruction->ToString(print_no_metadata);
-      return false;
-    }
-
-    // Skip the rest checks if the current operand is first_reshape_operand
-    // itself.
-    if (first_reshape_operand == operand) {
-      continue;
-    }
-
-    if (AreEquivalentReshapes(first_reshape_operand, operand)) {
-      VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
-              << first_reshape_operand->ToString(print_no_metadata)
-              << "\n\toperand: " << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (CanTriviallyChangeShape(operand)) {
-      VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (IsBroadcastScalarToShape(operand, first_reshape_operand->shape())) {
-      VLOG(5) << "Broadcast scalar to shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    // TODO(someone): Look into supporting general ops for the operands as
-    // well.
-    VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
-               "nor can trivially change shape: "
-            << operand->ToString(print_no_metadata);
-    return false;
-  }
-
-  VLOG(3) << "All operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  return true;
-}
-
 // This function is called once we've decided to sink reshape/transpose operands
 // across an instruction. It returns an updated `operand` with a shape that
 // plays nicely with `new_operand_shape`; either it has the same shape (of the
 // correct type), or it is a scalar that may be implicitly broadcast.
-HloInstruction* UpdateOperand(HloComputation* computation,
-                              const HloInstruction* first_reshape_operand,
+HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
                               const Shape& new_operand_shape,
                               HloInstruction* operand) {
+  HloComputation* computation = operand->parent();
   const PrimitiveType element_type = operand->shape().element_type();
   const Shape new_shape =
       ShapeUtil::ChangeElementType(new_operand_shape, element_type);
@@ -245,42 +177,24 @@ HloInstruction* UpdateOperand(HloComputation* computation,
       VLOG(5) << "Using existing operand of kReshape or kTranspose";
       return operand->mutable_operand(0);
     }
-    case HloOpcode::kBroadcast:
-      CHECK(IsBroadcastScalarToShape(operand, first_reshape_operand->shape()));
-      VLOG(5) << "Changing broadcast";
-      return computation->AddInstruction(
+    case HloOpcode::kBroadcast: {
+      CHECK(ShapeUtil::IsScalar(operand->operand(0)->shape()));
+      HloInstruction* inst = computation->AddInstruction(
           operand->CloneWithNewOperands(new_shape, operand->operands()));
+      VLOG(5) << "Changing broadcast from " << operand->ToString() << " to "
+              << inst->ToString();
+      return inst;
+    }
 
     default:
       LOG(FATAL) << "Unexpected operand opcode during update: " << operand;
   }
 }
 
-// Try to sink any reshape or transpose operands of `instruction` across it. We
-// do so if `instruction` is elementwise and all operands are either equivalent
-// reshapes/transposes or are trivially reshapable.
-StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
-                                         HloInstruction* instruction) {
-  // Only perform sinks for live elementwise instructions with operands.
-  const bool is_dead = instruction->user_count() == 0 &&
-                       instruction != computation->root_instruction();
-  if (!instruction->IsElementwise() || instruction->operands().empty() ||
-      is_dead) {
-    return false;
-  }
-
-  // Only perform sinks if there are any nontrivial reshape/transpose operands.
-  const HloInstruction* first_reshape_operand =
-      FirstNonScalarAndNonTrivialReshapeOperand(instruction);
-  if (!first_reshape_operand) {
-    return false;
-  }
-
-  // Only perform sinks if all operands can easily change shape.
-  if (!AllOperandsHaveEasyShapeChanges(instruction, first_reshape_operand)) {
-    return false;
-  }
-
+// Actually performs the reshape-move transformation -- that is, sinks the
+// reshape or transpose operands of `instruction` across it.
+StatusOr<bool> PerformSinkReshapeOrTranspose(
+    HloInstruction* instruction, const HloInstruction* first_reshape_operand) {
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   // At this point we've decided to sink reshape/transpose operands.
   const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
@@ -301,8 +215,8 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
     }
     VLOG(3) << "Updating operand #" << i << ": "
             << operands[i]->ToString(print_no_metadata);
-    operands[i] = UpdateOperand(computation, first_reshape_operand,
-                                new_operand_shape, operands[i]);
+    operands[i] =
+        UpdateOperand(first_reshape_operand, new_operand_shape, operands[i]);
   }
   if (HloOpcode::kFusion == instruction->opcode()) {
     // Here we already know `instruction` is elementwise, and no operand is
@@ -314,6 +228,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
       *shape->mutable_layout() = new_operand_shape.layout();
     }
   }
+  HloComputation* computation = instruction->parent();
   HloInstruction* new_elementwise =
       computation->AddInstruction(instruction->CloneWithNewOperands(
           // `instruction` may change the element type, e.g., from
@@ -348,6 +263,141 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
   return true;
 }
 
+// Returns true if the instruction is a reshape-move candidate.
+//
+// An instruction is a reshape-move candidate if the instruction is elementwise,
+// has at least one nontrivial reshape/transpose operand, and its operands are
+// either trivially reshapable or are equivalent nontrivial reshapes/transposes.
+bool IsReshapeMoveCandidate(HloInstruction* instruction) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+  VLOG(5) << "** Checking instruction: "
+          << instruction->ToString(print_no_metadata);
+
+  // Only perform reshape-move for live elementwise instructions with operands.
+  const bool is_dead = instruction->user_count() == 0 &&
+                       instruction != instruction->parent()->root_instruction();
+  if (!instruction->IsElementwise() || instruction->operands().empty() ||
+      is_dead) {
+    return false;
+  }
+
+  // Check whether all operands:
+  //    0. Have the same dimensions as the output -- if not, they may be
+  //       implicitly broadcast, which can confound the movement's
+  //       correctness.
+  //
+  // And one of the following:
+  //    1. Are reshapes or transposes that have the same input and
+  //       output shapes as all other reshaped or transposed operands.
+  //     or
+  //    2. Are one of kConstant, kRng, broadcast of a scalar value, and scalars
+  //     that can change shape trivially.
+  const HloInstruction* first_reshape_operand = nullptr;
+  for (const HloInstruction* operand : instruction->operands()) {
+    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+      VLOG(5) << "Operand shape differs from output shape; may be "
+                 "implicitly broadcast, so preventing "
+                 "movement\n\toperand: "
+              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
+              << instruction->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (CanTriviallyChangeShape(operand)) {
+      VLOG(5) << "Operand can trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      continue;
+    }
+
+    if (!IsNontrivialReshape(operand)) {
+      VLOG(5) << "Operand can't trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (first_reshape_operand == nullptr) {
+      first_reshape_operand = operand;
+      VLOG(5) << "First reshape operand "
+              << operand->ToString(print_no_metadata);
+    } else if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+      VLOG(5)
+          << "Operand is an equivalent reshape of the first reshape operand "
+          << operand->ToString(print_no_metadata);
+    } else {
+      // TODO(someone): Look into supporting general ops for the operands as
+      // well.
+      VLOG(5) << "Operand is a reshape but is not equivalent to the first "
+                 "Reshape operand"
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+  }
+
+  if (first_reshape_operand) {
+    VLOG(5) << "All operands have easy shape changes: "
+            << instruction->ToString(print_no_metadata);
+  }
+
+  return first_reshape_operand != nullptr;
+}
+
+// Reshape-moves all qualifying instructions in reshape_candidates.  Returns
+// true if it makes changes.
+//
+// `reshape_candidates` is a set of HloInstructions with nontrivial reshape
+// operands, and a instruction in the set can be reshape-moved iff all the users
+// of its nontrivial reshape operands can also be reshaped-moved.
+//
+// The algorithm here iteratively finds the nontrivial operands with users that
+// are outside the set of `reshape_candidates`, and removes their users from
+// `reshape_candidates`, until either `reshape_candidates` becomes empty or none
+// of the remaining nontrivial operands have users outside `reshape_candidates`.
+// In the later case, all the remaining instructions in `reshape_candidates`
+// are reshape-moved and the routine returns true.
+StatusOr<bool> TryReshapeMoveOnCandidates(
+    HloInstructionSet* reshape_candidates) {
+  bool removed = true;
+  while (!reshape_candidates->empty() && removed) {
+    if (VLOG_IS_ON(5)) {
+      for (const HloInstruction* instruction : *reshape_candidates) {
+        VLOG(5) << "candidate " << instruction->ToString();
+      }
+    }
+    ConstHloInstructionSet nontrivial_operands;
+    for (const HloInstruction* instruction : *reshape_candidates) {
+      for (const auto* operand : instruction->operands()) {
+        if (IsNontrivialReshape(operand)) {
+          nontrivial_operands.insert(operand);
+        }
+      }
+    }
+
+    removed = false;
+    for (auto operand : nontrivial_operands) {
+      if (c_any_of(operand->users(), [&](HloInstruction* user) {
+            return !reshape_candidates->count(user);
+          })) {
+        for (auto* user : operand->users()) {
+          removed |= reshape_candidates->erase(user) > 0;
+        }
+      }
+    }
+  }
+
+  if (reshape_candidates->empty()) {
+    return false;
+  }
+  for (HloInstruction* instruction : *reshape_candidates) {
+    const HloInstruction* first_reshape_operand =
+        FirstNonScalarAndNonTrivialReshapeOperand(instruction);
+    TF_ASSIGN_OR_RETURN(
+        bool did_change,
+        PerformSinkReshapeOrTranspose(instruction, first_reshape_operand));
+    CHECK(did_change);
+  }
+  return true;
+}
+
 }  // namespace
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
@@ -355,11 +405,15 @@ StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   VLOG(2) << "Pre ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
   for (auto* comp : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool did_change,
-                          TrySinkReshapeOrTranspose(comp, instruction));
-      changed |= did_change;
+    HloInstructionSet reshape_candidates;
+    for (HloInstruction* instruction : comp->instructions()) {
+      if (IsReshapeMoveCandidate(instruction)) {
+        reshape_candidates.insert(instruction);
+      }
     }
+    TF_ASSIGN_OR_RETURN(bool did_change,
+                        TryReshapeMoveOnCandidates(&reshape_candidates));
+    changed |= did_change;
   }
   VLOG(2) << "Post ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 4e0a0a8832..094f7319f4 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -564,15 +564,15 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
   const string hlo_string = R"(
     HloModule TransposeMulInversedTransposeModule
     ENTRY TransposeMulInversedTranspose {
-      src0 = f32[1,20,8,32]{3,2,1,0} parameter(0)
-      transpose0 = f32[1,8,20,32]{3,2,1,0} transpose(src0), dimensions={0,2,1,3}
+      src0 = f32[20,8]{1,0} parameter(0)
+      transpose0 = f32[8,20]{1,0} transpose(src0), dimensions={1,0}
       src1 = f32[] parameter(1)
-      broadcast0 = f32[1,8,20,32]{3,2,1,0} broadcast(src1), dimensions={}
-      ROOT multiply0 = f32[1,8,20,32]{3,2,1,0} multiply(transpose0, broadcast0)
+      broadcast0 = f32[8,20]{1,0} broadcast(src1), dimensions={}
+      ROOT multiply0 = f32[8,20]{1,0} multiply(transpose0, broadcast0)
     }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
   EXPECT_TRUE(changed);
 
@@ -580,5 +580,75 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
               op::Transpose(op::Multiply()));
 }
 
+TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
+  const string hlo_string = R"(
+    HloModule ReshapeWithUsersOutsideCandidates
+    ENTRY ReshapeWithMultipleUsers {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      param4 = f32[8,20]{1,0} parameter(4)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, param4)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates1
+    ENTRY ReshapeWithMultipleUsers1 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, reshape2)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates2
+    ENTRY ReshapeWithMultipleUsers2 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      ROOT add0 = f32[8,20]{1,0} add(reshape0, reshape0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Reshape(op::Add()));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 763ebd29fec7d3f8b26ead26f8d7c571672d2fb4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 3 Apr 2018 18:16:54 -0700
Subject: [PATCH 0237/1262] Fix error message that was displaying wrong shape

PiperOrigin-RevId: 191530602
---
 tensorflow/core/kernels/sparse_cross_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 4b5df7aff0..4ebb7fbcc7 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -419,7 +419,7 @@ class SparseCrossOp : public OpKernel {
           context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
           errors::InvalidArgument(
               "Dense inputs should be a matrix but received shape ",
-              indices_list_in[i].shape().DebugString(), " at position ", i));
+              dense_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
                   errors::InvalidArgument("Expected batch size ", batch_size,
                                           " got ", dense_list_in[i].dim_size(0),
-- 
GitLab


From be6b0edf02bb691e7e1b1ff5bb05c18f4830f94c Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Tue, 3 Apr 2018 18:47:46 -0700
Subject: [PATCH 0238/1262] Extra logging to help debug shape inference logic
 in optimizers.

PiperOrigin-RevId: 191533338
---
 tensorflow/core/grappler/costs/graph_properties.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 5103098f27..8fe154dbf3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -1011,6 +1011,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
     // Skip any information that comes from fed nodes.
     if (fed_ports.find(node->name()) != fed_ports.end()) {
+      VLOG(2) << "Skipping feed node shape: " << node->name();
       continue;
     }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
-- 
GitLab


From 62d547aa53dd0e7f53e8544b3c41d9274727d333 Mon Sep 17 00:00:00 2001
From: Rui Zhao <rzhao@google.com>
Date: Tue, 3 Apr 2018 18:49:53 -0700
Subject: [PATCH 0239/1262] Attach an id for each cost node in virtual_cluster.

PiperOrigin-RevId: 191533511
---
 tensorflow/core/grappler/clusters/virtual_cluster.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index ae70c98608..abfa7bc48e 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -66,6 +66,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
   }
 
   Costs node_costs;
+  int node_id = 0;
   do {
     OpContext op_context = scheduler.GetCurrNode();
     node_costs = node_estimator_->PredictCosts(op_context);
@@ -73,6 +74,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
       CostGraphDef::Node* cost_node =
           metadata->mutable_cost_graph()->add_node();
       const string& op_name = op_context.name;
+      cost_node->set_id(node_id++);
       cost_node->set_name(op_name);
       cost_node->set_device(op_context.device_name);
       cost_node->set_compute_cost(
-- 
GitLab


From ede6e1ff31531cae98844676af4981a821760188 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 3 Apr 2018 19:20:44 -0700
Subject: [PATCH 0240/1262] [TF:XLA] Add half precision support to test_utils.

PiperOrigin-RevId: 191535944
---
 tensorflow/compiler/xla/tests/test_utils.cc | 25 ++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0bc7df2a65..821432ef7d 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -23,14 +23,14 @@ namespace xla {
 
 namespace {
 
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine) {
+template <typename FloatT, typename GeneratorT>
+void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
+                                             std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
-  std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  std::uniform_real_distribution<GeneratorT> generator(1.0f, 1.125f);
   const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
@@ -52,10 +52,22 @@ void PopulateWithRandomFloatingPointData(Literal* literal,
         FloatT index_bias =
             static_cast<FloatT>(index_product % 113 - negative_bias) /
             static_cast<FloatT>(256.0f);
-        return (generator(*engine) - 1.0625) + index_bias;
+        return static_cast<FloatT>(generator(*engine) - 1.0625f) + index_bias;
       }));
 }
 
+template <typename FloatT>
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
+}
+
+template <>
+void PopulateWithRandomFloatingPointData<half>(Literal* literal,
+                                               std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
+}
+
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
@@ -100,6 +112,9 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     case BF16:
       PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
       break;
+    case F16:
+      PopulateWithRandomFloatingPointData<half>(literal.get(), engine);
+      break;
     case F32:
       PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
       break;
-- 
GitLab


From 0f60a7d10102048625e45c07a59c71c0c4375529 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 3 Apr 2018 19:29:53 -0700
Subject: [PATCH 0241/1262] Enable calling the C API by default.

Enabling the C API makes TensorFlow's Python code to call the TensorFlow C API for low-level graph construction. This should cause no noticeable changes for the most part. One known difference is improved static shape inference, meaning some shape errors will be surfaced during graph construction instead of at runtime.

Note that this can be disabled by setting the environment variable TF_C_API_GRAPH_CONSTRUCTION=0 (for now, eventually this option will be removed).

PiperOrigin-RevId: 191536563
---
 tensorflow/python/framework/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index be0fe5ee44..0215501b56 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -63,7 +63,7 @@ from tensorflow.python.util.tf_export import tf_export
 # calls to the C API. Currently disabled by default but can be manually enabled
 # in code or via the environment variable. This will be removed once all
 # functionality is supported and there's no performance penalty with it enabled.
-_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "0") is not "0"
+_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "1") is not "0"
 _USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "0") is not "0"
 
 
-- 
GitLab


From 1e38be65609a3fcdbc764a8a6d26a3f13f0efbbc Mon Sep 17 00:00:00 2001
From: Olivia Nordquist <nolivia@google.com>
Date: Tue, 3 Apr 2018 20:28:58 -0700
Subject: [PATCH 0242/1262] removing an old TODO

PiperOrigin-RevId: 191540512
---
 tensorflow/python/ops/math_ops_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 9f85188b35..05bcee8801 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -155,9 +155,7 @@ class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testRounding(self):
-    x = [0.49, 0.7, -0.3, -0.8]
-    # TODO(nolivia): Remove this when RoundOp is forwards compatible
-    # x = np.arange(-5.0, 5.0, .25)
+    x = np.arange(-5.0, 5.0, .25)
     for dtype in [np.float32, np.double, np.int32]:
       x_np = np.array(x, dtype=dtype)
       with test_util.device(use_gpu=True):
-- 
GitLab


From 65631f2d1af62f05b56a8a59b3e223883daa4b80 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Tue, 3 Apr 2018 21:25:20 -0700
Subject: [PATCH 0243/1262] Help build_pip_package work without ./configure

If the ./configure tool isn't run this script won't exist. It should
be safe to ignore this file if it isn't present since the variable
expansion below will just default to whatever python is on the PATH.

PiperOrigin-RevId: 191543995
---
 tensorflow/tools/pip_package/build_pip_package.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index dc31e4c5f7..e2d212a0db 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -160,7 +160,9 @@ function main() {
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
-  source tools/python_bin_path.sh
+  if [[ -e tools/python_bin_path.sh ]]; then
+    source tools/python_bin_path.sh
+  fi
 
   pushd ${TMPDIR}
   rm -f MANIFEST
-- 
GitLab


From 437fe4535c5447e192430763cb6e94b699a6a635 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Tue, 3 Apr 2018 21:33:58 -0700
Subject: [PATCH 0244/1262] Fix formatting of MKL URLs

The github.com was missing in the mirror.bazel.build path.

PiperOrigin-RevId: 191544461
---
 tensorflow/workspace.bzl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0bb297e72e..2510d369fc 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -48,7 +48,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
           "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
       ],
       sha256 = "feacc3d82565c1231470359b42c696236fae873704e0b013436afba5fd4fd30f",
@@ -58,7 +58,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
           "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip"
       ],
       sha256 = "24bae8d7b22b431a654acadea43f2243c46ae6b1e5a73a4a936825f31d284ee4",
@@ -68,7 +68,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
           "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz"
       ],
       sha256 = "0e954ec6fd3dc5e37f64c4043f6b5613dd687558da3df1028b3b7c29ff5cf77f",
-- 
GitLab


From 1f5324ca69bc1017972eef8e418691cff9a86dd7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 00:33:14 -0700
Subject: [PATCH 0245/1262] Fix PR-AUC calculation, namely the incorrect use of
 linear interpolation for Precision - see Section 4 of Davis & Goadrich 2006
 (https://www.biostat.wisc.edu/~page/rocpr.pdf) Also, modify the name of the
 "trapezoidal" summation method to reflect the fact that the proper
 interpolation method in this case isn't quite the trapezoidal one.

PiperOrigin-RevId: 191555707
---
 .../python/kernel_tests/metrics_test.py       | 63 +++++++++++++--
 tensorflow/python/ops/metrics_impl.py         | 77 +++++++++++++++++--
 2 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index ad802f7e1f..55653489af 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -1124,40 +1124,91 @@ class AUCTest(test.TestCase):
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
-  def testAUCPRSpecialCase(self):
+  # Regarding the AUC-PR tests: note that the preferred method when
+  # calculating AUC-PR is summation_method='careful_interpolation'.
+  def testCorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.79726744594
+      expected = 1 - math.log(1.5) / 2
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testCorrectAnotherAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.61350593198
+      expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testThirdCorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.90410597584
+      expected = 1 - math.log(4./3) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testIncorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
-  def testAnotherAUCPRSpecialCase(self):
+  def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
-  def testThirdAUCPRSpecialCase(self):
+  def testThirdIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 9ec4954579..47eea6ef6b 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -626,10 +627,16 @@ def auc(labels,
     curve: Specifies the name of the curve to be computed, 'ROC' [default] or
       'PR' for the Precision-Recall-curve.
     name: An optional variable_scope name.
-    summation_method: Specifies the Riemann summation method used, 'trapezoidal'
-      [default] that applies the trapezoidal rule, 'minoring' that applies
-      left summation for increasing intervals and right summation for decreasing
-      intervals or 'majoring' that applies the opposite.
+    summation_method: Specifies the Riemann summation method used
+      (https://en.wikipedia.org/wiki/Riemann_sum): 'trapezoidal' [default] that
+      applies the trapezoidal rule; 'careful_interpolation', a variant of it
+      differing only by a more correct interpolation scheme for PR-AUC -
+      interpolating (true/false) positives but not the ratio that is precision;
+      'minoring' that applies left summation for increasing intervals and right
+      summation for decreasing intervals; 'majoring' that does the opposite.
+      Note that 'careful_interpolation' is strictly preferred to 'trapezoidal'
+      (to be deprecated soon) as it applies the same method for ROC, and a
+      better one (see Davis & Goadrich 2006 for details) for the PR curve.
 
   Returns:
     auc: A scalar `Tensor` representing the current area-under-curve.
@@ -664,8 +671,62 @@ def auc(labels,
     # Add epsilons to avoid dividing by 0.
     epsilon = 1.0e-6
 
+    def interpolate_pr_auc(tp, fp, fn):
+      """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+      Note here we derive & use a closed formula not present in the paper
+      - as follows:
+      Modeling all of TP (true positive weight),
+      FP (false positive weight) and their sum P = TP + FP (positive weight)
+      as varying linearly within each interval [A, B] between successive
+      thresholds, we get
+        Precision = (TP_A + slope * (P - P_A)) / P
+      with slope = dTP / dP = (TP_B - TP_A) / (P_B - P_A).
+      The area within the interval is thus (slope / total_pos_weight) times
+        int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+        int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+      where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+        int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+      Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
+         slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+      where dTP == TP_B - TP_A.
+      Note that when P_A == 0 the above calculation simplifies into
+        int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+      which is really equivalent to imputing constant precision throughout the
+      first bucket having >0 true positives.
+
+      Args:
+        tp: true positive counts
+        fp: false positive counts
+        fn: false negative counts
+      Returns:
+        pr_auc: an approximation of the area under the P-R curve.
+      """
+      dtp = tp[:num_thresholds - 1] - tp[1:]
+      p = tp + fp
+      prec_slope = _safe_div(dtp, p[:num_thresholds - 1] - p[1:], 'prec_slope')
+      intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
+      safe_p_ratio = array_ops.where(
+          math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
+          _safe_div(p[:num_thresholds - 1], p[1:], 'recall_relative_ratio'),
+          array_ops.ones_like(p[1:]))
+      return math_ops.reduce_sum(
+          _safe_div(
+              prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
+              tp[1:] + fn[1:],
+              name='pr_auc_increment'),
+          name='interpolate_pr_auc')
+
     def compute_auc(tp, fn, tn, fp, name):
       """Computes the roc-auc or pr-auc based on confusion counts."""
+      if curve == 'PR':
+        if summation_method == 'trapezoidal':
+          logging.warning(
+              'Trapezoidal rule is known to produce incorrect PR-AUCs; '
+              'please switch to "careful_interpolation" instead.')
+        elif summation_method == 'careful_interpolation':
+          # This one is a bit tricky and is handled separately.
+          return interpolate_pr_auc(tp, fp, fn)
       rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
       if curve == 'ROC':
         fp_rate = math_ops.div(fp, fp + tn + epsilon)
@@ -675,7 +736,9 @@ def auc(labels,
         prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
         x = rec
         y = prec
-      if summation_method == 'trapezoidal':
+      if summation_method in ('trapezoidal', 'careful_interpolation'):
+        # Note that the case ('PR', 'careful_interpolation') has been handled
+        # above.
         return math_ops.reduce_sum(
             math_ops.multiply(x[:num_thresholds - 1] - x[1:],
                               (y[:num_thresholds - 1] + y[1:]) / 2.),
@@ -923,8 +986,8 @@ def mean_per_class_accuracy(labels,
         weights = array_ops.reshape(weights, [-1])
       weights = math_ops.to_float(weights)
 
-      is_correct = is_correct * weights
-      ones = ones * weights
+      is_correct *= weights
+      ones *= weights
 
     update_total_op = state_ops.scatter_add(total, labels, ones)
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
-- 
GitLab


From 71b19430e8484b136e0b872f6a543aff8a242587 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 05:31:48 -0700
Subject: [PATCH 0246/1262] Sync replicas distributed training example with two
 strategies: 1) Interleave covariance and inverse update ops with training op.
 2) Run the inverse and covariance ops on separate dedicated workers.

PiperOrigin-RevId: 191579634
---
 tensorflow/contrib/kfac/examples/BUILD        |  24 +-
 tensorflow/contrib/kfac/examples/convnet.py   | 315 +++++++++++++++---
 .../convnet_mnist_distributed_main.py         |  62 ++++
 .../convnet_mnist_multi_tower_main.py         |  48 +++
 ...t_main.py => convnet_mnist_single_main.py} |  32 +-
 .../kfac/examples/tests/convnet_test.py       |  17 +-
 6 files changed, 411 insertions(+), 87 deletions(-)
 create mode 100644 tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
 create mode 100644 tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
 rename tensorflow/contrib/kfac/examples/{convnet_mnist_main.py => convnet_mnist_single_main.py} (57%)

diff --git a/tensorflow/contrib/kfac/examples/BUILD b/tensorflow/contrib/kfac/examples/BUILD
index 7dd40c19c5..8186fa1c62 100644
--- a/tensorflow/contrib/kfac/examples/BUILD
+++ b/tensorflow/contrib/kfac/examples/BUILD
@@ -28,8 +28,28 @@ py_library(
 )
 
 py_binary(
-    name = "convnet_mnist_main",
-    srcs = ["convnet_mnist_main.py"],
+    name = "convnet_mnist_single_main",
+    srcs = ["convnet_mnist_single_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_multi_tower_main",
+    srcs = ["convnet_mnist_multi_tower_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_distributed_main",
+    srcs = ["convnet_mnist_distributed_main.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":convnet",
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index 39d80addaa..e8e3353091 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -37,6 +37,8 @@ import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import mlp
 from tensorflow.contrib.kfac.examples import mnist
+from tensorflow.contrib.kfac.python.ops import optimizer as opt
+
 
 lc = tf.contrib.kfac.layer_collection
 oq = tf.contrib.kfac.op_queue
@@ -48,12 +50,18 @@ __all__ = [
     "linear_layer",
     "build_model",
     "minimize_loss_single_machine",
-    "minimize_loss_distributed",
+    "distributed_grads_only_and_ops_chief_worker",
+    "distributed_grads_and_ops_dedicated_workers",
     "train_mnist_single_machine",
-    "train_mnist_distributed",
+    "train_mnist_distributed_sync_replicas",
+    "train_mnist_multitower"
 ]
 
 
+# Inverse update ops will be run every _INVERT_EVRY iterations.
+_INVERT_EVERY = 10
+
+
 def conv_layer(layer_id, inputs, kernel_size, out_channels):
   """Builds a convolutional layer with ReLU non-linearity.
 
@@ -161,8 +169,9 @@ def build_model(examples, labels, num_labels, layer_collection):
   accuracy = tf.reduce_mean(
       tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))
 
-  tf.summary.scalar("loss", loss)
-  tf.summary.scalar("accuracy", accuracy)
+  with tf.device("/cpu:0"):
+    tf.summary.scalar("loss", loss)
+    tf.summary.scalar("accuracy", accuracy)
 
   # Register parameters. K-FAC needs to know about the inputs, outputs, and
   # parameters of each conv/fully connected layer and the logits powering the
@@ -181,41 +190,59 @@ def build_model(examples, labels, num_labels, layer_collection):
 def minimize_loss_single_machine(loss,
                                  accuracy,
                                  layer_collection,
+                                 device="/gpu:0",
                                  session_config=None):
   """Minimize loss with K-FAC on a single machine.
 
-  A single Session is responsible for running all of K-FAC's ops.
+  A single Session is responsible for running all of K-FAC's ops. The covariance
+  and inverse update ops are placed on `device`. All model variables are on CPU.
 
   Args:
     loss: 0-D Tensor. Loss to be minimized.
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
+      update ops are run on this device.
     session_config: None or tf.ConfigProto. Configuration for tf.Session().
 
   Returns:
     final value for 'accuracy'.
   """
   # Train with K-FAC.
-  global_step = tf.train.get_or_create_global_step()
+  g_step = tf.train.get_or_create_global_step()
   optimizer = opt.KfacOptimizer(
       learning_rate=0.0001,
       cov_ema_decay=0.95,
       damping=0.001,
       layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=[device],
+      inv_devices=[device],
       momentum=0.9)
-  train_op = optimizer.minimize(loss, global_step=global_step)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  with tf.device(device):
+    train_op = optimizer.minimize(loss, global_step=g_step)
+
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([train_op, cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
-      global_step_, loss_, accuracy_, _, _ = sess.run(
-          [global_step, loss, accuracy, train_op, optimizer.cov_update_op])
-
-      if global_step_ % 100 == 0:
-        sess.run(optimizer.inv_update_op)
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, inverse_op])
 
-      if global_step_ % 100 == 0:
+      if (global_step_ + 1) % _INVERT_EVERY == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                         global_step_, loss_, accuracy_)
 
@@ -250,16 +277,62 @@ def _num_gradient_tasks(num_tasks):
   return int(np.ceil(0.6 * num_tasks))
 
 
-def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
-                              checkpoint_dir, loss, accuracy, layer_collection):
-  """Minimize loss with an synchronous implementation of K-FAC.
+def _make_distributed_train_op(
+    task_id,
+    num_worker_tasks,
+    num_ps_tasks,
+    layer_collection
+):
+  """Creates optimizer and distributed training op.
 
-  Different tasks are responsible for different parts of K-FAC's Ops. The first
-  60% of tasks update weights; the next 20% accumulate covariance statistics;
-  the last 20% invert the matrices used to precondition gradients.
+  Constructs KFAC optimizer and wraps it in `sync_replicas` optimizer. Makes
+  the train op.
+
+  Args:
+   task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    sync_optimizer: `tf.train.SyncReplicasOptimizer` instance which wraps KFAC
+      optimizer.
+    optimizer: Instance of `opt.KfacOptimizer`.
+    global_step: `tensor`, Global step.
+  """
+  tf.logging.info("Task id : %d", task_id)
+  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
+    global_step = tf.train.get_or_create_global_step()
+    optimizer = opt.KfacOptimizer(
+        learning_rate=0.0001,
+        cov_ema_decay=0.95,
+        damping=0.001,
+        layer_collection=layer_collection,
+        momentum=0.9)
+    sync_optimizer = tf.train.SyncReplicasOptimizer(
+        opt=optimizer,
+        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks),
+        total_num_replicas=num_worker_tasks)
+    return sync_optimizer, optimizer, global_step
+
+
+def distributed_grads_only_and_ops_chief_worker(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection, invert_every=10):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  All workers perform gradient computation. Chief worker applies gradient after
+  averaging the gradients obtained from all the workers. All workers block
+  execution untill the update is applied. Chief worker runs covariance and
+  inverse update ops. Covariance and inverse matrices are placed on parameter
+  servers in a round robin manner. For further details on synchronous
+  distributed optimization check `tf.train.SyncReplicasOptimizer`.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables. If 0,
       parameter servers are not used.
@@ -271,6 +344,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       run with each step.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    invert_every: `int`, Number of steps between update the inverse.
 
   Returns:
     final value for 'accuracy'.
@@ -278,19 +352,80 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   Raises:
     ValueError: if task_id >= num_worker_tasks.
   """
-  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
-    global_step = tf.train.get_or_create_global_step()
-    optimizer = opt.KfacOptimizer(
-        learning_rate=0.0001,
-        cov_ema_decay=0.95,
-        damping=0.001,
-        layer_collection=layer_collection,
-        momentum=0.9)
-    inv_update_queue = oq.OpQueue(optimizer.inv_update_ops)
-    sync_optimizer = tf.train.SyncReplicasOptimizer(
-        opt=optimizer,
-        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks))
-    train_op = sync_optimizer.minimize(loss, global_step=global_step)
+
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+
+  tf.logging.info("Starting training.")
+  hooks = [sync_optimizer.make_session_run_hook(is_chief)]
+
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  if is_chief:
+    cov_update_op = make_update_op(cov_update_thunks)
+    with tf.control_dependencies([train_op, cov_update_op]):
+      update_op = tf.cond(
+          tf.equal(tf.mod(global_step + 1, invert_every), 0),
+          lambda: make_update_op(inv_update_thunks),
+          tf.no_op)
+  else:
+    update_op = train_op
+
+  with tf.train.MonitoredTrainingSession(
+      master=master,
+      is_chief=is_chief,
+      checkpoint_dir=checkpoint_dir,
+      hooks=hooks,
+      stop_grace_period_secs=0) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [global_step, loss, accuracy, update_op])
+      tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
+                      loss_, accuracy_)
+  return accuracy_
+
+
+def distributed_grads_and_ops_dedicated_workers(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  Different workers are responsible for different parts of K-FAC's Ops. The
+  first 60% of tasks compute gradients; the next 20% accumulate covariance
+  statistics; the last 20% invert the matrices used to precondition gradients.
+  The chief worker applies the gradient .
+
+  Args:
+    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    master: string. IP and port of TensorFlow runtime process. Set to empty
+      string to run locally.
+    checkpoint_dir: string or None. Path to store checkpoints under.
+    loss: 0-D Tensor. Loss to be minimized.
+    accuracy: dict mapping strings to 0-D Tensors. Additional accuracy to
+      run with each step.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    final value for 'accuracy'.
+
+  Raises:
+    ValueError: if task_id >= num_worker_tasks.
+  """
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  _, cov_update_op, inv_update_ops, _, _, _ = optimizer.make_ops_and_vars()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+  inv_update_queue = oq.OpQueue(inv_update_ops)
 
   tf.logging.info("Starting training.")
   is_chief = (task_id == 0)
@@ -306,7 +441,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       if _is_gradient_task(task_id, num_worker_tasks):
         learning_op = train_op
       elif _is_cov_update_task(task_id, num_worker_tasks):
-        learning_op = optimizer.cov_update_op
+        learning_op = cov_update_op
       elif _is_inv_update_task(task_id, num_worker_tasks):
         # TODO(duckworthd): Running this op before cov_update_op has been run a
         # few times can result in "InvalidArgumentError: Cholesky decomposition
@@ -324,13 +459,18 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   return accuracy_
 
 
-def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
+def train_mnist_single_machine(data_dir,
+                               num_epochs,
+                               use_fake_data=False,
+                               device="/gpu:0"):
   """Train a ConvNet on MNIST.
 
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
@@ -350,22 +490,38 @@ def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
       examples, labels, num_labels=10, layer_collection=layer_collection)
 
   # Fit model.
-  return minimize_loss_single_machine(loss, accuracy, layer_collection)
+  return minimize_loss_single_machine(
+      loss, accuracy, layer_collection, device=device)
 
 
 def train_mnist_multitower(data_dir, num_epochs, num_towers,
-                           use_fake_data=True):
+                           use_fake_data=True, devices=None):
   """Train a ConvNet on MNIST.
 
+  Training data is split equally among the towers. Each tower computes loss on
+  its own batch of data and the loss is aggregated on the CPU. The model
+  variables are placed on first tower. The covariance and inverse update ops
+  and variables are placed on GPUs in a round robin manner.
+
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     num_towers: int. Number of CPUs to split inference across.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    devices: string, Either list of CPU or GPU. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
   """
+  if devices:
+    device_count = {"GPU": num_towers}
+  else:
+    device_count = {"CPU": num_towers}
+
+  devices = devices or [
+      "/cpu:{}".format(tower_id) for tower_id in range(num_towers)
+  ]
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
   tower_batch_size = 128
@@ -388,7 +544,7 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   layer_collection = lc.LayerCollection()
   tower_results = []
   for tower_id in range(num_towers):
-    with tf.device("/cpu:%d" % tower_id):
+    with tf.device(devices[tower_id]):
       with tf.name_scope("tower%d" % tower_id):
         with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
           tf.logging.info("Building tower %d." % tower_id)
@@ -402,34 +558,79 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   accuracy = tf.reduce_mean(accuracies)
 
   # Fit model.
+
   session_config = tf.ConfigProto(
-      allow_soft_placement=False, device_count={
-          "CPU": num_towers
-      })
-  return minimize_loss_single_machine(
-      loss, accuracy, layer_collection, session_config=session_config)
+      allow_soft_placement=False,
+      device_count=device_count,
+  )
+
+  g_step = tf.train.get_or_create_global_step()
+  optimizer = opt.KfacOptimizer(
+      learning_rate=0.0001,
+      cov_ema_decay=0.95,
+      damping=0.001,
+      layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=devices,
+      inv_devices=devices,
+      momentum=0.9)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
 
+  train_op = optimizer.minimize(loss, global_step=g_step)
 
-def train_mnist_distributed(task_id,
-                            num_worker_tasks,
-                            num_ps_tasks,
-                            master,
-                            data_dir,
-                            num_epochs,
-                            use_fake_data=False):
-  """Train a ConvNet on MNIST.
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([train_op, cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+
+  tf.logging.info("Starting training.")
+  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, inverse_op])
+
+      if (global_step_ + 1) % _INVERT_EVERY == 0:
+        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
+                        global_step_, loss_, accuracy_)
+
+
+def train_mnist_distributed_sync_replicas(task_id,
+                                          is_chief,
+                                          num_worker_tasks,
+                                          num_ps_tasks,
+                                          master,
+                                          data_dir,
+                                          num_epochs,
+                                          op_strategy,
+                                          use_fake_data=False):
+  """Train a ConvNet on MNIST using Sync replicas optimizer.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables.
     master: string. IP and port of TensorFlow runtime process.
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
+    op_strategy: `string`, Strategy to run the covariance and inverse
+      ops. If op_strategy == `chief_worker` then covaraiance and inverse
+      update ops are run on chief worker otherwise they are run on dedicated
+      workers.
+
     use_fake_data: bool. If True, generate a synthetic dataset.
 
   Returns:
     accuracy of model on the final minibatch of training data.
+
+  Raises:
+    ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"].
   """
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
@@ -448,9 +649,17 @@ def train_mnist_distributed(task_id,
 
   # Fit model.
   checkpoint_dir = None if data_dir is None else os.path.join(data_dir, "kfac")
-  return minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks,
-                                   master, checkpoint_dir, loss, accuracy,
-                                   layer_collection)
+  if op_strategy == "chief_worker":
+    return distributed_grads_only_and_ops_chief_worker(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  elif op_strategy == "dedicated_workers":
+    return distributed_grads_and_ops_dedicated_workers(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  else:
+    raise ValueError("Only supported op strategies are : {}, {}".format(
+        "chief_worker", "dedicated_workers"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
new file mode 100644
index 0000000000..b4c2d4a9e9
--- /dev/null
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Distributed training with sync replicas optimizer. See
+`convnet.train_mnist_distributed_sync_replicas` for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("task", -1, "Task identifier")
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+flags.DEFINE_string(
+    "cov_inv_op_strategy", "chief_worker",
+    "In dist training mode run the cov, inv ops on chief or dedicated workers."
+)
+flags.DEFINE_string("master", "local", "Session master.")
+flags.DEFINE_integer("ps_tasks", 2,
+                     "Number of tasks in the parameter server job.")
+flags.DEFINE_integer("replicas_to_aggregate", 5,
+                     "Number of replicas to aggregate.")
+flags.DEFINE_integer("worker_replicas", 5, "Number of replicas in worker job.")
+flags.DEFINE_integer("num_epochs", None, "Number of epochs.")
+
+
+def _is_chief():
+  """Determines whether a job is the chief worker."""
+  if "chief_worker" in FLAGS.brain_jobs:
+    return FLAGS.brain_job_name == "chief_worker"
+  else:
+    return FLAGS.task == 0
+
+
+def main(unused_argv):
+  _ = unused_argv
+  convnet.train_mnist_distributed_sync_replicas(
+      FLAGS.task, _is_chief(), FLAGS.worker_replicas, FLAGS.ps_tasks,
+      FLAGS.master, FLAGS.data_dir, FLAGS.num_epochs, FLAGS.cov_inv_op_strategy)
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
new file mode 100644
index 0000000000..4249bf8a8d
--- /dev/null
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Multi tower training mode. See `convnet.train_mnist_multitower` for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/multitower_1/mnist", "local mnist dir")
+flags.DEFINE_integer("num_towers", 2,
+                     "Number of towers for multi tower training.")
+
+
+def main(unused_argv):
+  _ = unused_argv
+  assert FLAGS.num_towers > 1
+  devices = ["/gpu:{}".format(tower_id) for tower_id in range(FLAGS.num_towers)]
+  convnet.train_mnist_multitower(
+      FLAGS.data_dir,
+      num_epochs=200,
+      num_towers=FLAGS.num_towers,
+      devices=devices)
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
similarity index 57%
rename from tensorflow/contrib/kfac/examples/convnet_mnist_main.py
rename to tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
index b0c6fbde19..3aa52aff19 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
@@ -14,44 +14,26 @@
 # ==============================================================================
 r"""Train a ConvNet on MNIST using K-FAC.
 
-See convnet.py for details.
+Train on single machine. See `convnet.train_mnist_single_machine` for details.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import sys
 
+from absl import flags
 import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import convnet
 
-FLAGS = None
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
 
 
-def main(argv):
-  _ = argv
-
-  if FLAGS.num_towers > 1:
-    convnet.train_mnist_multitower(
-        FLAGS.data_dir, num_epochs=200, num_towers=FLAGS.num_towers)
-  else:
-    convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
+def main(unused_argv):
+  convnet.train_mnist_single_gpu(FLAGS.data_dir, num_epochs=200)
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--data_dir",
-      type=str,
-      default="/tmp/mnist",
-      help="Directory to store dataset in.")
-  parser.add_argument(
-      "--num_towers",
-      type=int,
-      default=1,
-      help="Number of CPUs to split minibatch across.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
index 8d86c2bb51..6de775cc79 100644
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
@@ -112,15 +112,16 @@ class ConvNetTest(tf.test.TestCase):
   def testMinimizeLossSingleMachine(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_single_machine(loss, accuracy,
-                                                       layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      accuracy_ = convnet.minimize_loss_single_machine(
+          loss, accuracy, layer_collection, device="/cpu:0")
+      self.assertLess(accuracy_, 2.0)
 
   def testMinimizeLossDistributed(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_distributed(
+      accuracy_ = convnet.distributed_grads_only_and_ops_chief_worker(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
@@ -128,7 +129,7 @@ class ConvNetTest(tf.test.TestCase):
           loss=loss,
           accuracy=accuracy,
           layer_collection=layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      self.assertLess(accuracy_, 2.0)
 
   def testTrainMnistSingleMachine(self):
     with tf.Graph().as_default():
@@ -138,7 +139,7 @@ class ConvNetTest(tf.test.TestCase):
       # but there are too few parameters for the model to effectively memorize
       # the training set the way an MLP can.
       convnet.train_mnist_single_machine(
-          data_dir=None, num_epochs=1, use_fake_data=True)
+          data_dir=None, num_epochs=1, use_fake_data=True, device="/cpu:0")
 
   def testTrainMnistMultitower(self):
     with tf.Graph().as_default():
@@ -149,13 +150,15 @@ class ConvNetTest(tf.test.TestCase):
   def testTrainMnistDistributed(self):
     with tf.Graph().as_default():
       # Ensure model training doesn't crash.
-      convnet.train_mnist_distributed(
+      convnet.train_mnist_distributed_sync_replicas(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
           data_dir=None,
           num_epochs=1,
+          op_strategy="chief_worker",
           use_fake_data=True)
 
 
-- 
GitLab


From 3fbec92fa4997fc834807b57b229fa9ada179f6c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 07:45:31 -0700
Subject: [PATCH 0247/1262] Turn Cast into a proper builtin operator.

PiperOrigin-RevId: 191590230
---
 tensorflow/contrib/lite/BUILD                 |  1 +
 tensorflow/contrib/lite/builtin_op_data.h     |  7 ++
 tensorflow/contrib/lite/kernels/cast.cc       |  9 +++
 tensorflow/contrib/lite/model.cc              | 70 ++++++++++++-------
 tensorflow/contrib/lite/schema/schema.fbs     |  2 +
 .../contrib/lite/schema/schema_generated.h    | 38 +++++++++-
 .../contrib/lite/toco/tflite/operator.cc      | 26 ++++---
 .../contrib/lite/toco/tflite/operator_test.cc |  2 +-
 8 files changed, 117 insertions(+), 38 deletions(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index ac269d540a..9c4533079c 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -89,6 +89,7 @@ cc_library(
     hdrs = [
         "builtin_op_data.h",
     ],
+    deps = [":context"],
 )
 
 cc_library(
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 5fc8954743..2b6c24768c 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include "tensorflow/contrib/lite/context.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -174,6 +176,11 @@ typedef struct {
   int block_size;
 } TfLiteSpaceToDepthParams;
 
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
 typedef enum {
   kTfLiteCombinerTypeSum = 0,
   kTfLiteCombinerTypeMean = 1,
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc
index 19942de7bc..17ef2c572e 100644
--- a/tensorflow/contrib/lite/kernels/cast.cc
+++ b/tensorflow/contrib/lite/kernels/cast.cc
@@ -34,6 +34,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): these two checks would make the new implementation
+  // incompatible with some existing models, where params is not specified. It
+  // is OK not to have them because toco would have set input and output types
+  // to match the parameters.
+  // auto* params = reinterpret_cast<TfLiteCastParams*>(node->builtin_data);
+  // TF_LITE_ENSURE_EQ(context, input->type, params->in_data_type);
+  // TF_LITE_ENSURE_EQ(context, output->type, params->out_data_type);
+
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
 }
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 791d1378f3..606f4a5635 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -32,6 +32,32 @@ namespace tflite {
 
 const char* kEmptyTensorName = "";
 
+TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
+                               ErrorReporter* error_reporter) {
+  switch (tensor_type) {
+    case TensorType_FLOAT32:
+      *type = kTfLiteFloat32;
+      break;
+    case TensorType_INT32:
+      *type = kTfLiteInt32;
+      break;
+    case TensorType_UINT8:
+      *type = kTfLiteUInt8;
+      break;
+    case TensorType_INT64:
+      *type = kTfLiteInt64;
+      break;
+    case TensorType_STRING:
+      *type = kTfLiteString;
+      break;
+    default:
+      error_reporter->Report("Unimplemented data type %s (%d) in tensor\n",
+                             EnumNameTensorType(tensor_type), tensor_type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 // Loads a model from `filename`. If `mmap_file` is true then use mmap,
 // otherwise make a copy of the model in a buffer.
 std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
@@ -307,10 +333,25 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EXP:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_LOG_SOFTMAX:
-    case BuiltinOperator_CAST:
     case BuiltinOperator_DEQUANTIZE:
     case BuiltinOperator_PRELU:
       break;
+    case BuiltinOperator_CAST: {
+      TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
+      if (auto* schema_params = op->builtin_options_as_CastOptions()) {
+        auto in_status =
+            ConvertTensorType(schema_params->in_data_type(),
+                              &params->in_data_type, error_reporter);
+        auto out_status =
+            ConvertTensorType(schema_params->out_data_type(),
+                              &params->out_data_type, error_reporter);
+        if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
+          break;
+        }
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_LSH_PROJECTION: {
       TfLiteLSHProjectionParams* params =
           MallocPOD<TfLiteLSHProjectionParams>();
@@ -707,29 +748,10 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     }
 
     TfLiteType type;
-    switch (tensor->type()) {
-      case TensorType_FLOAT32:
-        type = kTfLiteFloat32;
-        break;
-      case TensorType_INT32:
-        type = kTfLiteInt32;
-        break;
-      case TensorType_UINT8:
-        type = kTfLiteUInt8;
-        break;
-      case TensorType_INT64:
-        type = kTfLiteInt64;
-        break;
-      case TensorType_STRING:
-        type = kTfLiteString;
-        break;
-      default:
-        // tensorType = ArrayType::NONE;
-        error_reporter_->Report("Unimplemented data type %s (%d) in tensor\n",
-                                EnumNameTensorType(tensor->type()),
-                                tensor->type());
-        status = kTfLiteError;
-        continue;
+    if (ConvertTensorType(tensor->type(), &type, error_reporter_) !=
+        kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
     auto get_readonly_data = [&](const char** buffer_data,
                                  size_t* buffer_size) {
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 7d2e00fe32..c63bfb28cc 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -381,6 +381,8 @@ table LogSoftmaxOptions {
 }
 
 table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
 }
 
 table DequantizeOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 66a97a1460..0735be5c8f 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -3702,14 +3702,30 @@ flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::Flat
 
 struct CastOptionsT : public flatbuffers::NativeTable {
   typedef CastOptions TableType;
-  CastOptionsT() {
+  TensorType in_data_type;
+  TensorType out_data_type;
+  CastOptionsT()
+      : in_data_type(TensorType_FLOAT32),
+        out_data_type(TensorType_FLOAT32) {
   }
 };
 
 struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CastOptionsT NativeTableType;
+  enum {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  TensorType in_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  TensorType out_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE) &&
            verifier.EndTable();
   }
   CastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3720,6 +3736,12 @@ struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct CastOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
+  void add_in_data_type(TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
   explicit CastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3733,8 +3755,12 @@ struct CastOptionsBuilder {
 };
 
 inline flatbuffers::Offset<CastOptions> CreateCastOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType in_data_type = TensorType_FLOAT32,
+    TensorType out_data_type = TensorType_FLOAT32) {
   CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
   return builder_.Finish();
 }
 
@@ -5727,6 +5753,8 @@ inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t
 inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; };
+  { auto _e = out_data_type(); _o->out_data_type = _e; };
 }
 
 inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -5737,8 +5765,12 @@ inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBuffe
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
   return tflite::CreateCastOptions(
-      _fbb);
+      _fbb,
+      _in_data_type,
+      _out_data_type);
 }
 
 inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 0cb348bda5..f991529569 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -204,17 +204,22 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 };
 
-class Cast : public CustomOperator<CastOperator> {
+class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
+                                    ::tflite::BuiltinOptions_CastOptions> {
  public:
-  using CustomOperator::CustomOperator;
-  void WriteOptions(const TocoOperator& op,
-                    flexbuffers::Builder* fbb) const override {
-    fbb->Int("src_data_type", DataType::Serialize(op.src_data_type));
-    fbb->Int("dst_data_type", DataType::Serialize(op.dst_data_type));
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateCastOptions(*builder,
+                                       DataType::Serialize(op.src_data_type),
+                                       DataType::Serialize(op.dst_data_type));
   }
-  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
-    op->src_data_type = DataType::Deserialize(m["src_data_type"].AsInt64());
-    op->dst_data_type = DataType::Deserialize(m["dst_data_type"].AsInt64());
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->src_data_type = DataType::Deserialize(options.in_data_type());
+    op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
 };
 
@@ -827,9 +832,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new TopK_V2(::tflite::BuiltinOperator_TOPK_V2, OperatorType::kTopK_V2));
   ops.emplace_back(
       new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
+  ops.emplace_back(
+      new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
 
   // Custom Operators.
-  ops.emplace_back(new Cast("CAST", OperatorType::kCast));
   ops.emplace_back(
       new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
   ops.emplace_back(new FakeQuant("FAKE_QUANT", OperatorType::kFakeQuant));
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index f7a213ecfc..4783843b7f 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -131,7 +131,7 @@ TEST_F(OperatorTest, BuiltinMean) {
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
-TEST_F(OperatorTest, CustomCast) {
+TEST_F(OperatorTest, BuiltinCast) {
   CastOperator op;
   op.src_data_type = ArrayDataType::kFloat;
   op.dst_data_type = ArrayDataType::kUint8;
-- 
GitLab


From cb8cdf261f932186d34bdd43b86e887eec450213 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 09:31:25 -0700
Subject: [PATCH 0248/1262] Restructuring the HLO partitioner to fit host
 computation and handle kCall. Pre process the input module to reassign
 reserved devices (like the host compute one) to new sequentially increasing
 device numbers, and track those in the GlobalState. This avoids having many
 places where we need to spread the is-special-device logic, within the HLO
 partitioner and its related components. Added handling for kCall, which was
 missing from previous implementation.

PiperOrigin-RevId: 191601831
---
 .../compiler/xla/service/hlo_computation.cc    | 18 +++++++-----------
 .../compiler/xla/service/hlo_instruction.h     |  7 +++++++
 tensorflow/compiler/xla/service/hlo_sharding.h |  4 ++++
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 6f983d0b95..594413e88f 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -304,19 +304,15 @@ void ComputeComputationPostOrder(
     HloComputation* computation,
     tensorflow::gtl::FlatSet<HloComputation*>* visited,
     std::list<HloComputation*>* post_order) {
-  if (visited->count(computation) > 0) {
-    return;
-  }
-
-  for (auto* instruction : computation->instructions()) {
-    for (HloComputation* called_computation :
-         instruction->called_computations()) {
-      ComputeComputationPostOrder(called_computation, visited, post_order);
+  if (visited->insert(computation).second) {
+    for (auto* instruction : computation->instructions()) {
+      for (HloComputation* called_computation :
+           instruction->called_computations()) {
+        ComputeComputationPostOrder(called_computation, visited, post_order);
+      }
     }
+    post_order->push_back(computation);
   }
-
-  visited->insert(computation);
-  post_order->push_back(computation);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a94ba145df..80f8408244 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -928,6 +928,13 @@ class HloInstruction {
   const HloSharding& sharding_or_default(const HloSharding& default_) const {
     return sharding_ ? *sharding_ : default_;
   }
+  // Returns the sharding unique device, if any.
+  tensorflow::gtl::optional<int64> sharding_unique_device() const {
+    if (sharding_ == nullptr || !sharding_->HasUniqueDevice()) {
+      return tensorflow::gtl::optional<int64>();
+    }
+    return sharding_->UniqueDevice().ValueOrDie();
+  }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 18d406f370..06204acbca 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -94,6 +94,10 @@ class HloSharding {
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
+  // Checks whether device is a reserved device number. A reserved device number
+  // has usually a special meaning, with dedicated handling logic.
+  static bool IsReservedDevice(int64 device) { return device < 0; }
+
   OpSharding ToProto() const;
   string ToString() const;
 
-- 
GitLab


From cfb9cdc7d90e349942062dfe075bd0b91c6e3352 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 4 Apr 2018 09:50:12 -0700
Subject: [PATCH 0249/1262] Add FullArgSpecs to test.

---
 tensorflow/tools/docs/parser_test.py | 47 ++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index ae86142f41..c6fbd2be0d 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -398,7 +398,6 @@ class ParserTest(googletest.TestCase):
     self.assertIn('<code>test_function', docs)
 
   def test_argspec_for_functools_partial(self):
-
     # pylint: disable=unused-argument
     def test_function_for_partial1(arg1, arg2, kwarg1=1, kwarg2=2):
       pass
@@ -409,45 +408,67 @@ class ParserTest(googletest.TestCase):
 
     # pylint: disable=protected-access
     # Make sure everything works for regular functions.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(1, 2),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(1, 2),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting args from the front works.
-    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
-                                  (1, 2))
+    expected = tf_inspect.FullArgSpec(args=['arg2', 'kwarg1', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(1, 2),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(args=['kwarg2'],
+                                      varargs=None, varkw=None, defaults=(2,),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting kwargs works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(2,),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1'],
+                                      varargs=None, varkw=None, defaults=(1,),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.FullArgSpec(args=['arg1'],
+                                      varargs=None, varkw=None, defaults=(),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure *args, *kwargs is accounted for.
-    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.FullArgSpec(args=[],
+                                      varargs='my_args', varkw='my_kwargs',
+                                      defaults=(),
+                                      kwonlyargs=[], kwonlydefaults={},
+                                      annotations={})
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
-
+    
     # pylint: enable=protected-access
 
   def testSaveReferenceResolver(self):
-- 
GitLab


From 590f23bb980a729abdf74e19e6a0b9141f492cfa Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 4 Apr 2018 09:51:56 -0700
Subject: [PATCH 0250/1262] Add missing FullArgSpec args for partial

---
 tensorflow/tools/docs/parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 3ab790bcba..bf1c3eda6b 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -657,12 +657,12 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    # NOTE Some fields from FullArgSpec were removed here.
-    # Add them back if needed in the future.
     return tf_inspect.FullArgSpec(args=argspec_args,
                                   varargs=argspec.varargs,
                                   varkw=argspec.varkw,
-                                  defaults=tuple(argspec_defaults))
+                                  defaults=tuple(argspec_defaults),
+                                  kwonlyargs=[], kwonlydefaults={},
+                                  annotations={})
   else:  # Regular function or method, getargspec will work fine.
     return tf_inspect.getfullargspec(func)
 
-- 
GitLab


From 8badd11d875a826bd318ed439909d5c47a7fb811 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 09:51:55 -0700
Subject: [PATCH 0251/1262] Check arguments of ComputeConvSizes that should be
 positive.

PiperOrigin-RevId: 191604311
---
 .../graph_transformations/propagate_fixed_sizes.cc     | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index b96d698675..68d6f21cf8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -38,6 +38,16 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
   const int input_height = input_shape.dims(1);
   const int batch = input_shape.dims(0);
 
+  CHECK_GE(input_width, 1);
+  CHECK_GE(input_height, 1);
+  CHECK_GE(batch, 1);
+  CHECK_GE(kwidth, 1);
+  CHECK_GE(kheight, 1);
+  CHECK_GE(stride_width, 1);
+  CHECK_GE(stride_height, 1);
+  CHECK_GE(dilation_width_factor, 1);
+  CHECK_GE(dilation_height_factor, 1);
+
   int dilated_kwidth = dilation_width_factor * (kwidth - 1) + 1;
   int dilated_kheight = dilation_height_factor * (kheight - 1) + 1;
 
-- 
GitLab


From f4dcfcaae4e85bbb727eb1f5bfc14f6fa3a055ed Mon Sep 17 00:00:00 2001
From: Tony Wang <tonywy@google.com>
Date: Wed, 4 Apr 2018 10:00:39 -0700
Subject: [PATCH 0252/1262] Automated g4 rollback of changelist 191527251

PiperOrigin-RevId: 191605505
---
 .../xla/legacy_flags/debug_options_flags.cc   |   7 -
 tensorflow/compiler/xla/service/cpu/BUILD     |  25 ----
 .../compiler/xla/service/cpu/cpu_runtime.cc   |   8 --
 .../compiler/xla/service/cpu/cpu_runtime.h    |   4 -
 .../xla/service/cpu/cpu_runtime_test.cc       |  82 +----------
 .../xla/service/cpu/dot_op_emitter.cc         |  23 ++--
 .../xla/service/cpu/runtime_matmul_mkl.cc     | 129 ------------------
 .../xla/service/cpu/runtime_matmul_mkl.h      |  80 -----------
 .../xla/service/cpu/simple_orc_jit.cc         |   5 -
 tensorflow/compiler/xla/xla.proto             |   3 -
 10 files changed, 15 insertions(+), 351 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
 delete mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h

diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index f037663e3f..c8ed3e3a2b 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,9 +40,6 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
-#ifdef INTEL_MKL
-  flags->set_xla_cpu_use_mkl_dnn(true);
-#endif  // INTEL_MKL
 
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
@@ -291,10 +288,6 @@ void AllocateFlags() {
           flag_values->xla_gpu_use_cudnn_batchnorm(),
           "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
           "rather than expanding them to a soup of HLOs."),
-      tensorflow::Flag("xla_cpu_use_mkl_dnn",
-                       bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
-                       flag_values->xla_cpu_use_mkl_dnn(),
-                       "Generate calls to MKL-DNN in the CPU backend."),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d22c135249..966e2d0fc5 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -18,10 +18,6 @@ load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -174,7 +170,6 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
-        ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
@@ -539,28 +534,10 @@ cc_library(
         ":runtime_matvec",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
 
-cc_library(
-    name = "runtime_matmul_mkl",
-    srcs = ["runtime_matmul_mkl.cc"],
-    hdrs = ["runtime_matmul_mkl.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ] + if_mkl([
-        "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ]),
-)
-
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = [
@@ -607,12 +584,10 @@ cc_library(
 tf_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
-    shard_count = 10,
     tags = ["optonly"],
     deps = [
         ":cpu_runtime",
         ":runtime_matmul",
-        ":runtime_matmul_mkl",
         ":runtime_single_threaded_matmul",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 872b0be1f8..9a3bd68c80 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,14 +37,6 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
-extern const char* const kMKLMatMulF32SymbolName =
-    "__xla_cpu_runtime_MKLMatMulF32";
-extern const char* const kMKLMatMulF64SymbolName =
-    "__xla_cpu_runtime_MKLMatMulF64";
-extern const char* const kMKLSingleThreadedMatMulF32SymbolName =
-    "__xla_cpu_runtime_MKLSingleThreadedMatMulF32";
-extern const char* const kMKLSingleThreadedMatMulF64SymbolName =
-    "__xla_cpu_runtime_MKLSingleThreadedMatMulF64";
 extern const char* const kEigenConvF16SymbolName =
     "__xla_cpu_runtime_EigenConvF16";
 extern const char* const kEigenConvF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index e392e231b4..e61d6ea28b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,10 +44,6 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
-extern const char* const kMKLMatMulF32SymbolName;
-extern const char* const kMKLMatMulF64SymbolName;
-extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
-extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index 9e04307295..f385829cdf 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
@@ -131,19 +130,21 @@ MatMulShape MatMulShapes[] = {
 // * transpose_lhs
 // * transpose_rhs
 // * single_threaded
-using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
+using EigenMatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
 
-class EigenMatMulTest : public CpuRuntimeTest,
-                        public ::testing::WithParamInterface<MatMulTestParam> {
+class EigenMatMulTest
+    : public CpuRuntimeTest,
+      public ::testing::WithParamInterface<EigenMatMulTestParam> {
  public:
-  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
+  static string Name(
+      const ::testing::TestParamInfo<EigenMatMulTestParam>& info) {
     MatMulShape shape = std::get<0>(info.param);
     bool transpose_lhs = std::get<1>(info.param);
     bool transpose_rhs = std::get<2>(info.param);
     bool single_threaded = std::get<3>(info.param);
 
     return tensorflow::strings::Printf(
-        "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        "MatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
         transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
         single_threaded ? "single" : "multi");
   }
@@ -168,74 +169,5 @@ INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
                                            ::testing::Bool()),
                         EigenMatMulTest::Name);
 
-#ifdef INTEL_MKL
-class MKLMatMulTest : public CpuRuntimeTest,
-                      public ::testing::WithParamInterface<MatMulTestParam> {
- public:
-  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
-    MatMulShape shape = std::get<0>(info.param);
-    bool transpose_lhs = std::get<1>(info.param);
-    bool transpose_rhs = std::get<2>(info.param);
-    bool single_threaded = std::get<3>(info.param);
-
-    return tensorflow::strings::Printf(
-        "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
-        transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
-        single_threaded ? "single" : "multi");
-  }
-};
-
-std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
-                                                  const Array2D<float>& b,
-                                                  bool transpose_lhs,
-                                                  bool transpose_rhs,
-                                                  bool single_threaded) {
-  CHECK_EQ(a.width(), b.height());
-  int64 m = a.height();
-  int64 n = b.width();
-  int64 k = a.width();
-
-  // The MKL matmul runtime function expects the matrix to be in column major
-  // order and array2d is in row-major order. Create transposes of a and b. The
-  // 'data' buffer in the transposed array is the original array in column major
-  // order.
-  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
-  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
-
-  // Since we're going to transpose c before returning it, swap the order of the
-  // dimension sizes to ensure the returned array is properly dimensioned.
-  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
-  if (single_threaded) {
-    __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
-        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
-        m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    __xla_cpu_runtime_MKLMatMulF32(nullptr, c_transpose->data(),
-                                   a_transpose->data(), b_transpose->data(), m,
-                                   n, k, transpose_lhs, transpose_rhs);
-  }
-  return MaybeTransposeArray2D(*c_transpose, true);
-}
-
-TEST_P(MKLMatMulTest, DoIt) {
-  MatMulShape shape = std::get<0>(GetParam());
-  bool transpose_lhs = std::get<1>(GetParam());
-  bool transpose_rhs = std::get<2>(GetParam());
-  bool single_threaded = std::get<3>(GetParam());
-
-  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
-  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
-  auto c =
-      MKLMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs, single_threaded);
-  CheckMatrixMultiply(*a, *b, *c);
-}
-
-INSTANTIATE_TEST_CASE_P(MKLMatMulTestInstantiaion, MKLMatMulTest,
-                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
-                                           ::testing::Bool(), ::testing::Bool(),
-                                           ::testing::Bool()),
-                        MKLMatMulTest::Name);
-#endif  // INTEL_MKL
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 29afd8ea5f..8b1e20d79e 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -918,35 +918,28 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded =
+  bool multi_threaded_eigen =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
     case F16:
-      fn_name = multi_threaded
+      fn_name = multi_threaded_eigen
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
       float_type = ir_builder_->getHalfTy();
       break;
     case F32:
-      fn_name = multi_threaded
-                    ? (use_mkl_dnn ? runtime::kMKLMatMulF32SymbolName
-                                   : runtime::kEigenMatMulF32SymbolName)
-                    : (use_mkl_dnn
-                           ? runtime::kMKLSingleThreadedMatMulF32SymbolName
-                           : runtime::kEigenSingleThreadedMatMulF32SymbolName);
+      fn_name = multi_threaded_eigen
+                    ? runtime::kEigenMatMulF32SymbolName
+                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded
-                    ? (use_mkl_dnn ? runtime::kMKLMatMulF64SymbolName
-                                   : runtime::kEigenMatMulF64SymbolName)
-                    : (use_mkl_dnn
-                           ? runtime::kMKLSingleThreadedMatMulF64SymbolName
-                           : runtime::kEigenSingleThreadedMatMulF64SymbolName);
+      fn_name = multi_threaded_eigen
+                    ? runtime::kEigenMatMulF64SymbolName
+                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
deleted file mode 100644
index 729a4e7f5b..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
-#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
-#include "third_party/intel_mkl_ml/include/mkl_service.h"
-
-#include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-
-#define EIGEN_USE_THREADS
-#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
-
-using tensorflow::int32;
-using tensorflow::int64;
-
-namespace {
-// BLAS GEMM API for 32-bit Matrix Multiplication.
-
-// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
-// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
-// Matrix lhs, rhs and out are all colum-major.
-void MatMulF32(const void* run_options_ptr, float* out, float* lhs, float* rhs,
-               int64 m, int64 n, int64 k, int32 transpose_lhs,
-               int32 transpose_rhs) {
-  const float alpha = 1.0f, beta = 0.0f;
-  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
-  // respectively. For column-major matrices, the leading dimension is the
-  // stride between consecutive columns (which equals the number of rows). If
-  // the matrix is transposed, the leading dimension is the stride between
-  // consecutive rows (which equals the number of columns).
-  int lda = transpose_lhs ? k : m;
-  int ldb = transpose_rhs ? n : k;
-  int ldc = m;
-  cblas_sgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
-              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
-              lda, rhs, ldb, beta, out, ldc);
-}
-
-// BLAS GEMM API for 64-bit Matrix Multiplication.
-
-// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
-// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
-// Matrix lhs, rhs and out are all colum-major.
-void MatMulF64(const void* run_options_ptr, double* out, double* lhs,
-               double* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
-               int32 transpose_rhs) {
-  const float alpha = 1.0f, beta = 0.0f;
-  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
-  // respectively. For a column-major matrix, the leading dimension is the
-  // stride between consecutive columns (which equals the number of rows). If
-  // the matrix is transposed, the leading dimension is the stride between
-  // consecutive rows (which equals the number of columns).
-  int lda = transpose_lhs ? k : m;
-  int ldb = transpose_rhs ? n : k;
-  int ldc = m;
-  cblas_dgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
-              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
-              lda, rhs, ldb, beta, out, ldc);
-}
-
-}  // namespace
-
-void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out,
-                                    float* lhs, float* rhs, int64 m, int64 n,
-                                    int64 k, int32 transpose_lhs,
-                                    int32 transpose_rhs) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
-  // number specified in intra_op_thread_pool to MKL.
-  int prev_num_threads = mkl_set_num_threads_local(
-      run_options->intra_op_thread_pool()->numThreads());
-  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  // Set thread number back to the previous number.
-  mkl_set_num_threads_local(prev_num_threads);
-}
-// BLAS GEMM API for 64-bit Matrix Multiplication
-void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out,
-                                    double* lhs, double* rhs, int64 m, int64 n,
-                                    int64 k, int32 transpose_lhs,
-                                    int32 transpose_rhs) {
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
-  // number specified in intra_op_thread_pool to MKL.
-  int prev_num_threads = mkl_set_num_threads_local(
-      run_options->intra_op_thread_pool()->numThreads());
-  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  // Set thread number back to the previous number.
-  mkl_set_num_threads_local(prev_num_threads);
-}
-void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr,
-                                                  float* out, float* lhs,
-                                                  float* rhs, int64 m, int64 n,
-                                                  int64 k, int32 transpose_lhs,
-                                                  int32 transpose_rhs) {
-  // Set the thread number to 1 for single threaded excution.
-  int prev_num_threads = mkl_set_num_threads_local(1);
-  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  // Set thread number back to the previous number.
-  mkl_set_num_threads_local(prev_num_threads);
-}
-void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr,
-                                                  double* out, double* lhs,
-                                                  double* rhs, int64 m, int64 n,
-                                                  int64 k, int32 transpose_lhs,
-                                                  int32 transpose_rhs) {
-  // Set the thread number to 1 for single threaded excution.
-  int prev_num_threads = mkl_set_num_threads_local(1);
-  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
-  // Set thread number back to the previous number.
-  mkl_set_num_threads_local(prev_num_threads);
-}
-#endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
deleted file mode 100644
index 9dbc506c08..0000000000
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#ifdef INTEL_MKL
-#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
-
-extern void __xla_cpu_runtime_MKLMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs);
-extern void __xla_cpu_runtime_MKLMatMulF64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
-    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs);
-extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs);
-extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
-    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs);
-
-#else
-extern void __xla_cpu_runtime_MKLMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs) {
-  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
-                "INTEL_MKL. Add --config=mkl to build with MKL.";
-}
-extern void __xla_cpu_runtime_MKLMatMulF64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
-    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs) {
-  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
-                "INTEL_MKL. Add --config=mkl to build with MKL.";
-}
-extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs) {
-  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
-                "INTEL_MKL. Add --config=mkl to build with MKL.";
-}
-extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
-    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
-    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
-    tensorflow::int32 transpose_rhs) {
-  LOG(FATAL) << "Attempt to call MKL MatMul runtime library without defining "
-                "INTEL_MKL. Add --config=mkl to build with MKL.";
-}
-
-#endif  // INTEL_MKL
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index b7ce5bbe47..4198260a22 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
@@ -184,10 +183,6 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f9943f71d3..5cb18113e5 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -189,9 +189,6 @@ message DebugOptions {
   // directory.
   string xla_dump_per_pass_hlo_proto_to = 96;
 
-  // Generate calls to MKL-DNN in the CPU backend.
-  bool xla_cpu_use_mkl_dnn = 97;
-
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
-- 
GitLab


From 0fef384fb94f83abd9c787b3c8ab7abc1f7ade95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 11:15:56 -0700
Subject: [PATCH 0253/1262] Guard against out-of-bounds dims accesses, even in
 optimized, no-asserts builds.

PiperOrigin-RevId: 191617948
---
 tensorflow/contrib/lite/toco/model.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 64269d369d..9bd72e7de1 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -1507,7 +1507,14 @@ class Shape {
 
   // We still have that one convenience accessor to avoid
   // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const { return dims_[i]; }
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
 
   bool operator==(const Shape& comp) const {
     return (this->dims_ == comp.dims());
-- 
GitLab


From 73b0540f81d6a5b88557b69df1bc3da25ca81a1b Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 4 Apr 2018 11:35:05 -0700
Subject: [PATCH 0254/1262] Also registers the gpu resourcegather kernel for
 half (no idea why it was missing)

PiperOrigin-RevId: 191621111
---
 tensorflow/core/kernels/resource_variable_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 5c54609ee6..f49a05c70a 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -605,7 +605,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #if GOOGLE_CUDA
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GATHER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 
 #endif  // GOOGLE_CUDA
 
-- 
GitLab


From 5c93045d14c83caaa7a590b3eb6b7e24fb892e31 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 4 Apr 2018 11:48:00 -0700
Subject: [PATCH 0255/1262] make default `kwonlydefaults=None` for py3

---
 tensorflow/python/util/tf_inspect.py |  2 +-
 tensorflow/tools/docs/parser.py      |  2 +-
 tensorflow/tools/docs/parser_test.py | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 5e74a97cd7..2c48c52caf 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -79,7 +79,7 @@ def getfullargspec(obj):  # pylint: disable=redefined-builtin
           varkw=argspecs.keywords,
           defaults=argspecs.defaults,
           kwonlyargs=[],
-          kwonlydefaults={},
+          kwonlydefaults=None,
           annotations={})
       return fullargspecs
   else:
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index bf1c3eda6b..cec23b1a36 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -661,7 +661,7 @@ def _get_arg_spec(func):
                                   varargs=argspec.varargs,
                                   varkw=argspec.varkw,
                                   defaults=tuple(argspec_defaults),
-                                  kwonlyargs=[], kwonlydefaults={},
+                                  kwonlyargs=[], kwonlydefaults=None,
                                   annotations={})
   else:  # Regular function or method, getargspec will work fine.
     return tf_inspect.getfullargspec(func)
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index c6fbd2be0d..d7757d78ed 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -410,14 +410,14 @@ class ParserTest(googletest.TestCase):
     # Make sure everything works for regular functions.
     expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
                                       varargs=None, varkw=None, defaults=(1, 2),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
     expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
                                       varargs=None, varkw=None, defaults=(1, 2),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
@@ -425,14 +425,14 @@ class ParserTest(googletest.TestCase):
     # Make sure setting args from the front works.
     expected = tf_inspect.FullArgSpec(args=['arg2', 'kwarg1', 'kwarg2'],
                                       varargs=None, varkw=None, defaults=(1, 2),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     expected = tf_inspect.FullArgSpec(args=['kwarg2'],
                                       varargs=None, varkw=None, defaults=(2,),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
@@ -440,21 +440,21 @@ class ParserTest(googletest.TestCase):
     # Make sure setting kwargs works.
     expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg2'],
                                       varargs=None, varkw=None, defaults=(2,),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1'],
                                       varargs=None, varkw=None, defaults=(1,),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     expected = tf_inspect.FullArgSpec(args=['arg1'],
                                       varargs=None, varkw=None, defaults=(),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
@@ -464,7 +464,7 @@ class ParserTest(googletest.TestCase):
     expected = tf_inspect.FullArgSpec(args=[],
                                       varargs='my_args', varkw='my_kwargs',
                                       defaults=(),
-                                      kwonlyargs=[], kwonlydefaults={},
+                                      kwonlyargs=[], kwonlydefaults=None,
                                       annotations={})
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
-- 
GitLab


From bd540c8c45bf66a9c14af38a272840b47731b91a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 12:00:38 -0700
Subject: [PATCH 0256/1262] Warning when tweaking a minmax to contain 0.

PiperOrigin-RevId: 191625723
---
 .../toco/graph_transformations/read_fake_quant_min_max.cc  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
index 11f8d4b6ee..bdcca5b7ca 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -72,6 +72,13 @@ bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
     minmax.min = min_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     // We always want [min, max] to contain 0.
+    if (minmax.min > 0 || minmax.max < 0) {
+      LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
+    }
     minmax.min = std::min(minmax.min, 0.);
     minmax.max = std::max(minmax.max, 0.);
 
-- 
GitLab


From 13e103b8f0dcc89673dd0d3d589b976c05c37a09 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 12:08:49 -0700
Subject: [PATCH 0257/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191627087
---
 tensorflow/core/lib/strings/numbers.cc        |  5 ++--
 .../core/lib/strings/ordered_code_test.cc     |  3 +-
 tensorflow/core/lib/strings/scanner.h         |  5 ++--
 tensorflow/core/platform/cloud/BUILD          |  2 ++
 .../core/platform/cloud/gcs_file_system.cc    |  9 +++---
 .../platform/cloud/gcs_file_system_test.cc    | 17 ++++++-----
 .../cloud/retrying_file_system_test.cc        | 29 ++++++++++---------
 .../platform/cloud/retrying_utils_test.cc     |  8 ++---
 tensorflow/core/util/command_line_flags.cc    | 20 +++++++++----
 .../core/util/device_name_utils_test.cc       |  2 +-
 tensorflow/core/util/equal_graph_def.cc       |  5 ++--
 tensorflow/core/util/memmapped_file_system.cc |  3 +-
 tensorflow/core/util/reporter_test.cc         |  2 +-
 .../core/util/tensor_slice_reader_test.cc     |  3 +-
 .../core/util/tensor_slice_writer_test.cc     |  9 +++---
 .../tools/graph_transforms/backports_test.cc  |  3 +-
 .../graph_transforms/fold_constants_test.cc   |  5 ++--
 .../freeze_requantization_ranges.cc           |  9 +++---
 .../tools/graph_transforms/insert_logging.cc  |  3 +-
 .../tools/graph_transforms/sparsify_gather.cc |  7 +++--
 .../graph_transforms/transform_graph_test.cc  |  8 ++---
 .../tools/graph_transforms/transform_utils.cc |  7 ++---
 22 files changed, 94 insertions(+), 70 deletions(-)

diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 516decc3c0..8f34baa7de 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -203,7 +204,7 @@ bool safe_strto64(StringPiece str, int64* value) {
 
   int64 vlimit = kint64max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different limit for positive and negative integers.
     vlimit = kint64min;
@@ -265,7 +266,7 @@ bool safe_strto32(StringPiece str, int32* value) {
 
   int64 vmax = kint32max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different max for positive and negative integers.
     ++vmax;
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index fee8a6f93e..ede9f4d390 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -128,7 +129,7 @@ void TestWriteAppends(T first, U second) {
   string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
-  EXPECT_TRUE(StringPiece(encoded).starts_with(encoded_first_only));
+  EXPECT_TRUE(str_util::StartsWith(encoded, encoded_first_only));
 }
 
 template <typename T>
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index d3b63357ee..c82e771368 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -75,14 +76,14 @@ class Scanner {
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, this is a no-op.
   Scanner& ZeroOrOneLiteral(StringPiece s) {
-    cur_.Consume(s);
+    str_util::ConsumePrefix(&cur_, s);
     return *this;
   }
 
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, then GetResult will ultimately return false.
   Scanner& OneLiteral(StringPiece s) {
-    if (!cur_.Consume(s)) {
+    if (!str_util::ConsumePrefix(&cur_, s)) {
       error_ = true;
     }
     return *this;
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 3ee7be3c4e..be84316c48 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -85,6 +85,7 @@ cc_library(
         ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@jsoncpp_git//:jsoncpp",
     ],
@@ -263,6 +264,7 @@ tf_cc_test(
     deps = [
         ":gcs_file_system",
         ":http_request_fake",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 1691826483..3c0dc13d75 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -172,7 +172,7 @@ Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("GCS path doesn't contain an object name: ",
@@ -535,7 +535,8 @@ class GcsWritableFile : public WritableFile {
       *uploaded = 0;
     } else {
       StringPiece range_piece(received_range);
-      range_piece.Consume("bytes=");  // May or may not be present.
+      str_util::ConsumePrefix(&range_piece,
+                              "bytes=");  // May or may not be present.
       std::vector<int64> range_parts;
       if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
           range_parts.size() != 2) {
@@ -1172,7 +1173,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         // 'object_prefix', which is part of 'dirname', should be removed from
         // the beginning of 'name'.
         StringPiece relative_path(name);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(strings::StrCat(
               "Unexpected response: the returned file name ", name,
               " doesn't match the prefix ", object_prefix));
@@ -1201,7 +1202,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         }
         const string& prefix_str = prefix.asString();
         StringPiece relative_path(prefix_str);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(
               "Unexpected response: the returned folder name ", prefix_str,
               " doesn't match the prefix ", object_prefix);
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 8516421614..2fbde9b6a7 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -584,8 +585,9 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   TF_EXPECT_OK(file->Append("content2"));
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts failed. The last failure: "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "All 10 retry attempts failed. The last failure: "
                             "Unavailable: important HTTP error 503"))
       << status;
 }
@@ -641,13 +643,12 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains(
-              "Upload to gs://bucket/path/writeable.txt failed, caused by: "
-              "Not found: important HTTP error 410"))
+      str_util::StrContains(status.error_message(),
+                            "Upload to gs://bucket/path/writeable.txt failed, "
+                            "caused by: Not found: important HTTP error 410"))
       << status;
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("when uploading gs://bucket/path/writeable.txt"))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "when uploading gs://bucket/path/writeable.txt"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index d3f763bb3c..ee6886fef7 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -245,7 +246,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   char scratch[10];
   const auto& status = random_access_file->Read(0, 10, &result, scratch);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -399,7 +400,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   // Use it and check the results.
   const auto& status = writable_file->Sync();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -428,7 +429,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
   const auto& status =
       fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -454,7 +455,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -481,7 +482,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -506,7 +507,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -531,7 +532,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -556,7 +557,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -582,7 +583,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -605,7 +606,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
 
   const auto& status = fs.RenameFile("old_name", "new_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -630,7 +631,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -642,7 +643,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
 
   const auto& status = fs.FileExists("file_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -677,7 +678,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
 
   const auto& status = fs.IsDirectory("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -706,7 +707,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
   const auto& status =
       fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
index 6eb340e094..1b6527618a 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -31,10 +32,9 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
 
   const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts "
-                            "failed. The last failure: "
-                            "Unavailable: Failed."))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "All 10 retry attempts failed. The last failure: Unavailable: Failed."))
       << status;
 
   EXPECT_EQ(10, requested_delays.size());
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 3efc703faf..480ce94fca 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -28,7 +29,9 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                      const std::function<bool(string)>& hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     *value_parsing_ok = hook(arg.ToString());
     return true;
   }
@@ -40,7 +43,9 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int32)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int32 parsed_int32;
     if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
@@ -60,7 +65,9 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int64)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int64 parsed_int64;
     if (sscanf(arg.data(), "%lld%c", &parsed_int64, &extra) != 1) {
@@ -80,7 +87,8 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    const std::function<bool(bool)>& hook,
                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag)) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *value_parsing_ok = hook(true);
       return true;
@@ -107,7 +115,9 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(float)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     float parsed_float;
     if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index c1bc0f3378..ff9c108f10 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -408,7 +408,7 @@ static void MergeDevNamesError(const string& name_a, const string& name_b,
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(s.error_message()).contains(expected_error_substr))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), expected_error_substr))
       << s;
 }
 
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index f1ec497a67..b87dce0dff 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -144,7 +145,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 
   int first_control_input = actual.input_size();
   for (int i = 0; i < actual.input_size(); ++i) {
-    if (StringPiece(actual.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(actual.input(i), "^")) {
       first_control_input = i;
       break;
     }
@@ -240,7 +241,7 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   // Normal inputs. Order important.
   int first_control_input = ndef.input_size();
   for (int i = 0; i < ndef.input_size(); ++i) {
-    if (StringPiece(ndef.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(ndef.input(i), "^")) {
       first_control_input = i;
       break;
     }
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index ea0a381f4f..1fa6b8bec0 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/util/memmapped_file_system.h"
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/memmapped_file_system.pb.h"
 
@@ -242,7 +243,7 @@ Status MemmappedFileSystem::InitializeFromFile(Env* env,
 }
 
 bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
-  return StringPiece(filename).starts_with(kMemmappedPackagePrefix);
+  return str_util::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 575c27d4ef..90ea09876e 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 010cc36823..3c9590e488 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -422,7 +423,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   // Read it back in and verify that we get the expected error
   TensorSliceReader reader(path, OpenTableTensorSliceReader);
   EXPECT_TRUE(reader.status().code() == error::INVALID_ARGUMENT &&
-              StringPiece(reader.status().error_message()).starts_with(error))
+              str_util::StartsWith(reader.status().error_message(), error))
       << "Expected error starting with '" << errors::InvalidArgument(error)
       << "', got '" << reader.status() << "'";
 }
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index ff5bfd65ae..31397f11b6 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -333,8 +334,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<int8> data(300000000, -1);
     Status s = writer.Add("test1", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 
   // Add a large string tensor slice, which will fail.
@@ -344,8 +345,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<string> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 }
 
diff --git a/tensorflow/tools/graph_transforms/backports_test.cc b/tensorflow/tools/graph_transforms/backports_test.cc
index ab9a61afa7..80a954e062 100644
--- a/tensorflow/tools/graph_transforms/backports_test.cc
+++ b/tensorflow/tools/graph_transforms/backports_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -191,7 +192,7 @@ TEST(BackportTensorArrayV3Test, TestBackportTensorArrayV3Subtypes) {
     std::map<string, const NodeDef*> node_lookup;
     MapNamesToNodes(result, &node_lookup);
     ASSERT_EQ(1, node_lookup.count("v3_node"));
-    EXPECT_TRUE(StringPiece(node_lookup.at("v3_node")->op()).ends_with("V2"));
+    EXPECT_TRUE(str_util::EndsWith(node_lookup.at("v3_node")->op(), "V2"));
   }
 }
 
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index 6bfdfe43f5..a082399a87 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -209,10 +210,10 @@ class ConstantFoldingTest : public ::testing::Test {
     for (const NodeDef& node : graph_def.node()) {
       const StringPiece name(node.name());
       const int occurrence_count = folded_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(name, "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(name, "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index 2436c7e4a2..f401723808 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -40,8 +40,8 @@ Status ExtractMinMaxRecords(const string& log_file_name,
   for (const string& file_line : file_lines) {
     // We expect to find a line with components separated by semicolons, so to
     // start make sure that the basic structure is in place/
-    StringPiece line(file_line);
-    if (!line.contains(print_suffix + ";" + requant_prefix)) {
+    if (!str_util::StrContains(file_line,
+                               print_suffix + ";" + requant_prefix)) {
       continue;
     }
     std::vector<string> line_parts = str_util::Split(file_line, ';');
@@ -53,8 +53,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
     bool min_max_found = false;
     int min_max_index;
     for (int i = 1; i < line_parts.size(); ++i) {
-      StringPiece line_part(line_parts[i]);
-      if (line_part.starts_with(requant_prefix)) {
+      if (str_util::StartsWith(line_parts[i], requant_prefix)) {
         min_max_found = true;
         min_max_index = i;
       }
@@ -90,7 +89,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
       continue;
     }
     StringPiece name_string = line_parts[min_max_index - 1];
-    if (!name_string.ends_with(print_suffix)) {
+    if (!str_util::EndsWith(name_string, print_suffix)) {
       continue;
     }
     string name =
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index e1ee2b420b..377665448c 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -101,7 +102,7 @@ Status InsertLogging(const GraphDef& input_graph_def,
     const bool op_matches = (ops.count(node.op()) > 0);
     bool prefix_matches = false;
     for (const string& prefix : prefixes) {
-      if (StringPiece(node.name()).starts_with(prefix)) {
+      if (str_util::StartsWith(node.name(), prefix)) {
         prefix_matches = true;
       }
     }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 701e350fc3..cc82100148 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -88,7 +89,7 @@ void CreateConstNode(const Tensor& tensor, const string& name,
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
   std::vector<string> names = Split(tensor_slice_name, "/");
-  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+  if (str_util::StartsWith(names[names.size() - 1], "part_")) {
     CHECK_GE(names.size(), 2);
     names.pop_back();
   }
@@ -102,8 +103,8 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
   for (const auto& node : input_graph_def.node()) {
     std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
-        StringPiece(node_name_parts[0]).starts_with("save") &&
-        StringPiece(node_name_parts[1]).starts_with("Assign") &&
+        str_util::StartsWith(node_name_parts[0], "save") &&
+        str_util::StartsWith(node_name_parts[1], "Assign") &&
         node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index bc2412fcbd..b276229aa4 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -112,12 +113,11 @@ class TransformGraphTest : public ::testing::Test {
     graph_transforms::MapNamesToNodes(out_graph_def, &out_node_map);
 
     for (const NodeDef& node : out_graph_def.node()) {
-      const StringPiece name(node.name());
       const int occurrence_count = out_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(node.name(), "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(node.name(), "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -139,7 +139,7 @@ class TransformGraphTest : public ::testing::Test {
     Status no_such_status =
         TransformGraph({}, {}, {{"test_no_such_transform", {}}}, &graph_def);
     EXPECT_TRUE(
-        StringPiece(no_such_status.ToString()).contains("not recognized"));
+        str_util::StrContains(no_such_status.ToString(), "not recognized"));
   }
 
   void TestParseTransformParameters() {
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 55f28a9e1d..367048965d 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -88,7 +88,7 @@ void NodeNamePartsFromInput(const string& input_name, string* prefix,
     *suffix = ":" + input_parts[1];
   }
   StringPiece node_name_piece(input_parts[0]);
-  if (node_name_piece.Consume("^")) {
+  if (str_util::ConsumePrefix(&node_name_piece, "^")) {
     *prefix = "^";
   } else {
     *prefix = "";
@@ -200,8 +200,7 @@ Status SortByExecutionOrder(const GraphDef& input_graph_def,
       // for merge only wait for one non-control input.
       int32 num_control_edges = 0;
       for (int i = 0; i < node_def.input_size(); ++i) {
-        StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(node_def.input(i), "^")) {
           num_control_edges++;
         }
       }
@@ -504,7 +503,7 @@ Status RenameNodeInputs(const GraphDef& input_graph_def,
           const string& dest_name = input_to_rename.second;
           bool is_match;
           string match_name;
-          if (StringPiece(source_name).ends_with(":*")) {
+          if (str_util::EndsWith(source_name, ":*")) {
             is_match = true;
             string prefix;
             string unused_node_name;
-- 
GitLab


From 8539edf3d9d979b2238693a952cdc382e031bdb1 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 4 Apr 2018 13:00:32 -0700
Subject: [PATCH 0258/1262] fix lint

---
 tensorflow/python/util/tf_inspect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 2c48c52caf..286028b8bb 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -31,7 +31,7 @@ if six.PY3:
 else:
   FullArgSpec = namedtuple(
       'FullArgSpec', ['args', 'varargs', 'varkw', 'defaults',
-      'kwonlyargs', 'kwonlydefaults', 'annotations'])
+                      'kwonlyargs', 'kwonlydefaults', 'annotations'])
 
 
 def currentframe():
-- 
GitLab


From 2a82ae63091ba39f84f0dac6d0333e2e0600f067 Mon Sep 17 00:00:00 2001
From: Adria Puigdomenech <adriap@google.com>
Date: Wed, 4 Apr 2018 13:02:27 -0700
Subject: [PATCH 0259/1262] Expose scatter_add for resource variables.

PiperOrigin-RevId: 191634030
---
 .../python_api/api_def_ScatterAdd.pbtxt       |  4 ++
 .../resource_variable_ops_test.py             |  6 +++
 tensorflow/python/ops/state_ops.py            | 52 +++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt

diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000..4f5b6decf6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index c31d5a1f91..edc63264a3 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -802,6 +802,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_update(v, [1], [3.0])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  def testScatterAddStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="add")
+      state_ops.scatter_add(v, [1], [3])
+      self.assertAllEqual([1.0, 5.0], v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 01fc3182bc..f6a11ca625 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -423,3 +423,55 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
       ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
       use_locking, name)]):
     return ref.read_value()
+
+
+@tf_export("scatter_add")
+def scatter_add(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Adds sparse updates to the variable referenced by `resource`.
+
+  This operation computes
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] += updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] += updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the updated value.
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions add.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+  </div>
+
+  Args:
+    ref: A `Variable`.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to store in `ref`.
+    use_locking: An optional `bool`. Defaults to `True`.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    Same as `ref`.  Returned as a convenience for operations that want
+    to use the updated values after the update is done.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_add(ref, indices, updates,
+                                     use_locking=use_locking, name=name)
+  return ref._lazy_read(gen_resource_variable_ops.resource_scatter_add(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
-- 
GitLab


From ab4ba5e4ba85aa6cd0e5b3e430e80eef39174ffa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 13:24:06 -0700
Subject: [PATCH 0260/1262] Fix up output array min/max post-quantization if
 the range was overridden.

PiperOrigin-RevId: 191637143
---
 .../graph_transformations/hardcode_min_max.cc |  7 ++-
 .../toco/graph_transformations/quantize.cc    | 49 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 7c97ef0d31..23c9e3246b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -223,8 +223,11 @@ bool PropagateMinMaxAmongArrays(Model* model,
     if (array.minmax) {
       CHECK(*array.minmax == *reference_minmax)
           << "Both the following arrays have minmax, and they disagree: "
-          << reference_array_name << " and " << array_name
-          << ". Expected that either only one of them would have minmax, or at "
+          << reference_array_name << " (" << reference_minmax->min << ","
+          << reference_minmax->max << ") and " << array_name << " ("
+          << array.minmax->min << "," << array.minmax->max
+          << "). Expected that either only one of them would have minmax, or "
+             "at "
              "least that they would agree.";
     } else {
       array.GetOrCreateMinMax() = *reference_minmax;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 9fcc95e1fe..7784558b22 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -472,6 +472,44 @@ bool ChooseQuantizationForOperatorOutput(
 
   return true;
 }
+
+// Fixes array minmax info to match the quantization parameters.
+// This is required for when quantization parameters change for an array during
+// quantization (such as ChooseQuantizationForOperatorOutput).
+void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
+                               const QuantizationParams& quantization_params,
+                               MinMax* minmax) {
+  double qmin, qmax;
+  switch (quantized_data_type) {
+    case ArrayDataType::kUint8:
+      qmin = 0;
+      qmax = 255;
+      break;
+    case ArrayDataType::kInt16:
+      qmin = -32768;
+      qmax = 32767;
+      break;
+    default:
+      // No update required.
+      return;
+  }
+
+  // Compute new minmax values.
+  double min =
+      (qmin - quantization_params.zero_point) * quantization_params.scale;
+  double max =
+      (qmax - quantization_params.zero_point) * quantization_params.scale;
+
+  // If we are close to the existing minmax values don't bother changing them.
+  // This prevents propagating small floating point precision errors.
+  constexpr double kMinMaxThreshold = 1e-5;
+  const double width = max - min;
+  if (std::abs(min - minmax->min) > kMinMaxThreshold * width ||
+      std::abs(max - minmax->max) > kMinMaxThreshold * width) {
+    minmax->min = min;
+    minmax->max = max;
+  }
+}
 }  // namespace
 
 bool Quantize::Run(Model* model, std::size_t op_index) {
@@ -618,12 +656,19 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
                                             &quantization_params)) {
       changed = true;
       const auto& output = op.outputs[output_index];
+      auto& output_array = model->GetArray(output);
+
+      // Fix up the min/max information on the output array to match the chosen
+      // quantization parameters.
+      auto& output_minmax = output_array.GetMinMax();
+      FixMinMaxPostQuantization(quantized_data_type, quantization_params,
+                                &output_minmax);
+
       QuantizeArray(this, model, output, quantized_data_type,
                     quantization_params);
+
       const auto& dequantized_output =
           AvailableArrayName(*model, output + "_dequantized");
-      const auto& output_array = model->GetArray(output);
-      const auto& output_minmax = output_array.GetMinMax();
       auto& dequantized_output_array =
           model->GetOrCreateArray(dequantized_output);
       dequantized_output_array.data_type = ArrayDataType::kFloat;
-- 
GitLab


From 712fc6252228748a72bbc015be55bd20ba811cbb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 13:38:20 -0700
Subject: [PATCH 0261/1262] Permit use of ArraysExtraInfo/constant_float_value
 when target model is quantized.

PiperOrigin-RevId: 191639289
---
 tensorflow/contrib/lite/toco/tooling_util.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 668cf51619..56fa8f4b69 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -2027,11 +2027,12 @@ void UseArraysExtraInfo(Model* model) {
     }
     if (entry.has_constant_float_value()) {
       CHECK(array.has_shape());
-      CHECK(array.data_type == ArrayDataType::kFloat);
-      auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
-      data.resize(RequiredBufferSizeForShape(array.shape()));
-      for (float& f : data) {
-        f = entry.constant_float_value();
+      if (array.data_type == ArrayDataType::kFloat) {
+        auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+        data.resize(RequiredBufferSizeForShape(array.shape()));
+        for (float& f : data) {
+          f = entry.constant_float_value();
+        }
       }
     }
   }
-- 
GitLab


From 86d06ff37f9f1cb2bb5cd0c2d594286bbf023491 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 14:28:22 -0700
Subject: [PATCH 0262/1262] run evaluate nodes on parts of arithmetic optimizer
 tests.

PiperOrigin-RevId: 191647386
---
 tensorflow/core/grappler/optimizers/BUILD     |   5 +
 .../optimizers/arithmetic_optimizer_test.cc   | 143 ++++++++++++++++--
 .../optimizers/constant_folding_test.cc       |  18 +--
 .../core/grappler/utils/grappler_test.h       |   9 ++
 4 files changed, 146 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 4ce3e73911..0c6549d940 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -274,6 +274,11 @@ tf_cuda_cc_test(
         ":constant_folding",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index ef3ed35fa6..48f1dd5aa1 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -156,7 +156,7 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
 
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -164,7 +164,6 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   OptimizeTwice(&optimizer, &item, &output);
   NodeMap node_map(&output);
   EXPECT_EQ(2, output.node_size());
-
   const NodeDef* new_c1 = node_map.GetNode("c1");
   ASSERT_NE(new_c1, nullptr);
 
@@ -174,7 +173,7 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   EXPECT_EQ("c1", new_div->input(0));
   EXPECT_EQ("c1", new_div->input(1));
 
-  auto tensors = EvaluateNodes(output, item.fetch, {});
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
@@ -193,6 +192,11 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
+  Tensor bool_t(DT_BOOL, TensorShape({}));
+  bool_t.scalar<bool>().setConstant(true);
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -208,6 +212,10 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   EXPECT_EQ("check1", new_div->input(1));
   EXPECT_EQ("^assert1", new_div->input(2));
   EXPECT_EQ("^assert1", new_div->input(3));
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
@@ -219,7 +227,9 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div"};
+  item.fetch = {"div1"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -241,6 +251,10 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   EXPECT_EQ(2, new_div1->input_size());
   EXPECT_EQ("mul1", new_div1->input(0));
   EXPECT_EQ("mul1", new_div1->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MulToSquare) {
@@ -251,6 +265,9 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   Output id = ops::Identity(s.WithOpName("id"), mul);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -265,6 +282,10 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   EXPECT_EQ(2, output.node(4).input_size());
   EXPECT_EQ("c", output.node(4).input(0));
   EXPECT_EQ("^d", output.node(4).input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
@@ -277,6 +298,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   Output id = ops::Identity(s.WithOpName("id"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -287,6 +311,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   EXPECT_EQ("c", output.node(1).input(0));
   EXPECT_EQ("c", output.node(3).input(0));
   EXPECT_EQ("c", output.node(5).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
@@ -299,6 +327,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -312,6 +343,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   EXPECT_EQ(6, output.node_size());
   EXPECT_EQ("squeeze", output.node(5).input(0));
   EXPECT_EQ("c", output.node(2).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
@@ -326,6 +361,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -343,6 +382,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
       EXPECT_EQ(original.input(j), optimized.input(j));
     }
   }
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
@@ -354,6 +397,10 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   OptimizeTwice(&optimizer, &item, &output);
@@ -375,6 +422,10 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   const NodeDef* new_id = node_map.GetNode("id");
   ASSERT_NE(new_id, nullptr);
   EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
@@ -387,6 +438,10 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   OptimizeTwice(&optimizer, &item, &output);
@@ -409,6 +464,10 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
   const NodeDef* new_id = node_map.GetNode("id");
   ASSERT_NE(new_id, nullptr);
   EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
@@ -424,6 +483,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   const std::vector<string> devices{
       "/device:CPU:0", "/device:GPU:0", "/device:CPU:0", "/device:GPU:1",
       "/device:CPU:0", "/device:CPU:0", "/device:CPU:0",
@@ -515,7 +575,8 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
       GrapplerItem item;
       item.fetch = {"id"};
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+      EXPECT_EQ(1, tensors_expected.size());
       ArithmeticOptimizer optimizer;
       EnableOnlyHoistCommonFactor(&optimizer);
 
@@ -554,21 +615,26 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
         EXPECT_EQ("id", id_node->name());
         EXPECT_EQ(HoistMulName("add"), id_node->input(0));
       }
+      auto tensors = EvaluateNodes(output, item.fetch);
+      EXPECT_EQ(1, tensors.size());
+      test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
     }
   }
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  std::vector<string> fetch = {"trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
   ArithmeticOptimizer optimizer;
   GraphDef output;
   OptimizeTwice(&optimizer, &item, &output);
@@ -582,12 +648,16 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
   EXPECT_EQ("z", trans_fused_node->input(0));
   EXPECT_EQ("perm", trans_fused_node->input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
@@ -595,6 +665,9 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
       ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conjugate_trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -608,18 +681,24 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
   EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
   EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
   Output conj = ops::Conj(s.WithOpName("conj"), trans);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conj"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -633,6 +712,9 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
   EXPECT_EQ("z", conj_fused_node->input(0));
   EXPECT_EQ("perm", conj_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
@@ -654,6 +736,9 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     }
     GrapplerItem item;
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    std::vector<string> fetch = {"matmul"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
 
     ArithmeticOptimizer optimizer;
     GraphDef output;
@@ -674,6 +759,9 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
       EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
       EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
   }
 }
 
@@ -695,6 +783,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"matmul"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -707,6 +798,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   EXPECT_EQ("b", output.node(10).input(1));
   EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
   EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
@@ -727,7 +821,10 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -735,6 +832,9 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
@@ -749,7 +849,10 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({8, 3, 28, 28}));
+  item.feed = {{"Placeholder", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -757,6 +860,9 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
@@ -769,7 +875,6 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -800,7 +905,10 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({8, 3, 28, 28, 4}));
+  item.feed = {{"nchw_vect_c", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -808,6 +916,9 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 7faa68a657..8d146637a6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -83,14 +83,6 @@ class ConstantFoldingTest : public GrapplerTest {
   }
 };
 
-template <DataType DTYPE>
-Tensor GetRandomTensor(const TensorShape& shape) {
-  typedef typename EnumToDataType<DTYPE>::Type T;
-  Tensor tensor(DTYPE, shape);
-  tensor.flat<T>() = tensor.flat<T>().random();
-  return tensor;
-}
-
 TEST_F(ConstantFoldingTest, SimpleFolding) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -380,11 +372,11 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(1).size());
       }
     }
-    auto a_t = GetRandomTensor<DT_FLOAT>(TensorShape({3, 2}));
-    auto b_t = GetRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
-    auto x_t = GetRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-    auto y_t = GetRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-    auto bias_t = GetRandomTensor<DT_FLOAT>(TensorShape({2}));
+    auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 2}));
+    auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
+    auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto bias_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
 
     auto tensors_expected = EvaluateNodes(
         item.graph, item.fetch,
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 3bc7bea454..e1394b9c35 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -57,6 +57,15 @@ class GrapplerTest : public ::testing::Test {
   // Count nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
 
+  // Get a random tansor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateRandomTensor(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    tensor.flat<T>() = tensor.flat<T>().random();
+    return tensor;
+  }
+
  private:
   SessionOptions options_;
 };
-- 
GitLab


From c7d947ebea70d48e5fe0b6d268f1ada0709c42fe Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Wed, 4 Apr 2018 14:29:03 -0700
Subject: [PATCH 0263/1262] Adding a _create_state method to FeatureColumn so
 that we can decouple variable (more generally, state) creation from getting
 dense tensors etc. This lays the groundwork to create a Layer type interface
 (which is more eager friendly) on top of feature columns where variable
 creation is separated from getting the tensors out.

PiperOrigin-RevId: 191647517
---
 .../python/feature_column/feature_column.py   | 149 ++++++++++++++----
 .../feature_column/feature_column_test.py     | 136 ++++++++++++++++
 2 files changed, 256 insertions(+), 29 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 92c6ff21c4..e116739bc0 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1643,6 +1643,19 @@ class _FeatureColumn(object):
     """
     pass
 
+  def _create_state(self, weight_collections=None, creator=None):
+    """Returns an object that captures the state of the column.
+
+    Args:
+      weight_collections: Collections to add the variable to
+      creator: Variable creator method called, if provided.
+
+    Returns:
+      An object that encapsulates the state of the column. Can return None.
+    """
+    del weight_collections, creator  # Unused
+    return None
+
 
 class _DenseColumn(_FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -1662,7 +1675,11 @@ class _DenseColumn(_FeatureColumn):
     pass
 
   @abc.abstractmethod
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     """Returns a `Tensor`.
 
     The output of this function will be used by model-builder-functions. For
@@ -1680,6 +1697,9 @@ class _DenseColumn(_FeatureColumn):
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
+      state: An object encapsulating the state of the column. Columns that
+        create state using the _create_state method would have that state
+        passed in to this method.
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -1687,13 +1707,13 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
-def _create_weighted_sum(
-    column,
-    builder,
-    units,
-    sparse_combiner,
-    weight_collections,
-    trainable):
+def _create_weighted_sum(column,
+                         builder,
+                         units,
+                         sparse_combiner,
+                         weight_collections,
+                         trainable,
+                         state=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
@@ -1709,16 +1729,28 @@ def _create_weighted_sum(
         builder=builder,
         units=units,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        state=state)
 
 
-def _create_dense_column_weighted_sum(
-    column, builder, units, weight_collections, trainable):
+def _create_dense_column_weighted_sum(column,
+                                      builder,
+                                      units,
+                                      weight_collections,
+                                      trainable,
+                                      state=None):
   """Create a weighted sum of a dense column for linear_model."""
-  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-      builder,
-      weight_collections=weight_collections,
-      trainable=trainable)
+  if state is not None:
+    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+        builder,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        state=state)
+  else:
+    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+        builder,
+        weight_collections=weight_collections,
+        trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
@@ -2195,8 +2227,33 @@ class _EmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor_internal(
-      self, inputs, weight_collections=None, trainable=None):
+  def _create_state(self, weight_collections=None, creator=None):
+    variables_map = {}
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if creator is not None:
+      embedding_weights = creator(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable)
+      ops.add_to_collections(weight_collections, embedding_weights)
+    else:
+      embedding_weights = variable_scope.get_variable(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable,
+          collections=weight_collections)
+    variables_map['embedding_weights'] = embedding_weights
+    return variables_map
+
+  def _get_dense_tensor_internal(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None,
+                                 state=None):
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -2204,14 +2261,10 @@ class _EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    embedding_weights = variable_scope.get_variable(
-        name='embedding_weights',
-        shape=embedding_shape,
-        dtype=dtypes.float32,
-        initializer=self.initializer,
-        trainable=self.trainable and trainable,
-        collections=weight_collections)
+    if state is None:
+      state = self._create_state(weight_collections)
+    embedding_weights = state['embedding_weights']
+
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
@@ -2229,7 +2282,11 @@ class _EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
           'In embedding_column: {}. '
@@ -2242,8 +2299,10 @@ class _EmbeddingColumn(
               self.name, type(self.categorical_column),
               self.categorical_column))
     return self._get_dense_tensor_internal(
-        inputs=inputs, weight_collections=weight_collections,
-        trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        state=state)
 
   def _get_sequence_dense_tensor(
       self, inputs, weight_collections=None, trainable=None):
@@ -2299,7 +2358,39 @@ class _SharedEmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _create_state(self, weight_collections=None, creator=None):
+    variables_map = {}
+    shared_embedding_collection = ops.get_collection(
+        self.shared_embedding_collection_name)
+    if not shared_embedding_collection:
+      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+      if creator is not None:
+        embedding_weights = creator(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable)
+        ops.add_to_collections(weight_collections, embedding_weights)
+      else:
+        embedding_weights = variable_scope.get_variable(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable,
+            collections=weight_collections)
+      ops.add_to_collection(self.shared_embedding_collection_name,
+                            embedding_weights)
+      variables_map['embedding_weights'] = embedding_weights
+
+    return variables_map
+
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     # This method is called from a variable_scope with name _var_scope_name,
     # which is shared among all shared embeddings. Open a name_scope here, so
     # that the ops for different columns have distinct names.
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 6f366e7722..4006a76bb4 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -3733,6 +3733,70 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+  def test_get_dense_tensor_with_state(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Create embedding_weights variable.
+    weight_collections = [ops.GraphKeys.GLOBAL_VARIABLES,
+                          ops.GraphKeys.MODEL_VARIABLES]
+    state = embedding_column._create_state(weight_collections)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }),
+        state=state)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -4453,6 +4517,78 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
       self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
+  def test_get_dense_tensor_with_state(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    input_features = {
+        'aaa': input_a,
+        'bbb': input_b
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Create state.
+    weight_collections = [ops.GraphKeys.GLOBAL_VARIABLES,
+                          ops.GraphKeys.MODEL_VARIABLES]
+    state = embedding_column_a._create_state(weight_collections)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features), state=state)
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features), state=state)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
-- 
GitLab


From 733c7b38089120e6a7490c0c2d3c1006b9f91aa9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 14:31:29 -0700
Subject: [PATCH 0264/1262] Inline nested function calls.

PiperOrigin-RevId: 191647899
---
 .../grappler/optimizers/function_optimizer.cc | 240 +++++++++++-------
 .../optimizers/function_optimizer_test.cc     | 137 ++++++++--
 2 files changed, 273 insertions(+), 104 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 2a6b8a325f..f1da469a6c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -32,16 +32,129 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+
+class FunctionInliningContext {
+ public:
+  explicit FunctionInliningContext(const GrapplerItem& item)
+      : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+
+  const FunctionDefLibrary& Library() const { return *library_; }
+
+  bool HasInlinedFunctions() const { return !functions_.empty(); }
+
+  // Find inlining candidate by name. Return nullptr if not found.
+  const FunctionDef* FindInlinedFunction(const string& name) const {
+    auto it = functions_.find(name);
+    if (it != functions_.end()) {
+      return it->second;
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::unordered_map<string, const FunctionDef*> InliningCandidates(
+      const GrapplerItem& item) const {
+    std::unordered_map<string, const FunctionDef*> functions;
+    for (const FunctionDef& func : item.graph.library().function()) {
+      // Don't inline functions marked as noinline
+      if (func.attr().count("_noinline") != 0) {
+        continue;
+      }
+      // Don't touch anything marked XLA to prevent XLA failures further down
+      // the road.
+      if (func.attr().count("_XlaCompile") > 0 &&
+          func.attr().at("_XlaCompile").b()) {
+        continue;
+      }
+      // Can't create IdentityN nodes with no input or output: skip these
+      // functions for now.
+      if (func.signature().input_arg_size() == 0 ||
+          func.signature().output_arg_size() == 0) {
+        continue;
+      }
+      functions[func.signature().name()] = &func;
+    }
+    return functions;
+  }
+
+  const FunctionDefLibrary* library_;
+  std::unordered_map<string, const FunctionDef*> functions_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+};
+
+// Copy input/output argument type to the type_list. Return error if argument
+// type is not explicitly defined, and not specified in function attributes.
+Status CopyArgType(const NodeDef& func_node,
+                   const std::unordered_map<string, AttrValue>& func_attr,
+                   const string& arg_kind, const OpDef::ArgDef& arg,
+                   AttrValue::ListValue* type_list) {
+  if (arg.type() != DT_INVALID) {
+    type_list->add_type(arg.type());
+  } else {
+    auto it = func_attr.find(arg.type_attr());
+    if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+      return errors::InvalidArgument(
+          "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+          func_node.op(), " instantiated by ", func_node.name());
+    }
+    type_list->add_type(it->second.type());
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function inputs to: this ensures that
+// they're all evaluated before the evaluation of the function body starts.
+Status HookInlinedFunctionInputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
+  inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+  inputs->set_op("IdentityN");
+  inputs->set_device(func_node.device());
+  *inputs->mutable_input() = func_node.input();
+  AttrValue::ListValue* type_list =
+      (*inputs->mutable_attr())["T"].mutable_list();
+  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "input", arg, type_list));
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function outputs to: this ensures that the
+// function body is fully evaluated before its fanout gets scheduled.
+Status HookInlinedFunctionOutputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
+  outputs->set_name(func_node.name());
+  outputs->set_op("IdentityN");
+  outputs->set_device(func_node.device());
+  AttrValue::ListValue* type_list =
+      (*outputs->mutable_attr())["T"].mutable_list();
+  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+    const OpDef::ArgDef& arg = func.signature().output_arg(i);
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "output", arg, type_list));
+    // Use the fetch names since they take into account the output mapping.
+    outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+  }
+  return Status::OK();
+}
+
+Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
+                      const FunctionInliningContext& ctx,
+                      GraphDef* optimized_graph) {
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
 
-Status InlineFunction(const NodeDef& node, const FunctionDef& func,
-                      const FunctionDefLibrary& library, GraphDef* graph) {
-  const std::unordered_map<string, AttrValue> attr(node.attr().begin(),
-                                                   node.attr().end());
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, attr, library);
+      GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
   if (!item) {
-    return errors::InvalidArgument("Failed to inline function ", node.op(),
-                                   " instantiated by ", node.name());
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name());
   }
 
   std::unordered_map<string, int> input_nodes;
@@ -50,43 +163,25 @@ Status InlineFunction(const NodeDef& node, const FunctionDef& func,
     input_nodes[arg.name()] = i;
   }
 
-  // Add an IdentityN op to hook the function inputs to: this ensures that
-  // they're all evaluated before the evaluation of the function body starts.
-  NodeDef* func_inputs = graph->add_node();
-  func_inputs->set_name(strings::StrCat(node.name(), "/", "inlined_inputs"));
-  func_inputs->set_op("IdentityN");
-  func_inputs->set_device(node.device());
-  *func_inputs->mutable_input() = node.input();
-  AttrValue::ListValue* type_list =
-      (*func_inputs->mutable_attr())["T"].mutable_list();
-  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-    if (arg.type() != DT_INVALID) {
-      type_list->add_type(arg.type());
-    } else {
-      auto it = attr.find(arg.type_attr());
-      if (it == attr.end()) {
-        return errors::InvalidArgument("Invalid input argument ", arg.name(),
-                                       " for function ", node.op(),
-                                       " instantiated by ", node.name());
-      }
-      type_list->add_type(it->second.type());
-    }
-  }
+  // Hook inlined function inputs to IdentityN node
+  NodeDef* func_inputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(
+      HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
 
   for (NodeDef& func_body_node : *item->graph.mutable_node()) {
     if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+      CHECK_EQ(0, func_body_node.input_size());
       // Turn input placeholders into identity nodes
       if (IsPlaceholder(func_body_node)) {
         func_body_node.set_op("Identity");
       }
-      CHECK_EQ(0, func_body_node.input_size());
       int input_id = input_nodes[func_body_node.name()];
       func_body_node.add_input(
           strings::StrCat(func_inputs->name(), ":", input_id));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, node.name());
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
       }
       // If the node has no input, make hook it up to the func_inputs node to
       // ensure it runs in the same frame as the other nodes of the function
@@ -98,39 +193,29 @@ Status InlineFunction(const NodeDef& node, const FunctionDef& func,
 
     // Add the node name as a prefix to avoid collisions after inlining
     func_body_node.set_name(
-        strings::StrCat(node.name(), "/", func_body_node.name()));
+        strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
     // Make sure the node is placed
-    func_body_node.set_device(node.device());
-
-    // Move the node to the main graph
-    graph->add_node()->Swap(&func_body_node);
-  }
-
-  // Add an IdentityN op to hook the function outputs to: this ensures that the
-  // function body is fully evaluated before its fanout gets scheduled.
-  NodeDef* func_outputs = graph->add_node();
-  func_outputs->set_name(node.name());
-  func_outputs->set_op("IdentityN");
-  func_outputs->set_device(node.device());
-  type_list = (*func_outputs->mutable_attr())["T"].mutable_list();
-  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().output_arg(i);
-    if (arg.type() != DT_INVALID) {
-      type_list->add_type(arg.type());
+    func_body_node.set_device(func_node.device());
+
+    // Check if a body node is itself a function
+    const FunctionDef* func_body_node_func =
+        ctx.FindInlinedFunction(func_body_node.op());
+    if (func_body_node_func != nullptr) {
+      // Recursively inline function calls
+      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
+                                        ctx, optimized_graph));
     } else {
-      auto it = attr.find(arg.type_attr());
-      if (it == attr.end()) {
-        return errors::InvalidArgument("Invalid output argument ", arg.name(),
-                                       " for function ", node.op(),
-                                       " instantiated by ", node.name());
-      }
-      type_list->add_type(it->second.type());
+      // Move the node to the main graph
+      optimized_graph->add_node()->Swap(&func_body_node);
     }
-    // Use the fetch names since they take into account the output mapping.
-    func_outputs->add_input(strings::StrCat(node.name(), "/", item->fetch[i]));
   }
 
+  // Hook inlined function outputs to IdentityN node
+  NodeDef* func_outputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
+                                                item->fetch, func_outputs));
+
   return Status::OK();
 }
 
@@ -278,31 +363,14 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
   return Status::OK();
 }
 
+}  // namespace
+
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  std::unordered_map<string, const FunctionDef*> functions;
-  for (const FunctionDef& func : item.graph.library().function()) {
-    // Don't inline functions marked as noinline
-    if (func.attr().count("_noinline") != 0) {
-      continue;
-    }
-    // Don't touch anything marked XLA to prevent XLA failures further down the
-    // road.
-    if (func.attr().count("_XlaCompile") > 0 &&
-        func.attr().at("_XlaCompile").b()) {
-      continue;
-    }
-    // Can't create IdentityN nodes with no input or output: skip these
-    // functions for now.
-    if (func.signature().input_arg_size() == 0 ||
-        func.signature().output_arg_size() == 0) {
-      continue;
-    }
-    functions[func.signature().name()] = &func;
-  }
+  FunctionInliningContext function_inlining_ctx(item);
 
-  // Nothing to do.
-  if (functions.empty()) {
+  // Nothing to do here.
+  if (!function_inlining_ctx.HasInlinedFunctions()) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
@@ -315,12 +383,14 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
       continue;
     }
-    auto it = functions.find(node.op());
-    if (it == functions.end()) {
-      *optimized_graph->add_node() = node;
+
+    const FunctionDef* func =
+        function_inlining_ctx.FindInlinedFunction(node.op());
+    if (func != nullptr) {
+      TF_RETURN_IF_ERROR(
+          InlineFunction(node, *func, function_inlining_ctx, optimized_graph));
     } else {
-      TF_RETURN_IF_ERROR(InlineFunction(node, *it->second, item.graph.library(),
-                                        optimized_graph));
+      *optimized_graph->add_node() = node;
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index deb2fabded..c804d75756 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -26,7 +26,22 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class FunctionOptimizerTest : public GrapplerTest {};
+constexpr char kDevice[] = "/device:CPU:0";
+
+class FunctionOptimizerTest : public GrapplerTest {
+ protected:
+  Tensor MakeScalarTensor(float value) {
+    Tensor tensor(DT_FLOAT, {});
+    tensor.scalar<float>()() = value;
+    return tensor;
+  }
+
+  Tensor MakeScalarTensor(int value) {
+    Tensor tensor(DT_INT32, {});
+    tensor.scalar<int>()() = value;
+    return tensor;
+  }
+};
 
 TEST_F(FunctionOptimizerTest, SimpleFunction) {
   // Build a graph to compute y = XTimesTwo(x)
@@ -94,9 +109,8 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
   }
   EXPECT_EQ(7, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -183,9 +197,8 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
   }
   EXPECT_EQ(6, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -268,9 +281,8 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
   }
   EXPECT_EQ(6, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -325,18 +337,11 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
   TF_EXPECT_OK(status);
 
   item.fetch = {"z0", "z1", "z2"};
-  Tensor in(DT_FLOAT, {});
-  in.flat<float>()(0) = 3.14f;
-  item.feed.emplace_back("x0", in);
-  in.flat<float>()(0) = 2.7f;
-  item.feed.emplace_back("x1", in);
-  in.flat<float>()(0) = 1.0f;
-  item.feed.emplace_back("x2", in);
-  in.flat<float>()(0) = -1.0f;
-  item.feed.emplace_back("x4", in);
-  Tensor in_int(DT_INT32, {});
-  in_int.flat<int>()(0) = 1234;
-  item.feed.emplace_back("x3", in_int);
+  item.feed.emplace_back("x0", MakeScalarTensor(3.14f));
+  item.feed.emplace_back("x1", MakeScalarTensor(2.7f));
+  item.feed.emplace_back("x2", MakeScalarTensor(1.0f));
+  item.feed.emplace_back("x4", MakeScalarTensor(-1.0f));
+  item.feed.emplace_back("x3", MakeScalarTensor(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -379,6 +384,100 @@ TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
   EXPECT_EQ(item.graph.DebugString(), output.DebugString());
 }
 
+TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
+  // Define square via function library:
+  //   MySquare(x) = MyMul(x, x)
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {test::function::NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}},
+                            kDevice),
+       test::function::NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}},
+                            kDevice),
+       test::function::NDef("outputs", "Identity", {"square:0"},
+                            {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func});
+
+  GraphDef output;
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+    } else if (node.name() == "square/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/x", node.input(0));
+      EXPECT_EQ("square/x", node.input(1));
+    } else if (node.name() == "square/output/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/y" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:1", node.input(0));
+    } else if (node.name() == "square/output/output" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/output/x", node.input(0));
+      EXPECT_EQ("square/output/y", node.input(1));
+    } else if (node.name() == "square/output" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/output:0", node.input(0));
+    } else if (node.name() == "square" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output:0", node.input(0));
+    } else if (node.name() == "outputs" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(9, count);
+
+  item.fetch = {"outputs"};
+  item.feed.emplace_back("a", MakeScalarTensor(2.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
-- 
GitLab


From f338cf1fce7be5d5d0aad0a41415a03b1f92817b Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Wed, 4 Apr 2018 14:39:40 -0700
Subject: [PATCH 0265/1262] Add link to tensorflow.js

PiperOrigin-RevId: 191649295
---
 tensorflow/docs_src/extend/index.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index bdff60b39e..1ab0340ad9 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -16,9 +16,10 @@ TensorFlow:
     for your own file and record formats.
 
 Python is currently the only language supported by TensorFlow's API stability
-promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
+promises. However, TensorFlow also provides functionality in C++, Go, Java and
+[JavaScript](https://js.tensorflow.org),
 plus community support for [Haskell](https://github.com/tensorflow/haskell) and
-[Rust](https://github.com/tensorflow/rust).  If you'd like to create or
+[Rust](https://github.com/tensorflow/rust). If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
-- 
GitLab


From 2948be3af67b7ec124942654dc7f734eec346f55 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 4 Apr 2018 14:41:59 -0700
Subject: [PATCH 0266/1262] Check that the c_api module is not destroyed

PiperOrigin-RevId: 191649662
---
 tensorflow/python/framework/c_api_util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 6c522de452..4356a534b4 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -33,7 +33,7 @@ class ScopedTFStatus(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteStatus is not None:
+    if c_api is not None and c_api.TF_DeleteStatus is not None:
       c_api.TF_DeleteStatus(self.status)
 
 
@@ -46,7 +46,7 @@ class ScopedTFGraph(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteGraph is not None:
+    if c_api is not None and c_api.TF_DeleteGraph is not None:
       c_api.TF_DeleteGraph(self.graph)
 
 
@@ -59,7 +59,7 @@ class ScopedTFImportGraphDefOptions(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteImportGraphDefOptions is not None:
+    if c_api is not None and c_api.TF_DeleteImportGraphDefOptions is not None:
       c_api.TF_DeleteImportGraphDefOptions(self.options)
 
 
-- 
GitLab


From e8882f768127b71e03efbf193a9c3152ab84802a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 14:45:05 -0700
Subject: [PATCH 0267/1262] GraphOptimizerStagePipeline to pass through
 multiple optimizer stages, skipping stages that return error.

PiperOrigin-RevId: 191650182
---
 .../optimizers/arithmetic_optimizer.cc        | 63 +++++++------------
 .../optimizers/graph_optimizer_stage.h        | 61 ++++++++++++++++++
 2 files changed, 83 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 6e27259998..919f23fd98 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1667,34 +1667,24 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
                                   &frame_map_);
   const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
 
-  std::vector<std::unique_ptr<ArithmeticOptimizerStage>> stages;
-
-  if (options_.combine_add_to_addn) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new AddOpsRewriteStage(ctx, ctx_ext)));
-  }
-  if (options_.hoist_common_factor_out_of_aggregation) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new HoistCommonFactorOutOfAggregation(ctx, ctx_ext)));
-  }
-  if (options_.remove_identity_transpose) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveIdentityTranspose(ctx, ctx_ext)));
-  }
-  if (options_.remove_redundant_bitcast) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveRedundantBitcastStage(ctx, ctx_ext)));
-  }
-  if (options_.remove_redundant_cast) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveRedundantCastStage(ctx, ctx_ext)));
-  }
-  if (options_.remove_negation) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveNegationStage(ctx, ctx_ext)));
-  }
-
-  VLOG(1) << "Simplify arithmetic ops using " << stages.size()
+  // Stop pipeline after first stage returning non-empty simplified tensor name.
+  const auto stop = [](const string& result) { return !result.empty(); };
+  GraphOptimizerStagePipeline<string> pipeline(stop);
+
+  if (options_.combine_add_to_addn)
+    pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.hoist_common_factor_out_of_aggregation)
+    pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
+  if (options_.remove_identity_transpose)
+    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+  if (options_.remove_redundant_bitcast)
+    pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
+  if (options_.remove_redundant_cast)
+    pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
+  if (options_.remove_negation)
+    pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+
+  VLOG(1) << "Simplify arithmetic ops using " << pipeline.NumStages()
           << " arithmetic optimization stages";
 
   while (!nodes_to_simplify.Empty()) {
@@ -1707,22 +1697,13 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
     }
 
     // if it was not simplified try to run it through all configured stages
-    if (simplified_tensor.empty()) {
-      for (auto& stage : stages) {
-        if (stage->IsSupported(node)) {
-          TF_RETURN_IF_ERROR(stage->TrySimplify(node, &simplified_tensor));
-          if (!simplified_tensor.empty()) {
-            break;
-          }
-        }
+    if (!stop(simplified_tensor)) {
+      bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
+      if (!optimized) {
+        continue;
       }
     }
 
-    // if it's still empty go to the next Node
-    if (simplified_tensor.empty()) {
-      continue;
-    }
-
     // re-wire consumers of an old node to the new one
     if (NodeName(simplified_tensor) != node->name()) {
       // Always consider simplified_tensor for further optimizations.
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index be95c00d2d..8d3e965c57 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -117,6 +117,9 @@ class GraphOptimizerStage {
       : optimizer_name_(optimizer_name), stage_name_(stage_name), ctx_(ctx) {}
   virtual ~GraphOptimizerStage() = default;
 
+  const string& stage_name() const { return stage_name_; }
+  const string& optimizer_name() const { return optimizer_name_; }
+
   // Check if we should try to simplify node. Returning true doesn't
   // guarantee that node will be simplified.
   //
@@ -179,6 +182,64 @@ class GraphOptimizerStage {
   const GraphOptimizerContext ctx_;
 };
 
+template <typename Result>
+class GraphOptimizerStagePipeline {
+ public:
+  // Break predicate specifies if a pipeline should stop early, and not pass
+  // a node to the next registered optimizer stage, typically that should be the
+  // case when a stage successfully optimized a node, and it wants to yield
+  // control to the optimizer.
+  explicit GraphOptimizerStagePipeline(
+      const std::function<bool(const Result&)> break_predicate)
+      : break_predicate_(break_predicate) {}
+
+  // Add a stage to the pipeline. It should be called with the arguments for the
+  // stage constructor:
+  //
+  //   pipeline.AddStage<FooStage>(constructor_arg1, constructor_arg2);
+  //
+  // Returns a reference to the added stage.
+  template <typename T, typename... Args>
+  T& AddStage(Args&&... args) {
+    auto stage = new T(std::forward<Args>(args)...);
+    stages_.push_back(std::unique_ptr<T>(stage));
+    return *stage;
+  }
+
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true.
+  //
+  // Return true, if pipeline exited after a break predicate was evaluated as
+  // 'true', which typically means that a node was optimized by one of the
+  // registered stages.
+  //
+  // Return false, if node was not optimized by any of registered stages.
+  bool PassThroughAllStages(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (stage->IsSupported(node)) {
+        const Status stage_status = stage->TrySimplify(node, result);
+        // Each stage must be "error safe" (just like exception safe). In
+        // case of any error it must leave optimized graph unmodified.
+        if (!stage_status.ok()) {
+          LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
+                       << ", stage " << stage->stage_name()
+                       << ". Error: " << stage_status.error_message();
+        }
+        if (break_predicate_(*result)) return true;
+      }
+    }
+    return false;
+  }
+
+  std::size_t NumStages() { return stages_.size(); }
+
+ private:
+  std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
+  std::function<bool(const Result&)> break_predicate_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphOptimizerStagePipeline);
+};
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-- 
GitLab


From a9cb0e19b9d96935c653f9cf89cebb6407564e5b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 14:55:16 -0700
Subject: [PATCH 0268/1262] Add quantized uint8 L2Normalization Kernel.

PiperOrigin-RevId: 191652174
---
 tensorflow/contrib/lite/kernels/l2norm.cc     | 22 ++++++++-
 .../contrib/lite/kernels/l2norm_test.cc       | 49 ++++++++++++++++---
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index ee8bfe56d9..e67f4e06f3 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -45,10 +45,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
-  // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(
+      context, output->type == kTfLiteFloat32 || output->type == kTfLiteUInt8);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
+  if (output->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+  }
+
   // TODO(ahentz): For some reason our implementations don't support
   // activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
@@ -75,6 +80,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_L2NORM(optimized_ops);
     }
 #undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteUInt8) {
+#define TF_LITE_L2NORM(type)                                               \
+  type::L2Normalization(GetTensorData<uint8>(input), GetTensorDims(input), \
+                        input->params.zero_point,                          \
+                        GetTensorData<uint8>(output), GetTensorDims(output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_L2NORM(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_L2NORM(optimized_ops);
+    }
+#undef TF_LITE_L2NORM
   } else {
     context->ReportError(context, "Inputs and outputs not all float types.");
     return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index 30e103f330..042314ccf5 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -25,10 +25,22 @@ using ::testing::ElementsAreArray;
 
 class L2NormOpModel : public SingleOpModel {
  public:
-  L2NormOpModel(std::initializer_list<int> input_shape,
-                ActivationFunctionType activation_type) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  L2NormOpModel(const std::initializer_list<int> input_shape,
+                const TensorType tensor_type,
+                const ActivationFunctionType activation_type) {
+    TensorData data = TensorData{tensor_type};
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -2.0;
+      data.max = 2.0;
+      data.scale = 2.0;
+      data.zero_point = 128;
+    }
+    input_ = AddInput(data);
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -1.0;
+      data.max = 127.0 / 128.0;
+    }
+    output_ = AddOutput(data);
     SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
                  CreateL2NormOptions(builder_, activation_type).Union());
     BuildInterpreter({input_shape});
@@ -38,7 +50,17 @@ class L2NormOpModel : public SingleOpModel {
     PopulateTensor(input_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+  int input() const { return input_; }
 
  private:
   int input_;
@@ -46,13 +68,26 @@ class L2NormOpModel : public SingleOpModel {
 };
 
 TEST(L2NormOpTest, SimpleTest) {
-  L2NormOpModel m({1, 1, 1, 6}, ActivationFunctionType_NONE);
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_FLOAT32,
+                  ActivationFunctionType_NONE);
   m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
+TEST(L2NormOpTest, SimpleUint8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(), {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({58, 166, 173, 205, 83, 134}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 91bf5524560c5bc0783b43717156c7dbb6f798f5 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Wed, 4 Apr 2018 15:06:08 -0700
Subject: [PATCH 0269/1262] Rename `distribute` to `train_distribute` parameter
 in `RunConfig` to clarify that its purpose is only for training.

PiperOrigin-RevId: 191654161
---
 tensorflow/contrib/distribute/README.md        |  2 +-
 .../python/estimator_integration_test.py       |  2 +-
 .../examples/simple_estimator_example.py       |  2 +-
 .../python/examples/simple_tfkeras_example.py  |  2 +-
 .../python/learn/estimators/run_config.py      |  2 +-
 tensorflow/python/estimator/estimator.py       |  2 +-
 tensorflow/python/estimator/run_config.py      | 18 +++++++++---------
 .../tensorflow.estimator.-run-config.pbtxt     | 10 +++++-----
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 4af51bec1a..28483f4c88 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -77,7 +77,7 @@ parameter of `Estimator`.
 
 ```python
 distribution = tf.contrib.distribute.MirroredStrategy()
-config = tf.estimator.RunConfig(distribute=distribution)
+config = tf.estimator.RunConfig(train_distribute=distribution)
 classifier = tf.estimator.Estimator(model_fn=model_fn, config=config)
 classifier.train(input_fn=input_fn)
 ```
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index 9be186a724..2b49b8f4ef 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -95,7 +95,7 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
         # TODO(isaprykin): Work around the colocate_with error.
         dnn_optimizer=adagrad.AdagradOptimizer(0.001),
         linear_optimizer=adagrad.AdagradOptimizer(0.001),
-        config=run_config.RunConfig(distribute=distribution))
+        config=run_config.RunConfig(train_distribute=distribution))
 
     num_steps = 10
     estimator.train(train_input_fn, steps=num_steps)
diff --git a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
index 5d6e02b4b9..00c25c7a24 100644
--- a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
+++ b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
@@ -59,7 +59,7 @@ def build_model_fn_optimizer():
 def main(_):
   distribution = tf.contrib.distribute.MirroredStrategy(
       ["/device:GPU:0", "/device:GPU:1"])
-  config = tf.estimator.RunConfig(distribute=distribution)
+  config = tf.estimator.RunConfig(train_distribute=distribution)
 
   def input_fn():
     features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
index e714255f69..b87224251c 100644
--- a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
+++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
@@ -41,7 +41,7 @@ def main(args):
 
   strategy = tf.contrib.distribute.MirroredStrategy(
       ['/device:GPU:0', '/device:GPU:1'])
-  config = tf.estimator.RunConfig(distribute=strategy)
+  config = tf.estimator.RunConfig(train_distribute=strategy)
   optimizer = tf.train.GradientDescentOptimizer(0.2)
 
   model = tf.keras.Sequential()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index f3500bf56f..8c85c431be 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -298,7 +298,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     #   core_run_config.RunConfig.__init__(self)
     # so instead of breaking compatibility with that assumption, we
     # just manually initialize this field:
-    self._distribute = None
+    self._train_distribute = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index ab69a093a2..4d3eff71ad 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -188,7 +188,7 @@ class Estimator(object):
       self._config = config
 
     # The distribute field contains an instance of DistributionStrategy.
-    self._distribution = self._config.distribute
+    self._distribution = self._config.train_distribute
 
     # Model directory.
     model_dir = compat_internal.path_to_str(model_dir)
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 41415b89e9..f62c9cece6 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -44,7 +44,7 @@ _DEFAULT_REPLACEABLE_LIST = [
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
     'log_step_count_steps',
-    'distribute'
+    'train_distribute'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -302,7 +302,7 @@ class RunConfig(object):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                log_step_count_steps=100,
-               distribute=None):
+               train_distribute=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -426,10 +426,10 @@ class RunConfig(object):
         the feature.
       log_step_count_steps: The frequency, in number of global steps, that the
         global step/sec and the loss will be logged during training.
-      distribute: an optional instance of
+      train_distribute: an optional instance of
         `tf.contrib.distribute.DistributionStrategy`. If specified,
-        then Estimator will distribute the user's model according to the policy
-        specified by that strategy.
+        then Estimator will distribute the user's model during training,
+        according to the policy specified by that strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -466,7 +466,7 @@ class RunConfig(object):
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
         log_step_count_steps=log_step_count_steps,
-        distribute=distribute)
+        train_distribute=train_distribute)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -678,10 +678,10 @@ class RunConfig(object):
     return self._service
 
   @property
-  def distribute(self):
+  def train_distribute(self):
     """Returns the optional `tf.contrib.distribute.DistributionStrategy` object.
     """
-    return self._distribute
+    return self._train_distribute
 
   def replace(self, **kwargs):
     """Returns a new instance of `RunConfig` replacing specified properties.
@@ -697,7 +697,7 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
-      - `distribute`.
+      - `train_distribute`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 759ff752b0..05e603efb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "distribute"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
@@ -82,9 +78,13 @@ tf_class {
     name: "tf_random_seed"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "train_distribute"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
   }
   member_method {
     name: "replace"
-- 
GitLab


From bf8ad8277258bdf352ddd1df5200e61ba625f7a2 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Wed, 4 Apr 2018 15:14:19 -0700
Subject: [PATCH 0270/1262] Creates a LinearModel (inherits from
 keras.training.Model) that creates a linear model.

Had to modify the __call__ method in the base layer class so that it could work with feature style inputs in which case we lazily convert the inputs to tensors instead of providing tensors as inputs upfront.

PiperOrigin-RevId: 191655445
---
 tensorflow/python/feature_column/BUILD        |    1 +
 .../python/feature_column/feature_column.py   |  195 ++-
 .../feature_column/feature_column_test.py     | 1538 +++++++++++++++--
 tensorflow/python/layers/base.py              |    3 +-
 4 files changed, 1545 insertions(+), 192 deletions(-)

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 219105d386..295d4ca094 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -43,6 +43,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/keras",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index e116739bc0..3a315e5c2e 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -139,6 +139,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -460,6 +462,154 @@ def linear_model(features,
     return predictions
 
 
+class _FCLinearWrapper(base.Layer):
+  """Wraps a _FeatureColumn in a layer for use in a linear model.
+
+  See `linear_model` above.
+  """
+
+  def __init__(self,
+               feature_column,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_FCLinearWrapper, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._feature_column = feature_column
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+    self._weight_collections = weight_collections
+    self._state = {}
+
+  def build(self, _):
+    self._state = self._feature_column._create_state(  # pylint: disable=protected-access
+        self._weight_collections, self.add_variable)
+
+    if isinstance(self._feature_column, _CategoricalColumn):
+      weight = self.add_variable(
+          name='weights',
+          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    else:
+      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
+      weight = self.add_variable(
+          name='weights',
+          shape=[num_elements, self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    ops.add_to_collections(self._weight_collections, weight)
+    self._weight_var = weight
+    self.built = True
+
+  def call(self, builder):
+    weighted_sum = _create_weighted_sum(
+        column=self._feature_column,
+        builder=builder,
+        units=self._units,
+        sparse_combiner=self._sparse_combiner,
+        weight_collections=self._weight_collections,
+        trainable=self.trainable,
+        weight_var=self._weight_var,
+        state=self._state)
+    return weighted_sum
+
+
+class _BiasLayer(base.Layer):
+  """A layer for the bias term.
+  """
+
+  def __init__(self,
+               units=1,
+               trainable=True,
+               weight_collections=None,
+               name=None,
+               **kwargs):
+    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
+    self._units = units
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._bias_variable = self.add_variable(
+        'bias_weights',
+        shape=[self._units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
+    ops.add_to_collections(self._weight_collections, self._bias_variable)
+    self.built = True
+
+  def call(self, _):
+    return self._bias_variable
+
+
+class _LinearModel(training.Model):
+  """Creates a linear model using feature columns.
+  """
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModel, self).__init__(name=name, **kwargs)
+    self._feature_columns = _clean_feature_columns(feature_columns)
+    self._weight_collections = list(weight_collections or [])
+    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
+      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+    column_layers = {}
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
+        column_name = vs.name
+      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
+                                      self._weight_collections, trainable,
+                                      column_name, **kwargs)
+      column_layers[column_name] = column_layer
+    self._column_layers = self._add_layers(column_layers)
+    self._bias_layer = _BiasLayer(
+        units=units,
+        trainable=trainable,
+        weight_collections=self._weight_collections,
+        name='bias_layer',
+        **kwargs)
+
+  def call(self, features):
+    for column in self._feature_columns:
+      if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+        raise ValueError(
+            'Items of feature_columns must be either a '
+            '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+    weighted_sums = []
+    ordered_columns = []
+    builder = _LazyBuilder(features)
+    for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
+      ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+      weighted_sum = layer(builder)
+      weighted_sums.append(weighted_sum)
+
+    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+    predictions_no_bias = math_ops.add_n(
+        weighted_sums, name='weighted_sum_no_bias')
+    predictions = nn_ops.bias_add(
+        predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+    return predictions
+
+  def _add_layers(self, layers):
+    # "Magic" required for keras.Model classes to track all the variables in
+    # a list of layers.Layer objects.
+    # TODO(ashankar): Figure out API so user code doesn't have to do this.
+    for name, layer in layers.items():
+      setattr(self, 'layer-%s' % name, layer)
+    return layers
+
+
 def _transform_features(features, feature_columns):
   """Returns transformed features based on features columns passed in.
 
@@ -1713,6 +1863,7 @@ def _create_weighted_sum(column,
                          sparse_combiner,
                          weight_collections,
                          trainable,
+                         weight_var=None,
                          state=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
@@ -1722,7 +1873,8 @@ def _create_weighted_sum(column,
         units=units,
         sparse_combiner=sparse_combiner,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
         column=column,
@@ -1730,6 +1882,7 @@ def _create_weighted_sum(column,
         units=units,
         weight_collections=weight_collections,
         trainable=trainable,
+        weight_var=weight_var,
         state=state)
 
 
@@ -1738,6 +1891,7 @@ def _create_dense_column_weighted_sum(column,
                                       units,
                                       weight_collections,
                                       trainable,
+                                      weight_var=None,
                                       state=None):
   """Create a weighted sum of a dense column for linear_model."""
   if state is not None:
@@ -1754,12 +1908,15 @@ def _create_dense_column_weighted_sum(column,
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=[num_elements, units],
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=[num_elements, units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return math_ops.matmul(tensor, weight, name='weighted_sum')
 
 
@@ -1809,8 +1966,13 @@ class _CategoricalColumn(_FeatureColumn):
     pass
 
 
-def _create_categorical_column_weighted_sum(
-    column, builder, units, sparse_combiner, weight_collections, trainable):
+def _create_categorical_column_weighted_sum(column,
+                                            builder,
+                                            units,
+                                            sparse_combiner,
+                                            weight_collections,
+                                            trainable,
+                                            weight_var=None):
   """Create a weighted sum of a categorical column for linear_model."""
   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
       builder,
@@ -1824,12 +1986,15 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=(column._num_buckets, units),  # pylint: disable=protected-access
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=(column._num_buckets, units),  # pylint: disable=protected-access
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return _safe_embedding_lookup_sparse(
       weight,
       id_tensor,
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 4006a76bb4..07588af37e 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _LinearModel
 from tensorflow.python.feature_column.feature_column import _transform_features
 from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
@@ -339,6 +340,20 @@ class NumericColumnTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[10.], [50.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
 
 class BucketizedColumnTest(test.TestCase):
 
@@ -561,6 +576,62 @@ class BucketizedColumnTest(test.TestCase):
         sess.run(bias.assign([1.]))
         self.assertAllClose([[81.], [141.]], predictions.eval())
 
+  def test_keras_linear_model_one_input_value(self):
+    """Tests _LinearModel for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_keras_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_keras_linear_model_two_input_values(self):
+    """Tests _LinearModel for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_keras_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
 
 class HashedCategoricalColumnTest(test.TestCase):
 
@@ -767,6 +838,28 @@ class HashedCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
 
 class CrossedColumnTest(test.TestCase):
 
@@ -1028,60 +1121,800 @@ class CrossedColumnTest(test.TestCase):
       def _num_buckets(self):
         return 5
 
-      def _transform_feature(self, inputs):
-        return (inputs.get(self.name),
-                inputs.get('{}_weights'.format(self.name)))
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        fc.linear_model({
+            t.name: sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[0, 1, 2],
+                dense_shape=(2, 2)),
+            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[1., 10., 2.],
+                dense_shape=(2, 2)),
+            'c': sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=['cA', 'cB', 'cC'],
+                dense_shape=(2, 2)),
+        }, (crossed,))
+
+  def test_keras_linear_model(self):
+    """Tests _LinearModel.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_keras_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_keras_linear_model_with_weights(self):
+
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        get_keras_linear_model_predictions({
+            t.name:
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[0, 1, 2],
+                    dense_shape=(2, 2)),
+            '{}_weights'.format(t.name):
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[1., 10., 2.],
+                    dense_shape=(2, 2)),
+            'c':
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=['cA', 'cB', 'cC'],
+                    dense_shape=(2, 2)),
+        }, (crossed,))
+
+
+def get_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    return variable_scope.get_variable('bias_weights')
+
+
+def get_linear_model_column_var(column):
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                            'linear_model/' + column.name)[0]
+
+
+def get_keras_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    with variable_scope.variable_scope('bias_layer', reuse=True):
+      return variable_scope.get_variable('bias_weights')
+
+
+def get_keras_linear_model_predictions(features,
+                                       feature_columns,
+                                       units=1,
+                                       sparse_combiner='sum',
+                                       weight_collections=None,
+                                       trainable=True):
+  keras_linear_model = _LinearModel(
+      feature_columns,
+      units,
+      sparse_combiner,
+      weight_collections,
+      trainable,
+      name='linear_model')
+  return keras_linear_model(features)  # pylint: disable=not-callable
+
+
+@test_util.with_c_api
+class LinearModelTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.linear_model(features={}, feature_columns=[])
+
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+
+  def test_should_be_dense_or_categorical_column(self):
+
+    class NotSupportedColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
+
+      def _transform_feature(self, cache):
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.linear_model(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [wire_cast, price])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs, weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc.linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast], units=3)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
+                1000., 1100., 1200.
+            ], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          predictions = fc.linear_model(features, [price])
+      else:
+        predictions = fc.linear_model(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_fills_cols_to_vars(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      cols_to_vars = {}
+      fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      self.assertAllEqual(cols_to_vars['bias'], [bias])
+      self.assertAllEqual(cols_to_vars[price1], [price1_var])
+      self.assertAllEqual(cols_to_vars[price2], [price2_var])
+
+  def test_fills_cols_to_vars_partitioned_variables(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=3)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [6., 7.]],
+          'price2': [[3., 4., 5.], [8., 9., 10.]]
+      }
+      cols_to_vars = {}
+      with variable_scope.variable_scope(
+          'linear',
+          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
+        fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+      with _initialized_session():
+        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
+        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
+        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+        # a [1, 1] Variable.
+        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      fc.linear_model(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.linear_model(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.linear_model(features, [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': constant_op.constant([-1., 12.,]),
+        'body-style': sparse_tensor.SparseTensor(
+            indices=((0,), (1,)),
+            values=('sedan', 'hardtop'),
+            dense_shape=(2,)),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+
+    net = fc.linear_model(features, [price_buckets, body_style])
+    with _initialized_session() as sess:
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
 
-      def _get_sparse_tensors(self, inputs, weight_collections=None,
-                              trainable=None):
-        """Populates both id_tensor and weight_tensor."""
-        ids_and_weights = inputs.get(self)
-        return _CategoricalColumn.IdWeightPair(
-            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+    price_data = np.array([-1., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)),
+        values=('sedan', 'hardtop'),
+        dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
 
-    t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError,
-          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
-        fc.linear_model({
-            t.name: sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=[0, 1, 2],
-                dense_shape=(2, 2)),
-            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=[1., 10., 2.],
-                dense_shape=(2, 2)),
-            'c': sparse_tensor.SparseTensor(
-                indices=((0, 0), (1, 0), (1, 1)),
-                values=['cA', 'cB', 'cC'],
-                dense_shape=(2, 2)),
-        }, (crossed,))
+    net = fc.linear_model(features, [price_buckets, body_style, country])
+    bias = get_linear_model_bias()
+    price_buckets_var = get_linear_model_column_var(price_buckets)
+    body_style_var = get_linear_model_column_var(body_style)
+    with _initialized_session() as sess:
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
 
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
 
-def get_linear_model_bias():
-  with variable_scope.variable_scope('linear_model', reuse=True):
-    return variable_scope.get_variable('bias_weights')
+  def test_with_rank_0_feature(self):
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
 
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      fc.linear_model(features, [price])
 
-def get_linear_model_column_var(column):
-  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                            'linear_model/' + column.name)[0]
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = fc.linear_model(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
 @test_util.with_c_api
-class LinearModelTest(test.TestCase):
+class _LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      fc.linear_model(features={}, feature_columns=[])
+      get_keras_linear_model_predictions(features={}, feature_columns=[])
 
   def test_should_be_feature_column(self):
     with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
-      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]}, feature_columns='NotSupported')
 
   def test_should_be_dense_or_categorical_column(self):
 
@@ -1100,7 +1933,7 @@ class LinearModelTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
@@ -1112,7 +1945,7 @@ class LinearModelTest(test.TestCase):
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features={'a': [[0]]},
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])
@@ -1121,8 +1954,8 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -1138,8 +1971,8 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast])
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
+      bias = get_keras_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -1157,8 +1990,9 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [wire_cast, price])
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [wire_cast, price])
+      bias = get_keras_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -1187,7 +2021,9 @@ class LinearModelTest(test.TestCase):
       def _variable_shape(self):
         raise ValueError('Should not use this method.')
 
-      def _get_dense_tensor(self, inputs, weight_collections=None,
+      def _get_dense_tensor(self,
+                            inputs,
+                            weight_collections=None,
                             trainable=None):
         raise ValueError('Should not use this method.')
 
@@ -1195,7 +2031,9 @@ class LinearModelTest(test.TestCase):
       def _num_buckets(self):
         return 4
 
-      def _get_sparse_tensors(self, inputs, weight_collections=None,
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
                               trainable=None):
         sp_tensor = sparse_tensor.SparseTensor(
             indices=[[0, 0], [1, 0], [1, 1]],
@@ -1210,13 +2048,15 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {dense_and_sparse_column.name: sp_tensor}
-      predictions = fc.linear_model(features, [dense_and_sparse_column])
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(
+          features, [dense_and_sparse_column])
+      bias = get_keras_linear_model_bias()
       dense_and_sparse_column_var = get_linear_model_column_var(
           dense_and_sparse_column)
       with _initialized_session() as sess:
-        sess.run(dense_and_sparse_column_var.assign(
-            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [10015.]], predictions.eval())
 
@@ -1224,8 +2064,9 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price], units=3)
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
+      bias = get_keras_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -1243,16 +2084,17 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast], units=3)
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(
+          features, [wire_cast], units=3)
+      bias = get_keras_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
         self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
         sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
-                1000., 1100., 1200.
-            ], [10000., 11000., 12000.]]))
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100.,
+                                   1200.], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
                             predictions.eval())
@@ -1261,7 +2103,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = fc.linear_model(features, [price])
+      predictions = get_keras_linear_model_predictions(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([[0.], [0.]], price_var.eval())
@@ -1277,7 +2119,7 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
           dense_shape=[2, 2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(features, [wire_cast])
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
@@ -1297,9 +2139,9 @@ class LinearModelTest(test.TestCase):
           indices=[[0, 0], [1, 0], [1, 1]],
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
-      predictions = fc.linear_model(
+      predictions = get_keras_linear_model_predictions(
           features, [wire_cast], sparse_combiner='mean')
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
@@ -1310,8 +2152,9 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      predictions = fc.linear_model(features, [price], units=3)
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
+      bias = get_keras_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -1329,9 +2172,9 @@ class LinearModelTest(test.TestCase):
         with self.assertRaisesRegexp(
             Exception,
             r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          predictions = fc.linear_model(features, [price])
+          predictions = get_keras_linear_model_predictions(features, [price])
       else:
-        predictions = fc.linear_model(features, [price])
+        predictions = get_keras_linear_model_predictions(features, [price])
         with _initialized_session():
           with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
             predictions.eval()
@@ -1340,8 +2183,8 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      predictions = fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -1354,12 +2197,10 @@ class LinearModelTest(test.TestCase):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
-      predictions = fc.linear_model(features, [price1, price2])
-      bias = get_linear_model_bias()
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      bias = get_keras_linear_model_bias()
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
@@ -1372,50 +2213,14 @@ class LinearModelTest(test.TestCase):
         sess.run(bias.assign([7.]))
         self.assertAllClose([[3217.], [4657.]], predictions.eval())
 
-  def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      cols_to_vars = {}
-      fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
-      bias = get_linear_model_bias()
-      price1_var = get_linear_model_column_var(price1)
-      price2_var = get_linear_model_column_var(price2)
-      self.assertAllEqual(cols_to_vars['bias'], [bias])
-      self.assertAllEqual(cols_to_vars[price1], [price1_var])
-      self.assertAllEqual(cols_to_vars[price2], [price2_var])
-
-  def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [6., 7.]],
-          'price2': [[3., 4., 5.], [8., 9., 10.]]
-      }
-      cols_to_vars = {}
-      with variable_scope.variable_scope(
-          'linear',
-          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
-        fc.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
-      with _initialized_session():
-        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
-        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
-        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
-        # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
-
   def test_dense_collection(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      get_keras_linear_model_predictions(
+          features, [price], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       self.assertIn(bias, my_vars)
       self.assertIn(price_var, my_vars)
@@ -1426,10 +2231,10 @@ class LinearModelTest(test.TestCase):
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features, [wire_cast], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       self.assertIn(bias, my_vars)
       self.assertIn(wire_cast_var, my_vars)
@@ -1438,8 +2243,8 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price])
-      bias = get_linear_model_bias()
+      get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertIn(bias, trainable_vars)
@@ -1451,9 +2256,9 @@ class LinearModelTest(test.TestCase):
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      fc.linear_model(features, [wire_cast])
+      get_keras_linear_model_predictions(features, [wire_cast])
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       self.assertIn(bias, trainable_vars)
       self.assertIn(wire_cast_var, trainable_vars)
@@ -1462,7 +2267,7 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
-      fc.linear_model(features, [price], trainable=False)
+      get_keras_linear_model_predictions(features, [price], trainable=False)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
 
@@ -1472,7 +2277,7 @@ class LinearModelTest(test.TestCase):
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       features = {'wire_cast': wire_tensor}
-      fc.linear_model(features, [wire_cast], trainable=False)
+      get_keras_linear_model_predictions(features, [wire_cast], trainable=False)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
 
@@ -1488,7 +2293,7 @@ class LinearModelTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       }
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features, [price_a, wire_cast, price_b],
           weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
@@ -1504,7 +2309,7 @@ class LinearModelTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
       }
-      fc.linear_model(
+      get_keras_linear_model_predictions(
           features, [wire_cast, price_b, price_a],
           weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
@@ -1523,7 +2328,7 @@ class LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-      fc.linear_model(features, [price1, price2])
+      get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -1538,7 +2343,7 @@ class LinearModelTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.linear_model(features, [price1, price2, price3])
+        get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -1548,7 +2353,8 @@ class LinearModelTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      predictions = fc.linear_model(features, [price1, price2])
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'must have the same size and shape'):
@@ -1563,7 +2369,8 @@ class LinearModelTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      predictions = fc.linear_model(features, [price1, price2])
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
       with _initialized_session() as sess:
         sess.run(
             predictions,
@@ -1574,7 +2381,12 @@ class LinearModelTest(test.TestCase):
 
   def test_with_numpy_input_fn(self):
     price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
@@ -1586,13 +2398,14 @@ class LinearModelTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
     # self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
 
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
       body_style_var = get_linear_model_column_var(body_style)
 
@@ -1607,24 +2420,35 @@ class LinearModelTest(test.TestCase):
 
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([-1., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
 
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
     with _initialized_session() as sess:
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
       body_style_var = get_linear_model_column_var(body_style)
 
@@ -1636,7 +2460,12 @@ class LinearModelTest(test.TestCase):
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     country = fc.categorical_column_with_vocabulary_list(
@@ -1653,13 +2482,12 @@ class LinearModelTest(test.TestCase):
 
     price_data = np.array([-1., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array(['US', 'CA'])
 
-    net = fc.linear_model(features, [price_buckets, body_style, country])
-    bias = get_linear_model_bias()
+    net = get_keras_linear_model_predictions(
+        features, [price_buckets, body_style, country])
+    bias = get_keras_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
     with _initialized_session() as sess:
@@ -1685,13 +2513,13 @@ class LinearModelTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.linear_model(features, [price])
+      get_keras_linear_model_predictions(features, [price])
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = fc.linear_model(features, [price])
+    net = get_keras_linear_model_predictions(features, [price])
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -2715,6 +3543,32 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
@@ -3082,6 +3936,31 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
@@ -3285,17 +4164,39 @@ class IdentityCategoricalColumnTest(test.TestCase):
               input_shape: (2, 2),
           }))
 
-  def test_linear_model(self):
+  def test_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+  def test_keras_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          column.name: sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2))
+      predictions = get_keras_linear_model_predictions({
+          column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2))
       }, (column,))
-      bias = get_linear_model_bias()
+      bias = get_keras_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -3537,6 +4438,25 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = get_keras_linear_model_predictions(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
   def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -3727,8 +4647,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
@@ -3752,6 +4672,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -3774,20 +4695,21 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Create embedding_weights variable.
-    weight_collections = [ops.GraphKeys.GLOBAL_VARIABLES,
-                          ops.GraphKeys.MODEL_VARIABLES]
+    weight_collections = [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
+    ]
     state = embedding_column._create_state(weight_collections)
 
     # Provide sparse input and get dense result.
     embedding_lookup = embedding_column._get_dense_tensor(
         _LazyBuilder({
             'aaa': sparse_input
-        }),
-        state=state)
+        }), state=state)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
@@ -4087,6 +5009,82 @@ class EmbeddingColumnTest(test.TestCase):
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -4509,8 +5507,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
       self.assertAllEqual(embedding_values, embedding_var.eval())
@@ -4521,16 +5519,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
@@ -4539,6 +5536,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4566,11 +5564,13 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='bbb', num_buckets=vocabulary_size)
     embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Create state.
-    weight_collections = [ops.GraphKeys.GLOBAL_VARIABLES,
-                          ops.GraphKeys.MODEL_VARIABLES]
+    weight_collections = [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
+    ]
     state = embedding_column_a._create_state(weight_collections)
 
     # Provide sparse input and get dense result.
@@ -4731,6 +5731,97 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
         self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights do not follow the column name. But this is a rare use
+      # case, and fixing it would add too much complexity to the code.
+      expected_var_names = (
+          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
   def _test_input_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
@@ -5016,6 +6107,101 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           weight_tensor.eval())
 
+  def test_keras_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(.5, 1., .1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_keras_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   r'Dimensions.*are not compatible'):
+        get_keras_linear_model_predictions({
+            'ids':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 2, 1),
+                    dense_shape=(2, 2)),
+            'values':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                    values=(.5, 11., 1., .1),
+                    dense_shape=(2, 2))
+        }, (column,))
+
+  def test_keras_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,))
+      }, (column,))
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_keras_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 242cdff6f3..ec741d3265 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -694,7 +694,8 @@ class Layer(checkpointable.CheckpointableBase):
               self._dtype = input_list[0].dtype.base_dtype.name
             except AttributeError:
               pass
-          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+          if all(hasattr(x, 'get_shape') for x in input_list):
+            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
           self.build(input_shapes)
         try:
           # Note: not all sub-classes of Layer call Layer.__init__ (especially
-- 
GitLab


From 1093a54cb79be1dd606eee9ff27b718006ba9d63 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Wed, 4 Apr 2018 15:15:32 -0700
Subject: [PATCH 0271/1262] Iterate over a copy of dictionary keys when closing
 variable subscopes. Otherwise, we run into a "dictionary changed size during
 iteration" once in a while, as we are modifying the values in the dictionary
 during the iteration.

PiperOrigin-RevId: 191655599
---
 tensorflow/python/ops/variable_scope.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c35735ca65..e33085ba62 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1164,7 +1164,7 @@ class _VariableScopeStore(threading.local):
       self.variable_scopes_count[scope_name] = 1
 
   def close_variable_subscopes(self, scope_name):
-    for k in self.variable_scopes_count:
+    for k in list(self.variable_scopes_count.keys()):
       if not scope_name or k.startswith(scope_name + "/"):
         self.variable_scopes_count[k] = 0
 
-- 
GitLab


From b3081c19c7ae4966400cf5073043bfb65ed4f2b8 Mon Sep 17 00:00:00 2001
From: Daniel Zheng <danielzheng@google.com>
Date: Wed, 4 Apr 2018 15:28:01 -0700
Subject: [PATCH 0272/1262] Fix typo in `tf.reduce_mean` documentation.

The range of valid values for the `axis` argument should be `[-rank(input), rank(input))`, just like other reduction ops.

PiperOrigin-RevId: 191657767
---
 tensorflow/python/ops/math_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 39f40882db..b460ce5b95 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1521,7 +1521,7 @@ def reduce_mean(input_tensor,
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor)]`.
+      `[-rank(input_tensor), rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
-- 
GitLab


From e7ad6ec4267f1f79ee7d9f558c8a008746682959 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 4 Apr 2018 15:36:52 -0700
Subject: [PATCH 0273/1262] Remove dependency on absl to fix tf-nightly-gpu
 windows GPU build

PiperOrigin-RevId: 191659091
---
 tensorflow/stream_executor/BUILD                | 1 -
 tensorflow/stream_executor/cuda/cuda_dnn.cc     | 7 +++----
 tensorflow/stream_executor/cuda/cudnn_version.h | 7 ++++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 27cdb860fe..1913fc20ee 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -75,7 +75,6 @@ cc_library(
         ":stream_executor",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:ops_util",
-        "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 1aea0485fd..f408c06f46 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
-#include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/env_var.h"
@@ -113,7 +112,7 @@ string ToString(libraryPropertyType type) {
     case PATCH_LEVEL:
       return "PATCH_LEVEL";
     default:
-      return absl::StrCat(
+      return port::StrCat(
           "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
   }
 }
@@ -375,7 +374,7 @@ port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
   cudnnStatus_t status = cudnnGetProperty(type, value);
   if (status != CUDNN_STATUS_SUCCESS) {
     const string error =
-        absl::StrCat("cudnnGetProperty failed for type: ", ToString(type),
+        port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
                      " with status: ", ToString(status));
     LOG(ERROR) << error;
     return port::Status{port::error::INTERNAL, error};
@@ -419,7 +418,7 @@ port::Status CudnnSupport::Init() {
     CudnnVersion loaded_version;
     TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&loaded_version));
     if (!IsSourceCompatibleWithCudnnLibrary(source_version, loaded_version)) {
-      const tensorflow::string error = absl::StrCat(
+      const tensorflow::string error = port::StrCat(
           "Loaded runtime CuDNN library: ", loaded_version.ToString(),
           " but source was compiled with: ", source_version.ToString(),
           ".  CuDNN library major and minor version needs to match or have "
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 058cc87bfa..2ed02e1700 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <string>
 
-#include "absl/strings/str_join.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace perftools {
 namespace gputools {
@@ -30,8 +30,9 @@ struct CudnnVersion {
   CudnnVersion(int major, int minor, int patch)
       : major_version(major), minor_version(minor), patch_level(patch) {}
 
-  std::string ToString() const {
-    return absl::StrJoin({major_version, minor_version, patch_level}, ".");
+  tensorflow::string ToString() const {
+    return tensorflow::strings::StrCat(major_version, ".", minor_version, ".",
+                                       patch_level);
   }
 
   int major_version;
-- 
GitLab


From 4cfb393b087dc50c150054531186ccb71882e2d0 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 4 Apr 2018 15:42:14 -0700
Subject: [PATCH 0274/1262] Adding Operation._control_outputs

PiperOrigin-RevId: 191659944
---
 tensorflow/python/client/tf_session.i         | 19 +++++++++++++++
 tensorflow/python/client/tf_session_helper.cc |  9 +++++++
 tensorflow/python/client/tf_session_helper.h  |  4 ++++
 tensorflow/python/framework/ops.py            | 24 +++++++++++++++++++
 tensorflow/python/framework/ops_test.py       |  2 ++
 5 files changed, 58 insertions(+)

diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 0c18d973a7..b82182d5d3 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -157,6 +157,25 @@ tensorflow::ImportNumpy();
   }
 }
 
+// We use TF_OperationGetControlOutputs_wrapper instead of
+// TF_OperationGetControlOutputs
+%ignore TF_OperationGetControlOutputs;
+%unignore TF_OperationGetControlOutputs_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetControlOutputs_wrapper;
+
+// Build a Python list of TF_Operation* and return it.
+%typemap(out) std::vector<TF_Operation*> tensorflow::TF_OperationGetControlOutputs_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation($1[i]));
+  }
+}
+
 %ignore TF_OperationOutputConsumers;
 %unignore TF_OperationOutputConsumers_wrapper;
 // See comment for "%noexception TF_SessionRun_wrapper;"
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index ca57abd712..b48d758e4a 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -550,6 +550,15 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper) {
+  std::vector<TF_Operation*> control_outputs(
+      TF_OperationNumControlOutputs(oper));
+  TF_OperationGetControlOutputs(oper, control_outputs.data(),
+                                control_outputs.size());
+  return control_outputs;
+}
+
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
     TF_Output oper_out) {
   int num_consumers = TF_OperationOutputNumConsumers(oper_out);
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 5416d41376..d2b4abc476 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -190,6 +190,10 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the control outputs of this operation.
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper);
+
 // Retrieves the op names of the consumers of `oper_out`. The returned strings
 // have the lifetime of the underlying TF_Graph.
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 0215501b56..2d55f98a1c 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2113,6 +2113,30 @@ class Operation(object):
     else:
       return self._control_inputs_val
 
+  @property
+  def _control_outputs(self):
+    """The `Operation` objects which have a control dependency on this op.
+
+    Before any of the ops in self._control_outputs can execute tensorflow will
+    ensure self has finished executing.
+
+    Returns:
+      A list of `Operation` objects.
+
+    """
+    if self._c_op:
+      control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(
+              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+      ]
+      # pylint: enable=protected-access
+    else:
+      # TODO(apassos) this should be less inefficient.
+      return [o for o in self._graph.get_operations()
+              if self in o.control_inputs]
+
   @property
   def _control_inputs(self):
     logging.warning("Operation._control_inputs is private, use "
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index aa51391871..58bead91ed 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -473,6 +473,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, x, x, y, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testAddControlInputC(self):
     # The C API dedups redundant control edges, pure Python does not
@@ -487,6 +488,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
-- 
GitLab


From c8e3d2b43e4cbf9a9e32567a2e59597916f5b0b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 15:45:16 -0700
Subject: [PATCH 0275/1262] Adds commandline option (toco_compatible, bool)
 that makes the optimize_for_inference script only use TOCO friendly ops. In
 particular, FusedResizeAndPadConv2D is not supported by TOCO.

This change does not alter existing behavior (the boolean is set to false by default).

PiperOrigin-RevId: 191660378
---
 tensorflow/python/tools/optimize_for_inference.py    | 12 +++++++++++-
 .../python/tools/optimize_for_inference_lib.py       |  9 ++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index 902748d55e..dac6a06a89 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -87,7 +87,9 @@ def main(unused_args):
   output_graph_def = optimize_for_inference_lib.optimize_for_inference(
       input_graph_def,
       FLAGS.input_names.split(","),
-      FLAGS.output_names.split(","), FLAGS.placeholder_type_enum)
+      FLAGS.output_names.split(","),
+      FLAGS.placeholder_type_enum,
+      FLAGS.toco_compatible)
 
   if FLAGS.frozen_graph:
     f = gfile.FastGFile(FLAGS.output, "w")
@@ -138,6 +140,14 @@ def parse_args():
       type=int,
       default=dtypes.float32.as_datatype_enum,
       help="The AttrValue enum to use for placeholders.")
+  parser.add_argument(
+      "--toco_compatible",
+      type=bool,
+      default=False,
+      help="""\
+      If true, only use ops compatible with Tensorflow
+      Lite Optimizing Converter.\
+      """)
   return parser.parse_known_args()
 
 
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 9c19271222..bb90d1cd6e 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -87,7 +87,7 @@ EPSILON_ATTR = {
 
 
 def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
-                           placeholder_type_enum):
+                           placeholder_type_enum, toco_compatible=False):
   """Applies a series of inference optimizations on the input graph.
 
   Args:
@@ -98,6 +98,8 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
       results.
     placeholder_type_enum: The AttrValue enum for the placeholder data type, or
         a list that specifies one value per input node name.
+    toco_compatible: Boolean, if True, only runs optimizations that result in
+      TOCO compatible graph operations (default=False).
 
   Returns:
     An optimized version of the input graph.
@@ -110,8 +112,9 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
   optimized_graph_def = graph_util.remove_training_nodes(
       optimized_graph_def, output_node_names)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
-  optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
-                                             output_node_names)
+  if not toco_compatible:
+    optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
+                                               output_node_names)
   ensure_graph_is_valid(optimized_graph_def)
   return optimized_graph_def
 
-- 
GitLab


From 1bba94af33aca56c3a2240302d80f44d65e6aa17 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 15:45:20 -0700
Subject: [PATCH 0276/1262] Compile TensorFlow with /arch:AVX on Windows

/arch:AVX is the correpsonding option in MSVC for gcc's --march=native

PiperOrigin-RevId: 191660389
---
 configure.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 0f52c0ec99..26eff5767e 100644
--- a/configure.py
+++ b/configure.py
@@ -484,6 +484,8 @@ def set_cc_opt_flags(environ_cp):
   if is_ppc64le():
     # gcc on ppc64le does not support -march, use mcpu instead
     default_cc_opt_flags = '-mcpu=native'
+  elif is_windows():
+    default_cc_opt_flags = '/arch:AVX'
   else:
     default_cc_opt_flags = '-march=native'
   question = ('Please specify optimization flags to use during compilation when'
@@ -494,7 +496,7 @@ def set_cc_opt_flags(environ_cp):
   for opt in cc_opt_flags.split():
     write_to_bazelrc('build:opt --copt=%s' % opt)
   # It should be safe on the same build host.
-  if not is_ppc64le():
+  if not is_ppc64le() and not is_windows():
     write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
   # TODO(mikecase): Remove these default defines once we are able to get
-- 
GitLab


From f034e6c457ab283dbe42b1bf561943c9fa5dffe7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 15:46:26 -0700
Subject: [PATCH 0277/1262] Internal change.

PiperOrigin-RevId: 191660588
---
 tensorflow/contrib/distributions/BUILD | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 514638ecbb..9799901483 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -502,12 +502,6 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 4,
-    tags = [
-        "manual",
-        "noasan",
-        "noguitar",
-        "optonly",
-    ],
 )
 
 cuda_py_test(
-- 
GitLab


From 7cee71e28e98bac613623feea19c4a51439e9a0a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 16:05:08 -0700
Subject: [PATCH 0278/1262] Enable constant propagation across Enter nodes, but
 only if is_constant is true. Don't propagate constants with control
 dependencies through Merge nodes.

PiperOrigin-RevId: 191663396
---
 .../grappler/optimizers/constant_folding.cc   | 35 +++++-----
 .../optimizers/constant_folding_test.cc       | 50 ++++++++++++--
 tensorflow/python/BUILD                       | 23 +++++++
 .../python/grappler/constant_folding_test.py  | 69 +++++++++++++++++++
 4 files changed, 153 insertions(+), 24 deletions(-)
 create mode 100644 tensorflow/python/grappler/constant_folding_test.py

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index dd522aa228..d941a0b3f9 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -773,7 +773,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   // the case of a merge node that propagate the first inputs that becomes
   // available, and therefore only requires a single constant input to be
   // foldable.
-  bool has_constant_input = false;
+  bool merge_has_constant_input = false;
   const bool is_merge = IsMerge(node);
   for (const auto& input : node.input()) {
     if (IsControlInput(input)) {
@@ -784,21 +784,20 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
       return false;
     }
     bool is_const = IsReallyConstant(*input_node);
-    if (!is_const && !is_merge) {
-      return false;
-    }
-    // Don't fold strings constants for now since this causes problems with
-    // checkpointing.
-    if (is_const && input_node->attr().at("dtype").type() == DT_STRING) {
+    if (is_const) {
+      // Don't fold strings constants for now since this causes problems with
+      // checkpointing.
+      if (input_node->attr().at("dtype").type() == DT_STRING) {
+        return false;
+      }
+      // Special case: If a Merge node has at least one constant input that
+      // does not depend on a control input, we can fold it.
+      merge_has_constant_input |= !HasControlInputs(*input_node);
+    } else if (!is_merge) {
       return false;
     }
-    has_constant_input |= is_const;
-  }
-  if (is_merge) {
-    return has_constant_input;
   }
-
-  return true;
+  return !is_merge || merge_has_constant_input;
 }
 
 namespace {
@@ -1714,9 +1713,11 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
     }
 
     // Move constants past Enter.
-    // TODO(rmlarsen): Reenable when we fix the root cause of b/76008022
-    if (opt_level_ == RewriterConfig::AGGRESSIVE && IsEnter(*node) &&
-        node->input_size() > 0) {
+    if (IsEnter(*node) && node->input_size() > 0) {
+      if (node->attr().count("is_constant") == 0 ||
+          !node->attr().at("is_constant").b()) {
+        continue;
+      }
       const string& node_name = node->name();
       const NodeDef* input = node_map_->GetNode(node->input(0));
       if (input != nullptr && IsReallyConstant(*input) &&
@@ -1745,7 +1746,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
           node_map_->AddOutput(node_name, new_node->name());
           for (NodeDef* consumer : consumers) {
             for (int i = 0; i < consumer->input_size(); ++i) {
-              if (consumer->input(i) == node_name) {
+              if (NodeName(consumer->input(i)) == node_name) {
                 node_map_->UpdateInput(consumer->name(), node_name,
                                        new_node->name());
                 consumer->set_input(i, new_node->name());
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 8d146637a6..71ee81dfde 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1256,6 +1256,10 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Merge m1(scope.WithOpName("m1"), {x, const1, const2});
   ops::Merge m2(scope.WithOpName("m2"), {const1, const3});
   ops::Merge m3(scope.WithOpName("m3"), {x, y});
+  // m4 is not foldable because the only constant input
+  // has a control input, so we cannot know if it will be
+  // triggered.
+  ops::Merge m4(scope.WithOpName("m4"), {x, const1});
 
   ops::Identity out1(scope.WithOpName("out1"), m1.output);
   ops::Identity idx1(scope.WithOpName("idx1"), m1.value_index);
@@ -1263,9 +1267,11 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Identity idx2(scope.WithOpName("idx2"), m2.value_index);
   ops::Identity out3(scope.WithOpName("out3"), m3.output);
   ops::Identity idx3(scope.WithOpName("idx3"), m3.value_index);
+  ops::Identity out4(scope.WithOpName("out4"), m4.output);
+  ops::Identity idx4(scope.WithOpName("idx4"), m4.value_index);
 
   GrapplerItem item;
-  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3"};
+  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
   ConstantFolding optimizer(nullptr /* cpu_device */);
@@ -1273,6 +1279,7 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  EXPECT_EQ(19, output.node_size());
   int found_nodes = 0;
   for (const auto& node : output.node()) {
     if (node.name() == "out1") {
@@ -1309,10 +1316,18 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("m3:1", node.input(0));
       ++found_nodes;
+    } else if (node.name() == "out4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4:1", node.input(0));
+      ++found_nodes;
     }
   }
   // Make sure the graph contains all the nodes we're expecting.
-  EXPECT_EQ(6, found_nodes);
+  EXPECT_EQ(8, found_nodes);
 
   std::vector<string> fetch = {"out1", "idx1"};
   auto tensors = EvaluateNodes(output, fetch);
@@ -2320,6 +2335,10 @@ TEST_F(ConstantFoldingTest, Enter) {
   GrapplerItem item;
   AttrValue frame_name;
   frame_name.set_s("foo");
+  AttrValue is_constant_true;
+  is_constant_true.set_b(true);
+  AttrValue is_constant_false;
+  is_constant_false.set_b(false);
   AttrValue type;
   type.set_type(DT_FLOAT);
   AttrValue value;
@@ -2330,19 +2349,31 @@ TEST_F(ConstantFoldingTest, Enter) {
   GraphDef& graph = item.graph;
   AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
-  AddNode("enter1", "Enter", {"x"}, {{"T", type}, {"frame_name", frame_name}},
+  AddNode("enter1", "Enter", {"x"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
+          &graph);
+  AddNode("enter2", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
           &graph);
-  AddNode("enter2", "Enter", {"c1"}, {{"T", type}, {"frame_name", frame_name}},
+  AddNode("enter3", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_false}},
           &graph);
   AddNode("id1", "Identity", {"enter1"}, {{"T", type}}, &graph);
   AddNode("id2", "Identity", {"enter2"}, {{"T", type}}, &graph);
   AddNode("id3", "Identity", {"enter2"}, {{"T", type}}, &graph);
+  AddNode("id4", "Identity", {"enter3"}, {{"T", type}}, &graph);
   item.fetch.push_back("id1");
   item.fetch.push_back("id2");
   item.fetch.push_back("id3");
+  item.fetch.push_back("id4");
 
-  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
-                            nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -2351,7 +2382,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(7, output.node_size());
+  EXPECT_EQ(9, output.node_size());
   for (const NodeDef& node : output.node()) {
     if (node.name() == "id1") {
       EXPECT_EQ("Identity", node.op());
@@ -2363,6 +2394,11 @@ TEST_F(ConstantFoldingTest, Enter) {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^enter2", node.input(0));
     }
+    if (node.name() == "id4") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("enter3", node.input(0));
+    }
   }
 }
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 57b0b78c82..a9363608b5 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4843,6 +4843,29 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "constant_folding_test",
+    size = "medium",
+    srcs = [
+        "grappler/constant_folding_test.py",
+    ],
+    additional_deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":functional_ops",
+        ":math_ops",
+        ":ops",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+    ],
+)
+
 cuda_py_test(
     name = "layout_optimizer_test",
     size = "medium",
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
new file mode 100644
index 0000000000..ab1d0ed25b
--- /dev/null
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Grappler Constant Folding."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ConstantFoldingTest(test.TestCase):
+
+  # See b/76008022.
+  def testScanInsideWhile(self):
+
+    def loop_cond(idx_step, *unused_args):
+      return idx_step < 1
+
+    def loop_body(idx_step, y):
+      x = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      x = functional_ops.scan(
+          math_ops.add,
+          x,
+          initializer=array_ops.zeros([20, 30], dtype=dtypes.float32),
+          back_prop=False,
+          parallel_iterations=1)
+
+      with ops.device('/cpu:0'):
+        y = array_ops.identity(x)
+
+        return idx_step + 1, y
+
+    if test.is_gpu_available(cuda_only=True):
+      init_y = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      _, y = control_flow_ops.while_loop(
+          loop_cond,
+          loop_body,
+          loop_vars=[0, init_y],
+          back_prop=False,
+          parallel_iterations=1)
+      with session.Session() as sess:
+        y_v = sess.run(y)
+        self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From e98c13c55e519cb70ede110cd8941f8cb75ab718 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 16:17:05 -0700
Subject: [PATCH 0279/1262] Running sparse_ops_test only in opt mode since the
 test is flaky (times out) in fastbuild mode.

PiperOrigin-RevId: 191665014
---
 tensorflow/python/kernel_tests/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index a544e4fa6e..6c34ea1816 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2519,7 +2519,10 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
     ],
     shard_count = 5,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",  # b/77589990
+    ],
 )
 
 cuda_py_test(
-- 
GitLab


From f8acfb01792886274778d9ad7a9d990cbef14141 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Wed, 4 Apr 2018 16:17:46 -0700
Subject: [PATCH 0280/1262] Fixed handling of control dependencies in the
 arithmethic optimizer

PiperOrigin-RevId: 191665098
---
 .../optimizers/arithmetic_optimizer.cc        | 110 ++++++++----------
 .../optimizers/arithmetic_optimizer.h         |  12 +-
 .../optimizers/arithmetic_optimizer_test.cc   |   9 +-
 .../optimizers/graph_optimizer_stage.h        |  10 +-
 .../optimizers/graph_optimizer_stage_test.cc  |  12 +-
 5 files changed, 64 insertions(+), 89 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 919f23fd98..59a5695af0 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
 #include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -290,21 +289,16 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 
   // TODO(ezhulenev): remove this method from ArithmeticOptimizer when all
   // optimizations will be migrated to stages
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep) {
-    const auto frame_it = ctx_.frame_map->find(old_node);
-    if (frame_it != ctx_.frame_map->end()) {
-      for (auto node : new_nodes) {
-        ctx_.frame_map->emplace(node, frame_it->second);
-      }
-      if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-        const string ctrl_dep = ConstantFolding::AddControlDependency(
-            source_for_ctrl_dep, ctx_.optimized_graph, ctx_.node_map);
-        for (auto node : sinks_for_control_dep) {
-          MaybeAddControlInput(ctrl_dep, node, ctx_.optimized_graph,
-                               ctx_.node_map);
+  void ForwardControlDependencies(
+      NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+    for (const auto& src : src_nodes) {
+      for (int i = src->input_size() - 1; i >= 0; --i) {
+        if (IsControlInput(src->input(i))) {
+          *target_node->add_input() = src->input(i);
+          ctx_.node_map->AddOutput(NodeName(src->input(i)),
+                                   target_node->name());
+        } else {
+          break;
         }
       }
     }
@@ -703,7 +697,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     CHECK(IsSupported(node));
 
     std::set<string> common_factors;
-    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors));
+    std::vector<string> ctrl_deps;
+    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors, &ctrl_deps));
 
     if (common_factors.size() == 1) {
       const string& common_factor = *common_factors.begin();
@@ -735,9 +730,11 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
           new_add_node->set_input(i, unique_factors[i]);
         }
 
-        // Add frame dependencies that the original node might have had.
-        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
-                            {new_add_node});
+        // Add control deps on add node
+        for (const string& ctrl_dep : ctrl_deps) {
+          *new_add_node->add_input() = ctrl_dep;
+          ctx_.node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
+        }
 
         // optimize new inner aggregation node
         AddToOptimizationQueue(new_add_node);
@@ -763,14 +760,16 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   }
 
   // Determine the set of common factors if the input nodes are all Mul nodes.
-  Status GetCommonFactors(const NodeDef* node,
-                          std::set<string>* common_factors) const {
+  Status GetCommonFactors(const NodeDef* node, std::set<string>* common_factors,
+                          std::vector<string>* ctrl_deps) const {
     CHECK(common_factors->empty());
 
     for (int i = 0; i < node->input_size(); ++i) {
       if (i > 0 && common_factors->empty()) break;
-      if (IsControlInput(node->input(i))) break;
-
+      if (IsControlInput(node->input(i))) {
+        ctrl_deps->push_back(node->input(i));
+        continue;
+      }
       NodeDef* input;
       TF_RETURN_IF_ERROR(GetInputNode(node->input(i), &input));
 
@@ -790,6 +789,9 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
             std::inserter(intersection, intersection.begin()));
         std::swap(*common_factors, intersection);
       }
+      for (int i = 2; i < input->input_size(); ++i) {
+        ctrl_deps->push_back(input->input(i));
+      }
     }
     return Status::OK();
   }
@@ -1275,20 +1277,15 @@ void ArithmeticOptimizer::DedupComputations() {
   }
 }
 
-void ArithmeticOptimizer::AddFrameControlDeps(
-    const NodeDef* old_node, const std::vector<NodeDef*>& new_nodes,
-    const string& source_for_ctrl_dep,
-    const std::vector<NodeDef*>& sinks_for_control_dep) {
-  const auto frame_it = frame_map_.find(old_node);
-  if (frame_it != frame_map_.end()) {
-    for (auto node : new_nodes) {
-      frame_map_.emplace(node, frame_it->second);
-    }
-    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          source_for_ctrl_dep, optimized_graph_, node_map_.get());
-      for (auto node : sinks_for_control_dep) {
-        MaybeAddControlInput(ctrl_dep, node, optimized_graph_, node_map_.get());
+void ArithmeticOptimizer::ForwardControlDependencies(
+    NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+  for (const auto& src : src_nodes) {
+    for (int i = src->input_size() - 1; i >= 0; --i) {
+      if (IsControlInput(src->input(i))) {
+        *target_node->add_input() = src->input(i);
+        node_map_->AddOutput(NodeName(src->input(i)), target_node->name());
+      } else {
+        break;
       }
     }
   }
@@ -1408,10 +1405,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           node_map_->AddOutput(new_transpose->name(), new_cast->name());
 
           nodes_to_simplify->PushBack(new_transpose);
-          //  Add frame dependencies that the original node might have had.
-          AddFrameControlDeps(node, {new_transpose, new_cast},
-                              new_transpose->input(0), {new_transpose});
-
+          ForwardControlDependencies(new_transpose, {cast, node});
           return new_cast->name();
         }
       }
@@ -1485,7 +1479,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
             node_map_->AddOutput(weights->name(), scaled_weights->name());
             scaled_weights->add_input(mul->input(1));
             node_map_->AddOutput(scale->name(), scaled_weights->name());
-            AddFrameControlDeps(node, {scaled_weights}, "", {});
+            ForwardControlDependencies(scaled_weights, {source});
 
             // Update `conv`'s weights to `scaled_weights`.
             conv->set_input(1, scaled_weights->name());
@@ -1521,7 +1515,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-    // Discard aggregate nodes with a single input.
+    // Discard aggregate nodes with a single input and no control dependencies.
     if (node->input_size() == 1) {
       return node->input(0);
     }
@@ -1567,6 +1561,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         return "";
       }
       new_const_node->set_device(node->device());
+      MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
+                           optimized_graph_, node_map_.get());
       nodes_to_simplify->PushBack(new_const_node);
 
       // 2. Replace the aggregate node with Mul(Const(N), x).
@@ -1579,9 +1575,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
       new_mul_node->add_input(node->input(0));
       node_map_->AddOutput(node->input(0), new_mul_node->name());
 
-      CopyControlInputs(*node, new_mul_node, optimized_graph_, node_map_.get());
-      AddFrameControlDeps(node, {new_const_node, new_mul_node}, node->input(0),
-                          {new_const_node});
+      ForwardControlDependencies(new_mul_node, {node});
       return new_mul_node->name();
     }
   }
@@ -1614,7 +1608,6 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_a, new_op);
         new_op->set_input(0, a->input(0));
         node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
-        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op});
       }
       if (b_is_foldable) {
         const string attr_b =
@@ -1622,10 +1615,15 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_b, new_op);
         new_op->set_input(1, b->input(0));
         node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
-        if (!a_is_foldable) {
-          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op});
-        }
       }
+      std::vector<const NodeDef*> deps_to_forward({node});
+      if (a_is_foldable) {
+        deps_to_forward.push_back(a);
+      }
+      if (b_is_foldable) {
+        deps_to_forward.push_back(b);
+      }
+      ForwardControlDependencies(new_op, deps_to_forward);
     }
   }
 
@@ -1647,7 +1645,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                                                        : "Transpose");
       new_op->set_input(0, input->input(0));
       node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
-      AddFrameControlDeps(node, {new_op}, "", {});
+      ForwardControlDependencies(new_op, {node, input});
       return new_op->name();
     }
   }
@@ -1663,8 +1661,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
   }
 
   const GraphOptimizerContext ctx(&nodes_to_preserve_, optimized_graph_,
-                                  graph_properties_.get(), node_map_.get(),
-                                  &frame_map_);
+                                  graph_properties_.get(), node_map_.get());
   const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
 
   // Stop pipeline after first stage returning non-empty simplified tensor name.
@@ -1764,11 +1761,6 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   graph_properties_.reset(new GraphProperties(item));
   TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
 
-  // Identify loop frames
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map_, &num_frames));
-
   // Perform the optimizations.
   TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 63a7b55893..7e81ed0a1f 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -100,13 +99,9 @@ class ArithmeticOptimizer : public GraphOptimizer {
   // Dedup redundant nodes in the graph.
   void DedupComputations();
 
-  // Fix frame dependencies by adding control dependencies from old_input to
-  // nodes in new_nodes_for_control_dep, and update frame_map for all nodes in
-  // new_nodes.
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep);
+  // Forward the control dependencies anchored on src_nodes to the target_nodes.
+  void ForwardControlDependencies(NodeDef* target_node,
+                                  const std::vector<const NodeDef*>& src_nodes);
 
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
@@ -135,7 +130,6 @@ class ArithmeticOptimizer : public GraphOptimizer {
   bool fetch_nodes_known_ = false;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
   std::unique_ptr<GraphProperties> graph_properties_;
   GraphDef* optimized_graph_ = nullptr;  // Not owned.
 };
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 48f1dd5aa1..e117341ba3 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -520,26 +520,23 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
   ASSERT_NE(add_6_node, nullptr);
-  EXPECT_EQ(3, add_6_node->input_size());
+  EXPECT_EQ(2, add_6_node->input_size());
   EXPECT_EQ(HoistAddName("Add_4"), add_6_node->input(0));
   EXPECT_EQ(HoistAddName("Add_5"), add_6_node->input(1));
-  EXPECT_EQ("^Placeholder", add_6_node->input(2));
 
   const NodeDef* add_4_node = node_map.GetNode(HoistAddName("Add_4"));
   ASSERT_NE(add_4_node, nullptr);
   EXPECT_EQ("Add", add_4_node->op());
-  EXPECT_EQ(3, add_4_node->input_size());
+  EXPECT_EQ(2, add_4_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
   EXPECT_EQ(OptimizedName("Add_1_const"), add_4_node->input(1));
-  EXPECT_EQ("^Placeholder", add_4_node->input(2));
 
   const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
   ASSERT_NE(add_5_node, nullptr);
   EXPECT_EQ("Add", add_5_node->op());
-  EXPECT_EQ(3, add_5_node->input_size());
+  EXPECT_EQ(2, add_5_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
   EXPECT_EQ(OptimizedName("Add_1_const"), add_5_node->input(1));
-  EXPECT_EQ("^Placeholder", add_5_node->input(2));
 
   const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
   ASSERT_NE(add_const_node, nullptr);
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 8d3e965c57..7ed0474861 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -45,21 +44,16 @@ const NodeScopeAndName ParseNodeScopeAndName(const string& node_name);
 struct GraphOptimizerContext {
   GraphOptimizerContext(const std::unordered_set<string>* nodes_to_preserve,
                         GraphDef* optimized_graph,
-                        GraphProperties* graph_properties, NodeMap* node_map,
-                        FrameMap* frame_map)
+                        GraphProperties* graph_properties, NodeMap* node_map)
       : nodes_to_preserve(nodes_to_preserve),
         optimized_graph(optimized_graph),
         graph_properties(graph_properties),
-        node_map(node_map),
-        frame_map(frame_map) {}
+        node_map(node_map) {}
 
   const std::unordered_set<string>* nodes_to_preserve;
   GraphDef* optimized_graph;
   GraphProperties* graph_properties;
   NodeMap* node_map;
-  // TODO(ezhulenev): it seems that frame_map is only relevant for loop
-  // optimizer? Move it to loop-optimizer specific context extension.
-  FrameMap* frame_map;
 };
 
 Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
index 416327e622..3f5ab87a5a 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -58,8 +58,8 @@ TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InScope) {
 TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ nullptr,
-                            /*graph_properties*/ nullptr, /*node_name*/ nullptr,
-                            /*frame_map*/ nullptr);
+                            /*graph_properties*/ nullptr,
+                            /*node_name*/ nullptr);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   const auto node = ParseNodeScopeAndName("a/b/c/Add");
@@ -94,8 +94,7 @@ TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
-                            /*node_name*/ &node_map,
-                            /*frame_map*/ nullptr);
+                            /*node_name*/ &node_map);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   NodeDef* add_node;
@@ -134,8 +133,7 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
-                            /*node_name*/ &node_map,
-                            /*frame_map*/ nullptr);
+                            /*node_name*/ &node_map);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   NodeDef* add_node;
@@ -165,4 +163,4 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
 
 }  // namespace
 }  // end namespace grappler
-}  // end namespace tensorflow
\ No newline at end of file
+}  // end namespace tensorflow
-- 
GitLab


From d107fee1e4a9a4462f01564798d345802acc2aef Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Wed, 4 Apr 2018 16:26:25 -0700
Subject: [PATCH 0281/1262] Check that n + kBlockTrailerSize does not overflow
 before reading a block

PiperOrigin-RevId: 191666300
---
 tensorflow/core/lib/io/format.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/core/lib/io/format.cc b/tensorflow/core/lib/io/format.cc
index 64852943ad..0c24c660a2 100644
--- a/tensorflow/core/lib/io/format.cc
+++ b/tensorflow/core/lib/io/format.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
+
 #include "tensorflow/core/lib/io/format.h"
 
 #include "tensorflow/core/lib/core/coding.h"
@@ -84,6 +86,11 @@ Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
   // Read the block contents as well as the type/crc footer.
   // See table_builder.cc for the code that built this structure.
   size_t n = static_cast<size_t>(handle.size());
+
+  if (kBlockTrailerSize > std::numeric_limits<size_t>::max() - n) {
+    return errors::DataLoss("handle.size() too big");
+  }
+
   char* buf = new char[n + kBlockTrailerSize];
   StringPiece contents;
   Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
-- 
GitLab


From 2af88f3e2114ce28fddd2f512477db020d34407a Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Wed, 4 Apr 2018 16:34:54 -0700
Subject: [PATCH 0282/1262] Address bug in distributed strategies `Monitor` to
 allow running for >1 step.

PiperOrigin-RevId: 191667378
---
 tensorflow/contrib/distribute/python/monitor.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
index fe80bb4df5..7644acedc9 100644
--- a/tensorflow/contrib/distribute/python/monitor.py
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import variables
 
 
@@ -55,7 +56,9 @@ class Monitor(object):
 
   def run_steps(self, num_steps=None):
     step = 0
-    done = False
-    while done is not None and (num_steps is None or step < num_steps):
-      done = self._run_step()
-      step += 1
+    while num_steps is None or step < num_steps:
+      try:
+        self._run_step()
+        step += 1
+      except errors.OutOfRangeError:
+        break
-- 
GitLab


From faeebb7daef6d1fdd0e4eb3a3e0afedcd2d3350d Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 4 Apr 2018 16:53:51 -0700
Subject: [PATCH 0283/1262] Replace trivial backend calls with calls to
 underlying TensorFlow functions - Part 2

PiperOrigin-RevId: 191669725
---
 .../python/keras/_impl/keras/activations.py   | 13 +++---
 .../python/keras/_impl/keras/constraints.py   |  9 ++--
 .../_impl/keras/engine/training_utils.py      |  3 +-
 .../keras/layers/advanced_activations.py      |  5 +-
 .../python/keras/_impl/keras/layers/core.py   |  9 ++--
 .../keras/_impl/keras/layers/core_test.py     |  4 +-
 .../keras/_impl/keras/layers/embeddings.py    |  2 +-
 .../python/keras/_impl/keras/layers/merge.py  | 46 +++++++++++--------
 .../python/keras/_impl/keras/layers/noise.py  |  2 +-
 .../keras/_impl/keras/layers/recurrent.py     |  6 ++-
 .../keras/_impl/keras/layers/wrappers.py      |  4 +-
 tensorflow/python/keras/_impl/keras/losses.py | 33 +++++++------
 .../python/keras/_impl/keras/metrics.py       | 22 +++++----
 .../python/keras/_impl/keras/metrics_test.py  |  5 +-
 .../python/keras/_impl/keras/optimizers.py    | 31 +++++++------
 .../python/keras/_impl/keras/regularizers.py  |  4 +-
 16 files changed, 111 insertions(+), 87 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 74ec373ea5..b518898ad8 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -24,6 +24,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.layers.base import Layer
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -44,9 +45,9 @@ def softmax(x, axis=-1):
   """
   ndim = K.ndim(x)
   if ndim == 2:
-    return K.softmax(x)
+    return nn.softmax(x)
   elif ndim > 2:
-    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
+    e = math_ops.exp(x - math_ops.reduce_max(x, axis=axis, keepdims=True))
     s = math_ops.reduce_sum(e, axis=axis, keepdims=True)
     return e / s
   else:
@@ -80,12 +81,12 @@ def selu(x):
 
 @tf_export('keras.activations.softplus')
 def softplus(x):
-  return K.softplus(x)
+  return nn.softplus(x)
 
 
 @tf_export('keras.activations.softsign')
 def softsign(x):
-  return K.softsign(x)
+  return nn.softsign(x)
 
 
 @tf_export('keras.activations.relu')
@@ -95,12 +96,12 @@ def relu(x, alpha=0., max_value=None):
 
 @tf_export('keras.activations.tanh')
 def tanh(x):
-  return K.tanh(x)
+  return nn.tanh(x)
 
 
 @tf_export('keras.activations.sigmoid')
 def sigmoid(x):
-  return K.sigmoid(x)
+  return nn.sigmoid(x)
 
 
 @tf_export('keras.activations.hard_sigmoid')
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index aac4d0f1e9..abe95d8e0c 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -67,7 +67,7 @@ class MaxNorm(Constraint):
 
   def __call__(self, w):
     norms = K.sqrt(
-        math_ops.reduce_sum(K.square(w), axis=self.axis, keepdims=True))
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
     return w * (desired / (K.epsilon() + norms))
 
@@ -81,7 +81,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    return w * math_ops.cast(K.greater_equal(w, 0.), K.floatx())
+    return w * math_ops.cast(math_ops.greater_equal(w, 0.), K.floatx())
 
 
 @tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
@@ -108,7 +108,8 @@ class UnitNorm(Constraint):
   def __call__(self, w):
     return w / (
         K.epsilon() + K.sqrt(
-            math_ops.reduce_sum(K.square(w), axis=self.axis, keepdims=True)))
+            math_ops.reduce_sum(
+                math_ops.square(w), axis=self.axis, keepdims=True)))
 
   def get_config(self):
     return {'axis': self.axis}
@@ -152,7 +153,7 @@ class MinMaxNorm(Constraint):
 
   def __call__(self, w):
     norms = K.sqrt(
-        math_ops.reduce_sum(K.square(w), axis=self.axis, keepdims=True))
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = (
         self.rate * K.clip(norms, self.min_value, self.max_value) +
         (1 - self.rate) * norms)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 58d2c78aad..a3fc8ef2a0 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -451,7 +451,8 @@ def weighted_masked_objective(fn):
       weight_ndim = K.ndim(weights)
       score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
       score_array *= weights
-      score_array /= K.mean(math_ops.cast(K.not_equal(weights, 0), K.floatx()))
+      score_array /= K.mean(
+          math_ops.cast(math_ops.not_equal(weights, 0), K.floatx()))
     return K.mean(score_array)
 
   return weighted
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index 45b0c6c91a..11ca89d625 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -147,7 +147,7 @@ class PReLU(Layer):
     if K.backend() == 'theano':
       neg = (
           K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - K.abs(inputs)) * 0.5)
+          (inputs - math_ops.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
@@ -233,7 +233,8 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * math_ops.cast(K.greater(inputs, self.theta), K.floatx())
+    return inputs * math_ops.cast(
+        math_ops.greater(inputs, self.theta), K.floatx())
 
   def get_config(self):
     config = {'theta': float(self.theta)}
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index a709a079fd..c74fc1e4c0 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -77,11 +77,11 @@ class Masking(Layer):
     self.mask_value = mask_value
 
   def compute_mask(self, inputs, mask=None):
-    return K.any(K.not_equal(inputs, self.mask_value), axis=-1)
+    return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
 
   def call(self, inputs):
     boolean_mask = K.any(
-        K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
+        math_ops.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
     return inputs * math_ops.cast(boolean_mask, inputs.dtype)
 
   def compute_output_shape(self, input_shape):
@@ -416,7 +416,8 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.reshape(inputs, (array_ops.shape(inputs)[0],) + self.target_shape)
+    return array_ops.reshape(inputs,
+                             (array_ops.shape(inputs)[0],) + self.target_shape)
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -469,7 +470,7 @@ class Permute(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.permute_dimensions(inputs, (0,) + self.dims)
+    return array_ops.transpose(inputs, perm=(0,) + self.dims)
 
   def get_config(self):
     config = {'dims': self.dims}
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 2ca816adbd..551d1b1c3a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -159,7 +160,7 @@ class CoreLayersTest(test.TestCase):
 
     # test with lambda
     ld = keras.layers.Lambda(
-        lambda x: keras.backend.concatenate([keras.backend.square(x), x]))
+        lambda x: keras.backend.concatenate([math_ops.square(x), x]))
     config = ld.get_config()
     ld = keras.layers.Lambda.from_config(config)
 
@@ -235,4 +236,3 @@ class CoreLayersTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index a0fd7a9637..540e2d945c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -128,7 +128,7 @@ class Embedding(Layer):
     if not self.mask_zero:
       return None
     else:
-      return K.not_equal(inputs, 0)
+      return math_ops.not_equal(inputs, 0)
 
   @shape_type_conversion
   def compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 6290db29a7..7c87e6c067 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -24,6 +24,8 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -128,7 +130,7 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           for _ in range(max_ndim - x_ndim):
-            x = K.expand_dims(x, 1)
+            x = array_ops.expand_dims(x, axis=1)
           reshaped_inputs.append(x)
         return self._merge_function(reshaped_inputs)
       else:
@@ -140,17 +142,20 @@ class _Merge(Layer):
           if x_ndim is None:
             x_shape = array_ops.shape(x)
             batch_size = x_shape[0]
-            new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
-            x_transposed = K.reshape(x,
-                                     K.stack([batch_size,
-                                              K.prod(x_shape[1:])]))
-            x_transposed = K.permute_dimensions(x_transposed, (1, 0))
-            x_transposed = K.reshape(x_transposed, new_shape)
+            new_shape = K.concatenate(
+                [x_shape[1:],
+                 array_ops.expand_dims(batch_size, axis=-1)])
+            x_transposed = array_ops.reshape(
+                x,
+                array_ops.stack(
+                    [batch_size, math_ops.reduce_prod(x_shape[1:])], axis=0))
+            x_transposed = array_ops.transpose(x_transposed, perm=(1, 0))
+            x_transposed = array_ops.reshape(x_transposed, new_shape)
             reshaped_inputs.append(x_transposed)
             transposed = True
           elif x_ndim > 1:
             dims = list(range(1, x_ndim)) + [0]
-            reshaped_inputs.append(K.permute_dimensions(x, dims))
+            reshaped_inputs.append(array_ops.transpose(x, perm=dims))
             transposed = True
           else:
             # We don't transpose inputs if they are 1D vectors or scalars.
@@ -163,14 +168,15 @@ class _Merge(Layer):
             y_shape = array_ops.shape(y)
             y_ndim = array_ops.shape(y_shape)[0]
             batch_size = y_shape[y_ndim - 1]
-            new_shape = K.concatenate(
-                [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
-            y = K.reshape(y, (-1, batch_size))
-            y = K.permute_dimensions(y, (1, 0))
-            y = K.reshape(y, new_shape)
+            new_shape = K.concatenate([
+                array_ops.expand_dims(batch_size, axis=-1), y_shape[:y_ndim - 1]
+            ])
+            y = array_ops.reshape(y, (-1, batch_size))
+            y = array_ops.transpose(y, perm=(1, 0))
+            y = array_ops.reshape(y, new_shape)
           elif y_ndim > 1:
             dims = [y_ndim - 1] + list(range(y_ndim - 1))
-            y = K.permute_dimensions(y, dims)
+            y = array_ops.transpose(y, perm=dims)
         return y
     else:
       return self._merge_function(inputs)
@@ -208,7 +214,7 @@ class _Merge(Layer):
                        'should have the same length.')
     if all([m is None for m in mask]):
       return None
-    masks = [K.expand_dims(m, 0) for m in mask if m is not None]
+    masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
@@ -326,7 +332,7 @@ class Maximum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.maximum(output, inputs[i])
+      output = math_ops.maximum(output, inputs[i])
     return output
 
 
@@ -341,7 +347,7 @@ class Minimum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.minimum(output, inputs[i])
+      output = math_ops.minimum(output, inputs[i])
     return output
 
 
@@ -422,7 +428,7 @@ class Concatenate(_Merge):
         masks.append(array_ops.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
-        masks.append(K.expand_dims(mask_i))
+        masks.append(array_ops.expand_dims(mask_i, axis=-1))
       else:
         masks.append(mask_i)
     concatenated = K.concatenate(masks, axis=self.axis)
@@ -512,8 +518,8 @@ class Dot(_Merge):
         else:
           axes.append(self.axes[i])
     if self.normalize:
-      x1 = K.l2_normalize(x1, axis=axes[0])
-      x2 = K.l2_normalize(x2, axis=axes[1])
+      x1 = nn.l2_normalize(x1, axis=axes[0])
+      x2 = nn.l2_normalize(x2, axis=axes[1])
     output = K.batch_dot(x1, x2, axes)
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 4366b654f2..72dc7a1ff8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -166,7 +166,7 @@ class AlphaDropout(Layer):
         scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
 
-        kept_idx = K.greater_equal(
+        kept_idx = math_ops.greater_equal(
             K.random_uniform(noise_shape, seed=seed), rate)
         kept_idx = math_ops.cast(kept_idx, K.floatx())
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index bd7c42e63e..7f9f77c296 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -510,7 +510,8 @@ class RNN(Layer):
     # shape of initial_state = (samples, timesteps, input_dim)
     initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
     # shape of initial_state = (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     if hasattr(self.cell.state_size, '__len__'):
       return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
     else:
@@ -2357,7 +2358,8 @@ class Recurrent(Layer):
     # shape of initial_state = (samples, timesteps, input_dim)
     initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
     # shape of initial_state = (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     initial_state = K.tile(initial_state, [1,
                                            self.units])  # (samples, output_dim)
     initial_state = [initial_state for _ in range(len(self.states))]
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 12f33614e2..c510e464ae 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -214,7 +214,7 @@ class TimeDistributed(Wrapper):
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
       input_uid = tf_layers_util.object_list_uid(inputs)
-      inputs = K.reshape(inputs, (-1,) + input_shape[2:])
+      inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
       y = self.layer.call(inputs, **kwargs)
@@ -222,7 +222,7 @@ class TimeDistributed(Wrapper):
         uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
-      y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
+      y = array_ops.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
     if (hasattr(self.layer, 'activity_regularizer') and
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 859bda0c9d..1d634d3801 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -25,51 +25,54 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.mean_squared_error',
            'keras.losses.mean_squared_error')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(K.square(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_error',
            'keras.losses.mean_absolute_error')
 def mean_absolute_error(y_true, y_pred):
-  return K.mean(K.abs(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_percentage_error',
            'keras.losses.mean_absolute_percentage_error')
 def mean_absolute_percentage_error(y_true, y_pred):
-  diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
+  diff = math_ops.abs(
+      (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
 @tf_export('keras.metrics.mean_squared_logarithmic_error',
            'keras.losses.mean_squared_logarithmic_error')
 def mean_squared_logarithmic_error(y_true, y_pred):
-  first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
-  second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(K.square(first_log - second_log), axis=-1)
+  first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
+  second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
+  return K.mean(math_ops.square(first_log - second_log), axis=-1)
 
 
 @tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
-  return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+  return K.mean(
+      math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
 @tf_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
-  return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
+  return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
 @tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
   pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
-  neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(0., neg - pos + 1.)
+  neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
+  return math_ops.maximum(0., neg - pos + 1.)
 
 
 @tf_export('keras.losses.logcosh')
@@ -90,7 +93,7 @@ def logcosh(y_true, y_pred):
   """
 
   def _logcosh(x):
-    return x + K.softplus(-2. * x) - K.log(2.)
+    return x + nn.softplus(-2. * x) - math_ops.log(2.)
 
   return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
@@ -118,18 +121,18 @@ def binary_crossentropy(y_true, y_pred):
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
-  return math_ops.reduce_sum(y_true * K.log(y_true / y_pred), axis=-1)
+  return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
-  return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
+  return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
 @tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
 def cosine_proximity(y_true, y_pred):
-  y_true = K.l2_normalize(y_true, axis=-1)
-  y_pred = K.l2_normalize(y_pred, axis=-1)
+  y_true = nn.l2_normalize(y_true, axis=-1)
+  y_pred = nn.l2_normalize(y_pred, axis=-1)
   return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 24192cf5a1..747c3e6515 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -38,39 +38,45 @@ from tensorflow.python.keras._impl.keras.losses import squared_hinge
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred):
-  return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
+  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
   return math_ops.cast(
-      K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
+      math_ops.equal(
+          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
+      K.floatx())
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
   return math_ops.cast(
-      K.equal(
-          K.max(y_true, axis=-1),
-          math_ops.cast(K.argmax(y_pred, axis=-1), K.floatx())), K.floatx())
+      math_ops.equal(
+          math_ops.reduce_max(y_true, axis=-1),
+          math_ops.cast(math_ops.argmax(y_pred, axis=-1), K.floatx())),
+      K.floatx())
 
 
 @tf_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
+  return K.mean(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
 
 
 @tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(
-      K.in_top_k(y_pred, math_ops.cast(K.max(y_true, axis=-1), 'int32'), k),
+      nn.in_top_k(y_pred,
+                  math_ops.cast(math_ops.reduce_max(y_true, axis=-1), 'int32'),
+                  k),
       axis=-1)
 
-
 # Aliases
 
 mse = MSE = mean_squared_error
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py
index 2b73e0c16f..9deaab0c05 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/_impl/keras/metrics_test.py
@@ -107,9 +107,8 @@ class KerasMetricsTest(test.TestCase):
                 completion of the batch.
         """
         y_true = math_ops.cast(y_true, 'int32')
-        y_pred = math_ops.cast(keras.backend.round(y_pred), 'int32')
-        correct_preds = math_ops.cast(
-            keras.backend.equal(y_pred, y_true), 'int32')
+        y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
+        correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
         true_pos = math_ops.cast(
             math_ops.reduce_sum(correct_preds * y_true), 'int32')
         current_true_pos = self.true_positives * 1
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index dc0e472b88..9f383deb72 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -119,7 +119,8 @@ class Optimizer(object):
                        'Common ops without gradient: '
                        'K.argmax, K.round, K.eval.')
     if hasattr(self, 'clipnorm') and self.clipnorm > 0:
-      norm = K.sqrt(sum([math_ops.reduce_sum(K.square(g)) for g in grads]))
+      norm = K.sqrt(
+          sum([math_ops.reduce_sum(math_ops.square(g)) for g in grads]))
       grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
     if hasattr(self, 'clipvalue') and self.clipvalue > 0:
       grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
@@ -288,7 +289,7 @@ class RMSprop(Optimizer):
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
       self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
@@ -349,7 +350,7 @@ class Adagrad(Optimizer):
                                                 K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
-      new_a = a + K.square(g)  # update accumulator
+      new_a = a + math_ops.square(g)  # update accumulator
       self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
@@ -414,7 +415,7 @@ class Adadelta(Optimizer):
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
       self.updates.append(state_ops.assign(a, new_a))
 
       # use the new accumulator and the *old* delta_accumulator
@@ -428,7 +429,7 @@ class Adadelta(Optimizer):
       self.updates.append(state_ops.assign(p, new_p))
 
       # update delta_accumulator
-      new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
+      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
       self.updates.append(state_ops.assign(d_a, new_d_a))
     return self.updates
 
@@ -494,7 +495,8 @@ class Adam(Optimizer):
 
     t = math_ops.cast(self.iterations, K.floatx()) + 1
     lr_t = lr * (
-        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
+        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
+        (1. - math_ops.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
@@ -506,9 +508,9 @@ class Adam(Optimizer):
 
     for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
       if self.amsgrad:
-        vhat_t = K.maximum(vhat, v_t)
+        vhat_t = math_ops.maximum(vhat, v_t)
         p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
         self.updates.append(state_ops.assign(vhat, vhat_t))
       else:
@@ -583,7 +585,7 @@ class Adamax(Optimizer):
                                                 K.dtype(self.decay))))
 
     t = math_ops.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr / (1. - K.pow(self.beta_1, t))
+    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
     # zero init of 1st moment
@@ -595,7 +597,7 @@ class Adamax(Optimizer):
     for p, g, m, u in zip(params, grads, ms, us):
 
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      u_t = K.maximum(self.beta_2 * u, K.abs(g))
+      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
       p_t = p - lr_t * m_t / (u_t + self.epsilon)
 
       self.updates.append(state_ops.assign(m, m_t))
@@ -666,10 +668,11 @@ class Nadam(Optimizer):
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
-        1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+        1. - 0.5 *
+        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
     momentum_cache_t_1 = self.beta_1 * (
         1. - 0.5 *
-        (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
     m_schedule_new = self.m_schedule * momentum_cache_t
     m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
     self.updates.append((self.m_schedule, m_schedule_new))
@@ -685,8 +688,8 @@ class Nadam(Optimizer):
       g_prime = g / (1. - m_schedule_new)
       m_t = self.beta_1 * m + (1. - self.beta_1) * g
       m_t_prime = m_t / (1. - m_schedule_next)
-      v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
-      v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
+      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
+      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
       m_t_bar = (
           1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
 
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index fdb9d33810..74c37d370e 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -56,9 +56,9 @@ class L1L2(Regularizer):
   def __call__(self, x):
     regularization = 0.
     if self.l1:
-      regularization += math_ops.reduce_sum(self.l1 * K.abs(x))
+      regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
     if self.l2:
-      regularization += math_ops.reduce_sum(self.l2 * K.square(x))
+      regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
     return regularization
 
   def get_config(self):
-- 
GitLab


From 1d0f3581f7350f5c666a1b9869637d6feac3d4df Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 4 Apr 2018 17:14:11 -0700
Subject: [PATCH 0284/1262] [XLA]: Enable Memory sanitizer for
 compilation_passes_test.

Initialize the buffer for a scalar tensor to avoid uninitialized
accesses.

PiperOrigin-RevId: 191672257
---
 tensorflow/compiler/jit/BUILD                         |  1 -
 .../compiler/jit/mark_for_compilation_pass_test.cc    | 11 ++++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index e7d18e8351..24aa203c00 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -347,7 +347,6 @@ tf_cc_test(
         "encapsulate_subgraphs_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
     ],
-    tags = ["nomsan"],  # TODO: b/77543571
     deps = [
         ":common",
         ":compilation_passes",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index af1919278c..2e362e0a63 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -163,11 +163,12 @@ TEST(XlaCompilationTest, HalfSupported) {
   GraphDef graphdef;
   {
     GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-    Node* a = ops::SourceOp(
-        "Const", builder.opts()
-                     .WithName("A")
-                     .WithAttr("dtype", DT_HALF)
-                     .WithAttr("value", Tensor(DT_HALF, TensorShape())));
+    Tensor t(DT_HALF, TensorShape());
+    t.scalar<Eigen::half>()() = static_cast<Eigen::half>(0.0f);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_HALF)
+                                         .WithAttr("value", t));
     Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
-- 
GitLab


From 18862fb82f317a930a2d1cde51bca9c47924c882 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 17:18:43 -0700
Subject: [PATCH 0285/1262] Exclude some gcc options in Windows build

PiperOrigin-RevId: 191672761
---
 tensorflow/python/BUILD   | 1 +
 tensorflow/tensorflow.bzl | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a9363608b5..a8f1318509 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -427,6 +427,7 @@ tf_cc_shared_object(
             "-lm",
         ],
         "//tensorflow:darwin": [],
+        "//tensorflow:windows": [],
     }),
     deps = [
         "//tensorflow/core:framework_headers_lib",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 098ae7e6e3..fd44b0eb3b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -304,6 +304,7 @@ def tf_cc_shared_object(
           clean_dep("//tensorflow:darwin"): [
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
+          clean_dep("//tensorflow:windows"): [],
           "//conditions:default": [
               "-Wl,-soname," + name.split("/")[-1],
           ],
@@ -929,6 +930,7 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   if 'linkstatic' not in kwargs or kwargs['linkstatic'] != 1:
     enable_text_relocation_linkopt = select({
           clean_dep("//tensorflow:darwin"): [],
+          clean_dep("//tensorflow:windows"): [],
           "//conditions:default": ['-Wl,-z,notext'],})
     if 'linkopts' in kwargs:
       kwargs['linkopts'] += enable_text_relocation_linkopt
-- 
GitLab


From 4b563ed0008953519a0ad9ec09a3261f1d3759dd Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Wed, 4 Apr 2018 17:28:41 -0700
Subject: [PATCH 0286/1262] BUGFIX: Detect when broadcasting is required and
 raise NotImplementedError.

PiperOrigin-RevId: 191673876
---
 .../python/kernel_tests/batch_reshape_test.py |  37 ++++
 .../distributions/python/ops/batch_reshape.py | 192 +++++++++++++-----
 2 files changed, 174 insertions(+), 55 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
index 4d2f40e27f..c6c8d2cf6e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import batch_reshape as batch_reshape_lib
 from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_lib
+from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
 from tensorflow.contrib.distributions.python.ops import wishart as wishart_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
@@ -514,6 +515,42 @@ class _BatchReshapeTest(object):
               batch_shape=new_batch_shape_ph,
               validate_args=True).sample().eval()
 
+  def test_broadcasting_explicitly_unsupported(self):
+    old_batch_shape = [4]
+    new_batch_shape = [1, 4, 1]
+    rate_ = self.dtype([1, 10, 2, 20])
+
+    rate = array_ops.placeholder_with_default(
+        rate_,
+        shape=old_batch_shape if self.is_static_shape else None)
+    poisson_4 = poisson_lib.Poisson(rate)
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+    poisson_141_reshaped = batch_reshape_lib.BatchReshape(
+        poisson_4, new_batch_shape_ph, validate_args=True)
+
+    x_4 = self.dtype([2, 12, 3, 23])
+    x_114 = self.dtype([2, 12, 3, 23]).reshape(1, 1, 4)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "too few event dims"):
+        poisson_141_reshaped.log_prob(x_4)
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "unexpected batch and event shape"):
+        poisson_141_reshaped.log_prob(x_114)
+      return
+
+    with self.assertRaisesOpError("too few event dims"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_4).eval()
+
+    with self.assertRaisesOpError("unexpected batch and event shape"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_114).eval()
+
 
 class BatchReshapeStaticTest(_BatchReshapeTest, test.TestCase):
 
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index c7ee9b2117..3e6c35e0d6 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -115,7 +115,7 @@ class BatchReshape(distribution_lib.Distribution):
       self._batch_shape_static = tensor_util.constant_value(self._batch_shape_)
       if self._batch_shape_static is not None:
         self._batch_shape_static = np.int32(self._batch_shape_static)
-      self._runtime_assertions = make_runtime_assertions(
+      self._runtime_assertions = validate_init_args(
           self._distribution,
           self._batch_shape_,
           validate_args,
@@ -229,7 +229,8 @@ class BatchReshape(distribution_lib.Distribution):
 
   def _call_reshape_input_output(self, fn, x):
     """Calls `fn`, appropriately reshaping its input `x` and output."""
-    with ops.control_dependencies(self._runtime_assertions):
+    with ops.control_dependencies(
+        self._runtime_assertions + self._validate_sample_arg(x)):
       sample_shape, static_sample_shape = self._sample_shape(x)
       old_shape = array_ops.concat([
           sample_shape,
@@ -273,61 +274,142 @@ class BatchReshape(distribution_lib.Distribution):
         result.set_shape(static_shape)
       return result
 
-
-def make_runtime_assertions(
+  def _validate_sample_arg(self, x):
+    """Helper which validates sample arg, e.g., input to `log_prob`."""
+    with ops.name_scope(name="validate_sample_arg", values=[x]):
+      x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+      event_ndims = (array_ops.size(self.event_shape_tensor())
+                     if self.event_shape.ndims is None
+                     else self.event_shape.ndims)
+      batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                     if self.batch_shape.ndims is None
+                     else self.batch_shape.ndims)
+      expected_batch_event_ndims = batch_ndims + event_ndims
+
+      if (isinstance(x_ndims, int) and
+          isinstance(expected_batch_event_ndims, int)):
+        if x_ndims < expected_batch_event_ndims:
+          raise NotImplementedError(
+              "Broadcasting is not supported; too few event dims "
+              "(expected at least {}, saw {}).".format(
+                  expected_batch_event_ndims, x_ndims))
+        ndims_assertion = []
+      elif self.validate_args:
+        ndims_assertion = [
+            check_ops.assert_greater_equal(
+                x_ndims,
+                expected_batch_event_ndims,
+                message="Broadcasting is not supported; too few event dims.",
+                name="assert_batch_and_event_ndims_large_enough"),
+        ]
+
+      if (self.batch_shape.is_fully_defined() and
+          self.event_shape.is_fully_defined()):
+        expected_batch_event_shape = np.int32(self.batch_shape.concatenate(
+            self.event_shape).as_list())
+      else:
+        expected_batch_event_shape = array_ops.concat([
+            self.batch_shape_tensor(),
+            self.event_shape_tensor(),
+        ], axis=0)
+
+      sample_ndims = x_ndims - expected_batch_event_ndims
+      if isinstance(sample_ndims, int):
+        sample_ndims = max(sample_ndims, 0)
+      if (isinstance(sample_ndims, int) and
+          x.shape[sample_ndims:].is_fully_defined()):
+        actual_batch_event_shape = np.int32(x.shape[sample_ndims:].as_list())
+      else:
+        sample_ndims = math_ops.maximum(sample_ndims, 0)
+        actual_batch_event_shape = array_ops.shape(x)[sample_ndims:]
+
+      if (isinstance(expected_batch_event_shape, np.ndarray) and
+          isinstance(actual_batch_event_shape, np.ndarray)):
+        if any(expected_batch_event_shape != actual_batch_event_shape):
+          raise NotImplementedError("Broadcasting is not supported; "
+                                    "unexpected batch and event shape "
+                                    "(expected {}, saw {}).".format(
+                                        expected_batch_event_shape,
+                                        actual_batch_event_shape))
+        # We need to set the final runtime-assertions to `ndims_assertion` since
+        # its possible this assertion was created. We could add a condition to
+        # only do so if `self.validate_args == True`, however this is redundant
+        # as `ndims_assertion` already encodes this information.
+        runtime_assertions = ndims_assertion
+      elif self.validate_args:
+        # We need to make the `ndims_assertion` a control dep because otherwise
+        # TF itself might raise an exception owing to this assertion being
+        # ill-defined, ie, one cannot even compare different rank Tensors.
+        with ops.control_dependencies(ndims_assertion):
+          shape_assertion = check_ops.assert_equal(
+              expected_batch_event_shape,
+              actual_batch_event_shape,
+              message=("Broadcasting is not supported; "
+                       "unexpected batch and event shape."),
+              name="assert_batch_and_event_shape_same")
+        runtime_assertions = [shape_assertion]
+      else:
+        runtime_assertions = []
+
+      return runtime_assertions
+
+
+def validate_init_args(
     distribution,
     batch_shape,
     validate_args,
     batch_shape_static):
   """Helper to __init__ which makes or raises assertions."""
-  runtime_assertions = []
-
-  if batch_shape.shape.ndims is not None:
-    if batch_shape.shape.ndims != 1:
-      raise ValueError("`batch_shape` must be a vector "
-                       "(saw rank: {}).".format(
-                           batch_shape.shape.ndims))
-  elif validate_args:
-    runtime_assertions += [
-        check_ops.assert_rank(
-            batch_shape,
-            1,
-            message="`batch_shape` must be a vector.",
-            name="assert_batch_shape_is_vector"),
-    ]
-
-  batch_size_static = np.prod(batch_shape_static)
-  dist_batch_size_static = (
-      None if not distribution.batch_shape.is_fully_defined()
-      else np.prod(distribution.batch_shape).value)
-
-  if batch_size_static is not None and dist_batch_size_static is not None:
-    if batch_size_static != dist_batch_size_static:
-      raise ValueError("`batch_shape` size ({}) must match "
-                       "`distribution.batch_shape` size ({}).".format(
-                           batch_size_static,
-                           dist_batch_size_static))
-  elif validate_args:
-    runtime_assertions += [
-        check_ops.assert_equal(
-            math_ops.reduce_prod(batch_shape),
-            math_ops.reduce_prod(distribution.batch_shape_tensor()),
-            message=("`batch_shape` size must match "
-                     "`distributions.batch_shape` size."),
-            name="assert_batch_size"),
-    ]
-
-  if batch_shape_static is not None:
-    if np.any(batch_shape_static < 1):
-      raise ValueError("`batch_shape` elements must be positive "
-                       "(i.e., larger than zero).")
-  elif validate_args:
-    runtime_assertions += [
-        check_ops.assert_positive(
-            batch_shape,
-            message=("`batch_shape` elements must be positive "
-                     "(i.e., larger than zero)."),
-            name="assert_batch_shape_positive")
-    ]
-
-  return runtime_assertions
+  with ops.name_scope(name="validate_init_args",
+                      values=[batch_shape] + distribution._graph_parents):  # pylint: disable=protected-access
+    runtime_assertions = []
+
+    if batch_shape.shape.ndims is not None:
+      if batch_shape.shape.ndims != 1:
+        raise ValueError("`batch_shape` must be a vector "
+                         "(saw rank: {}).".format(
+                             batch_shape.shape.ndims))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_rank(
+              batch_shape,
+              1,
+              message="`batch_shape` must be a vector.",
+              name="assert_batch_shape_is_vector"),
+      ]
+
+    batch_size_static = np.prod(batch_shape_static)
+    dist_batch_size_static = (
+        None if not distribution.batch_shape.is_fully_defined()
+        else np.prod(distribution.batch_shape).value)
+
+    if batch_size_static is not None and dist_batch_size_static is not None:
+      if batch_size_static != dist_batch_size_static:
+        raise ValueError("`batch_shape` size ({}) must match "
+                         "`distribution.batch_shape` size ({}).".format(
+                             batch_size_static,
+                             dist_batch_size_static))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_equal(
+              math_ops.reduce_prod(batch_shape),
+              math_ops.reduce_prod(distribution.batch_shape_tensor()),
+              message=("`batch_shape` size must match "
+                       "`distributions.batch_shape` size."),
+              name="assert_batch_size"),
+      ]
+
+    if batch_shape_static is not None:
+      if np.any(batch_shape_static < 1):
+        raise ValueError("`batch_shape` elements must be positive "
+                         "(i.e., larger than zero).")
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_positive(
+              batch_shape,
+              message=("`batch_shape` elements must be positive "
+                       "(i.e., larger than zero)."),
+              name="assert_batch_shape_positive")
+      ]
+
+    return runtime_assertions
-- 
GitLab


From 8abde65d3c7813a36082acfc341d22b0c5e76e02 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 4 Apr 2018 18:00:27 -0700
Subject: [PATCH 0287/1262] Sort control inputs alphabetically in
 ToGraphDefSubRange.

PiperOrigin-RevId: 191677358
---
 tensorflow/core/graph/graph.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index a7af5e2312..fb8a6c39e6 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -567,6 +567,11 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
         inputs[edge->dst_input()] = edge;
       }
     }
+    // Sort the control inputs for more predictable serialization.
+    std::sort(inputs.begin() + node->num_inputs(), inputs.end(),
+              [](const Edge* a, const Edge* b) -> bool {
+                return a->src()->name() < b->src()->name();
+              });
     node_def->clear_input();
     node_def->mutable_input()->Reserve(inputs.size());
 
-- 
GitLab


From 2194f66f0a905940327d05f2d63c1c7137e47574 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 18:20:36 -0700
Subject: [PATCH 0288/1262] Add a helper function to re-assign colocation in a
 graph.

PiperOrigin-RevId: 191679495
---
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../grappler/optimizers/meta_optimizer.cc     |   4 +-
 tensorflow/core/grappler/utils/BUILD          |  25 +++
 tensorflow/core/grappler/utils/colocation.cc  | 122 ++++++++++++
 tensorflow/core/grappler/utils/colocation.h   |  39 ++++
 .../core/grappler/utils/colocation_test.cc    | 183 ++++++++++++++++++
 6 files changed, 373 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/core/grappler/utils/colocation.cc
 create mode 100644 tensorflow/core/grappler/utils/colocation.h
 create mode 100644 tensorflow/core/grappler/utils/colocation_test.cc

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0c6549d940..122fd48584 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -508,6 +508,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index ce27d3d95c..5723e397ab 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -221,6 +222,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   if (already_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
+    ReassignColocation(optimized_graph);
     // Make sure that the optimizers preserved the graph version and library.
     DCHECK_GE(optimized_graph->library().function_size(),
               item.graph.library().function_size());
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index baf24c2505..7419c26dff 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -181,3 +181,28 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
+
+cc_library(
+    name = "colocation",
+    srcs = ["colocation.cc"],
+    hdrs = ["colocation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "colocation_test",
+    size = "small",
+    srcs = ["colocation_test.cc"],
+    deps = [
+        ":colocation",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/colocation.cc b/tensorflow/core/grappler/utils/colocation.cc
new file mode 100644
index 0000000000..0573e0a830
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include <cstring>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+// Find root node of the colocation group.
+// The map is mapping from one node name to its parent. node_name is the
+// starting node to search. By iteratively following the path from child to
+// parent, we can find the root node for the colocation group that node_name
+// belongs to.
+string GetColocationGroupRoot(std::unordered_map<string, string>* map,
+                              const string& node_name) {
+  if (map->find(node_name) == map->end()) {
+    // If node_name is not in the map, we create a new root node which points
+    // to itself.
+    map->insert({node_name, node_name});
+    return node_name;
+  }
+  string cur = node_name;
+  while ((*map)[cur] != cur) {
+    // Backtracing the map until we reach the root node.
+    cur = (*map)[cur];
+  }
+  return cur;
+}
+
+// Merge two colocation groups into one.
+// left and right is the root node of two colocation groups respectively.
+void MergeColocationGroup(std::unordered_map<string, string>* map,
+                          const string& left, const string& right) {
+  // Do nothing if left or right node is not in the map.
+  if (map->find(left) == map->end() || map->find(right) == map->end()) {
+    return;
+  }
+  if (left != right) {
+    // Make the right node a child of the left node, which merges the two
+    // groups.
+    map->at(right) = left;
+  }
+}
+}  // namespace
+
+// Use of disjoint set algorithm to build the colocation groups from the input
+// graph. The core data structure in use is a hash map from one node to its
+// parent node. Whenever we see two nodes colocate with each other, we merge
+// their colocation groups together. After we traverse all colocation pairs
+// in the graph, we will have several disjoint sets. Then we pick the root node
+// of each disjoint set as the representative node, and let all other nodes in
+// the group colocate with the representative node.
+void ReassignColocation(GraphDef* graph) {
+  constexpr char kClassAttr[] = "_class";
+  constexpr char kColocPrefix[] = "loc:@";
+
+  // A hashmap that maps from a node name to its parent node name.
+  std::unordered_map<string, string> coloc_groups;
+  NodeMap node_map(graph);
+  for (const auto& node : graph->node()) {
+    auto iter = node.attr().find(kClassAttr);
+    if (iter != node.attr().end() && iter->second.has_list()) {
+      for (const auto& str : iter->second.list().s()) {
+        size_t pos = str.find(kColocPrefix);
+        if (pos == 0) {
+          // After we find a colocation, update the colocation groups.
+          string colocate_node = str.substr(pos + strlen(kColocPrefix));
+          MergeColocationGroup(
+              &coloc_groups, GetColocationGroupRoot(&coloc_groups, node.name()),
+              GetColocationGroupRoot(&coloc_groups, colocate_node));
+        }
+      }
+    }
+  }
+
+  // We use the root node of each colocation groups as its representative
+  // node. For each node in one group, colocate with the representative node
+  // if the node is in the graph.
+  for (const auto& pair : coloc_groups) {
+    if (pair.first != pair.second) {
+      // This is a child node.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {
+        // Colocate this node with the root node.
+        AttrValue new_value;
+        new_value.mutable_list()->add_s(
+            kColocPrefix + GetColocationGroupRoot(&coloc_groups, pair.first));
+        node->mutable_attr()->erase(kClassAttr);
+        node->mutable_attr()->insert({kClassAttr, new_value});
+      }
+    } else {
+      // This is a root node. Clear the _class attribute.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {  // root node should always exist in the graph as guaranteed
+                   // by order of merging. Just put check here to ensure safety.
+        node->mutable_attr()->erase(kClassAttr);
+      }
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/colocation.h b/tensorflow/core/grappler/utils/colocation.h
new file mode 100644
index 0000000000..6062db6102
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Evaluates the colocation relation in the graph and rewrites the new
+// colocation relation in the graph. We scan the graph nodes sequentially, and
+// builds a disjoint-sets of nodes (within each disjoint-set the nodes are
+// colocated with each other). We then select the root node of each set as a
+// representative node, and then colocate each node within the set (should also
+// exist in graph) with the representative node.
+// Note that there is current one situation this function can't handle:
+// Node A colocates with X, node B colocates with Y, X colocates with Y but
+// X, Y are removed from graph. In this case we can't know A colocates with B.
+void ReassignColocation(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
diff --git a/tensorflow/core/grappler/utils/colocation_test.cc b/tensorflow/core/grappler/utils/colocation_test.cc
new file mode 100644
index 0000000000..6638364240
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ColocationTest : public ::testing::Test {};
+
+bool VerifyNodeHasColocation(const NodeDef& ndef, const string& coloc) {
+  if (ndef.attr().empty()) {
+    return false;
+  }
+  if (ndef.attr().find("_class") == ndef.attr().end()) {
+    return false;
+  }
+  return ndef.attr().at("_class").list().s(0) == coloc;
+}
+
+TEST(ColocationTest, ReassignColocation_SingleNode) {
+  // Node A colocates with B, but node B is not in the graph.
+  //   A
+  //   |
+  //   |
+  //  [B]
+
+  NodeDef ndef;
+  const Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@B"}).Finalize(&ndef);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef({ndef});
+
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(1, gdef.node(0).attr_size());
+
+  ReassignColocation(&gdef);
+
+  // Validates that node A's colocation info is cleared.
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(0, gdef.node(0).attr_size());
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_SingleGroup) {
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node X is not in the graph.
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  // After re-assign of colocation, A, B, C, D should colocate with E.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  GraphDef gdef =
+      test::function::GDef({ndef_a, ndef_b, ndef_c, ndef_d, ndef_e});
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_MultiGroup) {
+  // Before re-assign:
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node U, V colocates with W. Node X, W are not in the graph:
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  //
+  //  U       V
+  //  |       |
+  //  |       |
+  //  +--[W]--+
+  //
+  // After re-assign:
+  // A, B, C, D should colocate with E. U should colocate with V.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+  //
+  // U
+  // |
+  // |
+  // V
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("U", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_u);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("V", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_v);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef(
+      {ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v});
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@W"));  // U
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(6), "loc:@W"));  // V
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@V"));  // U
+  EXPECT_EQ(0, gdef.node(6).attr_size());                        // V
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
-- 
GitLab


From aa890776f062f3429bcedb0a080b712ebb97793b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 4 Apr 2018 22:37:35 -0700
Subject: [PATCH 0289/1262] Support arbitrary permutations for
 DataFormatDimMap.

PiperOrigin-RevId: 191696203
---
 tensorflow/core/kernels/data_format_ops.cc | 40 ++++++++++++++--------
 tensorflow/core/kernels/data_format_ops.h  | 18 +++++++---
 tensorflow/python/ops/nn_test.py           | 36 +++++++++++++++++++
 3 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 39ef8ee3ac..4485152e96 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -37,25 +37,37 @@ class DataFormatDimMapOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context, src_format.size() == 4,
+                errors::InvalidArgument(strings::StrCat(
+                    "Source format must of length 4, received src_format = ",
+                    src_format)));
     OP_REQUIRES(
-        context, src_format == "NHWC",
+        context, dst_format.size() == 4,
         errors::InvalidArgument(strings::StrCat(
-            "Current implementation doesn't support source data format ",
-            src_format)));
-    OP_REQUIRES(context, dst_format == "NCHW",
-                errors::InvalidArgument(strings::StrCat(
-                    "Current implementation doesn't support dst data format ",
-                    dst_format)));
+            "Destination format must of length 4, received dst_format = ",
+            dst_format)));
+    dst_idx_ = Tensor(DT_INT32, {static_cast<int64>(src_format.size())});
+    for (int i = 0; i < src_format.size(); ++i) {
+      for (int j = 0; j < dst_format.size(); ++j) {
+        if (dst_format[j] == src_format[i]) {
+          dst_idx_.vec<int>()(i) = j;
+          break;
+        }
+      }
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    Tensor* output = nullptr;
+    Tensor* output;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
-                                           input.flat<T>(), output->flat<T>());
+                                           input.flat<T>(), output->flat<T>(),
+                                           dst_idx_.vec<int>());
   }
+
+  Tensor dst_idx_;
 };
 
 template <typename Device, typename T>
@@ -147,11 +159,11 @@ TF_CALL_int64(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                \
-  template <>                                              \
-  void DataFormatDimMap<GPUDevice, T>::operator()(         \
-      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
-      typename TTypes<T>::Flat y);                         \
+#define DECLARE_GPU_SPEC(T)                                    \
+  template <>                                                  \
+  void DataFormatDimMap<GPUDevice, T>::operator()(             \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x,     \
+      typename TTypes<T>::Flat y, const TTypes<int>::Vec dst); \
   extern template struct DataFormatDimMap<GPUDevice, T>;
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 TF_CALL_int32(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
index 2ccc919586..1ca144cb40 100644
--- a/tensorflow/core/kernels/data_format_ops.h
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -27,15 +27,25 @@ namespace functor {
 template <typename Device, typename T>
 struct DataFormatDimMap {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
-                  typename TTypes<T>::Flat y) {
+                  typename TTypes<T>::Flat y, const TTypes<int>::Vec dst) {
     auto zero = x.constant(0);
     auto one = x.constant(1);
-    auto three = x.constant(3);
+    auto two = x.constant(2);
+
+    auto f_zero = x.constant(dst(0));
+    auto f_one = x.constant(dst(1));
+    auto f_two = x.constant(dst(2));
+    auto f_three = x.constant(dst(3));
+
     auto four = x.constant(4);
     auto x_mod = (x + four) % 4;
+
     auto is_zero = (x_mod == zero);
-    auto is_three = (x_mod == three);
-    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
+    auto is_one = (x_mod == one);
+    auto is_two = (x_mod == two);
+
+    y.device(d) = is_zero.select(
+        f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
   }
 };
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index da86d5f6ca..46a5f4fae6 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1081,6 +1081,42 @@ class DataFormatDimMapTest(test_lib.TestCase):
     self._test([1, -3, -2], [2, 2, 3])
     self._test([[1, -3], [1, -1]], [[2, 2], [2, 1]])
 
+  def testNHWCtoNCHW(self):
+    x_val = [1, -3, -2]
+    y_val_expected = [2, 2, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoHWNC(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoWHCN(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testArbitraryASCII(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
 
 class DataFormatVectorPermuteTest(test_lib.TestCase):
 
-- 
GitLab


From 007e3e6c3d72ea5edca361eb908fb7aa66ac6d6d Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Wed, 4 Apr 2018 22:37:42 -0700
Subject: [PATCH 0290/1262] Upgrade zlib to latest version

PiperOrigin-RevId: 191696213
---
 tensorflow/workspace.bzl |  8 ++++----
 third_party/zlib.BUILD   | 16 ++--------------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2510d369fc..cd8b6f83a9 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -503,11 +503,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "zlib_archive",
       urls = [
-          "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
-          "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
+          "https://mirror.bazel.build/zlib.net/zlib-1.2.11.tar.gz",
+          "https://zlib.net/zlib-1.2.11.tar.gz",
       ],
-      sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
-      strip_prefix = "zlib-1.2.8",
+      sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
+      strip_prefix = "zlib-1.2.11",
       build_file = clean_dep("//third_party:zlib.BUILD"),
   )
 
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index d164ee719c..e8048dd98a 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -2,18 +2,6 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # BSD/MIT-like license (for zlib)
 
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "zlib",
     srcs = [
@@ -45,8 +33,8 @@ cc_library(
     ],
     hdrs = ["zlib.h"],
     copts = select({
-        ":windows": [],
-        ":windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",
-- 
GitLab


From dfa9921e6343727b05f42f8d4a918b19528ff994 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Thu, 5 Apr 2018 00:53:51 -0700
Subject: [PATCH 0291/1262] Upgrade Snappy to latest version

They added big endian preprocessor macros in recent releases. Hopefully
this should do the right thing on IBM mainframes.

PiperOrigin-RevId: 191705207
---
 tensorflow/workspace.bzl |  8 ++--
 third_party/snappy.BUILD | 96 +++++++++++++++++++++++++++++-----------
 2 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index cd8b6f83a9..23f6d3c1d9 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -524,11 +524,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "snappy",
       urls = [
-          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
-          "https://github.com/google/snappy/archive/1.1.4.tar.gz",
+          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.7.tar.gz",
+          "https://github.com/google/snappy/archive/1.1.7.tar.gz",
       ],
-      sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
-      strip_prefix = "snappy-1.1.4",
+      sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
+      strip_prefix = "snappy-1.1.7",
       build_file = clean_dep("//third_party:snappy.BUILD"),
   )
 
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index fd48ed8941..cc11f52d0e 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -4,25 +4,12 @@ licenses(["notice"])  # BSD 3-Clause
 
 exports_files(["COPYING"])
 
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "snappy",
     srcs = [
+        "config.h",
         "snappy.cc",
         "snappy.h",
-        "snappy-c.cc",
-        "snappy-c.h",
         "snappy-internal.h",
         "snappy-sinksource.cc",
         "snappy-sinksource.h",
@@ -32,30 +19,85 @@ cc_library(
     ],
     hdrs = ["snappy.h"],
     copts = select({
-        ":windows": [],
-        ":windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "/DHAVE_CONFIG_H",
+            "/EHsc",
+        ],
+        "@org_tensorflow//tensorflow:windows_msvc": [
+            "/DHAVE_CONFIG_H",
+            "/EHsc",
+        ],
         "//conditions:default": [
+            "-DHAVE_CONFIG_H",
+            "-fno-exceptions",
+            "-Wno-sign-compare",
             "-Wno-shift-negative-value",
             "-Wno-implicit-function-declaration",
         ],
     }),
 )
 
+genrule(
+    name = "config_h",
+    outs = ["config.h"],
+    cmd = "\n".join([
+        "cat <<'EOF' >$@",
+        "#define HAVE_STDDEF_H 1",
+        "#define HAVE_STDINT_H 1",
+        "",
+        "#ifdef __has_builtin",
+        "#  if !defined(HAVE_BUILTIN_EXPECT) && __has_builtin(__builtin_expect)",
+        "#    define HAVE_BUILTIN_EXPECT 1",
+        "#  endif",
+        "#  if !defined(HAVE_BUILTIN_CTZ) && __has_builtin(__builtin_ctzll)",
+        "#    define HAVE_BUILTIN_CTZ 1",
+        "#  endif",
+        "#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4)",
+        "#  ifndef HAVE_BUILTIN_EXPECT",
+        "#    define HAVE_BUILTIN_EXPECT 1",
+        "#  endif",
+        "#  ifndef HAVE_BUILTIN_CTZ",
+        "#    define HAVE_BUILTIN_CTZ 1",
+        "#  endif",
+        "#endif",
+        "",
+        "#ifdef __has_include",
+        "#  if !defined(HAVE_BYTESWAP_H) && __has_include(<byteswap.h>)",
+        "#    define HAVE_BYTESWAP_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_UNISTD_H) && __has_include(<unistd.h>)",
+        "#    define HAVE_UNISTD_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_ENDIAN_H) && __has_include(<sys/endian.h>)",
+        "#    define HAVE_SYS_ENDIAN_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_MMAN_H) && __has_include(<sys/mman.h>)",
+        "#    define HAVE_SYS_MMAN_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_UIO_H) && __has_include(<sys/uio.h>)",
+        "#    define HAVE_SYS_UIO_H 1",
+        "#  endif",
+        "#endif",
+        "",
+        "#ifndef SNAPPY_IS_BIG_ENDIAN",
+        "#  ifdef __s390x__",
+        "#    define SNAPPY_IS_BIG_ENDIAN 1",
+        "#  elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__",
+        "#    define SNAPPY_IS_BIG_ENDIAN 1",
+        "#  endif",
+        "#endif",
+        "EOF",
+    ]),
+)
+
 genrule(
     name = "snappy_stubs_public_h",
     srcs = ["snappy-stubs-public.h.in"],
     outs = ["snappy-stubs-public.h"],
     cmd = ("sed " +
-           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
-           "-e 's/@ac_cv_have_stddef_h@/1/g' " +
-           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
-           select({
-               "@org_tensorflow//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "@org_tensorflow//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
-           }) +
-           "-e 's/@SNAPPY_MAJOR@/1/g' " +
-           "-e 's/@SNAPPY_MINOR@/1/g' " +
-           "-e 's/@SNAPPY_PATCHLEVEL@/4/g' " +
+           "-e 's/$${\\(.*\\)_01}/\\1/g' " +
+           "-e 's/$${SNAPPY_MAJOR}/1/g' " +
+           "-e 's/$${SNAPPY_MINOR}/1/g' " +
+           "-e 's/$${SNAPPY_PATCHLEVEL}/4/g' " +
            "$< >$@"),
 )
-- 
GitLab


From 1fda7645d132b71b9084b01945795e97e582adcd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 03:09:27 -0700
Subject: [PATCH 0292/1262] Add support for NCCL2. The configure script asks
 for what version of NCCL to use. The default is still NCCL 1 from GitHub. If
 the user chooses NCCL 2, it asks for the install directory.

The nccl_configure.bzl generates two different BUILD files based on the chose NCCL version. For NCCL 1, it aliases to the existing 'nccl_archive' http_repo on GitHub. For NCCL 2, it creates a target containing the NCCL 2 library and headers from the chosen install directory.

PiperOrigin-RevId: 191718007
---
 configure.py                                  |  77 +++++++
 tensorflow/contrib/nccl/BUILD                 |   6 +-
 .../contrib/nccl/kernels/nccl_manager.h       |   2 +-
 tensorflow/contrib/nccl/kernels/nccl_ops.cc   |   2 +-
 tensorflow/tools/pip_package/BUILD            |   3 +-
 tensorflow/workspace.bzl                      |   4 +-
 third_party/nccl/LICENSE                      | 203 ++++++++++++++++++
 .../{nccl.BUILD => nccl/nccl_archive.BUILD}   |   2 +
 third_party/nccl/nccl_configure.bzl           | 172 +++++++++++++++
 9 files changed, 463 insertions(+), 8 deletions(-)
 create mode 100644 third_party/nccl/LICENSE
 rename third_party/{nccl.BUILD => nccl/nccl_archive.BUILD} (95%)
 create mode 100644 third_party/nccl/nccl_configure.bzl

diff --git a/configure.py b/configure.py
index 26eff5767e..da3f97ab30 100644
--- a/configure.py
+++ b/configure.py
@@ -35,6 +35,7 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
+_DEFAULT_NCCL_VERSION = '1.3'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -1105,6 +1106,81 @@ def set_tf_tensorrt_install_path(environ_cp):
   write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
 
 
+def set_tf_nccl_install_path(environ_cp):
+  """Set NCCL_INSTALL_PATH and TF_NCCL_VERSION.
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently NCCL is only supported on Linux platforms.')
+
+  ask_nccl_version = (
+      'Please specify the NCCL version you want to use. '
+      '[Leave empty to default to NCCL %s]: ') % _DEFAULT_NCCL_VERSION
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    tf_nccl_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, _DEFAULT_NCCL_VERSION)
+    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
+
+    if tf_nccl_version == '1':
+      break  # No need to get install path, NCCL 1 is a GitHub repo.
+
+    # TODO(csigg): Look with ldconfig first if we can find the library in paths
+    # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
+    # include directory. This is where the NCCL .deb packages install them.
+    # Then ask the user if we should use that. Instead of a single
+    # NCCL_INSTALL_PATH, pass separate NCCL_LIB_PATH and NCCL_HDR_PATH to
+    # nccl_configure.bzl
+    default_nccl_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+    ask_nccl_path = (r'Please specify the location where NCCL %s library is '
+                     'installed. Refer to README.md for more details. [Default '
+                     'is %s]:') % (tf_nccl_version, default_nccl_path)
+    nccl_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'NCCL_INSTALL_PATH', ask_nccl_path, default_nccl_path)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    nccl_install_path = os.path.realpath(os.path.expanduser(nccl_install_path))
+    if is_windows() or is_cygwin():
+      nccl_install_path = cygpath(nccl_install_path)
+
+    if is_windows():
+      nccl_lib_path = 'lib/x64/nccl.lib'
+    elif is_linux():
+      nccl_lib_path = 'lib/libnccl.so.%s' % tf_nccl_version
+    elif is_macos():
+      nccl_lib_path = 'lib/libnccl.%s.dylib' % tf_nccl_version
+
+    nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
+    nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
+    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
+      # Set NCCL_INSTALL_PATH
+      environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
+      write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
+      break
+
+    # Reset and Retry
+    print('Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
+          'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
+                                              nccl_hdr_path))
+
+    environ_cp['TF_NCCL_VERSION'] = ''
+  else:
+    raise UserInputError('Invalid TF_NCCL setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TF_NCCL_VERSION
+  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
+  write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1441,6 +1517,7 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
+    set_tf_nccl_install_path(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 6cbfd03881..334e70318d 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -31,7 +31,7 @@ tf_custom_op_library(
         "kernels/nccl_ops.cc",
     ],
     deps = if_cuda([
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
         "//tensorflow/core:gpu_headers_lib",
     ]),
 )
@@ -61,7 +61,7 @@ tf_cuda_cc_test(
             "//tensorflow/core:test",
             "//tensorflow/core:test_main",
             "//tensorflow/core:testlib",
-            "@nccl_archive//:nccl",
+            "@local_config_nccl//:nccl",
         ],
 )
 
@@ -80,7 +80,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:stream_executor",
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index bb219e0edc..6ff8cea84e 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 266d4f6f0d..c2b76caef3 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 4a70f666b6..376644718f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -141,6 +141,7 @@ filegroup(
         "@kafka//:LICENSE",
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
+        "@local_config_nccl//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
         "@grpc//third_party/address_sorting:LICENSE",
@@ -157,8 +158,6 @@ filegroup(
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-    ]) + if_not_windows([
-        "@nccl_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
 )
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 23f6d3c1d9..ace0d411b9 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -2,6 +2,7 @@
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -31,6 +32,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   cc_download_clang_toolchain(name="local_config_download_clang")
   cuda_configure(name="local_config_cuda")
   tensorrt_configure(name="local_config_tensorrt")
+  nccl_configure(name="local_config_nccl")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
@@ -540,7 +542,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
-      build_file = clean_dep("//third_party:nccl.BUILD"),
+      build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
   )
 
   tf_http_archive(
diff --git a/third_party/nccl/LICENSE b/third_party/nccl/LICENSE
new file mode 100644
index 0000000000..146d9b765c
--- /dev/null
+++ b/third_party/nccl/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/third_party/nccl.BUILD b/third_party/nccl/nccl_archive.BUILD
similarity index 95%
rename from third_party/nccl.BUILD
rename to third_party/nccl/nccl_archive.BUILD
index b2b8e18824..a05899e38d 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl/nccl_archive.BUILD
@@ -43,6 +43,7 @@ cc_library(
         "-Iexternal/nccl_archive/src",
         "-O3",
     ] + cuda_default_copts(),
+    include_prefix = "third_party/nccl",
     linkopts = select({
         "@org_tensorflow//tensorflow:android": [
             "-pie",
@@ -61,6 +62,7 @@ cc_library(
             "-lrt",
         ],
     }),
+    strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
new file mode 100644
index 0000000000..9dfcb18369
--- /dev/null
+++ b/third_party/nccl/nccl_configure.bzl
@@ -0,0 +1,172 @@
+# -*- Python -*-
+"""Repository rule for NCCL configuration.
+
+`nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_VERSION`: The NCCL version.
+  * `NCCL_INSTALL_PATH`: The installation path of the NCCL library.
+"""
+
+load(
+    "//third_party/gpus:cuda_configure.bzl",
+    "auto_configure_fail",
+    "find_cuda_define",
+    "matches_version",
+)
+
+_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_TF_NCCL_VERSION = "TF_NCCL_VERSION"
+
+_DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
+_DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
+_DEFINE_NCCL_PATCH = "#define NCCL_PATCH"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_LOCAL_BUILD_TEMPLATE = """
+filegroup(
+  name = "LICENSE",
+  data = ["nccl/NCCL-SLA.txt"],
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  srcs = ["nccl/lib/libnccl.so.%s"],
+  hdrs = ["nccl/include/nccl.h"],
+  include_prefix = "third_party/nccl",
+  strip_include_prefix = "nccl/include",
+  deps = [
+      "@local_config_cuda//cuda:cuda_headers",
+  ],
+  visibility = ["//visibility:public"],
+)
+"""
+
+
+def _find_nccl_header(repository_ctx, nccl_install_path):
+  """Finds the NCCL header on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library install directory.
+
+  Returns:
+    The path to the NCCL header.
+  """
+  header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
+  if not header_path.exists:
+    auto_configure_fail("Cannot find %s" % str(header_path))
+  return header_path
+
+
+def _check_nccl_version(repository_ctx, nccl_install_path, nccl_version):
+  """Checks whether the header file matches the specified version of NCCL.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library install directory.
+    nccl_version: The expected NCCL version.
+
+  Returns:
+    A string containing the library version of NCCL.
+  """
+  header_path = _find_nccl_header(repository_ctx, nccl_install_path)
+  header_dir = str(header_path.realpath.dirname)
+  major_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_MAJOR)
+  minor_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_MINOR)
+  patch_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_PATCH)
+  header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+  if not matches_version(nccl_version, header_version):
+    auto_configure_fail(
+        ("NCCL library version detected from %s/nccl.h (%s) does not match " +
+         "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+        (header_dir, header_version, nccl_version))
+
+
+def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
+  """Finds the given NCCL library on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library installation directory.
+    nccl_version: The version of NCCL library files as returned
+      by _nccl_version.
+
+  Returns:
+    The path to the NCCL library.
+  """
+  lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
+                                                           nccl_version))
+  if not lib_path.exists:
+    auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
+  return lib_path
+
+
+def _nccl_configure_impl(repository_ctx):
+  """Implementation of the nccl_configure repository rule."""
+  if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+    # Add a dummy build file to make bazel query happy.
+    repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+    return
+
+  nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+  if matches_version("1", nccl_version):
+    # Alias to GitHub target from @nccl_archive.
+    if not matches_version(nccl_version, "1.3"):
+      auto_configure_fail(
+          "NCCL from GitHub must use version 1.3 (got %s)" % nccl_version)
+    repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+  else:
+    # Create target for locally installed NCCL.
+    nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
+    _check_nccl_version(repository_ctx, nccl_install_path, nccl_version)
+    repository_ctx.symlink(nccl_install_path, "nccl")
+    repository_ctx.file("BUILD", _NCCL_LOCAL_BUILD_TEMPLATE % nccl_version)
+
+
+nccl_configure = repository_rule(
+    implementation=_nccl_configure_impl,
+    environ=[
+        _NCCL_INSTALL_PATH,
+        _TF_NCCL_VERSION,
+    ],
+)
+"""Detects and configures the NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
-- 
GitLab


From 361a13cf0c2b65d26f6e2b5b68875adfcea98dd0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 05:39:21 -0700
Subject: [PATCH 0293/1262] Create a separate operators module that is to
 contain all Python constructs that we override: control flow, builtins,
 operators, etc.

PiperOrigin-RevId: 191729654
---
 tensorflow/contrib/autograph/impl/BUILD       |  1 +
 tensorflow/contrib/autograph/impl/config.py   | 17 ++++++++-----
 tensorflow/contrib/autograph/operators/BUILD  | 25 +++++++++++++++++++
 .../contrib/autograph/operators/__init__.py   | 24 ++++++++++++++++++
 4 files changed, 61 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/autograph/operators/BUILD
 create mode 100644 tensorflow/contrib/autograph/operators/__init__.py

diff --git a/tensorflow/contrib/autograph/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
index 0de479741a..54424e2647 100644
--- a/tensorflow/contrib/autograph/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -26,6 +26,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/contrib/autograph/converters",
+        "//tensorflow/contrib/autograph/operators",
         "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/contrib/autograph/pyct/static_analysis",
         "//tensorflow/contrib/autograph/utils",
diff --git a/tensorflow/contrib/autograph/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
index 543c1486e6..26326465e2 100644
--- a/tensorflow/contrib/autograph/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -41,10 +41,15 @@ DEFAULT_UNCOMPILED_MODULES = set((
 
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Also allow controlling the generated names.
+# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
-    'from __future__ import print_function', 'import tensorflow as tf',
-    'from tensorflow.contrib.autograph.impl import api as '
-    'autograph_api',
-    'from tensorflow.contrib.autograph import utils as '
-    'autograph_utils')
+    'from __future__ import print_function',
+    'import tensorflow as tf',
+    'from tensorflow.contrib.autograph.impl import api'
+    ' as autograph_api',
+    'from tensorflow.contrib.autograph import utils'
+    ' as autograph_utils',
+    'from tensorflow.contrib.autograph import operators'
+    ' as __ops',
+)
diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
new file mode 100644
index 0000000000..7856c253bd
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "operators",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [],
+)
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
new file mode 100644
index 0000000000..c3f4cab69e
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This module implements operators that we overload.
+
+Note that "operator" is used loosely here, and includes control structures like
+conditionals and loops, implemented in functional form, using for example
+closures for the body.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
-- 
GitLab


From ed439fc7a70d6acf2f8eb59253b7ac073f23221b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 08:47:27 -0700
Subject: [PATCH 0294/1262] Fix docstring.

PiperOrigin-RevId: 191747417
---
 tensorflow/contrib/autograph/pyct/ast_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 4f76a69522..4a70bab440 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -28,7 +28,7 @@ from tensorflow.contrib.autograph.pyct import anno
 class CleanCopier(gast.NodeVisitor):
   """Copy AST nodes.
 
-  The copied nodes will ignore almost all fields that prefixed by '__'.
+  The copied nodes will ignore almost all fields that are prefixed by '__'.
   Exceptions make some annotations.
   """
 
-- 
GitLab


From 7162214b8acc23826f9b72fb6bb65fe4c4555c74 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 5 Apr 2018 08:47:47 -0700
Subject: [PATCH 0295/1262] Use constants in tf.zeros if the constant won't be
 too big.

Using fill saves on GraphDef size, but can slow down models since the
total number of ops is greater (fill + shape + constant op). This
change makes us only use fill for large shapes.

PiperOrigin-RevId: 191747456
---
 .../rnn/python/kernel_tests/rnn_cell_test.py  |  8 +++----
 tensorflow/python/ops/array_ops.py            | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 63fdd91d36..c7d85862f6 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -842,12 +842,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[6.450831e-04, 4.697885e-04], [9.862894e-05, 7.212213e-04],
-           [4.401947e-04, 9.143004e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[4.621217e-04, 3.365449e-04], [7.438179e-05, 5.439147e-04],
-           [3.347936e-04, 6.953785e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 68d446602e..fa26e07c85 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1566,6 +1566,16 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 # pylint: enable=invalid-name
 
 
+def _constant_if_small(value, shape, dtype, name):
+  try:
+    if np.prod(shape) < 1000:
+      return constant(value, shape=shape, dtype=dtype, name=name)
+  except TypeError:
+    # Happens when shape is a Tensor, list with Tensor elements, etc.
+    pass
+  return None
+
+
 @tf_export("zeros")
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -1596,8 +1606,15 @@ def zeros(shape, dtype=dtypes.float32, name=None):
       zero = ""
     else:
       zero = 0
+
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(zero, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
@@ -1729,6 +1746,12 @@ def ones(shape, dtype=dtypes.float32, name=None):
     one = True if dtype == dtypes.bool else 1
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(one, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
-- 
GitLab


From 435e3ba8d3898c94f114aaa99cbeb1e741985e58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebasti=C3=A1n=20Ram=C3=ADrez?= <tiangolo@gmail.com>
Date: Thu, 5 Apr 2018 20:11:52 +0400
Subject: [PATCH 0296/1262] Docker Jupyter: Update deprecated
 softmax_cross_entropy_with_logits (#17412)

* Docker Jupyter Notebooks: Update deprecated softmax_cross_entropy_with_logits

* Docker Jupyter Notebooks: Revert removing collapsed: false
---
 .../tools/docker/notebooks/3_mnist_from_scratch.ipynb       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 5585ebdcd3..824fe14560 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -1207,7 +1207,7 @@
    "source": [
     "# Training computation: logits + cross-entropy loss.\n",
     "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n",
+    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
     "  labels=train_labels_node, logits=logits))\n",
     "\n",
     "# L2 regularization for the fully connected parameters.\n",
@@ -2031,7 +2031,7 @@
    "views": {}
   },
   "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2049,5 +2049,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
-- 
GitLab


From e6225d9835f63729a9006f10ca9e50068381663d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 09:17:57 -0700
Subject: [PATCH 0297/1262] Fix typos in "Profile Model Float Operations"
 documentation.

PiperOrigin-RevId: 191751175
---
 .../g3doc/profile_model_architecture.md       | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
index 61bb66bd21..4ccd43ce68 100644
--- a/tensorflow/core/profiler/g3doc/profile_model_architecture.md
+++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
@@ -45,22 +45,22 @@ sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 
 For an operation to have float operation statistics:
 
-* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
-use the definition to calculate float operations. Contributes are welcome.
-
-* It must have known "shape" information for RegisterStatistics('flops')
-to calculate the statistics. It is suggested to pass in `-run_meta_path` if
-shape is only known during runtime. tfprof can fill in the missing shape with
-the runtime shape information from RunMetadata.
-Hence, it is suggested to use `-account_displayed_op_only`
-option so that you know the statistics are only for the operations printed out.
-
-* If no RunMetadata provided, tfprof count float_ops of each graph node once,
-even if it is defined in tf.while_loop. This is because tfprof doesn't know
-how many times are run statically. If RunMetadata provided, tfprof calculate
-float_ops as float_ops * run_count.
-
-
+*   It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
+    uses the definition to calculate float operations. Contributions are
+    welcomed.
+
+*   It must have known "shape" information for RegisterStatistics('flops') to
+    calculate the statistics. It is suggested to pass in `-run_meta_path` if
+    shape is only known during runtime. tfprof can fill in the missing shape
+    with the runtime shape information from RunMetadata. Hence, it is suggested
+    to use `-account_displayed_op_only` option so that you know the statistics
+    are only for the operations printed out.
+
+*   If no RunMetadata is provided, tfprof counts float_ops of each graph node
+    once, even if it is defined in a tf.while_loop. This is because tfprof
+    doesn't know statically how many times each graph node is run. If
+    RunMetadata is provided, tfprof calculates float_ops as float_ops *
+    run_count.
 
 ```python
 # To profile float opertions in commandline, you need to pass --graph_path
-- 
GitLab


From 3fb89650a1e7f5cc4c04f091170fac504ba10021 Mon Sep 17 00:00:00 2001
From: Sherry Moore <sherrym@google.com>
Date: Thu, 5 Apr 2018 09:33:20 -0700
Subject: [PATCH 0298/1262] Added a call in
 CheckpointSaverHook.after_create_session to always save checkpoint before the
 first training step.

PiperOrigin-RevId: 191753026
---
 tensorflow/python/estimator/estimator_test.py |  2 +-
 .../estimator/replicate_model_fn_test.py      |  9 +++--
 .../training/basic_session_run_hooks.py       |  5 +++
 .../training/basic_session_run_hooks_test.py  | 37 ++++++++++++++++---
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index f4255091bf..498f5294a4 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -680,7 +680,7 @@ class EstimatorTrainTest(test.TestCase):
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
     self.assertAllEqual(
-        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+        ['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
diff --git a/tensorflow/python/estimator/replicate_model_fn_test.py b/tensorflow/python/estimator/replicate_model_fn_test.py
index ad1f9c02b9..00035ef1fe 100644
--- a/tensorflow/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/python/estimator/replicate_model_fn_test.py
@@ -27,6 +27,7 @@ import six
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import replicate_model_fn
+from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator.canned import dnn
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.estimator.canned import prediction_keys
@@ -593,7 +594,8 @@ class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
         loss=loss,
         eval_metric_ops=metrics,
         predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
+        train_op=optimizer.minimize(
+            loss, global_step=training.get_global_step()))
 
   @property
   def params(self):
@@ -612,8 +614,9 @@ class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
       estimator = estimator_lib.Estimator(
           model_fn=self.model_fn,
           model_dir=tempfile.mkdtemp(),
-          params=self.params)
-      estimator.train(train_input_fn, steps=1)
+          params=self.params,
+          config=run_config.RunConfig(save_checkpoints_steps=1))
+      estimator.train(train_input_fn, steps=2)
 
       self.assertEqual(7.0, estimator.get_variable_value('c'))
 
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index aae757b99a..77d4f15d52 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -429,6 +429,11 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     for l in self._listeners:
       l.begin()
 
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    self._save(session, global_step)
+    self._timer.update_last_triggered_step(global_step)
+
   def before_run(self, run_context):  # pylint: disable=unused-argument
     if self._timer.last_triggered_step() is None:
       # We do write graph and saver_def at the first call of before_run.
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2547661e52..4bf4a599b4 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -466,8 +466,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener_counts)
 
@@ -490,8 +490,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener_counts)
 
@@ -523,8 +523,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 2,
-        'after_save': 2,
+        'before_save': 3,
+        'after_save': 3,
         'end': 1
     }, listener1_counts)
     self.assertEqual(listener1_counts, listener2_counts)
@@ -718,6 +718,31 @@ class CheckpointSaverHookTest(test.TestCase):
 
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
+  def test_save_checkpoint_before_first_train_step(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        sess.run(self.scaffold.init_op)
+        hook.after_create_session(sess, None)
+        # Verifies that checkpoint is saved at step 0.
+        self.assertEqual(0,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        # Verifies that no checkpoint is saved after one training step.
+        mon_sess.run(self.train_op)
+        self.assertEqual(0,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        # Verifies that checkpoint is saved after save_steps.
+        mon_sess.run(self.train_op)
+        self.assertEqual(2,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
 
 class ResourceCheckpointSaverHookTest(test.TestCase):
 
-- 
GitLab


From 4a860885b92784c01d1e21e0b069fa5328696af0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 09:38:40 -0700
Subject: [PATCH 0299/1262] Document expected regular structure of the
 statistical testing library.

PiperOrigin-RevId: 191753693
---
 .../python/ops/statistical_testing.py         | 111 +++++++++++++++++-
 1 file changed, 109 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index d66c34cc1a..5c52015e5f 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -12,7 +12,114 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Statistical test assertions calibrated for their error rates."""
+"""Statistical test assertions calibrated for their error rates.
+
+Statistical tests have an inescapable probability of error: a correct
+sampler can still fail a test by chance, and an incorrect sampler can
+still pass a test by chance.  This library is about bounding both of
+those error rates.  This requires admitting a task-specific notion of
+"discrepancy": Correct code will fail rarely, code that misbehaves by
+more than the discrepancy will pass rarely, and nothing reliable can
+be said about code that misbehaves, but misbehaves by less than the
+discrepancy.
+
+# Example
+
+Consider testing that the mean of a scalar probability distribution P
+is some expected constant.  Suppose the support of P is the interval
+`[0, 1]`.  Then you might do this:
+
+```python
+tfd = tf.contrib.distributions
+
+expected_mean = ...
+num_samples = 5000
+samples = ... draw 5000 samples from P
+
+# Check that the mean looks right
+check1 = tfd.assert_true_mean_equal_by_dkwm(
+    samples, low=0., high=1., expected=expected_mean,
+    false_fail_rate=1e-6)
+
+# Check that the difference in means detectable with 5000 samples is
+# small enough
+check2 = tf.assert_less(
+    tfd.min_discrepancy_of_true_means_detectable_by_dkwm(
+        num_samples, low=0., high=1.0,
+        false_fail_rate=1e-6, false_pass_rate=1e-6),
+    0.01)
+
+# Be sure to execute both assertion ops
+sess.run([check1, check2])
+```
+
+The second assertion is an instance of experiment design.  It's a
+deterministic computation (independent of the code under test) that
+checks that `5000` samples is enough to reliably resolve mean
+differences of `0.01` or more.  Here "reliably" means that if the code
+under test is correct, the probability of drawing an unlucky sample
+that causes this test to fail is at most 1e-6; and if the code under
+test is incorrect enough that its true mean is 0.01 more or less than
+expected, then the probability of drawing a "lucky" sample that causes
+the test to false-pass is also at most 1e-6.
+
+# Overview
+
+Every function in this library can be characterized in terms of:
+
+- The property being tested, such as the full density of the
+  distribution under test, or just its true mean, or a single
+  Bernoulli probability, etc.
+
+- The relation being asserted, e.g., whether the mean is less, more,
+  or equal to the given expected value.
+
+- The stochastic bound being relied upon, such as the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
+  or the CDF of the binomial distribution (for assertions about
+  Bernoulli probabilities).
+
+- The number of sample sets in the statistical test.  For example,
+  testing equality of means has a one-sample variant, where the
+  expected mean is given exactly, and a two-sample variant, where the
+  expected mean is itself given by a set of samples (e.g., from an
+  alternative algorithm).
+
+- What operation(s) of the test are to be performed.  Each test has
+  three of these:
+
+  1. `assert` executes the test.  Specifically, it creates a TF op that
+     produces an error if it has enough evidence to prove that the
+     property under test is violated.  These functions depend on the
+     desired false failure rate, because that determines the sizes of
+     appropriate confidence intervals, etc.
+
+  2. `min_discrepancy` computes the smallest difference reliably
+     detectable by that test, given the sample count and error rates.
+     What it's a difference of is test-specific.  For example, a test
+     for equality of means would make detection guarantees about the
+     difference the true means.
+
+  3. `min_num_samples` computes the minimum number of samples needed
+     to reliably detect a given discrepancy with given error rates.
+
+  The latter two are for experimental design, and are meant to be
+  usable either interactively or inline in the overall test method.
+
+This library follows a naming convention, to make room for every
+combination of the above.  A name mentions the operation first, then
+the property, then the relation, then the bound, then, if the test
+takes more than one set of samples, a token indicating this.  For
+example, `assert_true_mean_equal_by_dkwm` (which is implicitly
+one-sample).  Each name is a grammatically sound noun phrase (or verb
+phrase, for the asserts).
+
+# Asymptotic properties
+
+The number of samples needed tends to scale as `O(1/discrepancy**2)` and
+as `O(log(1/error_rate))`.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -40,7 +147,7 @@ __all__ = [
 
 
 def _batch_sort_vector(x, ascending=True, name=None):
-  with ops.name_scope(name, "sort_each_row", [x]):
+  with ops.name_scope(name, "_batch_sort_vector", [x]):
     x = ops.convert_to_tensor(x, name="x")
     n = array_ops.shape(x)[-1]
     if ascending:
-- 
GitLab


From cb4cc1b3d7be8f2017bc81235afb6975210001ab Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Thu, 5 Apr 2018 09:41:31 -0700
Subject: [PATCH 0300/1262] Refine BatchReshape error messages.

PiperOrigin-RevId: 191754120
---
 .../distributions/python/kernel_tests/batch_reshape_test.py  | 4 ++--
 tensorflow/contrib/distributions/python/ops/batch_reshape.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
index c6c8d2cf6e..59d549b7b8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -536,14 +536,14 @@ class _BatchReshapeTest(object):
 
     if self.is_static_shape:
       with self.assertRaisesRegexp(NotImplementedError,
-                                   "too few event dims"):
+                                   "too few batch and event dims"):
         poisson_141_reshaped.log_prob(x_4)
       with self.assertRaisesRegexp(NotImplementedError,
                                    "unexpected batch and event shape"):
         poisson_141_reshaped.log_prob(x_114)
       return
 
-    with self.assertRaisesOpError("too few event dims"):
+    with self.assertRaisesOpError("too few batch and event dims"):
       with self.test_session():
         poisson_141_reshaped.log_prob(x_4).eval()
 
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index 3e6c35e0d6..bf5590cd55 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -290,7 +290,7 @@ class BatchReshape(distribution_lib.Distribution):
           isinstance(expected_batch_event_ndims, int)):
         if x_ndims < expected_batch_event_ndims:
           raise NotImplementedError(
-              "Broadcasting is not supported; too few event dims "
+              "Broadcasting is not supported; too few batch and event dims "
               "(expected at least {}, saw {}).".format(
                   expected_batch_event_ndims, x_ndims))
         ndims_assertion = []
@@ -299,7 +299,8 @@ class BatchReshape(distribution_lib.Distribution):
             check_ops.assert_greater_equal(
                 x_ndims,
                 expected_batch_event_ndims,
-                message="Broadcasting is not supported; too few event dims.",
+                message=("Broadcasting is not supported; too few "
+                         "batch and event dims."),
                 name="assert_batch_and_event_ndims_large_enough"),
         ]
 
-- 
GitLab


From 16b233c43fbfc366a3ca3cebb2c5a5e32354263e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 09:56:50 -0700
Subject: [PATCH 0301/1262] Include the operators module in the test framework
 as well.

PiperOrigin-RevId: 191756100
---
 tensorflow/contrib/autograph/converters/BUILD             | 1 +
 .../contrib/autograph/converters/converter_test_base.py   | 2 ++
 tensorflow/contrib/autograph/impl/api_test.py             | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index c5a0dc1095..92cca30df4 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -49,6 +49,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":converters",
+        "//tensorflow/contrib/autograph/operators",
         "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/contrib/autograph/pyct/static_analysis",
         "//tensorflow/contrib/autograph/utils",
diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 3ea2cfd668..984e72c70c 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import contextlib
 import imp
 
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.pyct import compiler
 from tensorflow.contrib.autograph.pyct import context
@@ -77,6 +78,7 @@ class TestCase(test.TestCase):
       result.tf = self.make_fake_mod('fake_tf', *symbols)
       result.autograph_utils = utils
       result.autograph_api = self.make_fake_mod('fake_api', converted_call)
+      result.__ops = operators  # pylint:disable=protected-access
       yield result
     except Exception:  # pylint:disable=broad-except
       if source is None:
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index ee2d301d75..f156a87a95 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -37,8 +37,12 @@ class ApiTest(test.TestCase):
   def setUp(self):
     config.COMPILED_IMPORT_STATEMENTS = (
         'from __future__ import print_function',
-        'from tensorflow.contrib.autograph import utils as '
-        'autograph_utils', 'tf = autograph_utils.fake_tf()')
+        'from tensorflow.contrib.autograph import utils'
+        ' as autograph_utils',
+        'from tensorflow.contrib.autograph import operators'
+        ' as __ops',
+        'tf = autograph_utils.fake_tf()',
+    )
 
   def test_decorator_recurses(self):
 
-- 
GitLab


From a0c80b9a54dc9669c0f5d151bee9f0b3a4fd71a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 09:57:58 -0700
Subject: [PATCH 0302/1262] Expand activity analysis to the test nodes of if
 and while statements.

PiperOrigin-RevId: 191756234
---
 .../autograph/pyct/static_analysis/activity.py | 18 +++++++++++++++---
 .../pyct/static_analysis/activity_test.py      |  2 ++
 .../autograph/pyct/static_analysis/annos.py    |  1 +
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index da6a2f6f05..6dd53091fa 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -265,10 +265,10 @@ class ActivityAnalizer(transformer.Base):
       qn = QN(node.name)
       self.scope.mark_write(qn)
     current_scope = self.scope
-    fndef_scope = Scope(current_scope, isolated=True)
-    self.scope = fndef_scope
+    body_scope = Scope(current_scope, isolated=True)
+    self.scope = body_scope
     self.generic_visit(node)
-    anno.setanno(node, NodeAnno.BODY_SCOPE, fndef_scope)
+    anno.setanno(node, NodeAnno.BODY_SCOPE, body_scope)
     self.scope = current_scope
     return node
 
@@ -282,7 +282,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_If(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
@@ -297,7 +303,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_While(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index 37c28872bb..1e6c686b01 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -204,6 +204,8 @@ class ActivityAnalizerTest(test.TestCase):
     self.assertScopeIsRmc(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
         ('b', 'c'), ('a', 'b', 'c'))
+    self.assertScopeIsRmc(
+        anno.getanno(while_node, NodeAnno.COND_SCOPE), ('b',), (), ())
 
   def test_for(self):
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index 5254b83ca7..d6d9f7e1a6 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -43,6 +43,7 @@ class NodeAnno(NoValue):
   # Scopes
   # Scopes are represented by objects of type activity.Scope.
   ARGS_SCOPE = 'The scope for the argument list of a function call.'
+  COND_SCOPE = 'The scope for the test node of a conditional statement.'
   BODY_SCOPE = (
       'The scope for the main body of a statement (True branch for if '
       'statements, main body for loops).')
-- 
GitLab


From 14241b17aae754e2a64c8a350caf63e6572fe9cd Mon Sep 17 00:00:00 2001
From: Jayaram Bobba <jayaram.bobba@intel.com>
Date: Thu, 5 Apr 2018 10:28:59 -0700
Subject: [PATCH 0303/1262] [Intel MKL] Change inter op defaults when built
 with MKL  (#17931)

* Change inter op defaults when built with MKL to avoid thread oversubscription

* Bump up default mkl inter_op to be less conservative
---
 .../core/common_runtime/process_util.cc       | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index d5bd7f8b98..7ff360ee26 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 
+#ifdef INTEL_MKL
+#include <omp.h>
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -47,10 +50,24 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
 }
 
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
-  const int32 t = options.config.inter_op_parallelism_threads();
-  if (t != 0) return t;
+  const int32 inter_op = options.config.inter_op_parallelism_threads();
+  if (inter_op != 0) return inter_op;
+#ifdef INTEL_MKL
+  // MKL library executes ops in parallel using OMP threads
+  // Set inter_op conservatively to avoid thread oversubscription that could 
+  // lead to severe perf degradations and OMP resource exhaustion
+  const int mkl_intra_op = omp_get_max_threads();
+  CHECK_GE(mkl_intra_op, 1);
+  const int32 mkl_inter_op = std::max(
+          (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+  VLOG(0) << "Creating new thread pool with default inter op setting: "
+          << mkl_inter_op
+          << ". Tune using inter_op_parallelism_threads for best performance.";
+  return mkl_inter_op;
+#else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
+#endif
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
-- 
GitLab


From de61d322391a824c9dd97b5b4913b45f8a12539d Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Fri, 6 Apr 2018 01:29:32 +0800
Subject: [PATCH 0304/1262] Fix some rendering format in contrib doc strings
 (#18148)

* Fix some rendering format in contrib doc strings

* Fix line too long pylint error
---
 .../contrib/kernel_methods/python/losses.py   |  6 +-
 .../python/mappers/random_fourier_features.py | 16 ++--
 .../mappers/random_fourier_features_test.py   |  2 +-
 .../contrib/kfac/python/ops/fisher_blocks.py  | 82 +++++++++----------
 .../seq2seq/python/ops/attention_wrapper.py   |  4 +-
 tensorflow/contrib/sparsemax/__init__.py      |  2 +-
 .../contrib/sparsemax/python/ops/sparsemax.py |  2 +-
 7 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index f182fef067..4ef0a66a52 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss(
 
   This is a generalization of standard (binary) hinge loss. For a given instance
   with correct label c*, the loss is given by:
-    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+    $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$
   or equivalently
-    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
-  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+    $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$
+  where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise.
 
   Args:
     labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 9dc01124ab..091f0a1098 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -35,23 +35,23 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
   ```
-  exp(-||x-y||_2^2 / (2 * sigma^2))
+  $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
   ```
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
-  where `d` is the input dimension (number of dense input features) and `D` is
-  the output dimension (i.e., dimension of the feature space the input is mapped
-  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
-  distribution and each entry of `b` is sampled independently and uniformly from
-  [0, 2 * pi].
+  The mapping uses a matrix `\\(Omega \in R^{d x D}\\)` and a bias vector
+  `\\(b \in R^D\\)` where `d` is the input dimension (number of dense input
+  features) and `D` is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of `Omega` is sampled i.i.d. from a
+  (scaled) Gaussian distribution and each entry of `b` is sampled independently
+  and uniformly from [0, \\(2 * pi\\)].
 
   For a single input feature vector x in R^d, its RFFM is defined as:
   ```
-      sqrt(2/D) * cos(x * Omega + b)
+      $$sqrt(2/D) * cos(x * Omega + b)$$
   ```
   where `cos` is the element-wise cosine function and `x, b` are represented as
   row vectors. The aforementioned paper shows that the linear kernel of
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 6f4a264485..91929184a2 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -34,7 +34,7 @@ def _inner_product(x, y):
   """Inner product between tensors x and y.
 
   The input tensors are assumed to be in ROW representation, that is, the method
-  returns x * y^T.
+  returns \\(x * y^T\\).
 
   Args:
     x: input tensor in row format
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index e0d9cb5ea9..00b3673a74 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior
 distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
 Fisher Information matrix is given by,
 
-  F(params) = E[ v(x, y, params) v(x, y, params)^T ]
+  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
 
 where,
 
-  v(x, y, params) = (d / d params) log p(y | x, params)
+  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
 
 and the expectation is taken with respect to the data's distribution for 'x' and
 the model's posterior distribution for 'y',
@@ -85,7 +85,7 @@ def normalize_damping(damping, num_replications):
 def compute_pi_tracenorm(left_cov, right_cov):
   """Computes the scalar constant pi for Tikhonov regularization/damping.
 
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
   See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
 
   Args:
@@ -462,14 +462,14 @@ class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider fully connected layer in this model with (unshared) weight matrix
   'w'. For an example 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( a (d loss / d s)^T )
+    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
 
   This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
   to the layer's parameters 'w'.
@@ -532,14 +532,14 @@ class ConvDiagonalFB(InputOutputMultiTower, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider a convoluational layer in this model with (unshared) filter matrix
   'w'. For an example image 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )
+    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
 
   where 'loc' is a single (x, y) location in an image.
 
@@ -805,12 +805,12 @@ class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
   'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
   this FisherBlock estimates,
 
-    F(w) = #locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])
+    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
+                                  E[flat(ds) flat(ds)^T])$$
 
   where
 
-    ds = (d / ds) log p(y | x, w)
+    $$ds = (d / ds) log p(y | x, w)$$
     #locations = number of (x, y) locations where 'w' is applied.
 
   where the expectation is taken over all examples and locations and flat()
@@ -1567,7 +1567,7 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
 
     if self._option == SeriesFBApproximation.option1:
 
-      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
       L_A, psi_A = self._input_factor.get_option1quants(
           self._input_damping_func)
       L_G, psi_G = self._output_factor.get_option1quants(
@@ -1581,33 +1581,33 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
         T = self._num_timesteps
         return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
 
-      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
       # Even though Y is Z-independent we are recomputing it from the psi's
       # each since Y depends on both A and G quantities, and it is relatively
       # cheap to compute.
       Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
 
-      # Z = L_G^T * Z * L_A
+      # \\(Z = L_G^T * Z * L_A\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = U_G^T * Z * U_A
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = U_G^T * Z * U_A\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
 
-      # Z = Z .* Y
+      # \\(Z = Z .* Y\\)
       Z *= Y
 
-      # Z = L_G * Z * L_A^T
+      # \\(Z = L_G * Z * L_A^T\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = U_G * Z * U_A^T
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = U_G * Z * U_A^T\\)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
 
     elif self._option == SeriesFBApproximation.option2:
 
-      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
-      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
+      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
       P_A, K_A, mu_A = self._input_factor.get_option2quants(
           self._input_damping_func)
       P_G, K_G, mu_G = self._output_factor.get_option2quants(
@@ -1616,26 +1616,26 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
       # Our approach differs superficially from the pseudo-code in the paper
       # in order to reduce the total number of matrix-matrix multiplies.
       # In particular, the first three computations in the pseudo code are
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = Z - hPsi_G^T * Z * hPsi_A
-      # Z = E_G^T * Z * E_A
-      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
-      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
+      # \\(Z = E_G^T * Z * E_A\\)
+      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
+      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
       # the entire computation can be written as
-      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
-      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
-      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
-      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
-      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
+      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
+      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
+      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
+      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
       # This final expression is computed by the following two lines:
-      # Z = Z - P_G * Z * P_A^T
+      # \\(Z = Z - P_G * Z * P_A^T\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # Z = K_G^T * Z * K_A
+      # \\(Z = K_G^T * Z * K_A\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
 
-      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
       # Be careful with the outer product.  We don't want to accidentally
       # make it an inner-product instead.
       tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
@@ -1646,13 +1646,13 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
       # We now perform the transpose/reverse version of the operations
       # derived above, whose derivation from the original pseudo-code is
       # analgous.
-      # Z = K_G * Z * K_A^T
+      # \\(Z = K_G * Z * K_A^T\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
 
-      # Z = Z - P_G^T * Z * P_A
+      # \\(Z = Z - P_G^T * Z * P_A\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
 
-      # Z = normalize (1/E[T]) * Z
+      # \\(Z = normalize (1/E[T]) * Z\\)
       # Note that this normalization is done because we compute the statistics
       # by averaging, not summing, over time. (And the gradient is presumably
       # summed over time, not averaged, and thus their scales are different.)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 9e0d69593f..f0f143ddfc 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -610,8 +610,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
   addition, once an input sequence element is attended to at a given output
   timestep, elements occurring before it cannot be attended to at subsequent
   output timesteps.  This function generates attention distributions according
-  to these assumptions.  For more information, see ``Online and Linear-Time
-  Attention by Enforcing Monotonic Alignments''.
+  to these assumptions.  For more information, see `Online and Linear-Time
+  Attention by Enforcing Monotonic Alignments`.
 
   Args:
     p_choose_i: Probability of choosing input sequence/memory element i.  Should
diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py
index 19d213fb3e..7bc726f4a8 100644
--- a/tensorflow/contrib/sparsemax/__init__.py
+++ b/tensorflow/contrib/sparsemax/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Module that implements sparsemax and sparsemax loss, see [1].
 
-[1] https://arxiv.org/abs/1602.02068
+[1]: https://arxiv.org/abs/1602.02068
 
 ## Sparsemax
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 890ca20f4c..e617af2ff1 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -31,7 +31,7 @@ def sparsemax(logits, name=None):
   """Computes sparsemax activations [1].
 
   For each batch `i` and class `j` we have
-    sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)
+    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
 
   [1]: https://arxiv.org/abs/1602.02068
 
-- 
GitLab


From 1450515b6d5b664c80cfa56648d0318bbefadfe8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 5 Apr 2018 10:29:15 -0700
Subject: [PATCH 0305/1262] Inline more functions

PiperOrigin-RevId: 191761109
---
 .../core/grappler/optimizers/function_optimizer.cc    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index f1da469a6c..1dd75db30f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -62,12 +62,6 @@ class FunctionInliningContext {
       if (func.attr().count("_noinline") != 0) {
         continue;
       }
-      // Don't touch anything marked XLA to prevent XLA failures further down
-      // the road.
-      if (func.attr().count("_XlaCompile") > 0 &&
-          func.attr().at("_XlaCompile").b()) {
-        continue;
-      }
       // Can't create IdentityN nodes with no input or output: skip these
       // functions for now.
       if (func.signature().input_arg_size() == 0 ||
@@ -206,6 +200,11 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
                                         ctx, optimized_graph));
     } else {
+      // Annotate the node with the function attributes.
+      for (const auto& attr : func.attr()) {
+        func_body_node.mutable_attr()->insert(attr);
+      }
+
       // Move the node to the main graph
       optimized_graph->add_node()->Swap(&func_body_node);
     }
-- 
GitLab


From 3ccb596e67af00d1b11a8d38fe5a65970725f93c Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Thu, 5 Apr 2018 10:41:40 -0700
Subject: [PATCH 0306/1262] Sync only the convolutional_recurrent file to Keras
 2.1.5.

PiperOrigin-RevId: 191763101
---
 .../keras/layers/convolutional_recurrent.py   | 1222 ++++++++++++-----
 .../layers/convolutional_recurrent_test.py    |    1 +
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |  114 +-
 3 files changed, 933 insertions(+), 404 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index b78962d66a..6b2a1d98fe 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
 """Convolutional-recurrent layers.
 """
 from __future__ import absolute_import
@@ -26,181 +27,456 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
+from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
+from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-class ConvRecurrent2D(Recurrent):
-  """Abstract base class for convolutional recurrent layers.
-
-  Do not use in a model -- it's not a functional layer!
+class ConvRNN2D(RNN):
+  """Base class for convolutional-recurrent layers.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of n integers, specifying the
-          dimensions of the convolution window.
-      strides: An integer or tuple/list of n integers,
-          specifying the strides of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of n integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
+    cell: A RNN cell instance. A RNN cell is a class that has:
+        - a `call(input_at_t, states_at_t)` method, returning
+            `(output_at_t, states_at_t_plus_1)`. The call method of the
+            cell can also take the optional argument `constants`, see
+            section "Note on passing external constants" below.
+        - a `state_size` attribute. This can be a single integer
+            (single state) in which case it is
+            the number of channels of the recurrent state
+            (which should be the same as the number of channels of the cell
+            output). This can also be a list/tuple of integers
+            (one size per state). In this case, the first entry
+            (`state_size[0]`) should be the same as
+            the size of the cell output.
+    return_sequences: Boolean. Whether to return the last output.
+        in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+        in addition to the output.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    input_shape: Use this argument to specify the shape of the
+        input when this layer is the first one in a model.
 
   Input shape:
-      5D tensor with shape `(num_samples, timesteps, channels, rows, cols)`.
+    5D tensor with shape:
+    `(samples, timesteps, channels, rows, cols)`
+    if data_format='channels_first' or 5D tensor with shape:
+    `(samples, timesteps, rows, cols, channels)`
+    if data_format='channels_last'.
 
   Output shape:
-      - if `return_sequences`: 5D tensor with shape
-          `(num_samples, timesteps, channels, rows, cols)`.
-      - else, 4D tensor with shape `(num_samples, channels, rows, cols)`.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
-      set to `True`.
-      **Note:** for the time being, masking is only supported with Theano.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch.
-      This assumes a one-to-one mapping between
-      samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              a `batch_input_size=(...)` to the first layer in your model.
-              This is the expected shape of your inputs *including the batch
-              size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
+    - if `return_state`: a list of tensors. The first tensor is
+        the output. The remaining tensors are the last states,
+        each 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to padding.
+    - if `return_sequences`: 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+    - else, 4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+
+  Masking:
+    This layer supports masking for input data with a variable number
+    of timesteps. To introduce masks to your data,
+    use an Embedding layer with the `mask_zero` parameter
+    set to `True`.
+
+  Note on using statefulness in RNNs:
+    You can set RNN layers to be 'stateful', which means that the states
+    computed for the samples in one batch will be reused as initial states
+    for the samples in the next batch. This assumes a one-to-one mapping
+    between samples in different successive batches.
+    To enable statefulness:
+        - specify `stateful=True` in the layer constructor.
+        - specify a fixed batch size for your model, by passing
+             - if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+             - if functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+                This is the expected shape of your inputs
+                *including the batch size*.
+                It should be a tuple of integers,
+                e.g. `(32, 10, 100, 100, 32)`.
+                Note that the number of rows and columns should be specified
+                too.
+        - specify `shuffle=False` when calling fit().
+    To reset the states of your model, call `.reset_states()` on either
+    a specific layer, or on your entire model.
+
+  Note on specifying the initial state of RNNs:
+    You can specify the initial state of RNN layers symbolically by
+    calling them with the keyword argument `initial_state`. The value of
+    `initial_state` should be a tensor or list of tensors representing
+    the initial state of the RNN layer.
+    You can specify the initial state of RNN layers numerically by
+    calling `reset_states` with the keyword argument `states`. The value of
+    `states` should be a numpy array or list of numpy arrays representing
+    the initial state of the RNN layer.
+
+  Note on passing external constants to RNNs:
+    You can pass "external" constants to the cell using the `constants`
+    keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+    requires that the `cell.call` method accepts the same keyword argument
+    `constants`. Such constants can be used to condition the cell
+    transformation on additional static inputs (not changing over time),
+    a.k.a. an attention mechanism.
   """
 
   def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
+               cell,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
+               unroll=False,
                **kwargs):
-    super(ConvRecurrent2D, self).__init__(**kwargs)
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
-                                                    'dilation_rate')
-    self.return_sequences = return_sequences
-    self.go_backwards = go_backwards
-    self.stateful = stateful
+    if unroll:
+      raise TypeError('Unrolling isn\'t possible with '
+                      'convolutional RNNs.')
+    if isinstance(cell, (list, tuple)):
+      # The StackedConvRNN2DCells isn't implemented yet.
+      raise TypeError('It is not possible at the moment to'
+                      'stack convolutional cells.')
+    super(ConvRNN2D, self).__init__(cell,
+                                    return_sequences,
+                                    return_state,
+                                    go_backwards,
+                                    stateful,
+                                    unroll,
+                                    **kwargs)
     self.input_spec = [InputSpec(ndim=5)]
-    self.state_spec = None
+    self.states = None
 
   @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    if self.data_format == 'channels_first':
+
+    cell = self.cell
+    if cell.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
-    elif self.data_format == 'channels_last':
+    elif cell.data_format == 'channels_last':
       rows = input_shape[2]
       cols = input_shape[3]
-    rows = conv_utils.conv_output_length(
-        rows,
-        self.kernel_size[0],
-        padding=self.padding,
-        stride=self.strides[0],
-        dilation=self.dilation_rate[0])
-    cols = conv_utils.conv_output_length(
-        cols,
-        self.kernel_size[1],
-        padding=self.padding,
-        stride=self.strides[1],
-        dilation=self.dilation_rate[1])
+    rows = conv_utils.conv_output_length(rows,
+                                         cell.kernel_size[0],
+                                         padding=cell.padding,
+                                         stride=cell.strides[0],
+                                         dilation=cell.dilation_rate[0])
+    cols = conv_utils.conv_output_length(cols,
+                                         cell.kernel_size[1],
+                                         padding=cell.padding,
+                                         stride=cell.strides[1],
+                                         dilation=cell.dilation_rate[1])
+
+    if cell.data_format == 'channels_first':
+      output_shape = input_shape[:2] + (cell.filters, rows, cols)
+    elif cell.data_format == 'channels_last':
+      output_shape = input_shape[:2] + (rows, cols, cell.filters)
+
+    if not self.return_sequences:
+      output_shape = output_shape[:1] + output_shape[2:]
+
+    if self.return_state:
+      output_shape = [output_shape]
+      if cell.data_format == 'channels_first':
+        output_shape += [(input_shape[0], cell.filters, rows, cols)
+                         for _ in range(2)]
+      elif cell.data_format == 'channels_last':
+        output_shape += [(input_shape[0], rows, cols, cell.filters)
+                         for _ in range(2)]
+    return output_shape
+
+  @shape_type_conversion
+  def build(self, input_shape):
+    # Note input_shape will be list of shapes of initial states and
+    # constants if these are passed in __call__.
+    if self._num_constants is not None:
+      constants_shape = input_shape[-self._num_constants:]
+    else:
+      constants_shape = None
+
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:5])
+
+    # allow cell (if layer) to build before we set or validate state_spec
+    if isinstance(self.cell, Layer):
+      step_input_shape = (input_shape[0],) + input_shape[2:]
+      if constants_shape is not None:
+        self.cell.build([step_input_shape] + constants_shape)
+      else:
+        self.cell.build(step_input_shape)
+
+    # set or validate state_spec
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = list(self.cell.state_size)
+    else:
+      state_size = [self.cell.state_size]
+
+    if self.state_spec is not None:
+      # initial_state was passed in call, check compatibility
+      if self.cell.data_format == 'channels_first':
+        ch_dim = 1
+      elif self.cell.data_format == 'channels_last':
+        ch_dim = 3
+      if [spec.shape[ch_dim] for spec in self.state_spec] != state_size:
+        raise ValueError(
+            'An initial_state was passed that is not compatible with '
+            '`cell.state_size`. Received `state_spec`={}; '
+            'However `cell.state_size` is '
+            '{}'.format([spec.shape for spec in self.state_spec],
+                        self.cell.state_size))
+    else:
+      if self.cell.data_format == 'channels_first':
+        self.state_spec = [InputSpec(shape=(None, dim, None, None))
+                           for dim in state_size]
+      elif self.cell.data_format == 'channels_last':
+        self.state_spec = [InputSpec(shape=(None, None, None, dim))
+                           for dim in state_size]
+    if self.stateful:
+      self.reset_states()
+    self.built = True
+
+  def get_initial_state(self, inputs):
+    # (samples, timesteps, rows, cols, filters)
+    initial_state = K.zeros_like(inputs)
+    # (samples, rows, cols, filters)
+    initial_state = K.sum(initial_state, axis=1)
+    shape = list(self.cell.kernel_shape)
+    shape[-1] = self.cell.filters
+    initial_state = self.cell.input_conv(initial_state,
+                                         K.zeros(tuple(shape)),
+                                         padding=self.cell.padding)
+
+    if hasattr(self.cell.state_size, '__len__'):
+      return [initial_state for _ in self.cell.state_size]
+    else:
+      return [initial_state]
+
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    inputs, initial_state, constants = self._standardize_args(
+        inputs, initial_state, constants)
+
+    if initial_state is None and constants is None:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+    # If any of `initial_state` or `constants` are specified and are Keras
+    # tensors, then add them to the inputs and temporarily modify the
+    # input_spec to include them.
+
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      self.state_spec = []
+      for state in initial_state:
+        shape = K.int_shape(state)
+        self.state_spec.append(InputSpec(shape=shape))
+
+      additional_specs += self.state_spec
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      self.constants_spec = [InputSpec(shape=K.int_shape(constant))
+                             for constant in constants]
+      self._num_constants = len(constants)
+      additional_specs += self.constants_spec
+    # at this point additional_inputs cannot be empty
+    for tensor in additional_inputs:
+      if K.is_keras_tensor(tensor) != K.is_keras_tensor(additional_inputs[0]):
+        raise ValueError('The initial state or constants of an RNN'
+                         ' layer cannot be specified with a mix of'
+                         ' Keras tensors and non-Keras tensors')
+
+    if K.is_keras_tensor(additional_inputs[0]):
+      # Compute the full input spec, including state and constants
+      full_input = [inputs] + additional_inputs
+      full_input_spec = self.input_spec + additional_specs
+      # Perform the call with temporarily replaced input_spec
+      original_input_spec = self.input_spec
+      self.input_spec = full_input_spec
+      output = super(ConvRNN2D, self).__call__(full_input, **kwargs)
+      self.input_spec = original_input_spec
+      return output
+    else:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           initial_state=None,
+           constants=None):
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' +
+                       str(len(initial_state)) +
+                       ' initial states.')
+    timesteps = K.int_shape(inputs)[1]
+
+    kwargs = {}
+    if generic_utils.has_arg(self.cell.call, 'training'):
+      kwargs['training'] = training
+
+    if constants:
+      if not generic_utils.has_arg(self.cell.call, 'constants'):
+        raise ValueError('RNN cell does not support constants')
+
+      def step(inputs, states):
+        constants = states[-self._num_constants:]
+        states = states[:-self._num_constants]
+        return self.cell.call(inputs, states, constants=constants,
+                              **kwargs)
+    else:
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+    last_output, outputs, states = K.rnn(step,
+                                         inputs,
+                                         initial_state,
+                                         constants=constants,
+                                         go_backwards=self.go_backwards,
+                                         mask=mask,
+                                         input_length=timesteps)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(K.update(self.states[i], states[i]))
+      self.add_update(updates, inputs=True)
+
     if self.return_sequences:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
-                        cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], input_shape[1], rows, cols,
-                        self.filters)
+      output = outputs
     else:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], self.filters, rows, cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], rows, cols, self.filters)
+      output = last_output
+
+    # Properly set learning phase
+    if getattr(last_output, '_uses_learning_phase', False):
+      output._uses_learning_phase = True
 
     if self.return_state:
-      if self.data_format == 'channels_first':
-        output_shape = [output_shape] + [
-            (input_shape[0], self.filters, rows, cols) for _ in range(2)
-        ]
-      elif self.data_format == 'channels_last':
-        output_shape = [output_shape] + [
-            (input_shape[0], rows, cols, self.filters) for _ in range(2)
-        ]
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [output] + states
+    else:
+      return output
 
-    return output_shape
+  def reset_states(self, states=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    input_shape = self.input_spec[0].shape
+    state_shape = self.compute_output_shape(input_shape)
+    if self.return_state:
+      state_shape = state_shape[0]
+    if self.return_sequences:
+      state_shape = state_shape[:1].concatenate(state_shape[2:])
+    if None in state_shape:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.\n'
+                       'The same thing goes for the number of rows and '
+                       'columns.')
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'return_sequences': self.return_sequences,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful
-    }
-    base_config = super(ConvRecurrent2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    # helper function
+    def get_tuple_shape(nb_channels):
+      result = list(state_shape)
+      if self.cell.data_format == 'channels_first':
+        result[1] = nb_channels
+      elif self.cell.data_format == 'channels_last':
+        result[3] = nb_channels
+      else:
+        raise KeyError
+      return tuple(result)
 
+    # initialize state if None
+    if self.states[0] is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        self.states = [K.zeros(get_tuple_shape(dim))
+                       for dim in self.cell.state_size]
+      else:
+        self.states = [K.zeros(get_tuple_shape(self.cell.state_size))]
+    elif states is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        for state, dim in zip(self.states, self.cell.state_size):
+          K.set_value(state, np.zeros(get_tuple_shape(dim)))
+      else:
+        K.set_value(self.states[0],
+                    np.zeros(get_tuple_shape(self.cell.state_size)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, ' +
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
+        if hasattr(self.cell.state_size, '__len__'):
+          dim = self.cell.state_size[index]
+        else:
+          dim = self.cell.state_size
+        if value.shape != get_tuple_shape(dim):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' +
+                           self.name + ': expected shape=' +
+                           str(get_tuple_shape(dim)) +
+                           ', found shape=' + str(value.shape))
+        # TODO(anjalisridhar): consider batch calls to `set_value`.
+        K.set_value(state, value)
 
-@tf_export('keras.layers.ConvLSTM2D')
-class ConvLSTM2D(ConvRecurrent2D):
-  """Convolutional LSTM.
 
-  It is similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+class ConvLSTM2DCell(Layer):
+  """Cell class for the ConvLSTM2D layer.
 
-  Arguments:
+  # Arguments
       filters: Integer, the dimensionality of the output space
           (i.e. the number of output filters in the convolution).
       kernel_size: An integer or tuple/list of n integers, specifying the
@@ -212,11 +488,6 @@ class ConvLSTM2D(ConvRecurrent2D):
       padding: One of `"valid"` or `"same"` (case-insensitive).
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -231,71 +502,32 @@ class ConvLSTM2D(ConvRecurrent2D):
           for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
+          used for the linear transformation of the recurrent state.
       bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Use in combination with `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+          This is recommended in [Jozefowicz et al.]
+          (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
           the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
           the `recurrent_kernel` weights matrix.
       bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
           the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
           the `recurrent_kernel` weights matrix.
       bias_constraint: Constraint function applied to the bias vector.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-
-  Input shape:
-      - if data_format='channels_first'
-          5D tensor with shape:
-          `(samples,time, channels, rows, cols)`
-      - if data_format='channels_last'
-          5D tensor with shape:
-          `(samples,time, rows, cols, channels)`
-
-   Output shape:
-      - if `return_sequences`
-           - if data_format='channels_first'
-              5D tensor with shape:
-              `(samples, time, filters, output_row, output_col)`
-           - if data_format='channels_last'
-              5D tensor with shape:
-              `(samples, time, output_row, output_col, filters)`
-      - else
-          - if data_format ='channels_first'
-              4D tensor with shape:
-              `(samples, filters, output_row, output_col)`
-          - if data_format='channels_last'
-              4D tensor with shape:
-              `(samples, output_row, output_col, filters)`
-          where o_row and o_col depend on the shape of the filter and
-          the padding
-
-  Raises:
-      ValueError: in case of invalid constructor arguments.
-
   """
 
   def __init__(self,
@@ -315,27 +547,20 @@ class ConvLSTM2D(ConvRecurrent2D):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
-               return_sequences=False,
-               go_backwards=False,
-               stateful=False,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(ConvLSTM2D, self).__init__(
-        filters,
-        kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        return_sequences=return_sequences,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
+    super(ConvLSTM2DCell, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                    'dilation_rate')
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
     self.use_bias = use_bias
@@ -348,7 +573,6 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -356,45 +580,29 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
+    self.state_size = (self.filters, self.filters)
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-    if self.stateful:
-      self.reset_states()
-    else:
-      # initial states: 2 all-zero tensor of shape (filters)
-      self.states = [None, None]
 
     if self.data_format == 'channels_first':
-      channel_axis = 2
+      channel_axis = 1
     else:
       channel_axis = -1
     if input_shape[channel_axis] is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
-    state_shape = [None] * 4
-    state_shape[channel_axis] = input_dim
-    state_shape = tuple(state_shape)
-    self.state_spec = [
-        InputSpec(shape=state_shape),
-        InputSpec(shape=state_shape)
-    ]
     kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
     self.kernel_shape = kernel_shape
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
 
-    self.kernel = self.add_weight(
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+    self.kernel = self.add_weight(shape=kernel_shape,
+                                  initializer=self.kernel_initializer,
+                                  name='kernel',
+                                  regularizer=self.kernel_regularizer,
+                                  constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
         shape=recurrent_kernel_shape,
         initializer=self.recurrent_initializer,
@@ -402,25 +610,24 @@ class ConvLSTM2D(ConvRecurrent2D):
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.filters * 4,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      self.bias = self.add_weight(shape=(self.filters * 4,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
       if self.unit_forget_bias:
         bias_value = np.zeros((self.filters * 4,))
-        bias_value[self.filters:self.filters * 2] = 1.
+        bias_value[self.filters: self.filters * 2] = 1.
         K.set_value(self.bias, bias_value)
     else:
       self.bias = None
 
     self.kernel_i = self.kernel[:, :, :, :self.filters]
     self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
-    self.kernel_f = self.kernel[:, :, :, self.filters:self.filters * 2]
+    self.kernel_f = self.kernel[:, :, :, self.filters: self.filters * 2]
     self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
                                                     self.filters * 2]
-    self.kernel_c = self.kernel[:, :, :, self.filters * 2:self.filters * 3]
+    self.kernel_c = self.kernel[:, :, :, self.filters * 2: self.filters * 3]
     self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
                                                     self.filters * 3]
     self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
@@ -428,8 +635,8 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     if self.use_bias:
       self.bias_i = self.bias[:self.filters]
-      self.bias_f = self.bias[self.filters:self.filters * 2]
-      self.bias_c = self.bias[self.filters * 2:self.filters * 3]
+      self.bias_f = self.bias[self.filters: self.filters * 2]
+      self.bias_c = self.bias[self.filters * 2: self.filters * 3]
       self.bias_o = self.bias[self.filters * 3:]
     else:
       self.bias_i = None
@@ -438,166 +645,419 @@ class ConvLSTM2D(ConvRecurrent2D):
       self.bias_o = None
     self.built = True
 
-  def get_initial_state(self, inputs):
-    # (samples, timesteps, rows, cols, filters)
-    initial_state = array_ops.zeros_like(inputs)
-    # (samples, rows, cols, filters)
-    initial_state = math_ops.reduce_sum(initial_state, axis=1)
-    shape = list(self.kernel_shape)
-    shape[-1] = self.filters
-    initial_state = self.input_conv(
-        initial_state, K.zeros(tuple(shape)), padding=self.padding)
-
-    initial_states = [initial_state for _ in range(2)]
-    return initial_states
+  def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          K.ones_like(inputs),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          K.ones_like(states[1]),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
 
-  def reset_states(self):
-    if not self.stateful:
-      raise RuntimeError('Layer must be stateful.')
-    input_shape = self.input_spec[0].shape
+    # dropout matrices for input units
+    dp_mask = self._dropout_mask
+    # dropout matrices for recurrent units
+    rec_dp_mask = self._recurrent_dropout_mask
 
-    if not input_shape[0]:
-      raise ValueError('If a RNN is stateful, a complete '
-                       'input_shape must be provided '
-                       '(including batch size). '
-                       'Got input shape: ' + str(input_shape))
+    h_tm1 = states[0]  # previous memory state
+    c_tm1 = states[1]  # previous carry state
 
-    if self.return_state:
-      output_shape = tuple(self.compute_output_shape(input_shape)[0].as_list())
-    else:
-      output_shape = tuple(self.compute_output_shape(input_shape).as_list())
-    if self.return_sequences:
-      output_shape = (input_shape[0],) + output_shape[2:]
+    if 0 < self.dropout < 1.:
+      inputs_i = inputs * dp_mask[0]
+      inputs_f = inputs * dp_mask[1]
+      inputs_c = inputs * dp_mask[2]
+      inputs_o = inputs * dp_mask[3]
     else:
-      output_shape = (input_shape[0],) + output_shape[1:]
+      inputs_i = inputs
+      inputs_f = inputs
+      inputs_c = inputs
+      inputs_o = inputs
 
-    if hasattr(self, 'states'):
-      K.set_value(self.states[0],
-                  np.zeros(output_shape))
-      K.set_value(self.states[1],
-                  np.zeros(output_shape))
+    if 0 < self.recurrent_dropout < 1.:
+      h_tm1_i = h_tm1 * rec_dp_mask[0]
+      h_tm1_f = h_tm1 * rec_dp_mask[1]
+      h_tm1_c = h_tm1 * rec_dp_mask[2]
+      h_tm1_o = h_tm1 * rec_dp_mask[3]
     else:
-      self.states = [
-          K.zeros(output_shape),
-          K.zeros(output_shape)
-      ]
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
-      ones = array_ops.zeros_like(inputs)
-      ones = math_ops.reduce_sum(ones, axis=1)
-      ones += 1
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(dp_mask)
-    else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-
-    if 0 < self.recurrent_dropout < 1:
-      shape = list(self.kernel_shape)
-      shape[-1] = self.filters
-      ones = array_ops.zeros_like(inputs)
-      ones = math_ops.reduce_sum(ones, axis=1)
-      ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
-      ones += 1.
-
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
-
-      rec_dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(rec_dp_mask)
-    else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-    return constants
+      h_tm1_i = h_tm1
+      h_tm1_f = h_tm1
+      h_tm1_c = h_tm1
+      h_tm1_o = h_tm1
+
+    x_i = self.input_conv(inputs_i, self.kernel_i, self.bias_i,
+                          padding=self.padding)
+    x_f = self.input_conv(inputs_f, self.kernel_f, self.bias_f,
+                          padding=self.padding)
+    x_c = self.input_conv(inputs_c, self.kernel_c, self.bias_c,
+                          padding=self.padding)
+    x_o = self.input_conv(inputs_o, self.kernel_o, self.bias_o,
+                          padding=self.padding)
+    h_i = self.recurrent_conv(h_tm1_i,
+                              self.recurrent_kernel_i)
+    h_f = self.recurrent_conv(h_tm1_f,
+                              self.recurrent_kernel_f)
+    h_c = self.recurrent_conv(h_tm1_c,
+                              self.recurrent_kernel_c)
+    h_o = self.recurrent_conv(h_tm1_o,
+                              self.recurrent_kernel_o)
+
+    i = self.recurrent_activation(x_i + h_i)
+    f = self.recurrent_activation(x_f + h_f)
+    c = f * c_tm1 + i * self.activation(x_c + h_c)
+    o = self.recurrent_activation(x_o + h_o)
+    h = o * self.activation(c)
+
+    if 0 < self.dropout + self.recurrent_dropout:
+      if training is None:
+        h._uses_learning_phase = True
+
+    return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
-    conv_out = K.conv2d(
-        x,
-        w,
-        strides=self.strides,
-        padding=padding,
-        data_format=self.data_format,
-        dilation_rate=self.dilation_rate)
+    conv_out = K.conv2d(x, w, strides=self.strides,
+                        padding=padding,
+                        data_format=self.data_format,
+                        dilation_rate=self.dilation_rate)
     if b is not None:
-      conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
+      conv_out = K.bias_add(conv_out, b,
+                            data_format=self.data_format)
     return conv_out
 
   def recurrent_conv(self, x, w):
-    conv_out = K.conv2d(
-        x, w, strides=(1, 1), padding='same', data_format=self.data_format)
+    conv_out = K.conv2d(x, w, strides=(1, 1),
+                        padding='same',
+                        data_format=self.data_format)
     return conv_out
 
-  def step(self, inputs, states):
-    assert len(states) == 4
-    h_tm1 = states[0]
-    c_tm1 = states[1]
-    dp_mask = states[2]
-    rec_dp_mask = states[3]
-
-    x_i = self.input_conv(
-        inputs * dp_mask[0], self.kernel_i, self.bias_i, padding=self.padding)
-    x_f = self.input_conv(
-        inputs * dp_mask[1], self.kernel_f, self.bias_f, padding=self.padding)
-    x_c = self.input_conv(
-        inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
-    x_o = self.input_conv(
-        inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
-    h_i = self.recurrent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
-    h_f = self.recurrent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
-    h_c = self.recurrent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
-    h_o = self.recurrent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+  def get_config(self):
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
+    base_config = super(ConvLSTM2DCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
-    i = self.recurrent_activation(x_i + h_i)
-    f = self.recurrent_activation(x_f + h_f)
-    c = f * c_tm1 + i * self.activation(x_c + h_c)
-    o = self.recurrent_activation(x_o + h_o)
-    h = o * self.activation(c)
-    return h, [h, c]
+@tf_export('keras.layers.ConvLSTM2D')
+class ConvLSTM2D(ConvRNN2D):
+  """Convolutional LSTM.
+
+  It is similar to an LSTM layer, but the input transformations
+  and recurrent transformations are both convolutional.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space
+        (i.e. the number output of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+    strides: An integer or tuple/list of n integers,
+        specifying the strides of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, time, ..., channels)`
+        while `channels_first` corresponds to
+        inputs with shape `(batch, time, channels, ...)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be "channels_last".
+    dilation_rate: An integer or tuple/list of n integers, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+        for the recurrent step.
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al.]
+        (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to.
+    kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+
+  Input shape:
+    - if data_format='channels_first'
+        5D tensor with shape:
+        `(samples,time, channels, rows, cols)`
+    - if data_format='channels_last'
+        5D tensor with shape:
+        `(samples,time, rows, cols, channels)`
+
+  Output shape:
+    - if `return_sequences`
+         - if data_format='channels_first'
+            5D tensor with shape:
+            `(samples, time, filters, output_row, output_col)`
+         - if data_format='channels_last'
+            5D tensor with shape:
+            `(samples, time, output_row, output_col, filters)`
+    - else
+        - if data_format ='channels_first'
+            4D tensor with shape:
+            `(samples, filters, output_row, output_col)`
+        - if data_format='channels_last'
+            4D tensor with shape:
+            `(samples, output_row, output_col, filters)`
+        where o_row and o_col depend on the shape of the filter and
+        the padding
+
+  Raises:
+    ValueError: in case of invalid constructor arguments.
+
+  References:
+    - [Convolutional LSTM Network: A Machine Learning Approach for
+    Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+    The current implementation does not include the feedback loop on the
+    cells output.
+
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    cell = ConvLSTM2DCell(filters=filters,
+                          kernel_size=kernel_size,
+                          strides=strides,
+                          padding=padding,
+                          data_format=data_format,
+                          dilation_rate=dilation_rate,
+                          activation=activation,
+                          recurrent_activation=recurrent_activation,
+                          use_bias=use_bias,
+                          kernel_initializer=kernel_initializer,
+                          recurrent_initializer=recurrent_initializer,
+                          bias_initializer=bias_initializer,
+                          unit_forget_bias=unit_forget_bias,
+                          kernel_regularizer=kernel_regularizer,
+                          recurrent_regularizer=recurrent_regularizer,
+                          bias_regularizer=bias_regularizer,
+                          kernel_constraint=kernel_constraint,
+                          recurrent_constraint=recurrent_constraint,
+                          bias_constraint=bias_constraint,
+                          dropout=dropout,
+                          recurrent_dropout=recurrent_dropout)
+    super(ConvLSTM2D, self).__init__(cell,
+                                     return_sequences=return_sequences,
+                                     go_backwards=go_backwards,
+                                     stateful=stateful,
+                                     **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    return super(ConvLSTM2D, self).call(inputs,
+                                        mask=mask,
+                                        training=training,
+                                        initial_state=initial_state)
+
+  @property
+  def filters(self):
+    return self.cell.filters
+
+  @property
+  def kernel_size(self):
+    return self.cell.kernel_size
+
+  @property
+  def strides(self):
+    return self.cell.strides
+
+  @property
+  def padding(self):
+    return self.cell.padding
+
+  @property
+  def data_format(self):
+    return self.cell.data_format
+
+  @property
+  def dilation_rate(self):
+    return self.cell.dilation_rate
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def recurrent_activation(self):
+    return self.cell.recurrent_activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def unit_forget_bias(self):
+    return self.cell.unit_forget_bias
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
 
   def get_config(self):
-    config = {
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
-    }
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'activity_regularizer': regularizers.serialize(
+                  self.activity_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
     base_config = super(ConvLSTM2D, self).get_config()
+    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
index 60137bdd72..9e768b4e95 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
@@ -64,6 +64,7 @@ class ConvLSTMTest(test.TestCase):
           self.assertEqual(len(states), 2)
           model = keras.models.Model(x, states[0])
           state = model.predict(inputs)
+
           self.assertAllClose(
               keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 6a7da1aef8..a535f18170 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -1,20 +1,52 @@
 path: "tensorflow.keras.layers.ConvLSTM2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRecurrent2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "data_format"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dilation_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "filters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
@@ -35,6 +67,22 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_size"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -67,10 +115,42 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "padding"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_activation"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strides"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -79,10 +159,18 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "unit_forget_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -143,10 +231,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -187,28 +271,12 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_conv"
-    argspec: "args=[\'self\', \'x\', \'w\', \'b\', \'padding\'], varargs=None, keywords=None, defaults=[\'None\', \'valid\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "recurrent_conv"
-    argspec: "args=[\'self\', \'x\', \'w\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
-- 
GitLab


From 9a22614a35682436c854668cfcbe077279476989 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 11:17:58 -0700
Subject: [PATCH 0307/1262] Internal change

PiperOrigin-RevId: 191769724
---
 .../ci_build/windows/cpu/pip/build_tf_windows.sh     | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 438c5d52f6..5e9ae497e1 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,6 +42,14 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
+skip_test=0
+
+for ARG in "$@"; do
+  if [[ "$ARG" == --skip_test ]]; then
+    skip_test=1
+  fi
+done
+
 run_configure_for_cpu_build
 
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
@@ -49,6 +57,10 @@ run_configure_for_cpu_build
 BUILD_OPTS="--define=override_eigen_strong_inline=true"
 bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
+if [[ "$skip_test" == 1 ]]; then
+  exit 0
+fi
+
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
-- 
GitLab


From 931c8f341b4eda91caeaa9c8a24892b7fa6a9cb2 Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@google.com>
Date: Thu, 5 Apr 2018 11:18:49 -0700
Subject: [PATCH 0308/1262] Expose odeint_fixed in tf.contrib.integrate

PiperOrigin-RevId: 191769890
---
 tensorflow/contrib/integrate/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index 68bf511099..694f0c14bd 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -18,6 +18,7 @@
 See the @{$python/contrib.integrate} guide.
 
 @@odeint
+@@odeint_fixed
 """
 
 from __future__ import absolute_import
-- 
GitLab


From b691c039c978a34ac4baa47e0e20b9c9d46aa6f7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 11:30:35 -0700
Subject: [PATCH 0309/1262] Automated g4 rollback of changelist 191761109

PiperOrigin-RevId: 191771969
---
 .../core/grappler/optimizers/function_optimizer.cc    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 1dd75db30f..f1da469a6c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -62,6 +62,12 @@ class FunctionInliningContext {
       if (func.attr().count("_noinline") != 0) {
         continue;
       }
+      // Don't touch anything marked XLA to prevent XLA failures further down
+      // the road.
+      if (func.attr().count("_XlaCompile") > 0 &&
+          func.attr().at("_XlaCompile").b()) {
+        continue;
+      }
       // Can't create IdentityN nodes with no input or output: skip these
       // functions for now.
       if (func.signature().input_arg_size() == 0 ||
@@ -200,11 +206,6 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
                                         ctx, optimized_graph));
     } else {
-      // Annotate the node with the function attributes.
-      for (const auto& attr : func.attr()) {
-        func_body_node.mutable_attr()->insert(attr);
-      }
-
       // Move the node to the main graph
       optimized_graph->add_node()->Swap(&func_body_node);
     }
-- 
GitLab


From ccad14e8281b244edffb09dc757b1997497dc27c Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Thu, 5 Apr 2018 11:36:43 -0700
Subject: [PATCH 0310/1262] Fix final eval bottleneck creation to work in cases
 where it isn't cached already.

Fixes #17423

PiperOrigin-RevId: 191773001
---
 tensorflow/examples/image_retraining/retrain.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 99a71206ac..fcc191250f 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -870,15 +870,16 @@ def run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
     resized_image_tensor: The input node of the recognition graph.
     bottleneck_tensor: The bottleneck output layer of the CNN graph.
   """
-  (sess, bottleneck_input, ground_truth_input, evaluation_step,
-   prediction) = build_eval_session(model_info, class_count)
-
   test_bottlenecks, test_ground_truth, test_filenames = (
       get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
                                     'testing', FLAGS.bottleneck_dir,
                                     FLAGS.image_dir, jpeg_data_tensor,
                                     decoded_image_tensor, resized_image_tensor,
                                     bottleneck_tensor, FLAGS.architecture))
+
+  (sess, bottleneck_input, ground_truth_input, evaluation_step,
+   prediction) = build_eval_session(model_info, class_count)
+
   test_accuracy, predictions = sess.run(
       [evaluation_step, prediction],
       feed_dict={
-- 
GitLab


From feb8d7b53953826e0d1b4bc68726392ac0ab310b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 12:18:34 -0700
Subject: [PATCH 0311/1262] Fix regression caused by cl/191020868: Re-use
 materialized shapes for other broadcast gradient shape nodes.

PiperOrigin-RevId: 191779263
---
 .../grappler/optimizers/constant_folding.cc   | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index d941a0b3f9..2f1b9e41d7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -552,7 +552,6 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
 
   const DataType type = node.attr().at("T").type();
   NodeDef* out[2];
-  bool created_const = false;
   for (int j = 0; j < 2; ++j) {
     int reduction_indices = reduce_dims[j].size();
     Tensor value(type, TensorShape({reduction_indices}));
@@ -576,20 +575,17 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
           AddControlDependency(node.name(), graph_, node_map_.get());
       *out[j]->add_input() = ctrl_dep;
       node_map_->AddOutput(NodeName(ctrl_dep), const_name);
-      created_const = true;
     }
   }
 
-  if (created_const) {
-    const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
-    for (NodeDef* output : outputs) {
-      for (int k = 0; k < output->input_size(); ++k) {
-        int port;
-        string node_name = ParseNodeName(output->input(k), &port);
-        if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
-          *output->mutable_input(k) = out[port]->name();
-          node_map_->UpdateInput(output->name(), node_name, out[port]->name());
-        }
+  const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
+  for (NodeDef* output : outputs) {
+    for (int k = 0; k < output->input_size(); ++k) {
+      int port;
+      string node_name = ParseNodeName(output->input(k), &port);
+      if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
+        *output->mutable_input(k) = out[port]->name();
+        node_map_->UpdateInput(output->name(), node_name, out[port]->name());
       }
     }
   }
-- 
GitLab


From 3d8a0538c1d75142c381a5c169ef0696cb95b4ec Mon Sep 17 00:00:00 2001
From: Alan Yee <alyee@ucsd.edu>
Date: Thu, 5 Apr 2018 12:32:03 -0700
Subject: [PATCH 0312/1262] Update metrics_ops.py (#18155)

* Update metrics_ops.py

Add deprecation notes

* Update metrics_ops.py

Fix styling for linter
---
 .../contrib/metrics/python/ops/metric_ops.py  | 37 +++++++++++--------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 81f05e7ce5..088319a557 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -62,7 +62,8 @@ def _safe_div(numerator, denominator, name):
       0,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
                              labels,
                              weights=None,
@@ -106,7 +107,8 @@ def streaming_true_positives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_negatives(predictions,
                              labels,
                              weights=None,
@@ -150,7 +152,8 @@ def streaming_true_negatives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_positives(predictions,
                               labels,
                               weights=None,
@@ -194,7 +197,8 @@ def streaming_false_positives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_negatives(predictions,
                               labels,
                               weights=None,
@@ -237,7 +241,7 @@ def streaming_false_negatives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.mean')
 def streaming_mean(values,
                    weights=None,
                    metrics_collections=None,
@@ -286,7 +290,7 @@ def streaming_mean(values,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.mean_tensor')
 def streaming_mean_tensor(values,
                           weights=None,
                           metrics_collections=None,
@@ -340,9 +344,8 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(None,
-            'Please switch to tf.metrics.accuracy. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -399,7 +402,8 @@ def streaming_accuracy(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_precision(predictions,
                         labels,
                         weights=None,
@@ -455,7 +459,8 @@ def streaming_precision(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_recall(predictions,
                      labels,
                      weights=None,
@@ -975,8 +980,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of '
+                  'the labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1797,9 +1802,9 @@ def streaming_sensitivity_at_specificity(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.precision_at_thresholds. Note that '
+            'the order of the labels and predictions arguments are switched.')
 def streaming_precision_at_thresholds(predictions,
                                       labels,
                                       thresholds,
-- 
GitLab


From 4ef9be236bdbf4cc50c71514503c0aa0fd41f72e Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Thu, 5 Apr 2018 12:57:49 -0700
Subject: [PATCH 0313/1262] Save the original from_proto method before calling
 it to avoid infinite loop.

PiperOrigin-RevId: 191784430
---
 tensorflow/python/training/distribute.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index c44627eadb..78bc024c0d 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -1226,13 +1226,16 @@ _default_tower_mode = _DefaultTowerThreadMode()
 # So here we catch any attempts to deserialize variables
 # when using distribution strategies.
 # pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
 def _from_proto_fn(v, import_scope=None):
   if has_distribution_strategy():
     raise NotImplementedError(
         "Deserialization of variables is not yet supported when using"
         "distributed strategies.")
   else:
-    resource_variable_ops._from_proto_fn(v, import_scope=import_scope)
+    return _original_from_proto(v, import_scope=import_scope)
 
 resource_variable_ops._from_proto_fn = _from_proto_fn
 # pylint: enable=protected-access
-- 
GitLab


From 6cf501a34afc9401a92624ef3ad3a4c8b8e1b43b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 13:00:06 -0700
Subject: [PATCH 0314/1262] Automated g4 rollback of changelist 191753026

PiperOrigin-RevId: 191784709
---
 tensorflow/python/estimator/estimator_test.py |  2 +-
 .../estimator/replicate_model_fn_test.py      |  9 ++---
 .../training/basic_session_run_hooks.py       |  5 ---
 .../training/basic_session_run_hooks_test.py  | 37 +++----------------
 4 files changed, 10 insertions(+), 43 deletions(-)

diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 498f5294a4..f4255091bf 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -680,7 +680,7 @@ class EstimatorTrainTest(test.TestCase):
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
     self.assertAllEqual(
-        ['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
diff --git a/tensorflow/python/estimator/replicate_model_fn_test.py b/tensorflow/python/estimator/replicate_model_fn_test.py
index 00035ef1fe..ad1f9c02b9 100644
--- a/tensorflow/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/python/estimator/replicate_model_fn_test.py
@@ -27,7 +27,6 @@ import six
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import replicate_model_fn
-from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator.canned import dnn
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.estimator.canned import prediction_keys
@@ -594,8 +593,7 @@ class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
         loss=loss,
         eval_metric_ops=metrics,
         predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(
-            loss, global_step=training.get_global_step()))
+        train_op=optimizer.minimize(loss))
 
   @property
   def params(self):
@@ -614,9 +612,8 @@ class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
       estimator = estimator_lib.Estimator(
           model_fn=self.model_fn,
           model_dir=tempfile.mkdtemp(),
-          params=self.params,
-          config=run_config.RunConfig(save_checkpoints_steps=1))
-      estimator.train(train_input_fn, steps=2)
+          params=self.params)
+      estimator.train(train_input_fn, steps=1)
 
       self.assertEqual(7.0, estimator.get_variable_value('c'))
 
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 77d4f15d52..aae757b99a 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -429,11 +429,6 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     for l in self._listeners:
       l.begin()
 
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-    self._save(session, global_step)
-    self._timer.update_last_triggered_step(global_step)
-
   def before_run(self, run_context):  # pylint: disable=unused-argument
     if self._timer.last_triggered_step() is None:
       # We do write graph and saver_def at the first call of before_run.
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 4bf4a599b4..2547661e52 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -466,8 +466,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 3,
-        'after_save': 3,
+        'before_save': 2,
+        'after_save': 2,
         'end': 1
     }, listener_counts)
 
@@ -490,8 +490,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 3,
-        'after_save': 3,
+        'before_save': 2,
+        'after_save': 2,
         'end': 1
     }, listener_counts)
 
@@ -523,8 +523,8 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(2, global_step_val)
     self.assertEqual({
         'begin': 1,
-        'before_save': 3,
-        'after_save': 3,
+        'before_save': 2,
+        'after_save': 2,
         'end': 1
     }, listener1_counts)
     self.assertEqual(listener1_counts, listener2_counts)
@@ -718,31 +718,6 @@ class CheckpointSaverHookTest(test.TestCase):
 
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
-  def test_save_checkpoint_before_first_train_step(self):
-    with self.graph.as_default():
-      hook = basic_session_run_hooks.CheckpointSaverHook(
-          self.model_dir, save_steps=2, scaffold=self.scaffold)
-      hook.begin()
-      self.scaffold.finalize()
-      with session_lib.Session() as sess:
-        mon_sess = monitored_session._HookedSession(sess, [hook])
-        sess.run(self.scaffold.init_op)
-        hook.after_create_session(sess, None)
-        # Verifies that checkpoint is saved at step 0.
-        self.assertEqual(0,
-                         checkpoint_utils.load_variable(self.model_dir,
-                                                        self.global_step.name))
-        # Verifies that no checkpoint is saved after one training step.
-        mon_sess.run(self.train_op)
-        self.assertEqual(0,
-                         checkpoint_utils.load_variable(self.model_dir,
-                                                        self.global_step.name))
-        # Verifies that checkpoint is saved after save_steps.
-        mon_sess.run(self.train_op)
-        self.assertEqual(2,
-                         checkpoint_utils.load_variable(self.model_dir,
-                                                        self.global_step.name))
-
 
 class ResourceCheckpointSaverHookTest(test.TestCase):
 
-- 
GitLab


From c1990c07018e56dfb40362f60e5c5698d425ff2f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 13:12:26 -0700
Subject: [PATCH 0315/1262] [XLA] Remove a dead function and a stale todo.

PiperOrigin-RevId: 191786563
---
 tensorflow/compiler/xla/service/service.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index e09d58bbe7..9fa72c1b8c 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -300,8 +300,6 @@ class Service : public ServiceInterface {
   Service(const ServiceOptions& options,
           std::unique_ptr<Backend> execute_backend);
 
-  static StatusOr<std::unique_ptr<Backend>> CreateComputeConstantBackend();
-
   // Resolves the given argument handles in the allocation tracker and returns
   // the corresponding allocations for every replica. The function also verifies
   // that each allocation matches the execution platform and device ordinal of
@@ -437,8 +435,6 @@ class Service : public ServiceInterface {
   CompilationCache compilation_cache_;
 
   // Backend to compile and execute computations on.
-  //
-  // TODO(b/28616830): Support multiple backends for execution.
   std::unique_ptr<Backend> execute_backend_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Service);
-- 
GitLab


From f3c677e84c6a7adf136397baad6ab93fdcc97ab4 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Thu, 5 Apr 2018 13:23:08 -0700
Subject: [PATCH 0316/1262] Enable branch prediction in TensorFlow

PiperOrigin-RevId: 191788253
---
 tensorflow/core/platform/macros.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 6119edfd5a..1b1faed703 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -67,11 +67,18 @@ limitations under the License.
 #define TF_EXPORT __attribute__((visibility("default")))
 #endif  // COMPILER_MSVC
 
-// GCC can be told that a certain branch is not likely to be taken (for
-// instance, a CHECK failure), and use that information in static analysis.
-// Giving it this information can help it optimize for the common case in
-// the absence of better information (ie. -fprofile-arcs).
-#if defined(COMPILER_GCC3)
+#ifdef __has_builtin
+#define TF_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TF_HAS_BUILTIN(x) 0
+#endif
+
+// Compilers can be told that a certain branch is not likely to be taken
+// (for instance, a CHECK failure), and use that information in static
+// analysis. Giving it this information can help it optimize for the
+// common case in the absence of better information (ie.
+// -fprofile-arcs).
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
-- 
GitLab


From 9e5657bc051c3bc8febc189c08bd2772cfedadc4 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 5 Apr 2018 13:30:55 -0700
Subject: [PATCH 0317/1262] Fix INT8 conversion bailing in case of unsupported
 TRT feature

---
 .../contrib/tensorrt/convert/convert_graph.cc | 10 ++-
 .../contrib/tensorrt/convert/convert_nodes.cc | 64 ++++++++++++++-----
 2 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index ff8cc6374d..b412b296e0 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -405,7 +405,13 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                          max_mem_per_engine, static_graph_properties,
                          &output_edge_map, precision_mode);
     if (precision_mode == INT8MODE) {
-      TF_RETURN_IF_ERROR(GetCalibNode(&p));
+      tensorflow::Status status = GetCalibNode(&p);
+      if (status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
+      }
     } else {
       tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
       if (status != tensorflow::Status::OK()) {
@@ -414,8 +420,8 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                      << "\" SKIPPING......( " << subgraph_node_names.size()
                      << " nodes)";
       }
-      count++;
     }
+    count++;
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index e920a797fe..ee1273ddff 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -2262,6 +2262,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
   TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
   Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (const std::pair<int, int>& input : s.input_inds) {
@@ -2270,20 +2271,41 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     int output_idx = input.second;
     tensorflow::Node* node = s.graph.FindNodeId(node_id);
     auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
-    // TODO(jie): alternative :)
-    if (!s.graph_properties.HasOutputProperties(node_name))
+    // input_names should use the node name in the graph
+    // here it should be the input tensor name -> matching the binding
+    // insert original node name without port
+    auto tensor_name = node_name;
+    if (output_idx != 0) {
+      tensor_name = StrCat(tensor_name, ":", output_idx);
+    }
+
+    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
+            << " idx: " << output_idx;
+
+    auto shape_inference_node_name = node_name;
+    auto shape_inference_output_idx = output_idx;
+    // rewire the shape inference to original node in the graph
+    if (s.output_edge_map->count(tensor_name)) {
+      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
+      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
+    }
+    if (shape_inference_output_idx < 0) continue;
+    VLOG(2) << "shapeinference name: " << shape_inference_node_name
+            << " idx: " << shape_inference_output_idx;
+
+    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
       return tensorflow::errors::Internal("failed to find input node: " +
-                                          node_name);
+                                          shape_inference_node_name);
 
-    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
+    auto op_info_vec =
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
+    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
       return tensorflow::errors::Internal(
-          "accessing output index of: ", output_idx, ", at node: ", node_name,
-          "with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(output_idx);
+          "accessing output index of: ", shape_inference_output_idx,
+          ", at node: ", shape_inference_node_name,
+          " with output entry from shape_map: ", op_info_vec.size());
 
+    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
@@ -2294,16 +2316,23 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
                    << "' failed";
       return type_status;
     }
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
     VLOG(2) << "accessing output index of: " << output_idx
             << ", at node: " << node_name
             << "with output entry from shape_map: " << op_info_vec.size();
-
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
+    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
+    //            update the code once TRT 4.0 comes out.
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
+
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
@@ -2312,8 +2341,11 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     // TODO(ben,jie): proper way to restore input tensor name?
     auto input_tensor_name = node_name;
-    if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx);
+    if (output_idx != 0) {
+      input_tensor_name = StrCat(node_name, ":", output_idx);
+    }
 
+    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -2377,11 +2409,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     tensor->setType(trt_dtype);
   }
 
-  VLOG(2) << "finished output";
+  VLOG(2) << "Finished processing outputs";
 
   // Build the engine
   op_res->builder_->setMaxBatchSize(s.max_batch_size);
   op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+  VLOG(0) << "Max batch size= " << s.max_batch_size
+          << " max workspace size= " << s.max_workspace_size_bytes;
 
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
@@ -2475,7 +2509,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input!!!!!";
+    VLOG(2) << "parsing input. Node id= " << input.first ;
     int node_id = input.first;
     int output_idx = input.second;
     tensorflow::Node* node = s.graph.FindNodeId(node_id);
-- 
GitLab


From 310249066320f1ddc7fe544b4c351aaf89ce3c9c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 13:55:45 -0700
Subject: [PATCH 0318/1262] Changes loss_reduction default to
 SUM_OVER_BATCH_SIZE for multi_class_head and binary_classification_head.

PiperOrigin-RevId: 191793392
---
 tensorflow/contrib/estimator/python/estimator/head.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 85ef3291ba..ae2fd8b490 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -41,11 +41,10 @@ from tensorflow.python.training import training_util
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
 
-# TODO(b/65403806): Switch loss_reduction default to SUM_OVER_BATCH_SIZE.
 def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM,
+                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
@@ -86,7 +85,8 @@ def multi_class_head(n_classes,
       have any value in `label_vocabulary`. Note that errors will be raised if
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -111,7 +111,7 @@ def binary_classification_head(
     weight_column=None,
     thresholds=None,
     label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     loss_fn=None,
     name=None):
   """Creates a `_Head` for single label binary classification.
@@ -155,7 +155,8 @@ def binary_classification_head(
       `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-- 
GitLab


From f7d00f3d67c47ffc3656c4f2868032b72cd2122b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 14:05:03 -0700
Subject: [PATCH 0319/1262] quantized LSTM support improvements

PiperOrigin-RevId: 191794956
---
 .../contrib/lite/toco/export_tensorflow.cc    |  4 +++
 .../make_initial_dequantize_operator.cc       | 14 +++++++---
 .../contrib/lite/toco/model_cmdline_flags.cc  |  7 ++---
 tensorflow/contrib/lite/toco/toco_tooling.cc  | 26 ++++++++++++-------
 tensorflow/contrib/lite/toco/tooling_util.cc  | 20 ++++++++++----
 tensorflow/contrib/lite/toco/tooling_util.h   |  2 +-
 6 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 5d51431005..4a77196aab 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT16;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
 using tensorflow::DT_UINT8;
@@ -1868,6 +1869,9 @@ void AddPlaceholder(const string& name, ArrayDataType type,
     case ArrayDataType::kInt64:
       (*placeholder->mutable_attr())["dtype"].set_type(DT_INT64);
       break;
+    case ArrayDataType::kInt16:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT16);
+      break;
     default:
       LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index 935da9f966..183b3d3f2e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -78,15 +78,21 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   image_input_op->outputs = {dequantized_input_name};
   model->operators.emplace(model->operators.begin(), image_input_op);
 
-  CHECK(input_array.final_data_type == ArrayDataType::kUint8);
-  input_array.data_type = ArrayDataType::kUint8;
   dequantized_input_array.data_type = ArrayDataType::kFloat;
   const auto& input_minmax = input_array.GetMinMax();
   auto& dequantized_input_minmax = dequantized_input_array.GetOrCreateMinMax();
   dequantized_input_minmax = input_minmax;
   auto& input_qparams = input_array.GetOrCreateQuantizationParams();
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
-                                                         &input_qparams);
+  input_array.data_type = input_array.final_data_type;
+  if (input_array.data_type == ArrayDataType::kUint8) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
+                                                           &input_qparams);
+  } else if (input_array.data_type == ArrayDataType::kInt16) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(input_minmax,
+                                                           &input_qparams);
+  } else {
+    LOG(FATAL) << "unhandled data type";
+  }
 
   transformation->AddMessageF(
       "Created %s"
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 245eb52444..0fa6e8598f 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -402,9 +402,10 @@ void ReadModelFlagsFromCommandLineFlags(
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
     string arrays_extra_info_file_contents;
-    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
-                            &arrays_extra_info_file_contents,
-                            port::file::Defaults());
+    CHECK(port::file::GetContents(
+              parsed_model_flags.arrays_extra_info_file.value(),
+              &arrays_extra_info_file_contents, port::file::Defaults())
+              .ok());
     ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
                                       model_flags->mutable_arrays_extra_info());
   }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 76e9a27aef..96c5ebd64f 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -130,20 +130,26 @@ bool SupportsPreallocatedWorkspace(FileFormat format) {
 }
 
 bool IsRealValued(toco::ArrayDataType type) {
+  // TODO(benoitjacob) - this is hardcoding that uint8 and int16 are only used
+  // for quantized real-number values, and no other integer type is ever used
+  // for that. This is dirty, should be resolved as part of a more general push
+  // to more explicitly distinguish between true-integers and
+  // integers used as quantized values representing real numbers.
   return static_cast<bool>(type == toco::ArrayDataType::kFloat ||
-                           type == toco::ArrayDataType::kUint8);
+                           type == toco::ArrayDataType::kUint8 ||
+                           type == toco::ArrayDataType::kInt16);
 }
 
 void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   ArrayDataType type;
-  if (toco_flags.has_inference_input_type()) {
+  if (!SupportsQuantization(output_format)) {
+    // Data type is implicitly float for non-quantized formats
+    type = ArrayDataType::kFloat;
+  } else if (toco_flags.has_inference_input_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_input_type());
   } else if (toco_flags.has_inference_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_type());
-  } else if (!SupportsQuantization(output_format)) {
-    // Data type is implicitly float for non-quantized formats
-    type = ArrayDataType::kFloat;
   } else {
     // Nothing to do. Data types stay as-is.
     return;
@@ -198,11 +204,6 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 }
 
 void Transform(const TocoFlags& toco_flags, Model* model) {
-  // Clean up after import.
-  SetFinalDataTypeOnInputs(toco_flags, model);
-  UseArraysExtraInfo(model);
-  FinishBuildingRNNStates(model);
-
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
@@ -215,6 +216,11 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         << "Quantized inference is not allowed with float inputs.";
   }
 
+  // Clean up after import.
+  SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model, quantize_output);
+  FinishBuildingRNNStates(model);
+
   // Remove unused ops before performing any other optimizations. This is to
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 56fa8f4b69..61d08fa13f 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1378,12 +1378,22 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     const float mean_value = input_array_proto.mean_value();
     const float std_value = input_array_proto.std_value();
     MinMax input_minmax;
-    input_minmax.min = (0.f - mean_value) / std_value;
-    input_minmax.max = (255.f - mean_value) / std_value;
+    float qmin = 0, qmax = 255;
+    if (input_array.data_type == ArrayDataType::kInt16) {
+      qmin = -32768;
+      qmax = 32767;
+    }
+    input_minmax.min = (qmin - mean_value) / std_value;
+    input_minmax.max = (qmax - mean_value) / std_value;
     if (input_array.minmax) {
       if (input_array_proto.has_mean_value() ||
           input_array_proto.has_std_value()) {
-        CHECK(input_minmax == *input_array.minmax)
+        const double width = input_minmax.max - input_minmax.min;
+        const double kMinMaxAllowedDiff = 1e-6 * width;
+        CHECK(std::abs(input_minmax.min - input_array.minmax->min) <
+                  kMinMaxAllowedDiff &&
+              std::abs(input_minmax.max - input_array.minmax->max) <
+                  kMinMaxAllowedDiff)
             << input_minmax.min << ", " << input_minmax.max
             << " != " << input_array.minmax->min << ", "
             << input_array.minmax->max;
@@ -2000,7 +2010,7 @@ void FinishBuildingRNNStates(Model* model) {
   }
 }
 
-void UseArraysExtraInfo(Model* model) {
+void UseArraysExtraInfo(Model* model, bool quantize_output) {
   for (const auto& entry : model->flags.arrays_extra_info().entries()) {
     if (!model->HasArray(entry.name())) {
       continue;
@@ -2012,7 +2022,7 @@ void UseArraysExtraInfo(Model* model) {
       minmax.min = entry.min();
       minmax.max = entry.max();
     }
-    if (entry.has_data_type()) {
+    if (entry.has_data_type() && quantize_output) {
       array.final_data_type =
           ConvertIODataTypeToArrayDataType(entry.data_type());
     }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 259ee7fbd0..dfd81173c3 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -285,7 +285,7 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 // already quantized, then case (a) should hold.
 void FinishBuildingRNNStates(Model* model);
 
-void UseArraysExtraInfo(Model* model);
+void UseArraysExtraInfo(Model* model, bool quantize_output);
 
 }  // namespace toco
 
-- 
GitLab


From 051dd6cf2f805f12f6e4efb5ad91d57de88481fc Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 5 Apr 2018 14:22:54 -0700
Subject: [PATCH 0320/1262] Fix TF_ImportGraphDefResults and TF_Function leaks
 in Python API.

PiperOrigin-RevId: 191797853
---
 tensorflow/python/eager/function.py       |  4 ++--
 tensorflow/python/eager/graph_callable.py |  2 +-
 tensorflow/python/framework/c_api_util.py | 26 +++++++++++++++++++++++
 tensorflow/python/framework/function.py   | 10 +++++----
 tensorflow/python/framework/importer.py   |  5 +++--
 tensorflow/python/framework/ops.py        |  8 ++++---
 6 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 711eddcec1..61859d6be3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -294,7 +294,7 @@ class _EagerDefinedFunction(object):
     self.signature = function_def.signature
     self.grad_func_name = None
     self.python_grad_func = None
-    self._c_func = fn
+    self._c_func = c_api_util.ScopedTFFunction(fn)
     self._grad_func = None
 
 
@@ -661,7 +661,7 @@ def _defun_internal(name, func, args, kwds):
   if context.executing_eagerly():
     for f in tmp_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
-      _register(f._c_func)  # pylint: disable=protected-access
+      _register(f._c_func.func)  # pylint: disable=protected-access
   return GraphModeFunction(
       fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
       func_outputs, output_shapes, variables)
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index ee5d87f083..d40ea982c7 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -325,7 +325,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register(f._c_func)  # pylint: disable=protected-access
+    function._register(f._c_func.func)  # pylint: disable=protected-access
   initializer_function = function.GraphModeFunction(
       initialization_name,
       placeholder_inputs,
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 4356a534b4..7bbe3183df 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -63,6 +63,32 @@ class ScopedTFImportGraphDefOptions(object):
       c_api.TF_DeleteImportGraphDefOptions(self.options)
 
 
+class ScopedTFImportGraphDefResults(object):
+  """Wrapper around TF_ImportGraphDefOptions that handles deletion."""
+
+  def __init__(self, results):
+    self.results = results
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteImportGraphDefResults is not None:
+      c_api.TF_DeleteImportGraphDefResults(self.results)
+
+
+class ScopedTFFunction(object):
+  """Wrapper around TF_Function that handles deletion."""
+
+  def __init__(self, func):
+    self.func = func
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteFunction is not None:
+      c_api.TF_DeleteFunction(self.func)
+
+
 @tf_contextlib.contextmanager
 def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index c5caf9ebc0..9570f009a5 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -274,7 +274,7 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     if self._c_func:
       with c_api_util.tf_buffer() as buf:
-        c_api.TF_FunctionToFunctionDef(self._c_func, buf)
+        c_api.TF_FunctionToFunctionDef(self._c_func.func, buf)
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
@@ -397,7 +397,7 @@ class _DefinedFunction(object):
                       if self._out_names else [])
       description = self._func.__doc__ or None
       # pylint: disable=protected-access
-      self._c_func = c_api.TF_GraphToFunction_wrapper(
+      c_func = c_api.TF_GraphToFunction_wrapper(
           temp_graph._c_graph,
           base_func_name,
           self._func_name is None,  # append_hash_to_fn_name
@@ -407,6 +407,7 @@ class _DefinedFunction(object):
           output_names,
           None,  # opts
           description)
+      self._c_func = c_api_util.ScopedTFFunction(c_func)
       # pylint: enable=protected-access
       self._set_c_attrs(kwargs_attr)
 
@@ -429,7 +430,7 @@ class _DefinedFunction(object):
       serialized = attr_value.SerializeToString()
       # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
       # It might be worth creating a convenient way to re-use the same status.
-      c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
+      c_api.TF_FunctionSetAttrValueProto(self._c_func.func, compat.as_str(name),
                                          serialized)
 
   def _create_hash_str(self, input_arg, output_arg, node_def):
@@ -825,7 +826,8 @@ def _from_definition(fdef, grad_func=None):
   # pylint: disable=protected-access
   if ops._USE_C_API:
     serialized = fdef.SerializeToString()
-    result._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+    c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+    result._c_func = c_api_util.ScopedTFFunction(c_func)
     result._extra_inputs = []
   else:
     result._definition = fdef
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 23f529b988..8beb74d2a0 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -487,6 +487,7 @@ def import_graph_def(graph_def,
         try:
           results = c_api.TF_GraphImportGraphDefWithResults(
               graph._c_graph, serialized, options)  # pylint: disable=protected-access
+          results = c_api_util.ScopedTFImportGraphDefResults(results)
         except errors.InvalidArgumentError as e:
           # Convert to ValueError for backwards compatibility.
           raise ValueError(str(e))
@@ -515,7 +516,7 @@ def import_graph_def(graph_def,
     # they are likely to be due to a typo.
     missing_unused_input_keys = (
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
-            results))
+            results.results))
     if missing_unused_input_keys:
       missing_unused_input_keys = [
           compat.as_str(s) for s in missing_unused_input_keys
@@ -527,7 +528,7 @@ def import_graph_def(graph_def,
     if return_elements is None:
       return None
     else:
-      return _GatherReturnElements(return_elements, graph, results)
+      return _GatherReturnElements(return_elements, graph, results.results)
 
   else:
     g = graph
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2d55f98a1c..84366e20f5 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3216,9 +3216,11 @@ class Graph(object):
       # as this will be unnecessary.
       if not function._c_func:
         serialized = function.definition.SerializeToString()
-        function._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
-      gradient = function._grad_func._c_func if function._grad_func else None
-      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient)
+        c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+        function._c_func = c_api_util.ScopedTFFunction(c_func)
+      gradient = (function._grad_func._c_func.func if function._grad_func
+                  else None)
+      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     else:
       # If there is already a function with the same name, raise an error
       # if bodies are different. Else, do nothing. The C API version above
-- 
GitLab


From af0790fdc841092e14dd2ed0c753f088edb660b6 Mon Sep 17 00:00:00 2001
From: Yun Peng <pcloudy@google.com>
Date: Thu, 5 Apr 2018 23:32:05 +0200
Subject: [PATCH 0321/1262] Add win_def_file attribute for
 tensorflow/python:pywrap_tensorflow_internal

This attribute is somehow missing when pushing for internal to github.

This should fix the TensorFlow Bazel postsubmit for github.
---
 tensorflow/python/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 6ec8a1cdab..a8f1318509 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3343,6 +3343,10 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
         "util/util.i",
     ],
+    win_def_file = select({
+        "//tensorflow:windows": ":pywrap_tensorflow_filtered_def_file",
+        "//conditions:default": None,
+    }),
     deps = [
         ":bfloat16_lib",
         ":cost_analyzer_lib",
-- 
GitLab


From b723cea002b3e612879291faaedb13eb702a5562 Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Thu, 5 Apr 2018 14:54:36 -0700
Subject: [PATCH 0322/1262] [XLA] Better support for mul reductions in
 MakeFakeArguments()

Mul reductions want a 1 as their init value, not a 0 or a random value.

PiperOrigin-RevId: 191802819
---
 tensorflow/compiler/xla/tests/test_utils.cc | 68 ++++++++++++++-------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 821432ef7d..68f75d50cb 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -160,27 +160,38 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
   return std::move(literal);
 }
 
-// Matches binary addition computations.
-bool LooksLikeSum(const HloComputation& computation) {
+enum class ConstantType { kUnknown, kZero, kOne };
+
+// Return the constant type required by this computation, if known.
+ConstantType GetInitValue(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
-  return root->opcode() == HloOpcode::kAdd &&
-         computation.num_parameters() == 2 &&
-         root->operand(0)->opcode() == HloOpcode::kParameter &&
-         root->operand(1)->opcode() == HloOpcode::kParameter &&
-         root->operand(0) != root->operand(1);
+  if (computation.num_parameters() != 2 ||
+      root->operand(0)->opcode() != HloOpcode::kParameter ||
+      root->operand(1)->opcode() != HloOpcode::kParameter ||
+      root->operand(0) == root->operand(1)) {
+    return ConstantType::kUnknown;
+  }
+
+  switch (root->opcode()) {
+    case HloOpcode::kAdd:
+      return ConstantType::kZero;
+    case HloOpcode::kMultiply:
+      return ConstantType::kOne;
+    default:
+      return ConstantType::kUnknown;
+  }
 }
 
-// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
-// which requires an init_value of 0 rather than a random value.
-bool NeedsZeroInitValue(const HloUse& use) {
+// Reduce, ReduceWindow, and SelectAndScatter ops may need a non-random
+// initialization value.
+bool NeedsInitValue(const HloUse& use) {
   const HloInstruction* const instruction = use.instruction;
   const HloOpcode opcode = instruction->opcode();
   const int64 op_num = use.operand_number;
   return (
       ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
-       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
-      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
-       LooksLikeSum(*instruction->scatter())));
+       op_num == 1) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2));
 }
 
 // Generate random values that are constrained to the input_shape minus the
@@ -222,7 +233,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
         auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
         constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
                                 fused_uses.end());
-      } else if (NeedsZeroInitValue(use)) {
+      } else if (NeedsInitValue(use)) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kConvert ||
                  opcode == HloOpcode::kReducePrecision) {
@@ -243,7 +254,8 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
-  HloInstruction* needs_zero = nullptr;
+  HloInstruction* needs_constant = nullptr;
+  ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
       case HloOpcode::kDynamicSlice:
@@ -258,8 +270,13 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
 
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
+        needs_constant = use;
+        constant_type = GetInitValue(*use->to_apply());
+        break;
+
       case HloOpcode::kSelectAndScatter:
-        needs_zero = use;
+        needs_constant = use;
+        constant_type = GetInitValue(*use->scatter());
         break;
 
       default:
@@ -268,17 +285,26 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
             use->ToString().c_str());
     }
   }
-  if (needs_index != nullptr && needs_zero != nullptr) {
+  if (needs_index != nullptr && needs_constant != nullptr) {
     return Unimplemented(
         "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
-        "zero: %s\n",
-        needs_index->ToString().c_str(), needs_zero->ToString().c_str());
+        "constant: %s\n",
+        needs_index->ToString().c_str(), needs_constant->ToString().c_str());
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
                                            needs_index->shape(), engine);
-  } else if (needs_zero != nullptr) {
-    return Literal::CreateFromShape(param.shape());
+  } else if (needs_constant != nullptr) {
+    switch (constant_type) {
+      case ConstantType::kZero:
+        return Literal::Zero(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kOne:
+        return Literal::One(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kUnknown:
+        // We want the identity element for the computation, but we don't really
+        // know what it is - so any value we generate will be just as wrong.
+        return MakeFakeLiteralInternal(param.shape(), engine);
+    }
   } else {
     return MakeFakeLiteralInternal(param.shape(), engine);
   }
-- 
GitLab


From 6630220b6e5f616b7c81bf1bebd281764d932b5a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 15:09:19 -0700
Subject: [PATCH 0323/1262] Disable tests that are currently failing with cuda
 9

PiperOrigin-RevId: 191805453
---
 tensorflow/cc/profiler/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index 00799526fc..cf65fe1ab9 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -9,6 +9,9 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
+    tags = [
+        "noguitar",  # b/77649654
+    ],
     deps = [
         ":profiler",
         "//tensorflow/cc:cc_ops",
-- 
GitLab


From 6b5b2782d3b1ff26855df579d2a58f6b54833479 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 15:11:02 -0700
Subject: [PATCH 0324/1262] Make tf.contrib.estimator.add_metrics work with
 warm-starting.

PiperOrigin-RevId: 191805682
---
 tensorflow/contrib/estimator/python/estimator/extenders.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 266ae93305..201699ed77 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -97,7 +97,10 @@ def add_metrics(estimator, metric_fn):
   return estimator_lib.Estimator(
       model_fn=new_model_fn,
       model_dir=estimator.model_dir,
-      config=estimator.config)
+      config=estimator.config,
+      # pylint: disable=protected-access
+      warm_start_from=estimator._warm_start_settings)
+      # pylint: enable=protected-access
 
 
 def clip_gradients_by_norm(optimizer, clip_norm):
-- 
GitLab


From 47fcad59ccd32e38bc133bff25e0645838e3e9df Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Thu, 5 Apr 2018 15:25:00 -0700
Subject: [PATCH 0325/1262] Add Raspberry Pi section and link to github build
 instructions.

PiperOrigin-RevId: 191807862
---
 tensorflow/docs_src/mobile/tflite/devguide.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
index 96392a3c9b..4133bc172a 100644
--- a/tensorflow/docs_src/mobile/tflite/devguide.md
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -190,7 +190,7 @@ graph visualization.
 
 ## 3. Use the TensorFlow Lite model for inference in a mobile app
 
-After completing the prior steps, you should now have a .tflite model file.
+After completing the prior steps, you should now have a `.tflite` model file.
 
 ### Android
 
@@ -222,3 +222,10 @@ trained Tensorflow models to the
 [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
 devices. To use the converter, refer to the
 [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+
+### Raspberry Pi
+
+Compile Tensorflow Lite for a Raspberry Pi by following the
+[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/rpi.md)
+This compiles a static library file (`.a`) used to build your app. There are
+plans for Python bindings and a demo app.
-- 
GitLab


From ea887d2d13a686990145b65e11701deae676b28b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 15:25:32 -0700
Subject: [PATCH 0326/1262] Add for and while loops to the list of operators.
 Do not use them yet.

PiperOrigin-RevId: 191807973
---
 tensorflow/contrib/autograph/operators/BUILD  |  17 +-
 .../contrib/autograph/operators/__init__.py   |   5 +
 .../autograph/operators/control_flow.py       | 179 ++++++++++++++++++
 .../autograph/operators/control_flow_test.py  |  82 ++++++++
 4 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/autograph/operators/control_flow.py
 create mode 100644 tensorflow/contrib/autograph/operators/control_flow_test.py

diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
index 7856c253bd..4c62468575 100644
--- a/tensorflow/contrib/autograph/operators/BUILD
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -2,6 +2,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -18,8 +20,21 @@ py_library(
     name = "operators",
     srcs = [
         "__init__.py",
+        "control_flow.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
-    deps = [],
+    deps = [
+        "//tensorflow/contrib/autograph/utils",
+    ],
+)
+
+py_test(
+    name = "control_flow_test",
+    srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
 )
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
index c3f4cab69e..04b4734551 100644
--- a/tensorflow/contrib/autograph/operators/__init__.py
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -22,3 +22,8 @@ closures for the body.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+# TODO(mdan): Add a container for implementation-specific toggles (throughout).
+
+from tensorflow.contrib.autograph.operators.control_flow import for_loop
+from tensorflow.contrib.autograph.operators.control_flow import while_loop
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
new file mode 100644
index 0000000000..5b8cb2d63c
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -0,0 +1,179 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Control flow statements: loops, conditionals, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+
+
+def for_loop(iterated, extra_cond, loop_body, init_state):
+  """Functional form of a for statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations, excluding the iterate. In what follows we
+  refer to state as either a tuple of entities that represent an actual state,
+  or a list of arguments of the corresponding types.
+
+  Args:
+    iterated: The entity being iterated over.
+    extra_cond: Callable with the state as arguments, and boolean return type.
+        An additionnal loop condition.
+    loop_body: Callable with the iterate and the state as arguments, and
+        state as return type. The actual loop body.
+    init_state: Tuple containing the initial state.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  if tensor_util.is_tensor(iterated):
+    return _known_len_for_loop(iterated, extra_cond, loop_body, init_state)
+  elif isinstance(iterated, dataset_ops.Dataset):
+    return _dataset_for_loop(iterated, extra_cond, loop_body, init_state)
+  else:
+    return _py_for_loop(iterated, extra_cond, loop_body, init_state)
+
+
+def _py_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that executes a Python for loop."""
+  state = init_state
+  for iterate in iterated:
+    if not extra_cond(*state):
+      break
+    state = loop_body(iterate, *state)
+
+  # TODO(mdan): Remove this special case.
+  if len(state) == 1:
+    return state[0]
+  return state
+
+
+def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over objects that define a length."""
+  n = builtins.dynamic_len(iterated)
+
+  def while_body(iterate_index, *state):
+    iterate = iterated[iterate_index]
+    new_state = loop_body(iterate, *state)
+    return (iterate_index + 1,) + new_state
+
+  def while_cond(iterate_index, *state):
+    return gen_math_ops.logical_and(iterate_index < n, extra_cond(*state))
+
+  results = while_loop(
+      while_cond,
+      while_body,
+      init_state=(0,) + init_state,
+      extra_deps=(iterated,))
+  # Dropping the iteration index because it's not syntactically visible.
+  results = results[1:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over TF Datasets."""
+  # Because Datsets only expose get_next, in the style of Python iterators,
+  # we are forced to unpack the loop as:
+  #
+  # epoch_number, iterate = ds.get_next()
+  # while epoch_number < 2:
+  #   <body>
+  #   epoch_number, iterate = ds.get_next()
+  epoch_numbers = dataset_ops.Dataset.range(2)
+  def tag_with(ds, tag):
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
+  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
+
+  iterator = ds_with_epoch.make_initializable_iterator()
+  with ops.control_dependencies((iterator.initializer,)):
+    epoch_number, iterate = iterator.get_next()
+
+    def while_body(epoch_number, iterate, *state):
+      new_state = loop_body(iterate, *state)
+      epoch_number, iterate = iterator.get_next()
+      return (epoch_number, iterate) + new_state
+
+    def while_cond(epoch_number, iterate, *state):
+      del iterate
+      return gen_math_ops.logical_and(epoch_number < 1, extra_cond(*state))
+
+    results = while_loop(
+        while_cond,
+        while_body,
+        init_state=(epoch_number, iterate) + init_state,
+        extra_deps=())
+  # Dropping the epoch number and iterate because they are not not syntactically
+  # visible.
+  results = results[2:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def while_loop(loop_cond, loop_body, init_state, extra_deps):
+  """Functional form of a while statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations. In what follows we refer to state as either
+  a tuple of entities that represent an actual state, or a list of arguments
+  of the corresponding types.
+
+  Args:
+    loop_cond: Callable with the state as arguments, and boolean return type.
+        The loop condition.
+    loop_body: Callable with the state as arguments, and state as return type.
+        The actual loop body.
+    init_state: Tuple containing the initial state.
+    extra_deps: Tuple containing additional entities on which the loop may
+        depend, such as loop invariants referenced by loop_cond. Used
+        exclusively for dispatch control.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
+  # That could be somethins as simple as a collection of dispatch rules, with
+  # some prioritization.
+  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+    return _tf_while_loop(loop_cond, loop_body, init_state)
+  else:
+    return _py_while_loop(loop_cond, loop_body, init_state)
+
+
+def _tf_while_loop(loop_cond, loop_body, init_state):
+  """Overload of while_loop that stages a TF while_loop."""
+  return control_flow_ops.while_loop(loop_cond, loop_body, init_state)
+
+
+def _py_while_loop(loop_cond, loop_body, init_state):
+  """Overload of while_loop that executes a Python while loop."""
+  state = init_state
+  while loop_cond(*state):
+    state = loop_body(*state)
+  return state
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
new file mode 100644
index 0000000000..9112b1627f
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for control_flow module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph import operators
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ForLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    s = operators.for_loop(
+        constant_op.constant([1, 2, 3, 4]),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+  def test_python(self):
+    s = operators.for_loop(
+        range(5),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    self.assertEqual(10, s)
+
+  def test_dataset(self):
+    to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
+    s = operators.for_loop(
+        dataset_ops.Dataset.range(5).map(to_int32),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+
+class WhileLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    n = constant_op.constant(5)
+    results = operators.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i,),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    with self.test_session() as sess:
+      self.assertEqual((5, 10), sess.run(results))
+
+  def test_python(self):
+    n = 5
+    results = operators.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    self.assertEqual((5, 10), results)
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From cc7bfaa28dc002be2ed6cc8ce4ef678c2efd7983 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 15:32:27 -0700
Subject: [PATCH 0327/1262] [TF:XLA] No need to set return value in the while
 loop's condition.

PiperOrigin-RevId: 191809110
---
 tensorflow/compiler/tf2xla/lib/while_loop.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 86c02ac2e6..495d9c6078 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -54,7 +54,6 @@ xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
         auto result,
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
                            cond_builder.get()));
-    TF_RETURN_IF_ERROR(cond_builder->SetReturnValue(result));
   }
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
-- 
GitLab


From e7e1e9f63de7fde4e82c2edf0173968443811f2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 15:37:50 -0700
Subject: [PATCH 0328/1262] Add functions to extract the basic symbols on which
 a composite name relies. This in turn allows to statically obtain a block's
 syntactic closure.

PiperOrigin-RevId: 191809965
---
 .../contrib/autograph/pyct/qual_names.py      | 23 +++++++++++++++++++
 .../contrib/autograph/pyct/qual_names_test.py | 15 ++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/tensorflow/contrib/autograph/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
index 4d5764a974..583cf7ecd7 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -112,6 +112,29 @@ class QN(object):
       raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
     return self._parent
 
+  @property
+  def support_set(self):
+    """Returns the set of simple symbols that this QN relies on.
+
+    This would be the smallest set of symbols necessary for the QN to
+    statically resolve (assuming properties and index ranges are verified
+    at runtime).
+
+    Examples:
+      'a.b' has only one support symbol, 'a'
+      'a[i]' has two roots, 'a' and 'i'
+    """
+    # TODO(mdan): This might be the set of Name nodes in the AST. Track those?
+    roots = set()
+    if self.has_attr():
+      roots.update(self.parent.support_set)
+    elif self.has_subscript():
+      roots.update(self.parent.support_set)
+      roots.update(self.qn[1].support_set)
+    else:
+      roots.add(self)
+    return roots
+
   def __hash__(self):
     return hash(self.qn + (self._has_attr, self._has_subscript))
 
diff --git a/tensorflow/contrib/autograph/pyct/qual_names_test.py b/tensorflow/contrib/autograph/pyct/qual_names_test.py
index 103bd25aa3..264afd508c 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names_test.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names_test.py
@@ -154,6 +154,21 @@ class QNTest(test.TestCase):
     a_sub_three = QN(a, subscript=QN(qual_names.NumberLiteral(3)))
     self.assertEqual(a_sub_three.ast().slice.value.n, 3)
 
+  def test_support_set(self):
+    a = QN('a')
+    b = QN('b')
+    c = QN('c')
+    a_sub_b = QN(a, subscript=b)
+    a_dot_b = QN(a, attr='b')
+    a_dot_b_dot_c = QN(a_dot_b, attr='c')
+    a_dot_b_sub_c = QN(a_dot_b, subscript=c)
+
+    self.assertSetEqual(a.support_set, set((a,)))
+    self.assertSetEqual(a_sub_b.support_set, set((a, b)))
+    self.assertSetEqual(a_dot_b.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_dot_c.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_sub_c.support_set, set((a, c)))
+
 
 class QNResolverTest(test.TestCase):
 
-- 
GitLab


From a87544a250d5904ab9e488fd2199d8aaf6014108 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 5 Apr 2018 15:48:27 -0700
Subject: [PATCH 0329/1262] Add link for index file in performance tab.

PiperOrigin-RevId: 191811610
---
 tensorflow/docs_src/performance/leftnav_files | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index d11a7e5d07..1f894c39fe 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,3 +1,4 @@
+index.md
 performance_guide.md
 datasets_performance.md
 performance_models.md
-- 
GitLab


From 2d3cd7815e55b65dc88ea219400d0bb7b63a1e57 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 5 Apr 2018 16:37:36 -0700
Subject: [PATCH 0330/1262] Ignore control edges as inputs

---
 tensorflow/contrib/tensorrt/convert/convert_nodes.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index ee1273ddff..567b4af88d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -443,7 +443,9 @@ class Converter {
        * 2) Control dependency inputs contain caret at the beginning and we
        *    remove this and annotate the edge as a control dependency.
        ************************************************************************/
-      string name = input_name[0] == '^' ? input_name.substr(1) : input_name;
+      // skip control nodes
+      if (input_name[0] == '^' ) continue;
+      string name =  input_name;
       auto first = name.find_first_of(':');
       if (first != string::npos && first + 2 == name.size() &&
           name[first + 1] == '0')
-- 
GitLab


From 6a5005cc69dec5882ef7e3e07d60ac72fd87f103 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 5 Apr 2018 16:46:12 -0700
Subject: [PATCH 0331/1262] Added an option to inline all functions in
 aggressive mode.

PiperOrigin-RevId: 191819577
---
 tensorflow/compiler/tests/jit_test.py         | 36 +++++++++++++------
 .../grappler/optimizers/function_optimizer.cc | 24 +++++++------
 .../grappler/optimizers/function_optimizer.h  |  5 ++-
 .../optimizers/function_optimizer_test.cc     |  8 ++---
 4 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index f9d87c2d1c..1f7da659e5 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -38,6 +39,18 @@ from tensorflow.python.platform import test
 jit_scope = jit.experimental_jit_scope
 
 
+# Disable rewrites to make sure we don't end up having to update this test
+# whenever we implement new ones.
+def NoRewriteSessionConfig():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      function_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
 def CompiledKernel(fn, *inputs, **kwargs):
   """Execute 'fn' as a compiled XLA kernel, with 'inputs'."""
   name = kwargs.pop("name", None)
@@ -81,7 +94,7 @@ class JitLaunchTest(test.TestCase):
   # actually ran. However, it is sometimes possible for _XlaLaunch ops to be
   # constant-folded away, so the check is optional.
   def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
-    with session_lib.Session() as sess:
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
       placeholders = []
       feeds = {}
       for arg in args:
@@ -258,7 +271,7 @@ class XlaCompilationTest(test.TestCase):
   def testReshape(self):
     """Tests an operator with compile-time constant and non-constant inputs."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -282,7 +295,7 @@ class XlaCompilationTest(test.TestCase):
   def testIgnoredArguments(self):
     """Tests that JIT computations can ignore formal parameters."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.int32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -306,7 +319,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoops(self):
     """Tests that compilation accepts computations containing loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         c = lambda i, _: math_ops.less(i, 5)
@@ -324,7 +337,7 @@ class XlaCompilationTest(test.TestCase):
   def testCond(self):
     """Tests that compilation handles switch operators."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.float32)
       c = array_ops.placeholder(dtypes.bool)
@@ -365,7 +378,8 @@ class XlaCompilationTest(test.TestCase):
       inp = array_ops.placeholder(dtypes.float32)
       out = Entry(inp)
 
-    with self.test_session(graph=g, use_gpu=True) as sess:
+    with self.test_session(
+        config=NoRewriteSessionConfig(), graph=g, use_gpu=True) as sess:
       run_metadata = config_pb2.RunMetadata()
       val = sess.run(out,
                      feed_dict={inp: [2., 10.]},
@@ -377,7 +391,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoopDeadlock(self):
     """Regression test for bug that caused deadlocks in graphs with loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         y = x + 1.0
@@ -404,10 +418,10 @@ class XlaCompilationTest(test.TestCase):
         y = Forward(x)
         dx, = gradients_impl.gradients(y, [x], 1.0)
 
-      cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-          optimizer_options=config_pb2.OptimizerOptions(
-              opt_level=config_pb2.OptimizerOptions.L1,
-              do_function_inlining=True)))
+      cfg = NoRewriteSessionConfig()
+      cfg.graph_options.optimizer_options.opt_level = (
+          config_pb2.OptimizerOptions.L1)
+      cfg.graph_options.optimizer_options.do_function_inlining = True
       with session_lib.Session(graph=g, config=cfg) as sess:
         run_metadata = config_pb2.RunMetadata()
         dx_val = sess.run(dx,
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index f1da469a6c..343c89a9da 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -36,8 +36,11 @@ namespace {
 
 class FunctionInliningContext {
  public:
-  explicit FunctionInliningContext(const GrapplerItem& item)
-      : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+  explicit FunctionInliningContext(const GrapplerItem& item,
+                                   RewriterConfig::Toggle opt_level)
+      : library_(&item.graph.library()),
+        opt_level_(opt_level),
+        functions_(InliningCandidates(item)) {}
 
   const FunctionDefLibrary& Library() const { return *library_; }
 
@@ -59,13 +62,9 @@ class FunctionInliningContext {
     std::unordered_map<string, const FunctionDef*> functions;
     for (const FunctionDef& func : item.graph.library().function()) {
       // Don't inline functions marked as noinline
-      if (func.attr().count("_noinline") != 0) {
-        continue;
-      }
-      // Don't touch anything marked XLA to prevent XLA failures further down
-      // the road.
-      if (func.attr().count("_XlaCompile") > 0 &&
-          func.attr().at("_XlaCompile").b()) {
+      if (func.attr().count("_noinline") != 0 &&
+          func.attr().at("_noinline").b() &&
+          opt_level_ != RewriterConfig::AGGRESSIVE) {
         continue;
       }
       // Can't create IdentityN nodes with no input or output: skip these
@@ -80,6 +79,7 @@ class FunctionInliningContext {
   }
 
   const FunctionDefLibrary* library_;
+  RewriterConfig::Toggle opt_level_;
   std::unordered_map<string, const FunctionDef*> functions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
@@ -206,6 +206,10 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
                                         ctx, optimized_graph));
     } else {
+      // Annotate the node with the function attributes.
+      for (const auto& attr : func.attr()) {
+        func_body_node.mutable_attr()->insert(attr);
+      }
       // Move the node to the main graph
       optimized_graph->add_node()->Swap(&func_body_node);
     }
@@ -367,7 +371,7 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  FunctionInliningContext function_inlining_ctx(item);
+  FunctionInliningContext function_inlining_ctx(item, opt_level_);
 
   // Nothing to do here.
   if (!function_inlining_ctx.HasInlinedFunctions()) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 41444e4673..b124efe01d 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,7 +26,7 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer(RewriterConfig::Toggle opt_level) {}
+  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
   ~FunctionOptimizer() override {}
 
   string name() const override { return "function_optimizer"; };
@@ -36,6 +36,9 @@ class FunctionOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index c804d75756..fe26a56fc2 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -412,7 +412,7 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
       {mul_func, square_func});
 
   GraphDef output;
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
@@ -508,7 +508,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -550,7 +550,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -613,7 +613,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   // The optimizer should succeed but the graphs should be the same.
-- 
GitLab


From ffe24e657bc9dc365f98de17e7118d94d88c3705 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 5 Apr 2018 16:49:06 -0700
Subject: [PATCH 0332/1262] Fix warning in rnn_cell.py (#18164)

This fix fixes the warning in rnn_cell.py caused by l2_normalize with dim:
```
rnn_cell.py:2894: calling l2_normalize (from tensorflow.python.ops.nn_impl) with dim is deprecated and will be removed in a future version.
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 2f6ae9f367..b12e2cd5ed 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2891,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
     output_size = weight.get_shape().as_list()[1]
     g = vs.get_variable(name, [output_size], dtype=weight.dtype)
-    return nn_impl.l2_normalize(weight, dim=0) * g
+    return nn_impl.l2_normalize(weight, axis=0) * g
 
   def _linear(self,
               args,
-- 
GitLab


From d5fc41a8ca69d37696c7c324e19a3a724b70192b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 5 Apr 2018 16:49:29 -0700
Subject: [PATCH 0333/1262] Fix `TypeError: 'dict_keys'` in contrib.distribute
 with python 3 (#18212)

This fix tries to fix the issue raised in 18205 where
```
TypeError: 'dict_keys' object does not support indexing
```
was thrown when using contrib.distribute in python 3.

The issue is that DistributedValues.devices returned `self._index.keys()`
which is a `dict_keys` and is not a list in python 3.

This fix converts the `dict_keys` to list for python 3
to fix the issue.

This fix fixes 18205.
This fix als fixes 18188.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/distribute/python/values.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 9acb6a9db9..87bf059038 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -73,7 +73,7 @@ class DistributedValues(object):
 
   @property
   def devices(self):
-    return self._index.keys()
+    return list(self._index.keys())
 
   def __str__(self):
     return "%s:%s" % (self.__class__.__name__, self._index)
-- 
GitLab


From 73f25fc34c69878c83ee2eeb8f030cb79a76472f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 5 Apr 2018 16:52:07 -0700
Subject: [PATCH 0334/1262] Fix an issue caused by (#18183)

While trying to run on my machine (Ubuntu 16.04 Python 2.7)
```
bazel test -s --config=opt --cache_test_results=no //tensorflow/tools/api/tests:api_compatibility_test
```

The following error was encountered:
```
  ......
  File "/home/ubuntu/.cache/bazel/_bazel_ubuntu/ad1e09741bb4109fbc70ef8216b59ee2/execroot/org_tensorflow/bazel-out/host/bin/tensorflow/tools/api/generator/create_python_api.runfiles/org_tensorflow/tensorflow/tools/api/generator/create_python_api.py", line 125, in get_api_imports
    if not module or 'tensorflow.' not in module.__name__:
  File "/usr/lib/python2.7/dist-packages/py/_apipkg.py", line 171, in __getattribute__
    return getattr(getmod(), name)
  File "/usr/lib/python2.7/dist-packages/py/_error.py", line 43, in __getattr__
    raise AttributeError(name)
AttributeError: __name__
```

The issue is that `<AliasModule 'py.error' for 'py._error.error'>` does not
have a `__name__` attribute (See similiar issue in https://github.com/pytest-dev/py/issues/73).

This fix tries to address the issue by adding an `hasattr()` check so
that AttributeError is not thrown.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/tools/api/generator/create_python_api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 183c4731b8..70f9776b08 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -122,7 +122,8 @@ def get_api_imports():
   # we want to traverse over TensorFlow Python modules.
   for module in sys.modules.values():
     # Only look at tensorflow modules.
-    if not module or 'tensorflow.' not in module.__name__:
+    if (not module or not hasattr(module, "__name__") or
+        'tensorflow.' not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
-- 
GitLab


From 8b9509c2ae41e2fc1c925ec14903b09a14b84803 Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Fri, 6 Apr 2018 07:54:33 +0800
Subject: [PATCH 0335/1262] Fix minor typos for programer guide documents
 (#17486)

* Fix minor typo fix for proggramer guide doccuments

* revert typo in dataset.md

* revert wrong typo fix

* revert wrong typo fix
---
 tensorflow/docs_src/get_started/custom_estimators.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 941c3e1690..275cda12bc 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -546,7 +546,7 @@ In brief, here's what the three graphs tell you:
 
 * accuracy: The accuracy is recorded by the following two lines:
 
-    * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+    * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation.
     * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
 
 These tensorboard graphs are one of the main reasons it's important to pass a
-- 
GitLab


From 261222e69e4e0ecab044c2fd74531b574a66d812 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Deltheil?=
 <355031+deltheil@users.noreply.github.com>
Date: Fri, 6 Apr 2018 01:56:20 +0200
Subject: [PATCH 0336/1262] contrib/lite: avoid building benchmark_model in
 build_ios_universal_lib.sh (#17796)

It is not needed part of this script. In addition, since f0633ec benchmark_model
now depends on //tensorflow/core library which is not taken into account by the
Makefile from contrib/lite, and thus causes failure:

$ ./build_ios_universal_lib.sh
...
In file included from tensorflow/contrib/lite/tools/benchmark_model.cc:29:
In file included from ./tensorflow/core/platform/env.h:24:
In file included from ./tensorflow/core/lib/core/errors.h:21:
./tensorflow/core/lib/core/status.h:23:10: fatal error: 'tensorflow/core/lib/core/error_codes.pb.h' file not found
#include "tensorflow/core/lib/core/error_codes.pb.h"
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
---
 .../contrib/lite/build_ios_universal_lib.sh       | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index 4a9023ff33..9f398f4a9f 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -19,11 +19,16 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../.."
 
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a
 
 lipo \
 tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
-- 
GitLab


From cfeadf0986ad60b0ae1eb18b3802539803c63b94 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 17:08:50 -0700
Subject: [PATCH 0337/1262] Make concat handler support mixed range input

PiperOrigin-RevId: 191822664
---
 .../contrib/lite/kernels/concatenation.cc     | 22 +++---
 .../lite/kernels/concatenation_test.cc        | 68 +++++++++++++++++++
 .../internal/optimized/optimized_ops.h        | 56 +++++++++++++++
 .../internal/reference/reference_ops.h        | 55 +++++++++++++++
 .../contrib/lite/kernels/internal/tensor.h    | 23 +++++++
 5 files changed, 213 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index a619ada86a..45ea8d0049 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -67,10 +67,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
     TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
     TF_LITE_ENSURE_EQ(context, t->type, input_type);
-    if (input_type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, t->params.zero_point, t0->params.zero_point);
-      TF_LITE_ENSURE_EQ(context, t->params.scale, t0->params.scale);
-    }
     for (int d = 0; d < t0->dims->size; ++d) {
       if (d == axis) {
         sum_axis += t->dims->data[axis];
@@ -87,11 +83,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
-  if (input_type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
-                      t0->params.zero_point);
-    TF_LITE_ENSURE_EQ(context, output->params.scale, t0->params.scale);
-  }
 
   return context->ResizeTensor(context, output, output_size);
 }
@@ -115,6 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
       GetTensorDims(output))
 
+#define TF_LITE_CONCATENATION_QUANTIZED(type)                                  \
+  VectorOfQuantizedTensors all_inputs(*context, *node->inputs);                \
+  type::Concatenation(                                                         \
+      RemapDim(NumDimensions(output), axis), all_inputs.data(),                \
+      all_inputs.dims(), all_inputs.zero_point(), all_inputs.scale(),          \
+      node->inputs->size, GetTensorData<uint8>(output), GetTensorDims(output), \
+      output->params.zero_point, output->params.scale)
+
   switch (output->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
@@ -125,9 +124,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(reference_ops);
       } else {
-        TF_LITE_CONCATENATION(optimized_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
       }
       break;
     default:
@@ -136,6 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+#undef TF_LITE_CONCATENATION_QUANTIZED
 #undef TF_LITE_CONCATENATION
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index ba1ffc5f84..467ff6f7e1 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -28,6 +28,7 @@ class BaseConcatenationOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): Also test different activation types, axis, input
   // dimensions.
+  BaseConcatenationOpModel() {}
   BaseConcatenationOpModel(const TensorData& input_template, int axis,
                            int num_inputs) {
     std::vector<std::vector<int>> all_input_shapes;
@@ -60,6 +61,23 @@ class ConcatenationOpModel : public BaseConcatenationOpModel {
 class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
  public:
   using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  QuantizedConcatenationOpModel(const std::vector<TensorData>& input_template,
+                                int axis, int num_inputs,
+                                const TensorData& output_template) {
+    std::vector<std::vector<int>> all_input_shapes;
+    CHECK_EQ(input_template.size(), num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template[i].shape);
+      AddInput(input_template[i]);
+    }
+    output_ = AddOutput({output_template.type, /*shape=*/{},
+                         output_template.min, output_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
   void SetInput(int index, std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(index, data);
   }
@@ -168,6 +186,56 @@ TEST(ConcatenationOpTest, FourInputsQuantized) {
                               }));
 }
 
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRangeClampingLogic) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -1., 1.});
+
+  m0.SetInput(0, {1.0f, -3.0f, -4.0f, -7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, -3.2f, -4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f,   //
+                      -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,  //
+                  },
+                  4e-3)));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  255, 0, 255, 255, 255, 0, 255, 255,  //
+                                  0, 0, 255, 255, 0, 255, 255, 255,    //
+                              }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 3642da311c..9a274612ad 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -2732,6 +2732,62 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the reference implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 3575974ae9..31e190e248 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1566,6 +1566,61 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the optimized implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int64_t outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 62e38e0d4c..4bce2ffaaf 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -126,6 +126,29 @@ class VectorOfTensors {
   std::vector<Dims<4>*> all_dims_ptr_;
 };
 
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32> zero_point_;
+  std::vector<float> scale_;
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
-- 
GitLab


From cde06a39592a849a2bc0ba022e858e6366c87cc5 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <b@lamberta.org>
Date: Thu, 5 Apr 2018 17:14:29 -0700
Subject: [PATCH 0338/1262] Docs: Fix formatting in programmers_guide/debugger
 (#18281)

Fix formating for code block in debugger.md
Fixes #17946
---
 .../docs_src/programmers_guide/debugger.md    | 61 +++++++++----------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index d1cd7e7c06..f5a0eb0a20 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -4,29 +4,28 @@
 
 [TOC]
 
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
-lets you view the internal structure and states of running TensorFlow graphs
-during training and inference, which is difficult to debug with general-purpose
-debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
-
-> NOTE: TensorFlow debugger uses a
-> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
-> text user interface. On Mac OS X, the `ncurses` library is required and can
-> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
-> isn't as well supported, so a
-> [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
-> be used with tfdbg by installing `pyreadline` with pip.
-> If you use Anaconda3, you can install it with a command
-> such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
-> Unofficial Windows curses packages can be downloaded
-> [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
-> installed using `pip install <your_version>.whl`, however curses on Windows
-> may not work as reliably as curses on Linux or Mac.
-
-> NOTE: This guide focuses on the command-line interface (CLI) of tfdbg. For
-> guide on how to use the graphical user interface (GUI) of tfdbg, i.e., the
-> **TensorBoard Debugger Plugin**, please visit
-> [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal
+structure and states of running TensorFlow graphs during training and inference,
+which is difficult to debug with general-purpose debuggers such as Python's `pdb`
+due to TensorFlow's computation-graph paradigm.
+
+This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on
+how to use the graphical user interface (GUI) of tfdbg, i.e., the
+**TensorBoard Debugger Plugin**, please visit
+[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+
+Note: The TensorFlow debugger uses a
+[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
+user interface. On Mac OS X, the `ncurses` library is required and can be
+installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
+interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
+use Anaconda3, you can install it with a command such as
+`"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial
+Windows curses packages can be downloaded
+[here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
+installed using `pip install <your_version>.whl`, however curses on Windows may
+not work as reliably as curses on Linux or Mac.
 
 This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance
 of [`nan`s](https://en.wikipedia.org/wiki/NaN)
@@ -748,16 +747,16 @@ There are three possible workarounds or solutions:
    to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
    debug data on a disk with larger free space. For example:
 
-   ``` python
-   # For LocalCLIDebugWrapperSession
-   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
-
-   # For LocalCLIDebugHook
-   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
-   ```
+```python
+# For LocalCLIDebugWrapperSession
+sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
 
+# For LocalCLIDebugHook
+hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
+```
    Make sure that the directory pointed to by dump_root is empty or nonexistent.
-   tfdbg cleans up the dump directories before exiting.
+   `tfdbg` cleans up the dump directories before exiting.
+
 *  Reduce the batch size used during the runs.
 *  Use the filtering options of tfdbg's `run` command to watch only specific
    nodes in the graph. For example:
-- 
GitLab


From a1fbdf8268d8043190fe7fa0b3c5796a455ea529 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 5 Apr 2018 17:17:25 -0700
Subject: [PATCH 0339/1262] Validate the shape of TakeDataset and SkipDataset
 (#18267)

* Validate the shape of TakeDataset and SkipDataset

The `count` inputs of the TakeDataset and SkipDataset require
scalar. That was not validated before though.

This fix validates the shape of count for TakeDataset and SkipDataset.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for invalide `count` of TakeDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Validate shape of count for SkipDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for invalid count shape for SkipDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/sequence_dataset_op_test.py  | 10 ++++++++++
 tensorflow/core/ops/dataset_ops.cc                   | 12 ++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index b044ff1775..b13ad9ba4e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest(
     # Skip nothing
     self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
 
+  def testInvalidSkip(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
+
   def _build_take_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(count)
@@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest(
     # Take nothing
     self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
 
+  def testInvalidTake(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
+
   def _build_repeat_dataset(self, count, take_count=3):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 2852c49e19..b25abbcc67 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -117,7 +117,11 @@ REGISTER_OP("TakeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SkipDataset")
     .Input("input_dataset: variant")
@@ -125,7 +129,11 @@ REGISTER_OP("SkipDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
-- 
GitLab


From 165a87a0f938aef649f7e817193716349e42072a Mon Sep 17 00:00:00 2001
From: Tony Wang <tonywy@google.com>
Date: Thu, 5 Apr 2018 17:24:33 -0700
Subject: [PATCH 0340/1262] Automated g4 rollback of changelist 191605505

PiperOrigin-RevId: 191824447
---
 .../xla/legacy_flags/debug_options_flags.cc   |   7 +
 tensorflow/compiler/xla/service/cpu/BUILD     |  23 ++++
 .../compiler/xla/service/cpu/cpu_runtime.cc   |   8 ++
 .../compiler/xla/service/cpu/cpu_runtime.h    |   4 +
 .../xla/service/cpu/cpu_runtime_test.cc       |  84 ++++++++++--
 .../xla/service/cpu/dot_op_emitter.cc         |  23 ++--
 .../xla/service/cpu/runtime_matmul_mkl.cc     | 128 ++++++++++++++++++
 .../xla/service/cpu/runtime_matmul_mkl.h      |  84 ++++++++++++
 .../xla/service/cpu/simple_orc_jit.cc         |   5 +
 tensorflow/compiler/xla/xla.proto             |   3 +
 10 files changed, 353 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h

diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index c8ed3e3a2b..f037663e3f 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,6 +40,9 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
+#ifdef INTEL_MKL
+  flags->set_xla_cpu_use_mkl_dnn(true);
+#endif  // INTEL_MKL
 
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
@@ -288,6 +291,10 @@ void AllocateFlags() {
           flag_values->xla_gpu_use_cudnn_batchnorm(),
           "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
           "rather than expanding them to a soup of HLOs."),
+      tensorflow::Flag("xla_cpu_use_mkl_dnn",
+                       bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
+                       flag_values->xla_cpu_use_mkl_dnn(),
+                       "Generate calls to MKL-DNN in the CPU backend."),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 966e2d0fc5..246b802861 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -18,6 +18,10 @@ load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -170,6 +174,7 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
@@ -538,6 +543,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_matmul_mkl",
+    srcs = ["runtime_matmul_mkl.cc"],
+    hdrs = ["runtime_matmul_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
+    ]),
+)
+
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = [
@@ -584,10 +605,12 @@ cc_library(
 tf_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
+    shard_count = 10,
     tags = ["optonly"],
     deps = [
         ":cpu_runtime",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_matmul",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 9a3bd68c80..872b0be1f8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,6 +37,14 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kMKLMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF32";
+extern const char* const kMKLMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF64";
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF32";
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF64";
 extern const char* const kEigenConvF16SymbolName =
     "__xla_cpu_runtime_EigenConvF16";
 extern const char* const kEigenConvF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index e61d6ea28b..e392e231b4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,6 +44,10 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kMKLMatMulF32SymbolName;
+extern const char* const kMKLMatMulF64SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f385829cdf..2ac950e6d9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
@@ -130,25 +131,23 @@ MatMulShape MatMulShapes[] = {
 // * transpose_lhs
 // * transpose_rhs
 // * single_threaded
-using EigenMatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
+using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
 
-class EigenMatMulTest
-    : public CpuRuntimeTest,
-      public ::testing::WithParamInterface<EigenMatMulTestParam> {
+class EigenMatMulTest : public CpuRuntimeTest,
+                        public ::testing::WithParamInterface<MatMulTestParam> {
  public:
-  static string Name(
-      const ::testing::TestParamInfo<EigenMatMulTestParam>& info) {
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
     MatMulShape shape = std::get<0>(info.param);
     bool transpose_lhs = std::get<1>(info.param);
     bool transpose_rhs = std::get<2>(info.param);
     bool single_threaded = std::get<3>(info.param);
 
     return tensorflow::strings::Printf(
-        "MatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
         transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
         single_threaded ? "single" : "multi");
   }
-};  // namespace xla
+};
 
 TEST_P(EigenMatMulTest, DoIt) {
   MatMulShape shape = std::get<0>(GetParam());
@@ -169,5 +168,74 @@ INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
                                            ::testing::Bool()),
                         EigenMatMulTest::Name);
 
+#ifdef INTEL_MKL
+class MKLMatMulTest : public CpuRuntimeTest,
+                      public ::testing::WithParamInterface<MatMulTestParam> {
+ public:
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
+    MatMulShape shape = std::get<0>(info.param);
+    bool transpose_lhs = std::get<1>(info.param);
+    bool transpose_rhs = std::get<2>(info.param);
+    bool single_threaded = std::get<3>(info.param);
+
+    return tensorflow::strings::Printf(
+        "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
+        single_threaded ? "single" : "multi");
+  }
+};
+
+std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
+                                                  const Array2D<float>& b,
+                                                  bool transpose_lhs,
+                                                  bool transpose_rhs,
+                                                  bool single_threaded) {
+  CHECK_EQ(a.width(), b.height());
+  int64 m = a.height();
+  int64 n = b.width();
+  int64 k = a.width();
+
+  // The MKL matmul runtime function expects the matrix to be in column major
+  // order and array2d is in row-major order. Create transposes of a and b. The
+  // 'data' buffer in the transposed array is the original array in column major
+  // order.
+  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
+  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
+
+  // Since we're going to transpose c before returning it, swap the order of the
+  // dimension sizes to ensure the returned array is properly dimensioned.
+  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  if (single_threaded) {
+    __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
+        m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    __xla_cpu_runtime_MKLMatMulF32(nullptr, c_transpose->data(),
+                                   a_transpose->data(), b_transpose->data(), m,
+                                   n, k, transpose_lhs, transpose_rhs);
+  }
+  return MaybeTransposeArray2D(*c_transpose, true);
+}
+
+TEST_P(MKLMatMulTest, DoIt) {
+  MatMulShape shape = std::get<0>(GetParam());
+  bool transpose_lhs = std::get<1>(GetParam());
+  bool transpose_rhs = std::get<2>(GetParam());
+  bool single_threaded = std::get<3>(GetParam());
+
+  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
+  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
+  auto c =
+      MKLMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs, single_threaded);
+  CheckMatrixMultiply(*a, *b, *c);
+}
+
+INSTANTIATE_TEST_CASE_P(MKLMatMulTestInstantiaion, MKLMatMulTest,
+                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        MKLMatMulTest::Name);
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 8b1e20d79e..29afd8ea5f 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -918,28 +918,35 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded_eigen =
+  bool multi_threaded =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
     case F16:
-      fn_name = multi_threaded_eigen
+      fn_name = multi_threaded
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
       float_type = ir_builder_->getHalfTy();
       break;
     case F32:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF32SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF32SymbolName
+                                   : runtime::kEigenMatMulF32SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF32SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF32SymbolName);
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF64SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF64SymbolName
+                                   : runtime::kEigenMatMulF64SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF64SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
new file mode 100644
index 0000000000..92da5f71c2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+#include "third_party/intel_mkl_ml/include/mkl_service.h"
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/types.h"
+
+#define EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+namespace {
+// BLAS GEMM API for 32-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF32(const void* run_options_ptr, float* out, float* lhs, float* rhs,
+               int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For column-major matrices, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_sgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+// BLAS GEMM API for 64-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF64(const void* run_options_ptr, double* out, double* lhs,
+               double* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For a column-major matrix, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_dgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+}  // namespace
+
+void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out,
+                                    float* lhs, float* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+// BLAS GEMM API for 64-bit Matrix Multiplication
+void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out,
+                                    double* lhs, double* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr,
+                                                  float* out, float* lhs,
+                                                  float* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr,
+                                                  double* out, double* lhs,
+                                                  double* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+#endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
new file mode 100644
index 0000000000..831b796efb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+
+#include <iostream>
+#include "tensorflow/core/platform/types.h"
+#ifdef INTEL_MKL
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+#else
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 4198260a22..b7ce5bbe47 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
@@ -183,6 +184,10 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 5cb18113e5..f9943f71d3 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -189,6 +189,9 @@ message DebugOptions {
   // directory.
   string xla_dump_per_pass_hlo_proto_to = 96;
 
+  // Generate calls to MKL-DNN in the CPU backend.
+  bool xla_cpu_use_mkl_dnn = 97;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
-- 
GitLab


From 0b87efcbae8cde976a302415c6df2189958e7a8b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 17:35:00 -0700
Subject: [PATCH 0341/1262] Add a command line parameter to toco to change the
 way toco rescales input and output tensors.

PiperOrigin-RevId: 191825756
---
 tensorflow/contrib/lite/toco/args.h           |  1 +
 .../graph_transformations/hardcode_min_max.cc | 45 +++++++++++--------
 .../toco/graph_transformations/quantize.cc    |  3 +-
 .../contrib/lite/toco/model_cmdline_flags.cc  |  7 +++
 .../contrib/lite/toco/model_flags.proto       |  6 ++-
 tensorflow/contrib/lite/toco/tooling_util.cc  |  3 +-
 6 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 39e49bc347..7a7059e357 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -202,6 +202,7 @@ struct ParsedModelFlags {
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
   Arg<toco::StringMapList> model_checks;
+  Arg<bool> change_concat_input_ranges = Arg<bool>(true);
   // Debugging output options.
   // TODO(benoitjacob): these shouldn't be ModelFlags.
   Arg<string> graphviz_first_array;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 23c9e3246b..437e30a918 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -95,30 +95,37 @@ bool HardcodeMinMaxForConcatenation(Model* model, Operator* op) {
   overall_minmax.min = overall_min;
   overall_minmax.max = overall_max;
   bool changed = false;
-  for (const auto& input : op->inputs) {
-    auto& array = model->GetArray(input);
-    if (!array.minmax) {
-      changed = true;
-    } else if (!(overall_minmax == array.GetMinMax())) {
-      changed = true;
-      LOG(WARNING)
-          << "Tweaking the MinMax of array " << input << ", which is "
-          << "an input to " << LogName(*op) << ", because we want all inputs "
-          << "and outputs of a Concatenation operator to have the same MinMax "
-          << "so that it can be implemented as a pure byte-copy, no "
-             "arithmetic.";
+  if (model->flags.change_concat_input_ranges()) {
+    for (const auto& input : op->inputs) {
+      auto& array = model->GetArray(input);
+      if (!array.minmax) {
+        changed = true;
+      } else if (!(overall_minmax == array.GetMinMax())) {
+        changed = true;
+        LOG(WARNING)
+            << "Tweaking the MinMax of array " << input << ", which is "
+            << "an input to " << LogName(*op) << ", because we want all inputs "
+            << "and outputs of a Concatenation operator to have the same "
+            << "MinMax so that it can be implemented as a pure byte-copy, no "
+               "arithmetic.";
+      }
+      array.GetOrCreateMinMax() = overall_minmax;
     }
-    array.GetOrCreateMinMax() = overall_minmax;
   }
   if (!output.minmax) {
     changed = true;
   } else if (!(overall_minmax == output.GetMinMax())) {
-    changed = true;
-    LOG(WARNING)
-        << "Tweaking the MinMax of the output array of " << LogName(*op)
-        << ", because we want all inputs "
-        << "and outputs of a Concatenation operator to have the same MinMax "
-        << "so that it can be implemented as a pure byte-copy, no arithmetic.";
+    if (model->flags.change_concat_input_ranges()) {
+      changed = true;
+      LOG(WARNING)
+          << "Tweaking the MinMax of the output array of " << LogName(*op)
+          << ", because we want all inputs "
+          << "and outputs of a Concatenation operator to have the same MinMax "
+          << "so that it can be implemented as a pure byte-copy, no "
+          << "arithmetic.";
+    } else {
+      return false;
+    }
   }
   output.GetOrCreateMinMax() = overall_minmax;
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 7784558b22..5b1268f9a9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -431,7 +431,8 @@ bool ChooseQuantizationForOperatorOutput(
       (op.type == OperatorType::kSpaceToDepth) ||
       (op.type == OperatorType::kTensorFlowReshape) ||
       (op.type == OperatorType::kTensorFlowSplit) ||
-      (op.type == OperatorType::kConcatenation)) {
+      (op.type == OperatorType::kConcatenation &&
+       model->flags.change_concat_input_ranges())) {
     int data_input_index = 0;
     if (op.type == OperatorType::kTensorFlowSplit) {
       data_input_index = 1;
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 0fa6e8598f..7bbeab7c9d 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -165,6 +165,11 @@ bool ParseModelFlagsFromCommandLineFlags(
            "Path to an optional file containing a serialized ModelFlags proto. "
            "Options specified on the command line will override the values in "
            "the proto."),
+      Flag("change_concat_input_ranges",
+           parsed_flags.change_concat_input_ranges.bind(),
+           parsed_flags.change_concat_input_ranges.default_value(),
+           "Boolean to change the behavior of min/max ranges for inputs and"
+           " output of the concat operators."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -399,6 +404,8 @@ void ReadModelFlagsFromCommandLineFlags(
       parsed_model_flags.allow_nonascii_arrays.value());
   model_flags->set_allow_nonexistent_arrays(
       parsed_model_flags.allow_nonexistent_arrays.value());
+  model_flags->set_change_concat_input_ranges(
+      parsed_model_flags.change_concat_input_ranges.value());
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
     string arrays_extra_info_file_contents;
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 835dea49eb..d23e80c464 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -128,7 +128,7 @@ message ArraysExtraInfo {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 19.
+// Next ID to USE: 20.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -175,4 +175,8 @@ message ModelFlags {
   // If set, this ArraysExtraInfo allows to pass extra information about arrays
   // not specified in the input model file, such as extra MinMax information.
   optional ArraysExtraInfo arrays_extra_info = 18;
+
+  // When set to false, toco will not change the input ranges and the output
+  // ranges of concat operator to the overlap of all input ranges.
+  optional bool change_concat_input_ranges = 19 [default = true];
 }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 61d08fa13f..b72f5fa2a7 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1413,7 +1413,8 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       CHECK(input_array.shape().dims_size());
     }
   }
-
+  model->flags.set_change_concat_input_ranges(
+      model_flags.change_concat_input_ranges());
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
   model->flags.set_allow_nonexistent_arrays(
       model_flags.allow_nonexistent_arrays());
-- 
GitLab


From c5aa11eb33542422889398c71fc61cf01a3cc5ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 17:43:43 -0700
Subject: [PATCH 0342/1262]   refactor and add proto field required by POD
 support.

PiperOrigin-RevId: 191826636
---
 .../tpu/profiler/capture_tpu_profile.cc       | 68 ++++++++++++-----
 .../contrib/tpu/profiler/dump_tpu_profile.cc  | 75 +++++++------------
 .../contrib/tpu/profiler/dump_tpu_profile.h   |  1 +
 .../contrib/tpu/profiler/tpu_profiler.proto   | 22 +++++-
 4 files changed, 99 insertions(+), 67 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index f2003e04dd..6b198dbc16 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -64,9 +64,11 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms,
-                        const string& repository_root, const string& session_id,
-                        const ProfileOptions& opts) {
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+             const string& repository_root, const string& session_id,
+             const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
@@ -94,7 +96,31 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
           channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-  return response;
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
+        logdir, session_id, "", response, &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  return response.encoded_trace().empty();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+bool NewSession(const string& service_addr,
+                const std::vector<tensorflow::string>& hostnames,
+                int duration_ms, const string& repository_root,
+                const string& session_id, const ProfileOptions& opts) {
+  return true;
 }
 
 }  // namespace
@@ -104,12 +130,16 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
+  tensorflow::string FLAGS_workers_list;
   int FLAGS_duration_ms = 2000;
   int FLAGS_num_tracing_attempts = 3;
   bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
+      tensorflow::Flag("workers_list", &FLAGS_workers_list,
+                       "The list of worker TPUs that we are about to profile "
+                       "in the current session."),
       tensorflow::Flag("logdir", &FLAGS_logdir,
                        "Path of TensorBoard log directory e.g. /tmp/tb_log, "
                        "gs://tb_bucket"),
@@ -153,18 +183,30 @@ int main(int argc, char** argv) {
   constexpr char kProfilePluginDirectory[] = "plugins/profile/";
   tensorflow::string repository_root =
       ::tensorflow::io::JoinPath(FLAGS_logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(FLAGS_workers_list, ",");
+
+  bool empty_trace = false;
   while (true) {
     std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms,
-                                        repository_root, session_id, opts);
-    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    if (hostnames.empty()) {
+      empty_trace = tensorflow::tpu::Profile(FLAGS_service_addr, FLAGS_logdir,
+                                             duration_ms, repository_root,
+                                             session_id, opts);
+    } else {
+      tensorflow::string tpu_master = FLAGS_service_addr;
+      empty_trace =
+          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
+                                      repository_root, session_id, opts);
+    }
+    if (remaining_attempts <= 0 || !empty_trace) break;
     std::cout << "No trace event is collected. Automatically retrying."
               << std::endl
               << std::endl;
   }
 
-  if (response.encoded_trace().empty()) {
+  if (empty_trace) {
     std::cout << "No trace event is collected after "
               << FLAGS_num_tracing_attempts << " attempt(s). "
               << "Perhaps, you want to try again (with more attempts?)."
@@ -175,13 +217,5 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-      FLAGS_logdir, session_id, response, &std::cout));
-  // Print this at the end so that it's not buried in irrelevant LOG messages.
-  std::cout
-      << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
-      << "Set an appropriate duration (with --duration_ms) if you "
-         "don't see a full step in your trace or the captured trace is too "
-         "large."
-      << std::endl;
+  return 0;
 }
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ebd6185faa..ae508583f8 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -41,6 +41,7 @@ namespace {
 using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
+using ::tensorflow::strings::StrCat;
 
 constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
 constexpr char kJsonOpProfileFileName[] = "op_profile.json";
@@ -61,28 +62,33 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
   return Status::OK();
 }
 
-Status DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace,
-                               std::ostream* os) {
+Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
+                               const string& encoded_trace, std::ostream* os) {
   string proto_path = JoinPath(run_dir, kProtoTraceFileName);
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), proto_path, encoded_trace));
   LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
 
-  string json_path = JoinPath(run_dir, kJsonTraceFileName);
+  string json_path = JoinPath(run_dir, StrCat(host_prefix, kJsonTraceFileName));
   Trace trace;
   trace.ParseFromString(encoded_trace);
-  *os << "Trace contains " << trace.trace_events_size() << " events."
-      << std::endl;
+  if (os) {
+    *os << "Trace contains " << trace.trace_events_size() << " events."
+        << std::endl;
+  }
   TF_RETURN_IF_ERROR(
       WriteGzippedDataToFile(json_path, TraceEventsToJson(trace)));
-  *os << "Dumped JSON trace data to " << json_path << std::endl;
+  if (os) {
+    *os << "Dumped JSON trace data to " << json_path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
+                                   const string& host_prefix,
                                    const tpu::op_profile::Profile& profile,
                                    std::ostream* os) {
-  string path = JoinPath(run_dir, kJsonOpProfileFileName);
+  string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
   JsonOptions options;
   options.always_print_primitive_fields = true;
@@ -93,49 +99,20 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
         string(status.error_message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
-  *os << "Dumped json op profile data to " << path << std::endl;
+  if (os) {
+    *os << "Dumped json op profile data to " << path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
+                                  const string& host_prefix,
                                   const tensorflow::ProfileToolData& tool,
                                   std::ostream* os) {
-  string path = JoinPath(run_dir, tool.name());
+  string path = JoinPath(run_dir, StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
-  *os << "Dumped tool data for " << tool.name() << " to " << path << std::endl;
-  return Status::OK();
-}
-
-Status DumpGraphEvents(const string& logdir, const string& run,
-                       const ProfileResponse& response, std::ostream* os) {
-  int num_graphs = response.computation_graph_size();
-  if (response.computation_graph_size() == 0) return Status::OK();
-  // The server might generates multiple graphs for one program; we simply
-  // pick the first one.
-  if (num_graphs > 1) {
-    *os << num_graphs
-        << " TPU program variants observed over the profiling period. "
-        << "One computation graph will be chosen arbitrarily." << std::endl;
-  }
-  // The graph plugin expects the graph in <logdir>/<run>/<event.file>.
-  string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run));
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir));
-  EventsWriter event_writer(JoinPath(run_dir, "events"));
-  Event event;
-  // Add the computation graph.
-  event.set_graph_def(response.computation_graph(0).SerializeAsString());
-  event_writer.WriteEvent(event);
-  *os << "Wrote a HLO graph to " << event_writer.FileName() << std::endl;
-
-  if (response.has_hlo_metadata()) {
-    tensorflow::TaggedRunMetadata tagged_run_metadata;
-    tagged_run_metadata.set_tag(run);
-    tagged_run_metadata.set_run_metadata(
-        response.hlo_metadata().SerializeAsString());
-    tensorflow::Event meta_event;
-    *meta_event.mutable_tagged_run_metadata() = tagged_run_metadata;
-    event_writer.WriteEvent(meta_event);
-    *os << "Wrote HLO ops run metadata to " << event_writer.FileName()
+  if (os) {
+    *os << "Dumped tool data for " << tool.name() << " to " << path
         << std::endl;
   }
   return Status::OK();
@@ -144,27 +121,29 @@ Status DumpGraphEvents(const string& logdir, const string& run,
 }  // namespace
 
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os) {
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
+  string host_prefix = host.empty() ? "" : StrCat(host, ".");
   string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
   if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
-    TF_RETURN_IF_ERROR(
-        DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
+    TF_RETURN_IF_ERROR(DumpTraceToLogDirectory(profile_run_dir, host_prefix,
+                                               response.encoded_trace(), os));
   }
   if (response.has_op_profile() &&
       (response.op_profile().has_by_program_structure() ||
        response.op_profile().has_by_category())) {
-    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
+    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir, host_prefix,
                                                    response.op_profile(), os));
   }
   for (const auto& tool_data : response.tool_data()) {
-    TF_RETURN_IF_ERROR(
-        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
+    TF_RETURN_IF_ERROR(DumpToolDataToLogDirectory(profile_run_dir, host_prefix,
+                                                  tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 29ef977bac..ecf21b1de2 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -32,6 +32,7 @@ namespace tpu {
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index cddc3cd1b4..8505c4bc69 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -21,6 +21,17 @@ message ProfileOptions {
   // next-field: 2
 }
 
+message ToolRequestOptions {
+  // Required formats for the tool, it should be one of "json", "proto", "raw"
+  // etc. If not specified (backward compatible), use default format, i.e. most
+  // tools use json format.
+  string output_formats = 2;
+
+  // Whether save the result directly to repository or pass it back to caller.
+  // Default to false for backward compatibilities.
+  bool save_to_repo = 3;
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -30,9 +41,12 @@ message ProfileRequest {
   // events.
   uint64 max_events = 2;
 
-  // required profiling tools name such as "input_pipeline_analyzer" etc
+  // Required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Specifies the requirement for each tools.
+  map<string, ToolRequestOptions> tool_options = 8;
+
   // Optional profiling options that control how a TF session will be profiled.
   ProfileOptions opts = 4;
 
@@ -43,10 +57,14 @@ message ProfileRequest {
   // The user provided profile session identifier.
   string session_id = 6;
 
+  // The hostname of system where the profile should happen.
+  // We use it as identifier in part of our output filename.
+  string host_name = 7;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 7
+  // next-field: 9
 }
 
 message ProfileToolData {
-- 
GitLab


From 9fc9f19428e497f3a297538059804f69996a612e Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 5 Apr 2018 18:21:54 -0700
Subject: [PATCH 0343/1262] Lazily evaluate shapes with the C API enabled.

This change makes it so shapes are computed only when requested with
_USE_C_API = True. Note that the C API will still raise a shape error
if necessary when the op is created.

In addition, it cleans up the logic for _USE_C_SHAPES = True. In this
case, we lazily fetch and cache shapes directly from the C API. We no
longer need set_shapes_for_outputs at all in this case.

PiperOrigin-RevId: 191830565
---
 tensorflow/python/client/tf_session_helper.cc |   9 -
 tensorflow/python/client/tf_session_helper.h  |   7 -
 tensorflow/python/framework/importer.py       |   9 +-
 tensorflow/python/framework/ops.py            | 174 ++++++++++++------
 tensorflow/python/framework/tensor_util.py    |   3 +-
 .../python/ops/resource_variable_ops.py       |   7 +-
 6 files changed, 125 insertions(+), 84 deletions(-)

diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index b48d758e4a..b6481e7e29 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -629,15 +629,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
   TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
 }
 
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status) {
-  std::vector<int64_t> dims(num_dims);
-  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
-  return dims;
-}
-
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results) {
   int num_missing_unused_input_mappings;
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index d2b4abc476..cfd27c2bee 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -229,13 +229,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
                                     const std::vector<int64_t>& dims,
                                     bool unknown_shape, TF_Status* status);
 
-// Return the shape of output. `num_dims` should be the output of
-// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status);
-
 // Returns the string representations of the missing unused input mappings.
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results);
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 8beb74d2a0..3f8a8c4bef 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -685,11 +685,10 @@ def import_graph_def(graph_def,
                      ', '.join(x.name for x in op._input_types))))
         # pylint: enable=protected-access
 
-        if not g._is_function(op.type):  # pylint: disable=protected-access
-          # Execute shape inference for this op.
-          # NOTE(mrry): If the graph contains a cycle, the full shape
-          # information may not be available for this op's inputs.
-          ops.set_shapes_for_outputs(op)
+        # Execute shape inference for this op.
+        # NOTE(mrry): If the graph contains a cycle, the full shape
+        # information may not be available for this op's inputs.
+        ops.set_shape_and_handle_data_for_outputs(op)
         # For nodes with _output_shapes set, set the output shapes.
         if '_output_shapes' in op.node_def.attr:
           for i, output in enumerate(op.outputs):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 84366e20f5..2574fa57a4 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -289,15 +289,26 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-    self._shape_val = tensor_shape.unknown_shape()
+
+    if _USE_C_API:
+      # This will be set by set_shape_and_handle_data_for_outputs.
+      self._shape_val = None
+    else:
+      # The Python code requires all tensors start with a shape to support shape
+      # inference on imported while loops. This isn't necessary with the C API
+      # enabled because the C API provides the shapes for imported nodes.
+      # TODO(skyewm): remove when _USE_C_API is removed.
+      self._shape_val = tensor_shape.unknown_shape()
+
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
 
-    # Attributes used for C++ shape inference. Not inspected, only forwarded.
-    # If set, will be a HandleData object from cpp_shape_inference.proto.
-    # TODO(b/74620627): remove when _USE_C_SHAPES is removed
-    self._handle_data = None
+    if not _USE_C_SHAPES:
+      # Attributes used for C++ shape inference. Not inspected, only forwarded.
+      # If set, will be a HandleData object from cpp_shape_inference.proto.
+      self._handle_data = None
+
     self._id = uid()
 
   @property
@@ -371,18 +382,45 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
-    graph = self._op._graph._c_graph # pylint: disable=protected-access
-    if graph and _USE_C_SHAPES:
-      num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output())
-      if num_dims == -1:
-        dim_list = None
+    if self._shape_val is None:
+      if _USE_C_SHAPES:
+        self._shape_val = self._c_api_shape()
       else:
-        dim_list = c_api.TF_GraphGetTensorShape_wrapper(
-            graph, self._as_tf_output(), num_dims)
-        dim_list = [None if i == -1 else i for i in dim_list]
-      return tensor_shape.TensorShape(dim_list)
+        assert _USE_C_API
+        # Call set_shape_and_handle_data_for_outputs in topological order on all
+        # ops that are needed to compute self.op's shape. We do this instead of
+        # having set_shape_and_handle_data_for_outputs recursively call
+        # Operation.shape on self.op.inputs to overflowing the call stack.
+        need_shapes = self._get_input_ops_without_shapes(self.op)
+        need_shapes.sort(key=lambda op: op._id)
+        for op in need_shapes:
+          set_shape_and_handle_data_for_outputs(op)
     return self._shape_val
 
+  def _get_input_ops_without_shapes(self, target_op):
+    """Returns ops needing shape inference to compute target_op's shape."""
+    result = []
+    stack = [self._op]
+    visited = set()
+    while stack:
+      op = stack.pop()
+      if op in visited: continue
+      result.append(op)
+      stack.extend(t.op for t in op.inputs if t._shape_val is None)
+      visited.add(op)
+    return result
+
+  def _c_api_shape(self):
+    """Returns the TensorShape of this tensor according to the C API."""
+    c_graph = self._op._graph._c_graph  # pylint: disable=protected-access
+    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+        c_graph, self._as_tf_output())
+    if unknown_shape:
+      return tensor_shape.unknown_shape()
+    else:
+      shape_vector = [None if d == -1 else d for d in shape_vector]
+      return tensor_shape.TensorShape(shape_vector)
+
   @property
   def _shape(self):
     logging.warning("Tensor._shape is private, use Tensor.shape "
@@ -466,8 +504,11 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if not _USE_C_SHAPES:  # pylint: disable=protected-access
-      self._shape_val = self._shape_val.merge_with(shape)
+    if _USE_C_SHAPES:  # pylint: disable=protected-access
+      # Reset cached shape.
+      self._shape_val = None
+    else:
+      self._shape_val = self.shape.merge_with(shape)
 
     if not self._op._graph._c_graph: return
 
@@ -579,6 +620,16 @@ class Tensor(_TensorLike):
     # Necessary to support Python's collection membership operators
     return id(self) == id(other)
 
+  def __copy__(self):
+    # Make sure _shape_val is computed before we copy.
+    # TODO(b/77597810): get rid of Tensor copies.
+    if self._shape_val is None:
+      set_shape_and_handle_data_for_outputs(self.op)
+    cls = self.__class__
+    result = cls.__new__(cls)
+    result.__dict__.update(self.__dict__)
+    return result
+
   # NOTE(mrry): This enables the Tensor's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Tensor class higher priority than an ndarray, or a
@@ -1932,6 +1983,13 @@ class Operation(object):
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
+
+    # Make sure output shapes are already computed for this op in case we create
+    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
+    # lazily upon request.
+    if not _USE_C_SHAPES:
+      set_shape_and_handle_data_for_outputs(self)
+
     if self._c_op:
       # Reset cached inputs.
       self._inputs_val = None
@@ -2474,35 +2532,41 @@ class RegisterShape(object):
     return f
 
 
-def _set_shapes_for_outputs_c_api(op):
-  """set_shapes_for_outputs implementation when C API is enabled."""
-  # The C API computes the shapes when the TF_Operation is created. Fetch the
-  # output shapes from the C object.
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def _set_shape_and_handle_data_for_outputs_c_api(op):
+  """Set shapes and resource handle data using info from the C API."""
+  assert not _USE_C_SHAPES
   for output in op.outputs:
-    # pylint: disable=protected-access
-    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+    output._shape_val = output._c_api_shape()
+    # Set the resource handle data for compatibility with the Python shape
+    # inference code.
+    serialized = c_api.ResourceHandleShapeAndType(
         op._graph._c_graph, output._as_tf_output())
-    # pylint: enable=protected-access
-    if unknown_shape:
-      output.set_shape(tensor_shape.unknown_shape())
-    elif not shape_vector:
-      output.set_shape(tensor_shape.scalar())
-    else:
-      shape_vector = [None if d == -1 else d for d in shape_vector]
-      output.set_shape(tensor_shape.TensorShape(shape_vector))
-
-    serialized = c_api.ResourceHandleShapeAndType(op._graph._c_graph,
-                                                  output._as_tf_output())
     if serialized:
       output._handle_data = (
-          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
-              compat.as_bytes(serialized)))
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
+          .FromString(compat.as_bytes(serialized)))
     else:
       output._handle_data = None
 
-# TODO(skyewm): remove this when _USE_C_API flag is removed.
-def _set_shapes_for_outputs(op):
-  """set_shapes_for_outputs implementation when C API is disabled."""
+
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def set_shape_and_handle_data_for_outputs(op):
+  """Set the shapes and resource handle data for op's outputs.
+
+  When _USE_C_API = True, this is lazily called when a tensor's shape is first
+  requested. Usually this should work automatically, but some edge cases may
+  require manaully calling this first to make sure Tensor._shape_val and
+  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
+  Tensor).
+  """
+  if _USE_C_SHAPES: return
+
+  if op.graph._is_function(op.type):
+    for output in op.outputs:
+      output._shape_val = tensor_shape.unknown_shape()
+    return
+
   try:
     shape_func = _shape_registry.lookup(op.type)
   except LookupError:
@@ -2521,8 +2585,10 @@ def _set_shapes_for_outputs(op):
     shapes = shapes_dict["shapes"]
     handle_datas = shapes_dict["handle_data"]
     for output, handle_data in zip(op.outputs, handle_datas):
+      # Don't override any existing handle data that may have been manually set.
       # pylint: disable=protected-access
-      output._handle_data = handle_data
+      if output._handle_data is None:
+        output._handle_data = handle_data
       # pylint: enable=protected-access
 
   if len(op.outputs) != len(shapes):
@@ -2530,15 +2596,8 @@ def _set_shapes_for_outputs(op):
         "Shape function for op %s returned %d shapes but expected %d %s %s" %
         (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
   for output, s in zip(op.outputs, shapes):
-    output.set_shape(s)
-
-
-def set_shapes_for_outputs(op):
-  """Set the shapes for op's outputs."""
-  if op._c_op and _USE_C_SHAPES:  # pylint: disable=protected-access
-    return _set_shapes_for_outputs_c_api(op)
-  else:
-    return _set_shapes_for_outputs(op)
+    output._shape_val = tensor_shape.unknown_shape()
+    output._shape_val = output._shape_val.merge_with(s)
 
 
 class OpStats(object):
@@ -3331,18 +3390,14 @@ class Graph(object):
           original_op=self._default_original_op,
           op_def=op_def)
 
-      # TODO(vrv): Instead of eagerly filling in shape property for every op,
-      # only populate the shape when requested.
+      # Note: shapes are lazily computed with the C API enabled.
       #
       # TODO(skyewm): unlike in the original Python implementation, the C API
       # always computes shape information (even for function calls, which the
       # original Python shape inference code doesn't handle). Deprecate the
       # compute_shapes argument.
-      #
-      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
-      # is removed
-      if (ret._c_op and _USE_C_SHAPES) or compute_shapes:  # pylint: disable=protected-access
-        set_shapes_for_outputs(ret)
+      if not _USE_C_API and compute_shapes:
+        set_shape_and_handle_data_for_outputs(ret)
 
       self._create_op_helper(ret, compute_shapes=compute_shapes,
                              compute_device=compute_device)
@@ -3484,18 +3539,17 @@ class Graph(object):
         for c_op in c_api_util.new_tf_operations(self)
     ]
 
+    # pylint: disable=protected-access
     for op in new_ops:
       # Operations created by the C API always retrieve shapes from the C API so
       # we preserve the shapes of ops created in import_graph_def (from the
       # "_output_shapes" attr of the imported NodeDef).
-      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
-      # is removed.
-      _set_shapes_for_outputs_c_api(op)
+      if not _USE_C_SHAPES:
+        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
-      # pylint: disable=protected-access
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
-      # pylint: enable=protected-access
+    # pylint: enable=protected-access
 
     return new_ops
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 984bcecdfe..64b0fa6c00 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,7 +22,6 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -828,7 +827,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   Returns:
     A `TensorShape` based on the constant value of the given `tensor`.
   """
-  if context.executing_eagerly():
+  if isinstance(tensor, ops.EagerTensor):
     return tensor_shape.as_shape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 07e25e540c..508ba9bfee 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -72,7 +72,12 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # know the shape and dtype of the variable pointed to by a handle. Since
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
-    handle._handle_data = h._handle_data  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if h._handle_data is None:
+      ops.set_shape_and_handle_data_for_outputs(h.op)
+    handle._handle_data = h._handle_data
+    # pylint: enable=protected-access
+
   # Clean up our reference cycles to avoid making the garbage collector run.
   # pylint: disable=protected-access
   # OrderedDict, constructed on Graph creation, makes a simple reference loop
-- 
GitLab


From 2248a3488c53f8b858e2a0b8be93d62c3056df36 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 5 Apr 2018 18:23:32 -0700
Subject: [PATCH 0344/1262] [XLA] Don't call Literal::Get in HloEvaluator's
 convolution loop.

This speeds up the implementation of conv because Literal::Get calls
Literal::Piece::data, which is relatively slow.

Instead, we call Literal::Data() once and cache the result.

Before: ConvolutionTest/0.StridedFilter (59094 ms)
After:  ConvolutionTest/0.StridedFilter (41812 ms)

Speedup: 59/42 = 1.4x
PiperOrigin-RevId: 191830741
---
 tensorflow/compiler/xla/service/hlo_evaluator.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 9d7251b6ae..4bec953bf7 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1003,6 +1003,9 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector rhs_index(rhs_rank);
     DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
 
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
     auto func = [&](ArraySlice<int64> out_index) {
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
 
@@ -1062,9 +1065,13 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                     : rhs_spatial_index[ki];
           }
 
-          result_val +=
-              static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-              static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+          auto lhs_elem = static_cast<ElementwiseT>(
+              lhs_literal_data[IndexUtil::MultidimensionalIndexToLinearIndex(
+                  lhs_shape, lhs_index)]);
+          auto rhs_elem = static_cast<ElementwiseT>(
+              rhs_literal_data[IndexUtil::MultidimensionalIndexToLinearIndex(
+                  rhs_shape, rhs_index)]);
+          result_val += lhs_elem * rhs_elem;
         }
       cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
-- 
GitLab


From 96a72a9a836ad1c1f46e74d30eda4deb78740efe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 18:37:19 -0700
Subject: [PATCH 0345/1262] Added `drop_final_batch` argument to
 make_batched_features_dataset. This allows the batch_and_drop_remainder
 function to be used instead of the default batch function.

PiperOrigin-RevId: 191831842
---
 .../kernel_tests/reader_dataset_ops_test.py   | 20 +++++++++++++++++--
 tensorflow/contrib/data/python/ops/readers.py | 11 ++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 6ee1b572f1..f3e9302409 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -271,7 +271,8 @@ class ReadBatchFeaturesTest(test.TestCase):
                            reader_num_threads=1,
                            parser_num_threads=1,
                            shuffle=False,
-                           shuffle_seed=None):
+                           shuffle_seed=None,
+                           drop_final_batch=False):
     self.filenames = filenames
     self.num_epochs = num_epochs
     self.batch_size = batch_size
@@ -289,7 +290,8 @@ class ReadBatchFeaturesTest(test.TestCase):
         shuffle=shuffle,
         shuffle_seed=shuffle_seed,
         reader_num_threads=reader_num_threads,
-        parser_num_threads=parser_num_threads).make_one_shot_iterator(
+        parser_num_threads=parser_num_threads,
+        drop_final_batch=drop_final_batch).make_one_shot_iterator(
         ).get_next()
 
   def _record(self, f, r):
@@ -559,6 +561,20 @@ class ReadBatchFeaturesTest(test.TestCase):
               with self.assertRaises(errors.OutOfRangeError):
                 self._next_actual_batch(sess)
 
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          # Basic test: read from file 0.
+          self.outputs = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              drop_final_batch=True)
+          for _, tensor in self.outputs.items():
+            if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
+              self.assertEqual(tensor.shape[0], batch_size)
+
 
 class MakeCsvDatasetTest(test.TestCase):
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 9a48aa02fb..b8eb09978e 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -370,7 +370,8 @@ def make_batched_features_dataset(file_pattern,
                                   prefetch_buffer_size=1,
                                   reader_num_threads=1,
                                   parser_num_threads=2,
-                                  sloppy_ordering=False):
+                                  sloppy_ordering=False,
+                                  drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   Example:
@@ -443,6 +444,9 @@ def make_batched_features_dataset(file_pattern,
       produced is deterministic prior to shuffling (elements are still
       randomized if `shuffle=True`. Note that if the seed is set, then order
       of elements after shuffling is deterministic). Defaults to `False`.
+    drop_final_batch: If `True`, and the batch size does not evenly divide the
+      input dataset size, the final smaller batch will be dropped. Defaults to
+      `False`.
 
   Returns:
     A dataset of `dict` elements. Each `dict` maps feature keys to
@@ -481,7 +485,10 @@ def make_batched_features_dataset(file_pattern,
   elif shuffle:
     dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
 
-  dataset = dataset.batch(batch_size)
+  if drop_final_batch:
+    dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
+  else:
+    dataset = dataset.batch(batch_size)
 
   # Parse `Example` tensors to a dictionary of `Feature` tensors.
   dataset = dataset.map(
-- 
GitLab


From 538cab2870fd02b4d89e0534ad52573e67f16606 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 18:50:35 -0700
Subject: [PATCH 0346/1262] Add RunMetadata logging to tf.train.ProfilerHook
 for Tensorboard Memeory/CPU usage visualization

PiperOrigin-RevId: 191832832
---
 .../testing/python/framework/fake_summary_writer.py |  6 ++++++
 .../python/training/basic_session_run_hooks.py      |  3 +++
 .../python/training/basic_session_run_hooks_test.py | 13 +++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index 15a415df30..eac34afc4a 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -52,6 +52,7 @@ class FakeSummaryWriter(object):
     self._added_graphs = []
     self._added_meta_graphs = []
     self._added_session_logs = []
+    self._added_run_metadata = {}
 
   @property
   def summaries(self):
@@ -127,6 +128,11 @@ class FakeSummaryWriter(object):
     # pylint: disable=unused-argument
     self._added_session_logs.append(session_log)
 
+  def add_run_metadata(self, run_metadata, tag, global_step=None):
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    self._added_run_metadata[tag] = run_metadata
+
   def flush(self):
     pass
 
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index aae757b99a..094a9e886b 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -859,6 +859,7 @@ class ProfilerHook(session_run_hook.SessionRunHook):
           showing the sizes and lifetimes of tensors.
     """
     self._output_file = os.path.join(output_dir, "timeline-{}.json")
+    self._file_writer = SummaryWriterCache.get(output_dir)
     self._show_dataflow = show_dataflow
     self._show_memory = show_memory
     self._timer = SecondOrStepTimer(
@@ -889,6 +890,8 @@ class ProfilerHook(session_run_hook.SessionRunHook):
       self._save(global_step,
                  self._output_file.format(global_step),
                  run_values.run_metadata.step_stats)
+      self._file_writer.add_run_metadata(run_values.run_metadata,
+                                         "step_%d" % global_step)
 
     self._next_step = global_step + 1
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2547661e52..f39a5261a9 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -1274,6 +1274,19 @@ class ProfilerHookTest(test.TestCase):
         sess.run(self.train_op)  # Saved.
         self.assertEqual(3, self._count_timeline_files())
 
+  def test_run_metadata_saves_in_first_step(self):
+    writer_cache.FileWriterCache.clear()
+    fake_summary_writer.FakeSummaryWriter.install()
+    fake_writer = writer_cache.FileWriterCache.get(self.output_dir)
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.ProfilerHook(
+          save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(
+            list(fake_writer._added_run_metadata.keys()), ['step_1'])
+    fake_summary_writer.FakeSummaryWriter.uninstall()
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 05f665543adecddfbfc44f7bcd4e74d983c2c442 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 5 Apr 2018 19:30:10 -0700
Subject: [PATCH 0347/1262] [XLA] Don't call MultidimensionalIndexToLinearIndex
 in HloEvaluator's convolution routine.

Before: ConvolutionTest/0.StridedFilter (41812 ms)
After:  ConvolutionTest/0.StridedFilter (28054 ms)

Speedup: 42 / 28 = 1.5x
PiperOrigin-RevId: 191835735
---
 .../compiler/xla/service/hlo_evaluator.cc     | 74 ++++++++++++-------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 4bec953bf7..53ad8909c5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -202,6 +202,25 @@ void IterateThroughWindow(
   } while (IndexUtil::BumpIndices(window_shape, &window_index));
 }
 
+// Creates a vector of multipliers which can be used to create a linear index
+// into shape.
+//
+// Given the multidimensional index {i1, ..., iN} and
+// M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+//
+//   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+//
+// This lets you calculate LI given the multidimensional indices in any order.
+DimensionVector MakeDimMultipliers(const Shape& shape) {
+  DimensionVector v(ShapeUtil::Rank(shape));
+  int64 scale = 1;
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+    v[dim] = scale;
+    scale *= shape.dimensions(dim);
+  }
+  return v;
+}
+
 }  // namespace
 
 template <typename ReturnT, typename ElementwiseT>
@@ -999,8 +1018,9 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Shape& window_shape =
         ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
 
-    DimensionVector lhs_index(lhs_rank);
-    DimensionVector rhs_index(rhs_rank);
+    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
+
     DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
 
     auto lhs_literal_data = lhs_literal.data<ReturnT>();
@@ -1008,19 +1028,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     auto func = [&](ArraySlice<int64> out_index) {
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
-
-      std::fill(lhs_index.begin(), lhs_index.end(), 0);
-      std::fill(rhs_index.begin(), rhs_index.end(), 0);
       std::fill(rhs_spatial_index.begin(), rhs_spatial_index.end(), 0);
 
-      lhs_index[input_batch_dim] = out_index[output_batch_dim];
-      rhs_index[kernel_output_z_dim] = out_index[output_z_dim];
-
       // Convolve input feature with kernel.
       do {
         for (int64 iz = 0; iz < z_size; ++iz) {
-          lhs_index[input_z_dim] = iz;
-          rhs_index[kernel_input_z_dim] = iz;
+          int64 lhs_linear_index = 0;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+
+          int64 rhs_linear_index = 0;
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
 
           // Find corresponding spatial dimension index for input (lhs).
           for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
@@ -1045,33 +1066,32 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
             // Calculate the actual lhs (input) index after dilation.  As an
             // optimization, skip this integer divide if there's no dilation.
+            int64 lhs_spatial_index;
             if (window_dim.base_dilation() > 1) {
-              lhs_index[input_spatial_dim] =
-                  undilated_index / window_dim.base_dilation();
+              lhs_spatial_index = undilated_index / window_dim.base_dilation();
             } else {
-              lhs_index[input_spatial_dim] = undilated_index;
+              lhs_spatial_index = undilated_index;
             }
+            lhs_linear_index +=
+                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
 
-            // Skip if input index is not in bound.
-            if (!(lhs_index[input_spatial_dim] >= 0 &&
-                  lhs_index[input_spatial_dim] <
+            // Skip if input index is not in bounds.
+            if (!(lhs_spatial_index >= 0 &&
+                  lhs_spatial_index <
                       lhs_shape.dimensions(input_spatial_dim))) {
               goto cnt;
             }
 
-            rhs_index[dnums.kernel_spatial_dimensions(ki)] =
-                window_dim.window_reversal()
-                    ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                    : rhs_spatial_index[ki];
+            rhs_linear_index +=
+                (window_dim.window_reversal()
+                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                     : rhs_spatial_index[ki]) *
+                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
           }
 
-          auto lhs_elem = static_cast<ElementwiseT>(
-              lhs_literal_data[IndexUtil::MultidimensionalIndexToLinearIndex(
-                  lhs_shape, lhs_index)]);
-          auto rhs_elem = static_cast<ElementwiseT>(
-              rhs_literal_data[IndexUtil::MultidimensionalIndexToLinearIndex(
-                  rhs_shape, rhs_index)]);
-          result_val += lhs_elem * rhs_elem;
+          result_val +=
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
         }
       cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
-- 
GitLab


From fbb5a655f985b9040a99e8e2acd9631fd70abe01 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 19:33:58 -0700
Subject: [PATCH 0348/1262] Expose the adaptive sampling option for SDCA and
 shuffle the data when adaptive sampling is off.

PiperOrigin-RevId: 191836004
---
 .../python/kernel_tests/sdca_ops_test.py      | 54 +++++++++++++++++++
 .../linear_optimizer/python/ops/sdca_ops.py   |  7 ++-
 .../linear_optimizer/python/sdca_optimizer.py |  9 +++-
 tensorflow/core/kernels/sdca_internal.cc      |  5 ++
 tensorflow/core/kernels/sdca_internal.h       |  7 ++-
 tensorflow/core/kernels/sdca_ops.cc           |  6 +--
 6 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index cfe62fac43..ac50699f59 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
 import threading
 
 from tensorflow.contrib.linear_optimizer.python.ops.sdca_ops import SdcaModel
@@ -102,6 +103,33 @@ def make_example_dict(example_protos, example_weights):
       example_ids=['%d' % i for i in range(0, len(example_protos))])
 
 
+def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
+  random.seed(1)
+  sparse_features = [
+      SparseFeatureColumn(
+          [int(i / num_non_zero) for i in range(num_examples * num_non_zero)],
+          [int(random.random() * dim) for _ in range(
+              num_examples * num_non_zero)],
+          [num_non_zero**(-0.5) for _ in range(num_examples * num_non_zero)])
+  ]
+  examples_dict = dict(
+      sparse_features=sparse_features,
+      dense_features=[],
+      example_weights=[random.random() for _ in range(num_examples)],
+      example_labels=[
+          1. if random.random() > 0.5 else 0. for _ in range(num_examples)
+      ],
+      example_ids=[str(i) for i in range(num_examples)])
+
+  weights = variables_lib.Variable(
+      array_ops.zeros([dim], dtype=dtypes.float32))
+  variables_dict = dict(
+      sparse_features_weights=[weights],
+      dense_features_weights=[])
+
+  return examples_dict, variables_dict
+
+
 def make_variable_dict(max_age, max_gender):
   # TODO(sibyl-toe9oF2e):  Figure out how to derive max_age & max_gender from
   # examples_dict.
@@ -235,6 +263,32 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         self.assertAllClose(
             0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
 
+  def testSparseRandom(self):
+    dim = 20
+    num_examples = 1000
+    # Number of non-zero features per example.
+    non_zeros = 10
+    # Setup test data.
+    with self._single_threaded_test_session():
+      examples, variables = make_random_examples_and_variables_dicts(
+          num_examples, dim, non_zeros)
+      options = dict(
+          symmetric_l2_regularization=.1,
+          symmetric_l1_regularization=0,
+          num_table_shards=1,
+          adaptive=False,
+          loss_type='logistic_loss')
+
+      lr = SdcaModel(examples, variables, options)
+      variables_lib.global_variables_initializer().run()
+      train_op = lr.minimize()
+      for _ in range(4):
+        train_op.run()
+      lr.update_weights(train_op).run()
+      # Duality gap is 1.4e-5.
+      # It would be 0.01 without shuffling and 0.02 with adaptive sampling.
+      self.assertNear(0.0, lr.approximate_duality_gap().eval(), err=1e-3)
+
   def testDistributedSimple(self):
     # Setup test data
     example_protos = [
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 3f5fdc18bb..f980746a19 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -168,6 +168,10 @@ class SdcaModel(object):
     # of workers
     return self._options.get('num_loss_partitions', 1)
 
+  def _adaptive(self):
+    # Perform adaptive sampling.
+    return self._options.get('adaptive', True)
+
   def _num_table_shards(self):
     # Number of hash table shards.
     # Return 1 if not specified or if the value is 'None'
@@ -344,7 +348,8 @@ class SdcaModel(object):
           l1=self._options['symmetric_l1_regularization'],
           l2=self._symmetric_l2_regularization(),
           num_loss_partitions=self._num_loss_partitions(),
-          num_inner_iterations=1)
+          num_inner_iterations=1,
+          adaptative=self._adaptive())
       # pylint: enable=protected-access
 
       with ops.control_dependencies([esu]):
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 92d022f2a3..dffdddacfb 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -71,12 +71,14 @@ class SDCAOptimizer(object):
                num_loss_partitions=1,
                num_table_shards=None,
                symmetric_l1_regularization=0.0,
-               symmetric_l2_regularization=1.0):
+               symmetric_l2_regularization=1.0,
+               adaptive=True):
     self._example_id_column = example_id_column
     self._num_loss_partitions = num_loss_partitions
     self._num_table_shards = num_table_shards
     self._symmetric_l1_regularization = symmetric_l1_regularization
     self._symmetric_l2_regularization = symmetric_l2_regularization
+    self._adaptive = adaptive
 
   def get_name(self):
     return 'SDCAOptimizer'
@@ -101,6 +103,10 @@ class SDCAOptimizer(object):
   def symmetric_l2_regularization(self):
     return self._symmetric_l2_regularization
 
+  @property
+  def adaptive(self):
+    return self._adaptive
+
   def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
                      features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
@@ -228,6 +234,7 @@ class SDCAOptimizer(object):
         options=dict(
             symmetric_l1_regularization=self._symmetric_l1_regularization,
             symmetric_l2_regularization=self._symmetric_l2_regularization,
+            adaptive=self._adaptive,
             num_loss_partitions=self._num_loss_partitions,
             num_table_shards=self._num_table_shards,
             loss_type=loss_type))
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 5a389a6548..623de2a482 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -302,6 +302,11 @@ Status Examples::SampleAdaptiveProbabilities(
   return Status::OK();
 }
 
+void Examples::RandomShuffle() {
+  std::iota(sampled_index_.begin(), sampled_index_.end(), 0);
+  std::random_shuffle(sampled_index_.begin(), sampled_index_.end());
+}
+
 // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
 Status Examples::Initialize(OpKernelContext* const context,
                             const ModelWeights& weights,
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 1665b1210e..bfdb3febdc 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -322,10 +322,7 @@ class Examples {
     return examples_.at(example_index);
   }
 
-  int sampled_index(const int id, const bool adaptive) const {
-    if (adaptive) return sampled_index_[id];
-    return id;
-  }
+  int sampled_index(const int id) const { return sampled_index_[id]; }
 
   // Adaptive SDCA in the current implementation only works for
   // binary classification, where the input argument for num_weight_vectors
@@ -337,6 +334,8 @@ class Examples {
       const std::unique_ptr<DualLossUpdater>& loss_updater,
       const int num_weight_vectors);
 
+  void RandomShuffle();
+
   int num_examples() const { return examples_.size(); }
 
   int num_features() const { return num_features_; }
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 5b63057f3f..55e68b348b 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -153,8 +153,9 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
                        options.num_loss_partitions, options.regularizations,
                        model_weights, example_state_data, options.loss_updater,
                        /*num_weight_vectors =*/1));
+  } else {
+    examples.RandomShuffle();
   }
-
   mutex mu;
   Status train_step_status GUARDED_BY(mu);
   std::atomic<std::int64_t> atomic_index(-1);
@@ -162,8 +163,7 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
     for (int id = static_cast<int>(begin); id < end; ++id) {
-      const int64 example_index =
-          examples.sampled_index(++atomic_index, options.adaptive);
+      const int64 example_index = examples.sampled_index(++atomic_index);
       const Example& example = examples.example(example_index);
       const float dual = example_state_data(example_index, 0);
       const float example_weight = example.example_weight();
-- 
GitLab


From d82b2f71b60d5fff48884c20c7b85e517330e91f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 6 Apr 2018 10:51:12 +0800
Subject: [PATCH 0349/1262] add assert_element_shape method for tf.contrib.data
 (#17480)

* ENH: add assert_element_shape method

* CLN: add indentation

* ENH: raise exception when wrong shape is given

* CLN: fix too long line
---
 tensorflow/contrib/data/__init__.py           |  2 +
 .../contrib/data/python/kernel_tests/BUILD    |  1 +
 .../kernel_tests/batch_dataset_op_test.py     | 70 +++++++++++++++++++
 tensorflow/contrib/data/python/ops/BUILD      |  1 +
 .../contrib/data/python/ops/batching.py       | 40 +++++++++++
 5 files changed, 114 insertions(+)

diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 17048314a4..125260b4c1 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -25,6 +25,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@Counter
 @@SqlDataset
 
+@@assert_element_shape
 @@batch_and_drop_remainder
 @@bucket_by_sequence_length
 @@dense_to_sparse_batch
@@ -55,6 +56,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 
+from tensorflow.contrib.data.python.ops.batching import assert_element_shape
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import map_and_batch
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c8699e0d5a..7270d533c6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -22,6 +22,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 75482f67da..413d873797 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -579,5 +581,73 @@ class PaddedBatchDatasetSerializationTest(
                         lambda: build_dataset(seq_lens2), 8)
 
 
+class RestructuredDatasetTest(test.TestCase):
+
+  def test_assert_element_shape(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 236792bb98..a1a5c9ed05 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -119,6 +119,7 @@ py_library(
     deps = [
         ":contrib_op_loader",
         ":gen_dataset_ops",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index a212adf6cf..1eba010b56 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@@ -345,6 +346,45 @@ class _RestructuredDataset(dataset_ops.Dataset):
     return self._output_shapes
 
 
+def assert_element_shape(expected_shapes):
+  """Assert the shape of this `Dataset`.
+
+  ```python
+  shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)]
+  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
+  print(result.output_shapes)  # ==> "((16, 256), <unknown>)"
+  ```
+
+  If dataset shapes and expected_shape, are fully defined, assert they match.
+  Otherwise, add assert op that will validate the shapes when tensors are
+  evaluated, and set shapes on tensors, respectively.
+
+  Args:
+    expected_shapes: A nested structure of `tf.TensorShape` objects.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}
+  """
+
+  def _check_shape(*elements):
+    flatten_tensors = nest.flatten(elements)
+    flatten_shapes = nest.flatten(expected_shapes)
+    checked_tensors = [with_shape(shape, tensor)
+                       for shape, tensor in zip(flatten_shapes,
+                                                flatten_tensors)]
+    return nest.pack_sequence_as(elements, checked_tensors)
+
+  def _apply_fn(dataset):
+    return _RestructuredDataset(
+        dataset.map(_check_shape),
+        dataset.output_types,
+        output_shapes=expected_shapes,
+        output_classes=dataset.output_classes)
+
+  return _apply_fn
+
+
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
-- 
GitLab


From 60ed80200e443e1dfe35c87c001c3a984d7ee7aa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 5 Apr 2018 20:14:42 -0700
Subject: [PATCH 0350/1262] Swap in the new implementation of while and for
 loops.

PiperOrigin-RevId: 191838806
---
 tensorflow/contrib/autograph/converters/BUILD | 11 ---
 .../autograph/converters/break_statements.py  |  7 +-
 .../autograph/converters/control_flow.py      | 76 +++++++++++++--
 .../autograph/converters/control_flow_test.py | 72 +++++++++++++++
 .../converters/converter_test_base.py         |  2 +-
 .../contrib/autograph/converters/for_loops.py | 92 -------------------
 .../autograph/converters/for_loops_test.py    | 70 --------------
 tensorflow/contrib/autograph/impl/api_test.py |  3 +-
 .../contrib/autograph/impl/conversion.py      |  3 -
 .../contrib/autograph/utils/__init__.py       |  3 -
 .../contrib/autograph/utils/builtins.py       | 68 --------------
 .../autograph/utils/multiple_dispatch.py      | 41 ---------
 .../autograph/utils/multiple_dispatch_test.py | 23 -----
 13 files changed, 146 insertions(+), 325 deletions(-)
 delete mode 100644 tensorflow/contrib/autograph/converters/for_loops.py
 delete mode 100644 tensorflow/contrib/autograph/converters/for_loops_test.py

diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 92cca30df4..8f9bffa55e 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -24,7 +24,6 @@ py_library(
         "continue_statements.py",
         "control_flow.py",
         "decorators.py",
-        "for_loops.py",
         "ifexp.py",
         "list_comprehension.py",
         "lists.py",
@@ -133,16 +132,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "for_loops_test",
-    srcs = ["for_loops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":test_lib",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
 py_test(
     name = "name_scopes_test",
     srcs = ["name_scopes_test.py"],
diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 48026bccab..62115d4005 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -32,6 +32,7 @@ class BreakCanonicalizationTransformer(transformer.Base):
   def __init__(self, context):
     super(BreakCanonicalizationTransformer, self).__init__(context)
     # This is a stack structure, to correctly process nested loops.
+    # Each item is a list [break_used, break_variable_name]
     self.break_uses = []
 
   def _create_break_check(self):
@@ -99,9 +100,9 @@ class BreakCanonicalizationTransformer(transformer.Base):
     self.break_uses.append([False, break_var])
     node.body = self._manual_visit_list(node.body)
     if self.break_uses[-1][0]:
-      anno.setanno(node, 'extra_cond',
-                   gast.UnaryOp(gast.Not(),
-                                gast.Name(break_var, gast.Load(), None)))
+      extra_cond = templates.replace_as_expression(
+          'not var_name', var_name=break_var)
+      anno.setanno(node, 'extra_cond', extra_cond)
       final_nodes = [self._create_break_init(), node]
     else:
       final_nodes = node
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 49d932026f..55a28e8ac3 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -22,6 +22,7 @@ import gast
 
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
@@ -49,11 +50,6 @@ class ControlFlowTransformer(transformer.Base):
   def __init__(self, context):
     super(ControlFlowTransformer, self).__init__(context)
 
-  # pylint:disable=invalid-name
-
-  def visit_For(self, node):
-    assert False, 'for statement should have been canonicalized at this point'
-
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
     if aliased_orig_names:
@@ -170,6 +166,13 @@ class ControlFlowTransformer(transformer.Base):
     body_closure = body_scope.modified - body_scope.created
     all_referenced = body_scope.referenced
 
+    cond_scope = anno.getanno(node, NodeAnno.COND_SCOPE)
+    cond_closure = set()
+    for s in cond_scope.referenced:
+      for root in s.support_set:
+        if root not in body_scope.created:
+          cond_closure.add(root)
+
     state = list(body_closure)
     if not state:
       # TODO(mdan): Implement this properly.
@@ -204,7 +207,8 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = autograph_utils.run_while(test_name, body_name, [state])
+      state_ast_tuple = __ops.while_loop(
+          test_name, body_name, (state,), (extra_deps,))
     """
     node = templates.replace(
         template,
@@ -216,11 +220,67 @@ class ControlFlowTransformer(transformer.Base):
         test=test,
         body_name=self.context.namer.new_symbol('loop_body',
                                                 body_scope.referenced),
-        body=node_body)
+        body=node_body,
+        extra_deps=tuple(s.ast() for s in cond_closure),
+    )
 
     return node
 
-  # pylint:enable=invalid-name
+  def visit_For(self, node):
+    self.generic_visit(node)
+
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    body_closure = body_scope.modified - body_scope.created
+    all_referenced = body_scope.referenced
+
+    state = list(body_closure)
+
+    state_ssf = [
+        self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
+    ]
+    ssf_map = {
+        name: ssf
+        for name, ssf in zip(state, state_ssf)
+        if str(name) != ssf
+    }
+
+    if len(state) == 1:
+      state = state[0]
+      state_ssf = state_ssf[0]
+      state_ast_tuple = state
+    else:
+      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    if anno.hasanno(node, 'extra_cond'):
+      extra_cond = anno.getanno(node, 'extra_cond')
+      extra_cond = ast_util.rename_symbols(extra_cond, ssf_map)
+    else:
+      extra_cond = parser.parse_expression('True')
+
+    template = """
+      def extra_cond_name(state_ssf):
+        return extra_cond_expr
+      def body_name(iterate, state_ssf):
+        body
+        return state_ssf,
+      state_ast_tuple = __ops.for_loop(
+          iterated, extra_cond_name, body_name, (state,))
+    """
+    node = templates.replace(
+        template,
+        state=state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iterated=node.iter,
+        iterate=node.target,
+        extra_cond_name=self.context.namer.new_symbol('extra_cond',
+                                                      all_referenced),
+        extra_cond_expr=extra_cond,
+        body_name=self.context.namer.new_symbol('loop_body', all_referenced),
+        body=node_body)
+
+    return node
 
 
 def transform(node, context):
diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index 86fed51f27..c5610b16b4 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
@@ -94,6 +95,77 @@ class ControlFlowTest(converter_test_base.TestCase):
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
+  def test_simple_for(self):
+
+    def test_fn(l):
+      s1 = 0
+      s2 = 0
+      for e in l:
+        s1 += e
+        s2 += e * e
+      return s1, s2
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_single_var(self):
+
+    def test_fn(l):
+      s = 0
+      for e in l:
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_with_iterated_expression(self):
+
+    eval_count = [0]
+
+    def count_evals(x):
+      eval_count[0] += 1
+      return x
+
+    def test_fn(n):
+      s = 0
+      for e in count_evals(range(n)):
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      result.count_evals = count_evals
+      self.assertEqual(test_fn(5), result.test_fn(5))
+      # count_evals ran twice, once for test_fn and another for result.test_fn
+      self.assertEqual(eval_count[0], 2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 984e72c70c..6f75e9a529 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -78,7 +78,7 @@ class TestCase(test.TestCase):
       result.tf = self.make_fake_mod('fake_tf', *symbols)
       result.autograph_utils = utils
       result.autograph_api = self.make_fake_mod('fake_api', converted_call)
-      result.__ops = operators  # pylint:disable=protected-access
+      result.__dict__['__ops'] = operators
       yield result
     except Exception:  # pylint:disable=broad-except
       if source is None:
diff --git a/tensorflow/contrib/autograph/converters/for_loops.py b/tensorflow/contrib/autograph/converters/for_loops.py
deleted file mode 100644
index 4999c47bdc..0000000000
--- a/tensorflow/contrib/autograph/converters/for_loops.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Canonicalizes for loops into while loops.
-
-This canonicalizer uses the len function on its argument. That should be
-converted to a tf.shape separately.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.autograph.pyct import anno
-from tensorflow.contrib.autograph.pyct import templates
-from tensorflow.contrib.autograph.pyct import transformer
-from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
-
-
-class ForLoopCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes for loops (e.g. into while loops)."""
-
-  def __init__(self, context):
-    super(ForLoopCanonicalizationTransformer, self).__init__(context)
-
-  def visit_For(self, node):
-    self.generic_visit(node)
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    i_var = self.context.namer.new_symbol('i', body_scope.referenced)
-    smart_loop_iter_var = self.context.namer.new_symbol('smart_loop_iter',
-                                                        body_scope.referenced)
-    cont_var = self.context.namer.new_symbol('cont', body_scope.referenced)
-    # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
-    if anno.hasanno(node, 'extra_cond'):
-      template = """
-        i = 0
-        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
-        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-        while cont and extra_cond:
-          body
-          i += 1
-          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-      """
-      return templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=i_var,
-          smart_loop_iter=smart_loop_iter_var,
-          cont=cont_var,
-          extra_cond=anno.getanno(node, 'extra_cond'))
-    else:
-      template = """
-        i = 0
-        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
-        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-        while cont:
-          body
-          i += 1
-          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-      """
-      repl = templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=i_var,
-          smart_loop_iter=smart_loop_iter_var,
-          cont=cont_var)
-      return repl
-
-  def visit_Continue(self, node):
-    assert False, 'continue statement should be desugared at this point'
-
-  def visit_Break(self, node):
-    assert False, 'break statement should be desugared at this point'
-
-
-def transform(node, context):
-  return ForLoopCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/for_loops_test.py b/tensorflow/contrib/autograph/converters/for_loops_test.py
deleted file mode 100644
index 943f52de55..0000000000
--- a/tensorflow/contrib/autograph/converters/for_loops_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for for_loops module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.autograph.converters import converter_test_base
-from tensorflow.contrib.autograph.converters import for_loops
-from tensorflow.python.platform import test
-
-
-class ControlFlowTest(converter_test_base.TestCase):
-
-  def test_basic_for(self):
-
-    def test_fn(l):
-      s = 0
-      for e in l:
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      l = [1, 2, 3]
-      self.assertEqual(test_fn(l), result.test_fn(l))
-      l = []
-      self.assertEqual(test_fn(l), result.test_fn(l))
-
-  def test_for_with_iterated_expression(self):
-
-    eval_count = [0]
-
-    def count_evals(x):
-      eval_count[0] += 1
-      return x
-
-    def test_fn(n):
-      s = 0
-      for e in count_evals(range(n)):
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      result.count_evals = count_evals
-      self.assertEqual(test_fn(5), result.test_fn(5))
-      # count_evals ran twice, once for test_fn and another for result.test_fn
-      self.assertEqual(eval_count[0], 2)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index f156a87a95..f9db07778a 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -201,8 +201,7 @@ class ApiTest(test.TestCase):
 
     compiled_code = api.to_code(test_fn)
 
-    # Just check for some key words and that it is parseable Python code.
-    self.assertRegexpMatches(compiled_code, 'autograph_utils\\.run_while')
+    # Just check that it is parseable Python code.
     self.assertIsNotNone(parser.parse_str(compiled_code))
 
 
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 62a49cd92d..3bacc94300 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -28,7 +28,6 @@ from tensorflow.contrib.autograph.converters import call_trees
 from tensorflow.contrib.autograph.converters import continue_statements
 from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import decorators
-from tensorflow.contrib.autograph.converters import for_loops
 from tensorflow.contrib.autograph.converters import ifexp
 from tensorflow.contrib.autograph.converters import lists
 from tensorflow.contrib.autograph.converters import logical_expressions
@@ -324,8 +323,6 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   node = _static_analysis_pass(node, ctx)
   node = lists.transform(node, ctx)
-  node = for_loops.transform(node, ctx)
-  # for_loops may insert new global references.
   node = builtin_functions.transform(node, ctx)
 
   node = _static_analysis_pass(node, ctx)
diff --git a/tensorflow/contrib/autograph/utils/__init__.py b/tensorflow/contrib/autograph/utils/__init__.py
index 22898b17e9..817d4126d1 100644
--- a/tensorflow/contrib/autograph/utils/__init__.py
+++ b/tensorflow/contrib/autograph/utils/__init__.py
@@ -19,8 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.autograph.utils.builtins import dynamic_builtin
-from tensorflow.contrib.autograph.utils.builtins import dynamic_dataset
-from tensorflow.contrib.autograph.utils.builtins import dynamic_for_cond
 from tensorflow.contrib.autograph.utils.builtins import dynamic_print
 from tensorflow.contrib.autograph.utils.builtins import dynamic_range
 from tensorflow.contrib.autograph.utils.context_managers import control_dependency_on_returns
@@ -28,7 +26,6 @@ from tensorflow.contrib.autograph.utils.misc import alias_tensors
 from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is
 from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is_not
 from tensorflow.contrib.autograph.utils.multiple_dispatch import run_cond
-from tensorflow.contrib.autograph.utils.multiple_dispatch import run_while
 from tensorflow.contrib.autograph.utils.py_func import wrap_py_func
 from tensorflow.contrib.autograph.utils.tensor_list import dynamic_list_append
 from tensorflow.contrib.autograph.utils.testing import fake_tf
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index c6af0e4d13..7fbb7c09d8 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -24,10 +24,8 @@ import six
 
 from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.contrib.autograph.utils import type_check
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_inspect
@@ -106,69 +104,3 @@ def dynamic_print(*values):
 
   return py_func.wrap_py_func(
       flushed_print, None, values, use_dummy_return=True)
-
-
-def dynamic_dataset(iterated):
-  """Implementartion of smart tf.data.Dataset epoch wrapping.
-
-  The function checks if the input is a tf.data.Dataset and if so then wraps it
-  so that for each element it returns it also returns the current epoch the
-  dataset iteration is in, for two epochs.  If the input is not a
-  tf.data.Dataset then it just returns the input.
-
-  Args:
-    iterated: The iterable or tf.data.Dataset that is being iterated over.
-  Returns:
-    Either just the untouched input, or in the case of input being a
-    tf.data.Dataset then it returns a wrapped  tf.data.Dataset where for each
-    element it returns it also returns the current epoch the dataset iteration
-    is in.
-  """
-  if not isinstance(iterated, dataset_ops.Dataset):
-    return iterated
-
-  def epoch_dataset_number_helper(i):
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(i).repeat(), iterated))
-
-  epoch_numbers = dataset_ops.Dataset.range(2)
-  return epoch_numbers.flat_map(epoch_dataset_number_helper)
-
-
-def dynamic_for_cond(iteration, iterated):
-  """Implementartion of smart while-loop condition using dynamic dispatch.
-
-  The function checks if it is iterating over a tf.data.Dataset or not, and in
-  the case it is not then it simply returns if we are still in range of the
-  iterated and the next element.  If it is iterating over a dataset then it only
-  iterates for a single epoch.
-
-  Args:
-    iteration: The current iteration of the loop.
-    iterated: The iterable or tf.data.Dataset that is being iterated over.
-  Returns:
-    A tuple of a bool that indicates whether the loop should continue, and the
-    next element in iterated.
-  """
-  # TODO(znado): Clean up.
-  # TODO(znado): This won't work for unpacked iterates. Fix.
-  if isinstance(iterated, dataset_ops.Dataset):
-    curr_epoch, next_elem = iterated.make_one_shot_iterator().get_next()
-    return math_ops.less(curr_epoch, 1), next_elem
-  elif tensor_util.is_tensor(iterated):
-    if iterated.shape.ndims > 1:
-      elem_shape = array_ops.shape(iterated)[1:]
-    else:
-      elem_shape = ()
-    if iterated.shape.ndims == 0 or iterated.shape[0] == 0:
-      return False, array_ops.zeros(elem_shape, iterated.dtype)
-    return control_flow_ops.cond(
-        math_ops.less(iteration, dynamic_len(iterated)),
-        lambda: (True, iterated[iteration]),
-        lambda: (False, array_ops.zeros(elem_shape, iterated.dtype)))
-  elif hasattr(iterated, '__len__'):
-    if iteration < len(iterated):
-      return True, iterated[iteration]
-    return False, None
-  else:
-    raise NotImplementedError('Python iterators not yet supported.')
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch.py b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
index 47049255f3..70eef5676f 100644
--- a/tensorflow/contrib/autograph/utils/multiple_dispatch.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.contrib.autograph.utils.type_check import is_tensor
 from tensorflow.python.ops import control_flow_ops
 
@@ -66,42 +64,3 @@ def py_cond(condition, true_fn, false_fn):
   if len(results) == 1:
     return results[0]
   return results
-
-
-def run_while(cond_fn, body_fn, init_args):
-  """Type-dependent functional while loop.
-
-  Args:
-    cond_fn: A Python callable implementing the stop conditions of the loop.
-    body_fn: A Python callable implementing the body of the loop.
-    init_args: The initial values of the arguments that will be passed to both
-      cond_fn and body_fn.
-
-  Returns:
-    result: A list of values with the same shape and type as init_args. If any
-    of the init_args, or any variables closed-over in cond_fn are Tensors,
-    tf.while_loop will be used, otherwise a Python while loop will be ran.
-
-  Raises:
-    ValueError: if init_args is not a tuple or list with one or more elements.
-  """
-  if not isinstance(init_args, (tuple, list)) or not init_args:
-    raise ValueError(
-        'init_args must be a non-empty list or tuple, found %s' % init_args)
-
-  # TODO(alexbw): statically determine all active variables in cond_fn,
-  # and pass them directly
-  closure_vars = tuple(
-      [c.cell_contents for c in six.get_function_closure(cond_fn) or []])
-  possibly_tensors = tuple(init_args) + closure_vars
-  if is_tensor(*possibly_tensors):
-    return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
-  else:
-    return py_while_loop(cond_fn, body_fn, init_args)
-
-
-def py_while_loop(cond_fn, body_fn, init_args):
-  state = init_args
-  while cond_fn(*state):
-    state = body_fn(*state)
-  return state
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
index e6a41bb416..f72f8e94a0 100644
--- a/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
@@ -70,29 +70,6 @@ class MultipleDispatchTest(test.TestCase):
       out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
       self.assertEqual(sess.run(out), 3)
 
-  def test_run_while_python(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 1.0, 0.5])
-    self.assertEqual(x, 0.75)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 4.0, 0.5])
-    self.assertEqual(x, 3.0)
-
-  def test_run_while_tf(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    with Session() as sess:
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 1.0, 0.5])
-      self.assertEqual(sess.run(x), 0.75)
-
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 4.0, 0.5])
-      self.assertEqual(sess.run(x), 3.0)
-
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 1b4f2c51b668dbc1952cabdaf61773b7cff2a0c3 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Thu, 5 Apr 2018 20:43:47 -0700
Subject: [PATCH 0351/1262] Upgrade libpng

PiperOrigin-RevId: 191840652
---
 tensorflow/workspace.bzl |  8 ++++----
 third_party/png.BUILD    | 12 ++++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ace0d411b9..c72aa3e649 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -221,11 +221,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "png_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
-          "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
+          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
+          "https://github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
       ],
-      sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
-      strip_prefix = "libpng-1.2.53",
+      sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
+      strip_prefix = "libpng-1.6.34",
       build_file = clean_dep("//third_party:png.BUILD"),
   )
 
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 6a7ad719aa..76ab32d69c 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -9,15 +9,20 @@ cc_library(
     name = "png",
     srcs = [
         "png.c",
+        "pngdebug.h",
         "pngerror.c",
         "pngget.c",
+        "pnginfo.h",
+        "pnglibconf.h",
         "pngmem.c",
         "pngpread.c",
+        "pngpriv.h",
         "pngread.c",
         "pngrio.c",
         "pngrtran.c",
         "pngrutil.c",
         "pngset.c",
+        "pngstruct.h",
         "pngtrans.c",
         "pngwio.c",
         "pngwrite.c",
@@ -33,3 +38,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = ["@zlib_archive//:zlib"],
 )
+
+genrule(
+    name = "snappy_stubs_public_h",
+    srcs = ["scripts/pnglibconf.h.prebuilt"],
+    outs = ["pnglibconf.h"],
+    cmd = "sed -e 's/PNG_ZLIB_VERNUM 0/PNG_ZLIB_VERNUM 0x12b0/' $< >$@",
+)
-- 
GitLab


From c2d6faafc48b251faa24a342dc063d9fa624421e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 5 Apr 2018 22:37:49 -0700
Subject: [PATCH 0352/1262] Fix StringPiece use-after-free in
 MasterSession::ReffedClientGraph.

Use the owned ClientGraph as the source for the node_to_name_ map, rather than the borrowed GraphExecutionState (which can be deleted while the ReffedClientGraph is in use).

PiperOrigin-RevId: 191847023
---
 .../distributed_runtime/master_session.cc     | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 01da54fcb3..64adf35c5e 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -66,8 +66,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     std::unique_ptr<ClientGraph> cg,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
-                    GraphExecutionState* execution_state, bool is_partial,
-                    WorkerCacheInterface* worker_cache, bool should_deregister)
+                    bool is_partial, WorkerCacheInterface* worker_cache,
+                    bool should_deregister)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
@@ -80,8 +80,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
-    // Initialize a name to node map for testing that fetches are reachable.
-    for (Node* n : execution_state->full_graph()->nodes()) {
+    // Initialize a name to node map for processing device stats.
+    for (Node* n : client_graph_->graph.nodes()) {
       name_to_node_.insert({n->name(), n});
     }
   }
@@ -829,8 +829,6 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
 // TODO(suharsh,mrry): Build a map from fetch target to set of feeds it depends
 // on once at setup time to prevent us from computing the dependencies
 // everytime.
-// TODO(suharshs,mrry): Consider removing the need for execution_state to reduce
-// contention.
 Status MasterSession::ReffedClientGraph::CheckFetches(
     const RunStepRequestWrapper& req, const RunState* run_state,
     GraphExecutionState* execution_state) {
@@ -840,8 +838,8 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
@@ -856,11 +854,11 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     const string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
-    stack.push_back(it->second);
+    stack.push_back(n);
   }
 
   // Any tensor needed for fetches can't be in pending_feeds.
@@ -1293,8 +1291,8 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
-          stats_publisher_factory_, execution_state_.get(), is_partial,
-          worker_cache, !should_delete_worker_sessions_);
+          stats_publisher_factory_, is_partial, worker_cache,
+          !should_delete_worker_sessions_);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
-- 
GitLab


From 4eefd3a5e4a7f5432be7fd3981071dc6b727349f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 01:46:37 -0700
Subject: [PATCH 0353/1262] Add a test to check graceful handling of
 out-of-memory conditions.

PiperOrigin-RevId: 191860462
---
 tensorflow/compiler/tests/BUILD       | 20 +++++++++
 tensorflow/compiler/tests/oom_test.py | 61 +++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 tensorflow/compiler/tests/oom_test.py

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index edabdc218a..e345c1266a 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -191,6 +191,26 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "oom_test",
+    size = "medium",
+    srcs = ["oom_test.py"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
new file mode 100644
index 0000000000..1434e965e3
--- /dev/null
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for out-of-memory conditions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class OutOfMemoryTest(xla_test.XLATestCase):
+
+  def testOutputOutOfMemory(self):
+    """Allocates tensors until out of memory.
+
+    Generates a large rank-1 tensor. The tensor is an output of an XLA
+    computation, not constant.
+
+    Check that a ResourceExhaustedError is raised and can be caught.
+
+    We spin in a loop generating larger and larger tensors until an OOM event
+    happens. We may be running sandboxed, so have a small host memory limit, so
+    any hardcoded value is unlikely to land in the sweet spot between device
+    memory size and host memory size with stability.
+    """
+
+    def test_loop():
+      size = 2e8
+      while True:
+        with self.test_session():
+          # Force the compiled code to not be constant by feeding in an addend.
+          p = array_ops.placeholder(dtypes.float32, shape=[])
+          with self.test_scope():
+            # Create a large R1 tensor.
+            c = array_ops.zeros([size, 1]) + p
+
+            c.eval(feed_dict={p: 1.0})
+            size *= 2
+
+    self.assertRaises(errors.ResourceExhaustedError, test_loop)
+
+
+if __name__ == "__main__":
+  googletest.main()
-- 
GitLab


From 58df8c97a7dc2ed2159e8137312fa29c0d7bcf67 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 03:23:54 -0700
Subject: [PATCH 0354/1262] internal change

PiperOrigin-RevId: 191869400
---
 tensorflow/compiler/jit/BUILD                 |  4 +--
 .../compiler/jit/kernels/xla_launch_op.cc     |  5 +++
 tensorflow/compiler/jit/xla_device_context.cc | 34 +++++++++++++++----
 tensorflow/compiler/jit/xla_device_context.h  |  7 ++++
 tensorflow/compiler/jit/xla_launch_util.cc    | 34 ++++++++++++++-----
 .../compiler/xla/executable_run_options.cc    |  7 ++++
 .../compiler/xla/executable_run_options.h     |  4 +++
 7 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 24aa203c00..a492fc6b9b 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -204,14 +204,14 @@ cc_library(
         ":common",
         ":xla_compilation_cache",
         ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 2d6511a45b..f48941fce3 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -155,6 +155,9 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
   options.device_allocator = xla_allocator;
+  // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
+  // is restricted to Variables, but we need something like this to apply to
+  // normal Tensors too.
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -179,8 +182,10 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   run_options.set_stream(stream);
   run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+
   auto run_result = executable->Run(launch_context.arguments(), run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 6a57831cde..43eb164012 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
 
@@ -53,8 +54,33 @@ XlaTransferManager::XlaTransferManager(se::Stream* stream,
                                        bool transfer_as_literal)
     : stream_(stream),
       client_(client),
+      transfer_manager_(client->backend().transfer_manager()),
       transfer_as_literal_(transfer_as_literal) {}
 
+Status XlaTransferManager::TransferLiteralToDevice(
+    const Tensor& host_tensor, Tensor* device_tensor) const {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(HostTensorToLiteral(host_tensor, &literal));
+  VLOG(1) << "Transfer to device as literal: " << literal.ToString();
+
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+  return transfer_manager_->TransferLiteralToDevice(stream_->parent(), literal,
+                                                    shaped_buffer);
+}
+
+Status XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor) const {
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
+                      transfer_manager_->TransferLiteralFromDevice(
+                          stream_->parent(), shaped_buffer));
+  VLOG(1) << "Transfer from device as literal: " << literal->ToString();
+  return LiteralToHostTensor(*literal, host_tensor->dtype(), host_tensor);
+}
+
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                                Device* device,
                                                Tensor* device_tensor,
@@ -86,9 +112,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
     if (transfer_as_literal_) {
-      status = xla::Unimplemented(
-          "XlaTransferManager::CopyCPUTensorToDevice not implemented for "
-          "literals");
+      status = TransferLiteralToDevice(*cpu_tensor, device_tensor);
     } else {
       stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
@@ -129,9 +153,7 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
 
     Status status;
     if (transfer_as_literal_) {
-      status = xla::Unimplemented(
-          "XlaTransferManager::CopyDeviceTensorToCPU not implemented for "
-          "literals");
+      status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
     } else {
       stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index a8ad511fbd..ad914a1c23 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -57,11 +57,18 @@ class XlaTransferManager {
   perftools::gputools::Stream* stream() const { return stream_; }
 
  private:
+  Status TransferLiteralToDevice(const Tensor& host_tensor,
+                                 Tensor* device_tensor) const;
+  Status TransferLiteralFromDevice(Tensor* host_tensor,
+                                   const Tensor& device_tensor) const;
+
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
   perftools::gputools::Stream* stream_;
   // For the underlying memory allocator and XLA's TransferManager.
   xla::LocalClient* client_;
+  // Transfer manager, for marshalling data to and from the device.
+  xla::TransferManager* transfer_manager_;
   // True if we must use XLA's TransferManager for correct device transfers.
   bool transfer_as_literal_;
 };
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 354be1e1b5..50b0061d69 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -16,12 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -165,6 +167,8 @@ void XlaComputationLaunchContext::PopulateOutputs(
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
+    VLOG(2) << "Result tuple shape (on device): "
+            << output->on_device_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
@@ -179,6 +183,10 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const size_t total_bytes = const_tensor.TotalBytes();
       if (stream && total_bytes > 0) {
         // Copy host -> device. (Empty tensors don't have backing buffers.)
+        // Manually allocate memory using an XlaTensorBuffer so we can allocate
+        // as much memory as the device requires (as given by
+        // GetByteSizeRequirement). This avoids XlaTransferManager having to
+        // reallocate the device buffer later.
         VLOG(1) << "Constant output tensor on device";
 
         OP_REQUIRES_OK(
@@ -189,15 +197,23 @@ void XlaComputationLaunchContext::PopulateOutputs(
                                   client_, stream->parent()->device_ordinal()));
         }
 
-        const void* src_ptr = DMAHelper::base(&const_tensor);
-        gpu::DeviceMemoryBase dst_ptr =
-            XlaTensor::DeviceMemoryFromTensor(*output_tensor);
-        // Memcpying asynchronously is safe for the GPU, but the CPU uses a
-        // shared allocator so hold a reference to the copied-to buffer until
-        // complete.
-        TensorReference ref(*output_tensor);
-        stream->ThenMemcpy(&dst_ptr, src_ptr, total_bytes);
-        stream->ThenDoHostCallback([ref] { ref.Unref(); });
+        Device* device = dynamic_cast<Device*>(ctx->device());
+        OP_REQUIRES(ctx, device != nullptr,
+                    errors::Internal("DeviceBase was not a Device."));
+        ctx->op_device_context()->CopyCPUTensorToDevice(
+            &const_tensor, device, output_tensor,
+            [&](Status status) { TF_CHECK_OK(status); });
+
+        if (device->device_type() == DEVICE_GPU) {
+          // The GPUDeviceContext enqueues the host->device transfer in a
+          // separate stream from the main compute stream. We must ensure the
+          // compute stream is synchronized with the host->device transfer
+          // stream now otherwise we will create a race condition.
+          auto* gpu_device_context =
+              static_cast<GPUDeviceContext*>(ctx->op_device_context());
+          gpu_device_context->stream()->ThenWaitFor(
+              gpu_device_context->host_to_device_stream());
+        }
       } else {
         // No copy required.
         ctx->set_output(i, const_tensor);
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 392ad9010a..1700c97718 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -87,4 +87,11 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
+  rng_seed_ = rng_seed;
+  return *this;
+}
+
+int ExecutableRunOptions::rng_seed() const { return rng_seed_; }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index d4fcbf0493..2c1d9ffff1 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -84,6 +84,9 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
+  ExecutableRunOptions& set_rng_seed(int rng_seed);
+  int rng_seed() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -92,6 +95,7 @@ class ExecutableRunOptions {
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
+  int rng_seed_ = 0;
 };
 
 }  // namespace xla
-- 
GitLab


From 55b7dc3125d47466ecfa38c7c20e3aa3ffef6345 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Fri, 6 Apr 2018 06:30:45 -0700
Subject: [PATCH 0355/1262] Fix typos in XlaCompilationCache

PiperOrigin-RevId: 191881135
---
 tensorflow/compiler/jit/xla_compilation_cache.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 5c0c79b880..be1043d8c3 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -52,13 +52,14 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `constant_args` is a maps of tensorflow argument number to constant value.
+  // `constant_args` is a map of tensorflow argument number to its constant
+  //  value.
   // `variable_args` is a snapshot of the current values of the
   // resource variable arguments to `function`; uninitialized variables are
   // represented by an absent OptionalTensor.
   // The result of compilation is written to `*compilation_result`, which must
   // be non-null. If `executable` is non-null, also builds an
-  // xla::LocalExecutable and sets `executable to point to it. The resulting
+  // xla::LocalExecutable and sets `executable` to point to it. The resulting
   // executable pointer may be null if the computation has no non-constant
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
@@ -96,6 +97,7 @@ class XlaCompilationCache : public ResourceBase {
                      xla::LocalExecutable** executable,
                      const XlaCompiler::CompileOptions* compile_options,
                      bool compile_single_op);
+
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
   Status BuildExecutable(const XlaCompiler::Options& options,
-- 
GitLab


From d5bda3a264d0d3d5e7122a956bfdf0b19b8fd880 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 07:13:49 -0700
Subject: [PATCH 0356/1262] Define PRNG seeding style for new code in
 Distributions and TF Probability, with rationales.

Implement lightweight PRNG for seed generation in that style.

Enables incremental refactoring of existing code into this style.

PiperOrigin-RevId: 191884573
---
 tensorflow/contrib/distributions/BUILD        |  10 +
 tensorflow/contrib/distributions/__init__.py  |   2 +
 .../python/kernel_tests/seed_stream_test.py   |  70 ++++++
 .../distributions/python/ops/seed_stream.py   | 228 ++++++++++++++++++
 4 files changed, 310 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/seed_stream.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 9799901483..fec6eafd4a 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -490,6 +490,16 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_test(
+    name = "seed_stream_test",
+    size = "small",
+    srcs = ["python/kernel_tests/seed_stream_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 cuda_py_test(
     name = "statistical_testing_test",
     size = "medium",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 4d4489468d..ddf59891e6 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -59,6 +59,7 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.seed_stream import *
 from tensorflow.contrib.distributions.python.ops.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.test_util import *
 from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
@@ -126,6 +127,7 @@ _allowed_symbols = [
     'NormalWithSoftplusScale',
     'Poisson',
     'PoissonLogNormalQuadratureCompound',
+    'SeedStream',
     'SinhArcsinh',
     'StudentT',
     'StudentTWithAbsDfSoftplusScale',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
new file mode 100644
index 0000000000..9680573317
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
@@ -0,0 +1,70 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the SeedStream class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import seed_stream
+from tensorflow.python.platform import test
+
+
+class SeedStreamTest(test.TestCase):
+
+  def assertAllUnique(self, items):
+    self.assertEqual(len(items), len(set(items)))
+
+  def testNonRepetition(self):
+    # The probability of repetitions in a short stream from a correct
+    # PRNG is negligible; this test catches bugs that prevent state
+    # updates.
+    strm = seed_stream.SeedStream(seed=4, salt="salt")
+    output = [strm() for _ in range(50)]
+    self.assertEqual(sorted(output), sorted(list(set(output))))
+
+  def testReproducibility(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm3 = seed_stream.SeedStream(seed=4, salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertEqual(outputs, [strm2() for _ in range(50)])
+    self.assertEqual(outputs, [strm3() for _ in range(50)])
+
+  def testSeededDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=5, salt="salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testSaltedDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="another salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testNestingRobustness(self):
+    # SeedStreams started from generated seeds should not collide with
+    # the master or with each other, even if the salts are the same.
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(strm1(), salt="salt")
+    strm3 = seed_stream.SeedStream(strm1(), salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertAllUnique(
+        outputs + [strm2() for _ in range(50)] + [strm3() for _ in range(50)])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/seed_stream.py b/tensorflow/contrib/distributions/python/ops/seed_stream.py
new file mode 100644
index 0000000000..056d349688
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/seed_stream.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Local PRNG for amplifying seed entropy into seeds for base operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+
+
+class SeedStream(object):
+  """Local PRNG for amplifying seed entropy into seeds for base operations.
+
+  Writing sampling code which correctly sets the pseudo-random number
+  generator (PRNG) seed is surprisingly difficult.  This class serves as
+  a helper for the TensorFlow Probability coding pattern designed to
+  avoid common mistakes.
+
+  # Motivating Example
+
+  A common first-cut implementation of a sampler for the beta
+  distribution is to compute the ratio of a gamma with itself plus
+  another gamma.  This code snippet tries to do that, but contains a
+  surprisingly common error:
+
+  ```python
+  def broken_beta(shape, alpha, beta, seed):
+    x = tf.random_gamma(shape, alpha, seed=seed)
+    y = tf.random_gamma(shape, beta, seed=seed)
+    return x / (x + y)
+  ```
+
+  The mistake is that the two gamma draws are seeded with the same
+  seed.  This causes them to always produce the same results, which,
+  in turn, leads this code snippet to always return `0.5`.  Because it
+  can happen across abstraction boundaries, this kind of error is
+  surprisingly easy to make when handling immutable seeds.
+
+  # Goals
+
+  TensorFlow Probability adopts a code style designed to eliminate the
+  above class of error, without exacerbating others.  The goals of
+  this code style are:
+
+  - Support reproducibility of results (by encouraging seeding of all
+    pseudo-random operations).
+
+  - Avoid shared-write global state (by not relying on a global PRNG).
+
+  - Prevent accidental seed reuse by TF Probability implementers.  This
+    goal is served with the local pseudo-random seed generator provided
+    in this module.
+
+  - Mitigate potential accidental seed reuse by TF Probability clients
+    (with a salting scheme).
+
+  - Prevent accidental resonances with downstream PRNGs (by hashing the
+    output).
+
+  ## Non-goals
+
+  - Implementing a high-performance PRNG for generating large amounts of
+    entropy.  That's the job of the underlying TensorFlow PRNG we are
+    seeding.
+
+  - Avoiding random seed collisions, aka "birthday attacks".
+
+  # Code pattern
+
+  ```python
+  def random_beta(shape, alpha, beta, seed):        # (a)
+    seed = SeedStream(seed, salt="random_beta")     # (b)
+    x = tf.random_gamma(shape, alpha, seed=seed())  # (c)
+    y = tf.random_gamma(shape, beta, seed=seed())   # (c)
+    return x / (x + y)
+  ```
+
+  The elements of this pattern are:
+
+  - Accept an explicit seed (line a) as an argument in all public
+    functions, and write the function to be deterministic (up to any
+    numerical issues) for fixed seed.
+
+    - Rationale: This provides the client with the ability to reproduce
+      results.  Accepting an immutable seed rather than a mutable PRNG
+      object reduces code coupling, permitting different sections to be
+      reproducible independently.
+
+  - Use that seed only to initialize a local `SeedStream` instance (line b).
+
+    - Rationale: Avoids accidental seed reuse.
+
+  - Supply the name of the function being implemented as a salt to the
+    `SeedStream` instance (line b).  This serves to keep the salts
+    unique; unique salts ensure that clients of TF Probability will see
+    different functions always produce independent results even if
+    called with the same seeds.
+
+  - Seed each callee operation with the output of a unique call to the
+    `SeedStream` instance (lines c).  This ensures reproducibility of
+    results while preventing seed reuse across callee invocations.
+
+  # Why salt?
+
+  Salting the `SeedStream` instances (with unique salts) is defensive
+  programming against a client accidentally committing a mistake
+  similar to our motivating example.  Consider the following situation
+  that might arise without salting:
+
+  ```python
+  def tfp_foo(seed):
+    seed = SeedStream(seed, salt="")
+    foo_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def tfp_bar(seed):
+    seed = SeedStream(seed, salt="")
+    bar_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def client_baz(seed):
+    foo = tfp_foo(seed=seed)
+    bar = tfp_bar(seed=seed)
+    ...
+  ```
+
+  The client should have used different seeds as inputs to `foo` and
+  `bar`.  However, because they didn't, *and because `foo` and `bar`
+  both sample a Gaussian internally as their first action*, the
+  internal `foo_stuff` and `bar_stuff` will be the same, and the
+  returned `foo` and `bar` will not be independent, leading to subtly
+  incorrect answers from the client's simulation.  This kind of bug is
+  particularly insidious for the client, because it depends on a
+  Distributions implementation detail, namely the order in which `foo`
+  and `bar` invoke the samplers they depend on.  In particular, a
+  Bayesflow team member can introduce such a bug in previously
+  (accidentally) correct client code by performing an internal
+  refactoring that causes this operation order alignment.
+
+  A salting discipline eliminates this problem by making sure that the
+  seeds seen by `foo`'s callees will differ from those seen by `bar`'s
+  callees, even if `foo` and `bar` are invoked with the same input
+  seed.
+  """
+
+  def __init__(self, seed, salt):
+    """Initializes a `SeedStream`.
+
+    Args:
+      seed: Any Python object convertible to string, supplying the
+        initial entropy.  If `None`, operations seeded with seeds
+        drawn from this `SeedStream` will follow TensorFlow semantics
+        for not being seeded.
+      salt: Any Python object convertible to string, supplying
+        auxiliary entropy.  Must be unique across the Distributions
+        and TensorFlow Probability code base.  See class docstring for
+        rationale.
+    """
+    self._seed = seed
+    self._salt = salt
+    self._counter = 0
+
+  def __call__(self):
+    """Returns a fresh integer usable as a seed in downstream operations.
+
+    If this `SeedStream` was initialized with `seed=None`, returns
+    `None`.  This has the effect that downstream operations (both
+    `SeedStream`s and primitive TensorFlow ops) will behave as though
+    they were unseeded.
+
+    The returned integer is non-negative, and uniformly distributed in
+    the half-open interval `[0, 2**512)`.  This is consistent with
+    TensorFlow, as TensorFlow operations internally use the residue of
+    the given seed modulo `2**31 - 1` (see
+    `tensorflow/python/framework/random_seed.py`).
+
+    Returns:
+      seed: A fresh integer usable as a seed in downstream operations,
+        or `None`.
+    """
+    self._counter += 1
+    if self._seed is None:
+      return None
+    composite = str((self._seed, self._counter, self._salt)).encode("utf-8")
+    return int(hashlib.sha512(composite).hexdigest(), 16)
+
+  @property
+  def original_seed(self):
+    return self._seed
+
+  @property
+  def salt(self):
+    return self._salt
+
+# Design rationales for the SeedStream class
+#
+# - Salts are accepted for the reason given above to supply them.
+#
+# - A `None` seed propagates to downstream seeds, so they exhibit
+#   their "unseeded" behavior.
+#
+# - The return value is a Python int so it can be passed directly to
+#   TensorFlow operations as a seed.  It is large to avoid losing seed
+#   space needlessly (TF will internally read only the last 31 bits).
+#
+# - The output is hashed with a crypto-grade hash function as a form
+#   of defensive programming: this reliably prevents all possible
+#   accidental resonances with all possible downstream PRNGs.  The
+#   specific function used is not important; SHA512 was ready to hand.
+#
+# - The internal state update is a simple counter because (a) given
+#   that the output is hashed anyway, this is enough, and (b) letting
+#   it be this predictable permits a future "generate many seeds in
+#   parallel" operation whose results would agree with running
+#   sequentially.
-- 
GitLab


From 4e6c808efce4d4eae138cd8fbbc65a663064bfa7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 07:50:09 -0700
Subject: [PATCH 0357/1262] Avoid marking clusters containing only Identity ops
 for compilation.

This would produce clusters where XLA cannot optimize anything.

PiperOrigin-RevId: 191887414
---
 .../compiler/jit/mark_for_compilation_pass.cc | 19 +++++++----
 .../jit/mark_for_compilation_pass_test.cc     | 34 +++++++++++++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index f651768a67..f32c0f4ba8 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -728,11 +728,15 @@ Status MarkForCompilationPass::RunImpl(
     }
   }
 
-  // Count the number of elements in each cluster.
-  std::vector<int> cluster_sizes(graph->num_node_ids());
+  // Count the number of non-trivial elements in each cluster.
+  std::vector<int> effective_cluster_sizes(graph->num_node_ids());
   for (const Node* n : compilation_candidates) {
     int cluster = clusters[n->id()].Get().representative;
-    cluster_sizes[cluster]++;
+    // Identity nodes will be removed if the node gets marked for compilation.
+    // Therefore we don't want to count them towards the effective cluster size.
+    if (n->def().op() != "Identity") {
+      effective_cluster_sizes[cluster]++;
+    }
   }
 
   // Names for each cluster.
@@ -765,9 +769,12 @@ Status MarkForCompilationPass::RunImpl(
     const XlaOpRegistry::DeviceRegistration* registration;
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
-    // Or compile if this is a cluster of >= min_cluster_size compilable
-    // operators.
-    if (cluster_sizes[cluster] >= min_cluster_size || marked_for_compilation ||
+    // Compile if this is a cluster of >= min_cluster_size compilable operators.
+    // Also, always compile if the operator is placed on a device that requires
+    // compilation, or if it contains at least one op that is marked for
+    // compilation that is not an Identity op.
+    if (effective_cluster_sizes[cluster] >= min_cluster_size ||
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
         registration->requires_compilation) {
       string& name = cluster_names[cluster];
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 2e362e0a63..80edaf28b8 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -575,5 +577,37 @@ TEST(XlaCompilationTest, Retval) {
   EXPECT_EQ(clusters["A"], clusters["B"]);
 }
 
+TEST(XlaCompilationTest, DontCountIdentityOps) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    auto c = ops::Identity(root.WithOpName("C"), b);
+    auto r = ops::_Retval(root.WithOpName("R"), c, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    b.node()->AddAttr(kXlaCompileAttr, true);
+    auto r = ops::_Retval(root.WithOpName("R"), b, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From 7eeb54aa745ac45c15e886385ec33372d5966b23 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 08:11:24 -0700
Subject: [PATCH 0358/1262] Add description to the LPIRC 2018 competition
 benchmarker.

PiperOrigin-RevId: 191889484
---
 tensorflow/contrib/lite/java/BUILD            | 39 +++++++++
 tensorflow/contrib/lite/java/ovic/README.md   | 83 +++++++++++++++++++
 .../tensorflow/ovic/OvicClassifierTest.java   | 35 ++++----
 3 files changed, 140 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/contrib/lite/java/ovic/README.md

diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 7f7a2632dd..b14230acd7 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -46,6 +46,23 @@ android_library(
     ],
 )
 
+java_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
 java_library(
     name = "tensorflowlitelib",
     srcs = glob(
@@ -147,6 +164,28 @@ java_test(
     ],
 )
 
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "ovic/src/testdata/float_model.lite",
+        "ovic/src/testdata/labels.txt",
+        "ovic/src/testdata/low_res_model.lite",
+        "ovic/src/testdata/quantized_model.lite",
+        "ovic/src/testdata/test_image_128.jpg",
+        "ovic/src/testdata/test_image_224.jpg",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":ovicbenchmarkerlib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
new file mode 100644
index 0000000000..76c33838bf
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -0,0 +1,83 @@
+# Benchmarker for LPIRC Workshop at CVPR 2018
+
+This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
+
+## Pre-requesits
+
+Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
+
+## To test the benchmarker:
+
+The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
+
+Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
+We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
+
+### Obtain the sample models
+
+The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
+
+Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/contrib/lite`).
+
+
+* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip):
+
+```sh
+curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip -o /tmp/ovic.zip
+```
+
+* Unzip the package into the testdata folder:
+
+```sh
+unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+### Run tests
+
+You can run test with Bazel as below. This helps to ensure that the installation is correct.
+
+```sh
+bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java:OvicClassifierTest --test_output=all
+```
+
+### Test your submissions
+
+Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it as below.
+
+* Move your submission to the testdata folder:
+
+Let say the submission file is located at `/tmp/my_model.lite`, then
+
+```sh
+cp /tmp/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+* Resize the test image to the resolutions that are expected by your submission:
+
+The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
+
+* Add your model and test image to the BUILD rule:
+
+```JSON
+java_test(
+  name = "OvicClassifierTest",
+  size = "medium",
+  srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+  data = [
+      "ovic/src/testdata/float_model.lite",
+      "ovic/src/testdata/labels.txt",
+      "ovic/src/testdata/low_res_model.lite",
+      "ovic/src/testdata/quantized_model.lite",
+      "ovic/src/testdata/test_image_128.jpg",
+      "ovic/src/testdata/test_image_224.jpg",
+      "ovic/src/testdata/my_model.lite",        # <--- Your submission.
+      "ovic/src/testdata/my_test_image.jpg",    # <--- Your test image.
+  ],
+      ...
+```
+
+* Modify `OvicClassifierTest.java` to test your model.
+
+Change `TEST_IMAGE_PATH` to `testdata/my_test_image.jpg`. If your model runs inference in floating point, change `FLOAT_MODEL_PATH` to `testdata/my_model.lite`. If your model runs [quantized inference](https://www.tensorflow.org/performance/quantization), change `QUANTIZED_MODEL_PATH` to `testdata/my_model.lite`.
+
+Now you can run the bazel tests to catch any runtime issues with the submission.
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 4fd23a99d2..098ed8ceba 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -26,7 +26,6 @@ import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
-import java.nio.file.Paths;
 import javax.imageio.ImageIO;
 import org.junit.Before;
 import org.junit.Test;
@@ -45,27 +44,33 @@ public final class OvicClassifierTest {
   private ByteBuffer testImage = null;
   private ByteBuffer lowResTestImage = null;
   private OvicSingleImageResult testResult = null;
-  private static final String LABELS_PATH = "testdata/labels.txt";
-  private static final String QUANTIZED_MODEL_PATH = "testdata/quantized_model.lite";
-  private static final String LOW_RES_MODEL_PATH = "testdata/low_res_model.lite";
-  private static final String FLOAT_MODEL_PATH = "testdata/float_model.lite";
-  private static final String TEST_IMAGE_PATH = "testdata/test_image_224.jpg";
-  private static final String TEST_LOW_RES_IMAGE_PATH = "testdata/test_image_128.jpg";
+  private static final String LABELS_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+  private static final String QUANTIZED_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/quantized_model.lite";
+  private static final String LOW_RES_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/low_res_model.lite";
+  private static final String FLOAT_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/float_model.lite";
+  private static final String TEST_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_224.jpg";
+  private static final String TEST_LOW_RES_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_128.jpg";
   private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
 
   @Before
   public void setUp() {
     try {
-      File labelsfile = new File(getTestDir(LABELS_PATH));
+      File labelsfile = new File(LABELS_PATH);
       labelsInputStream = new FileInputStream(labelsfile);
-      quantizedModel = loadModelFile(getTestDir(QUANTIZED_MODEL_PATH));
-      floatModel = loadModelFile(getTestDir(FLOAT_MODEL_PATH));
-      lowResModel = loadModelFile(getTestDir(LOW_RES_MODEL_PATH));
-      File imageFile = new File(getTestDir(TEST_IMAGE_PATH));
+      quantizedModel = loadModelFile(QUANTIZED_MODEL_PATH);
+      floatModel = loadModelFile(FLOAT_MODEL_PATH);
+      lowResModel = loadModelFile(LOW_RES_MODEL_PATH);
+      File imageFile = new File(TEST_IMAGE_PATH);
       BufferedImage img = ImageIO.read(imageFile);
       testImage = toByteBuffer(img);
       // Low res image and models.
-      imageFile = new File(getTestDir(TEST_LOW_RES_IMAGE_PATH));
+      imageFile = new File(TEST_LOW_RES_IMAGE_PATH);
       img = ImageIO.read(imageFile);
       lowResTestImage = toByteBuffer(img);
     } catch (IOException e) {
@@ -74,10 +79,6 @@ public final class OvicClassifierTest {
     System.out.println("Successful setup");
   }
 
-  private static String getTestDir(String testfile) throws IOException {
-    return Paths.get("third_party/tensorflow/contrib/lite/java/ovic/src/", testfile).toString();
-  }
-
   @Test
   public void ovicClassifier_quantizedModelCreateSuccess() throws Exception {
     classifier = new OvicClassifier(labelsInputStream, quantizedModel);
-- 
GitLab


From afc21e7149a0d146bd8db3145fe825b1f316c0a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 08:48:16 -0700
Subject: [PATCH 0359/1262] The training model need not be built when the kfac
 optimizer is initialized so the self._variables will be empty list.  So pass
 a function which returns list of trainable variables to estimator.

PiperOrigin-RevId: 191893084
---
 tensorflow/contrib/kfac/python/ops/estimator.py | 11 +++++++----
 tensorflow/contrib/kfac/python/ops/optimizer.py | 10 +++-------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index ced1110676..d11c9c8288 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -85,9 +85,9 @@ class FisherEstimator(object):
     """Create a FisherEstimator object.
 
     Args:
-      variables: A list of the variables for which to estimate the Fisher. This
-          must match the variables registered in layer_collection (if it is not
-          None).
+      variables: A `list` of variables or `callable` which returns the variables
+          for which to estimate the Fisher. This must match the variables
+          registered in layer_collection (if it is not None).
       cov_ema_decay: The decay factor used when calculating the covariance
           estimate moving averages.
       damping: float. The damping factor used to stabilize training due to
@@ -147,7 +147,10 @@ class FisherEstimator(object):
 
   @property
   def variables(self):
-    return self._variables
+    if callable(self._variables):
+      return self._variables()
+    else:
+      return self._variables
 
   @property
   def damping(self):
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 843aeef7d8..f01c5a8322 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -108,13 +108,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       ValueError: If momentum is non-zero and momentum_type is not 'regular'
           or 'adam'.
     """
-
-    variables = var_list
-    if variables is None:
-      variables = tf_variables.trainable_variables()
-
     # Parameters to be passed to the Fisher estimator:
-    self._variables = variables
+    self._variables = var_list or tf_variables.trainable_variables
     self._cov_ema_decay = cov_ema_decay
     self._layers = layer_collection
     self._estimation_mode = estimation_mode
@@ -235,7 +230,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
 
   @property
   def variables(self):
-    return self._variables
+    return self._fisher_est.variables
 
   @property
   def damping(self):
@@ -373,6 +368,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     else:
       kwargs["var_list"] = kwargs.get("var_list") or self.variables
       var_list = kwargs["var_list"]
+
     if set(var_list) != set(self.variables):
       raise ValueError("var_list doesn't match with set of Fisher-estimating "
                        "variables.")
-- 
GitLab


From 3306d2127513facc617a14da10e9669392f7d217 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 08:58:14 -0700
Subject: [PATCH 0360/1262] Fix up the support for the case where a given array
 name occurs multiple times in the inputs/outputs list of an op. The
 (non-essential) computation of the optimal workspace size had not been
 updated for that case, causing it to fail on a simple test case. Moreover,
 the initial implementation had some redundant usage of std::find that this CL
 moves to a shared helper function.

PiperOrigin-RevId: 191894081
---
 .../lite/toco/allocate_transient_arrays.cc    | 36 +++++++++----------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 621fbcb98d..1f3ea2e1c7 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -200,6 +200,12 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   allocator->Deallocate(*array->alloc);
 }
 
+void PushBackIfNotFound(const string& s, std::vector<string>* v) {
+  if (std::find(v->begin(), v->end(), s) == v->end()) {
+    v->push_back(s);
+  }
+}
+
 }  // namespace
 
 void AllocateTransientArrays(Model* model,
@@ -251,18 +257,12 @@ void AllocateTransientArrays(Model* model,
     std::vector<string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
-        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
-                      input) == arrays_to_allocate.end()) {
-          arrays_to_allocate.push_back(input);
-        }
+        PushBackIfNotFound(input, &arrays_to_allocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (StartsAt(array_lifespans[output], op_index)) {
-        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
-                      output) == arrays_to_allocate.end()) {
-          arrays_to_allocate.push_back(output);
-        }
+        PushBackIfNotFound(output, &arrays_to_allocate);
       }
     }
     for (const string& array : arrays_to_allocate) {
@@ -274,18 +274,12 @@ void AllocateTransientArrays(Model* model,
     std::vector<string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
-        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
-                      input) == arrays_to_deallocate.end()) {
-          arrays_to_deallocate.push_back(input);
-        }
+        PushBackIfNotFound(input, &arrays_to_deallocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (EndsAt(array_lifespans[output], op_index)) {
-        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
-                      output) == arrays_to_deallocate.end()) {
-          arrays_to_deallocate.push_back(output);
-        }
+        PushBackIfNotFound(output, &arrays_to_deallocate);
       }
     }
     for (const string& array : arrays_to_deallocate) {
@@ -310,17 +304,21 @@ void AllocateTransientArrays(Model* model,
     // for each operator, compute the sum of the sizes of the array that must
     // be live during the execution of this operator, plus the size of
     // persistent arrays that must be live at all times.
-    std::size_t size = persistent_alloc_size;
+    std::vector<string> non_persistent_edges;
     for (const auto& input : op->inputs) {
       if (!array_lifespans[input].persistent) {
-        size += TransientArraySize(*model, input, transient_data_alignment);
+        PushBackIfNotFound(input, &non_persistent_edges);
       }
     }
     for (const auto& output : op->outputs) {
       if (!array_lifespans[output].persistent) {
-        size += TransientArraySize(*model, output, transient_data_alignment);
+        PushBackIfNotFound(output, &non_persistent_edges);
       }
     }
+    std::size_t size = persistent_alloc_size;
+    for (const string& edge : non_persistent_edges) {
+      size += TransientArraySize(*model, edge, transient_data_alignment);
+    }
     // The optimal total size is the maximum of all operator-specific sizes.
     optimal_transient_alloc_size = std::max(optimal_transient_alloc_size, size);
   }
-- 
GitLab


From 9169f0db74635bb83d631338221ec2786da8dc99 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 09:11:32 -0700
Subject: [PATCH 0361/1262] Support override of device filters for gRPC, by
 overriding the requests with default session config.

PiperOrigin-RevId: 191895856
---
 .../rpc/grpc_master_service.cc                | 26 +++++++++++--------
 .../rpc/grpc_master_service.h                 |  7 ++---
 .../rpc/grpc_server_lib.cc                    |  3 +--
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index b4d18d8607..63745e8ebd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -47,11 +47,11 @@ namespace tensorflow {
 
 class GrpcMasterService : public AsyncServiceInterface {
  public:
-  GrpcMasterService(Master* master, int64 default_timeout_in_ms,
+  GrpcMasterService(Master* master, const ConfigProto& default_session_config,
                     ::grpc::ServerBuilder* builder)
       : master_impl_(master),
-        default_timeout_in_ms_(default_timeout_in_ms),
-        is_shutdown_(false) {
+        is_shutdown_(false),
+        default_session_config_(default_session_config) {
     builder->RegisterService(&master_service_);
     cq_ = builder->AddCompletionQueue();
   }
@@ -129,12 +129,12 @@ class GrpcMasterService : public AsyncServiceInterface {
 
  private:
   Master* master_impl_ = nullptr;  // Not owned.
-  const int64 default_timeout_in_ms_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::MasterService::AsyncService master_service_;
 
   mutex mu_;
   bool is_shutdown_ GUARDED_BY(mu_);
+  const ConfigProto default_session_config_;
   ::grpc::Alarm* shutdown_alarm_ = nullptr;
 
   template <class RequestMessage, class ResponseMessage>
@@ -144,9 +144,13 @@ class GrpcMasterService : public AsyncServiceInterface {
   // RPC handler for creating a session.
   void CreateSessionHandler(
       MasterCall<CreateSessionRequest, CreateSessionResponse>* call) {
-    master_impl_->CreateSession(&call->request, &call->response,
-                                [call](const Status& status) {
+    CreateSessionRequest* rewritten_req = new CreateSessionRequest;
+    rewritten_req->mutable_config()->MergeFrom(default_session_config_);
+    rewritten_req->MergeFrom(call->request);
+    master_impl_->CreateSession(rewritten_req, &call->response,
+                                [call, rewritten_req](const Status& status) {
                                   call->SendResponse(ToGrpcStatus(status));
+                                  delete rewritten_req;
                                 });
     ENQUEUE_REQUEST(CreateSession, true);
   }
@@ -178,7 +182,7 @@ class GrpcMasterService : public AsyncServiceInterface {
     if (call->request.options().timeout_in_ms() > 0) {
       call_opts->SetTimeout(call->request.options().timeout_in_ms());
     } else {
-      call_opts->SetTimeout(default_timeout_in_ms_);
+      call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
     }
     RunStepRequestWrapper* wrapped_request =
         new ProtoRunStepRequest(&call->request);
@@ -249,10 +253,10 @@ class GrpcMasterService : public AsyncServiceInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
 };
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder) {
-  return new GrpcMasterService(master, default_timeout_in_ms, builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder) {
+  return new GrpcMasterService(master, default_session_config, builder);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 473604f257..f0fe5b0c4e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
 
 namespace grpc {
 class ServerBuilder;
@@ -28,9 +29,9 @@ namespace tensorflow {
 class AsyncServiceInterface;
 class Master;
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index a6f4be3eaf..be19103582 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -183,8 +183,7 @@ Status GrpcServer::Init(
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   master_impl_ = CreateMaster(&master_env_);
-  master_service_ = NewGrpcMasterService(
-      master_impl_.get(), config.operation_timeout_in_ms(), &builder);
+  master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
   worker_impl_ =
       worker_func ? worker_func(&worker_env_) : NewGrpcWorker(&worker_env_);
   worker_service_ =
-- 
GitLab


From c5a16fa1c91a0d1cf3d5b432d70b4e8fe47b88cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 09:21:00 -0700
Subject: [PATCH 0362/1262] Tweaked docstrings in LayerCollection.

PiperOrigin-RevId: 191897098
---
 .../kfac/python/ops/layer_collection.py       | 123 +++++++++---------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 19608aca47..411da033c3 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -84,7 +84,7 @@ _EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES = {
     APPROX_KRONECKER_INDEP_NAME: fb.EmbeddingKFACMultiIndepFB
 }
 
-# Possible value for 'reuse' keyword argument. Sets 'reuse' to
+# Possible value for `reuse` keyword argument. Sets `reuse` to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
@@ -294,8 +294,8 @@ class LayerCollection(object):
       layer_key: A variable or tuple of variables. The key to check for in
           existing registrations and to register if valid.
       fisher_block: The associated `FisherBlock`.
-      reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
-        or 'VARIABLE_SCOPE'.
+      reuse: Method to use for inserting new `FisherBlock's. One of True, False,
+        or `VARIABLE_SCOPE`.
 
     Raises:
       ValueError: If `layer_key` was already registered and reuse is `False`,
@@ -359,15 +359,14 @@ class LayerCollection(object):
         is None.
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
+      reuse: (OPTIONAL) bool or str.  If True, adds `loss` as an additional
+        tower for the existing loss function.
 
     Raises:
       ValueError: If reuse == True and name == None.
       ValueError: If reuse == True and seed != None.
-      KeyError: If reuse == True and no existing LossFunction with 'name' found.
-      KeyError: If reuse == False and existing LossFunction with 'name' found.
+      KeyError: If reuse == True and no existing LossFunction with `name` found.
+      KeyError: If reuse == False and existing LossFunction with `name` found.
     """
 
     name = name or self._graph.unique_name(base_name)
@@ -491,24 +490,24 @@ class LayerCollection(object):
     """
     params = frozenset(utils.ensure_sequence(params))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.fisher_blocks.keys()'.
+    # Check if any of the variables in `params` is already in
+    # 'self.fisher_blocks.keys()`.
     for registered_params, fisher_block in self.fisher_blocks.items():
       registered_params_set = set(utils.ensure_sequence(registered_params))
       for variable in params:
         if (variable in registered_params_set and
             params != registered_params_set):
           raise ValueError(
-              "Can't link parameters {}, variable {} was already registered in "
+              "Can`t link parameters {}, variable {} was already registered in "
               "group {} with layer {}".format(params, variable,
                                               registered_params, fisher_block))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.linked_parameters'.
+    # Check if any of the variables in `params` is already in
+    # 'self.linked_parameters`.
     for variable in params:
       for other_linked_params in self.linked_parameters:
         if variable in other_linked_params:
-          raise ValueError("Can't link parameters {}, variable {} was already "
+          raise ValueError("Can`t link parameters {}, variable {} was already "
                            "linked in group {}.".format(params, variable,
                                                         other_linked_params))
     self._linked_parameters[params] = approximation
@@ -576,15 +575,15 @@ class LayerCollection(object):
         produced by layer.
       approx: str or None. If not None must be "kron".  The Fisher
         approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -618,15 +617,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
@@ -669,15 +668,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
@@ -686,7 +685,7 @@ class LayerCollection(object):
         _CONV2D_APPROX_TO_BLOCK_TYPES)
 
     # It feels bad to pass in configuration that has to do with the internal
-    # implementation.  And then we can't use the same constructor for both
+    # implementation.  And then we can`t use the same constructor for both
     # anymore and are thus forced to use this ugly if-statement.
     # TODO(b/74793309): Clean this up?
     if approx == APPROX_KRONECKER_NAME:
@@ -749,15 +748,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     # TODO(b/74793309): Have this use _get_block_type like the other
@@ -804,15 +803,15 @@ class LayerCollection(object):
       data_format: str or None. Format of data.
       approx: str or None. If not None must "diagonal".  The Fisher
         approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     # TODO(b/74793309): Have this use _get_block_type like the other
@@ -872,15 +871,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     self.register_depthwise_conv2d(
@@ -917,14 +916,14 @@ class LayerCollection(object):
       approx: str or None. It not None, must be one of "full" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str. If True, this adds 'batch_size' to the total
+      reuse: bool or str. If True, this adds `batch_size` to the total
         mini-batch size use when estimating the Fisher block for this layer
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -954,10 +953,10 @@ class LayerCollection(object):
         correspond to a "time-step" in an RNN). OR, can be single Tensor, of
         shape [num_uses * batch_size , input_size], which is a reshaped version
         of a Tensor of shape [num_uses, batch_size, input_size].
-      outputs: A list of Tensors, the same length as 'inputs', each of shape
+      outputs: A list of Tensors, the same length as `inputs`, each of shape
         [batch_size, output_size]. Outputs produced by layer. The list indexes
         each use in the graph (which might correspond to a "time-step" in an
-        RNN). Needs to correspond with the order used in 'inputs'.  OR, can be
+        RNN). Needs to correspond with the order used in `inputs`.  OR, can be
         a single Tensor of shape [num_uses * batch_size, output_size], which is
         a reshaped version of a Tensor of shape [num_uses, batch_size,
         output_size].
@@ -967,16 +966,16 @@ class LayerCollection(object):
       approx: str or None. If not None, must be of "kron_indep", "kron_series_1"
         or "kron_series_2". The Fisher approximation to use. If None the default
         value is used. (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
+      ValueError: For improper value to `approx`.
     """
     block_type, approx = self._get_block_type(
         params, approx, self.default_fully_connected_multi_approximation,
@@ -1025,7 +1024,7 @@ class LayerCollection(object):
       outputs: A list of Tensors, each of shape [batch_size, height, width,
         out_channels]. Output produced by layer. The list indexes each use
         in the graph (which might correspond to a "time-step" in an RNN).
-        Needs to correspond with the order used in 'inputs'.  OR, can be a
+        Needs to correspond with the order used in `inputs`.  OR, can be a
         single Tensor, of shape [num_uses * batch_size, height, width,
         out_channels], which is a reshaped version of a Tensor of shape
         [num_uses, batch_size, height, width, out_channels].
@@ -1037,17 +1036,17 @@ class LayerCollection(object):
       approx: str or None. If not None must by "kron_indep". The Fisher
         approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -1098,7 +1097,7 @@ class LayerCollection(object):
       outputs: A list of Tensors, each of shape [batch_size, embedding_size].
         Outputs produced by layer. The list indexes each use in the graph
         (which might correspond to a "time-step" in an RNN). Needs to
-        correspond with the order used in 'inputs'. OR, can be a
+        correspond with the order used in `inputs`. OR, can be a
         single Tensor, of shape [num_uses * batch_size, embedding_size], which
         is a reshaped version of a Tensor of shape [num_uses, batch_size,
         embedding_size].
@@ -1108,17 +1107,17 @@ class LayerCollection(object):
       approx: str or None. If not None must by "kron_indep". The Fisher
         approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -1156,7 +1155,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'logits' as an additional
+      reuse: bool or str.  If True, this adds `logits` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1190,7 +1189,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'mean' and 'var' as an additional
+      reuse: bool or str.  If True, this adds `mean` and `var` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1219,7 +1218,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'logits' as an additional
+      reuse: bool or str.  If True, this adds `logits` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1231,18 +1230,18 @@ class LayerCollection(object):
                                 name=name, reuse=reuse)
 
   def make_or_get_factor(self, cls, args):
-    """Insert 'cls(args)' into 'self.fisher_factors' if not already present.
+    """Insert `cls(args)` into 'self.fisher_factors` if not already present.
 
-    Wraps constructor in 'tf.variable_scope()' to ensure variables constructed
-    in 'cls.__init__' are placed under this LayerCollection's scope.
+    Wraps constructor in `tf.variable_scope()` to ensure variables constructed
+    in `cls.__init__` are placed under this LayerCollection's scope.
 
     Args:
       cls: Class that implements FisherFactor.
-      args: Tuple of arguments to pass into 'cls's constructor. Must be
+      args: Tuple of arguments to pass into `cls's constructor. Must be
         hashable.
 
     Returns:
-      Instance of 'cls' found in self.fisher_factors.
+      Instance of `cls` found in self.fisher_factors.
     """
     try:
       hash(args)
-- 
GitLab


From 218647db25d1e754ad85fd1fa8a0960b82ae83bf Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Fri, 6 Apr 2018 09:26:08 -0700
Subject: [PATCH 0363/1262] [TPUClusterResolver] Start a TFServer when running
 in GKE

This change allows advanced input pipelines (e.g. StreamingFilesDataset, or split-pipelines that use py_func's) to run in GKE- and GKE-like enviornments.

PiperOrigin-RevId: 191897639
---
 .../python/training/tpu_cluster_resolver.py   | 75 ++++++++++++-------
 .../training/tpu_cluster_resolver_test.py     |  8 +-
 2 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 300b19733e..a520a06bd7 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -61,11 +61,13 @@ class TPUClusterResolver(ClusterResolver):
       return False
     return True
 
-  def _inGke(self):
+  @staticmethod
+  def _inGke():
     """When running in GKE, the environment variable will be set."""
     return _GKE_ENV_VARIABLE in os.environ
 
-  def _gkeMaster(self):
+  @staticmethod
+  def _gkeMaster():
     return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
 
   def __init__(self,
@@ -119,8 +121,9 @@ class TPUClusterResolver(ClusterResolver):
             'Using multiple TPUs in a single session is not yet implemented')
       tpu = tpu[0]
 
+    in_gke = self._inGke()
     # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None and self._inGke():
+    if tpu is None and in_gke:
       tpu = self._gkeMaster()
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
@@ -158,7 +161,8 @@ class TPUClusterResolver(ClusterResolver):
       self._service = service
 
     self._coordinator_name = coordinator_name
-    if coordinator_name and not coordinator_address and should_resolve:
+    if coordinator_name and not coordinator_address and (should_resolve or
+                                                         in_gke):
       self._start_local_server()
     else:
       self._coordinator_address = coordinator_address
@@ -204,31 +208,50 @@ class TPUClusterResolver(ClusterResolver):
     Raises:
       RuntimeError: If the provided TPU is not healthy.
     """
-    if not self._shouldResolve():
-      return server_lib.ClusterSpec({})
-
-    full_name = 'projects/%s/locations/%s/nodes/%s' % (
-        self._project, self._zone, compat.as_text(self._tpu))
-    request = self._service.projects().locations().nodes().get(name=full_name)
-    response = request.execute()
-
-    if 'health' in response and response['health'] != 'HEALTHY':
-      raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
-                                                          response['health']))
-
-    if 'networkEndpoints' in response:
-      worker_list = [
-          '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-          for endpoint in response['networkEndpoints']
-      ]
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
+
+    if self._shouldResolve():
+      # Case 1.
+      full_name = 'projects/%s/locations/%s/nodes/%s' % (
+          self._project, self._zone, compat.as_text(self._tpu))
+      request = self._service.projects().locations().nodes().get(name=full_name)
+      response = request.execute()
+
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
+                                                            response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list = [instance_url]
+
+      cluster_spec = {self._job_name: worker_list}
     else:
-      # Fall back to the deprecated response format
-      instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-      worker_list = [instance_url]
-
-    cluster_spec = {self._job_name: worker_list}
+      if not self._tpu.startswith(compat.as_bytes('grpc://')):
+        # Case 3.
+        return server_lib.ClusterSpec({})
+      # Case 2.
+      cluster_spec = {self._job_name: [self._tpu[len(
+          compat.as_bytes('grpc://')):]]}
 
     if self._coordinator_address:
+      # {1, 2}.a
       cluster_spec[self._coordinator_name] = [self._coordinator_address]
 
     return server_lib.ClusterSpec(cluster_spec)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 48c3f6bb4f..cfddca1063 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -358,14 +358,10 @@ class TPUClusterResolverTest(test.TestCase):
   def testGkeEnvironment(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
     self.assertTrue('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ)
-    tpu_cluster_resolver = TPUClusterResolver()
-    self.assertTrue(tpu_cluster_resolver._inGke())
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver._gkeMaster()))
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.get_master()))
+        compat.as_bytes(TPUClusterResolver._gkeMaster()))
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
 
-- 
GitLab


From eef7771ddb9fc0de5e8c9aeabcaa186ff78ec105 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 6 Apr 2018 09:26:26 -0700
Subject: [PATCH 0364/1262] [tf.data] Enable using
 `tf.contrib.data.prefetch_to_device()` in eager mode.

The added functionality is a substitute for the implicit prefetching in
`tfe.Iterator`, and the two paths will converge in a future change.

Fixes #18260.

PiperOrigin-RevId: 191897666
---
 .../data/python/ops/prefetching_ops.py        | 114 +++++++++++++++---
 tensorflow/contrib/eager/python/datasets.py   |   7 ++
 .../contrib/eager/python/datasets_test.py     |  13 ++
 3 files changed, 115 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 77e23d0319..89c04dc89a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -25,10 +25,11 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -111,19 +112,7 @@ class _PrefetchToDeviceIterator(object):
           self._input_iterator.output_shapes,
           self._input_iterator.output_classes)
       ret = remote_iterator.get_next()
-
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
-      ret = nest.pack_sequence_as(ret, [
-          sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
-      ])
-
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
-      return nest.flatten(ret)
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
@@ -179,6 +168,68 @@ class _PrefetchToDeviceIterator(object):
   @property
   def output_types(self):
     return self._input_dataset.output_types
+
+
+class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               device,
+               buffer_size):
+    with ops.device("/device:CPU:0"):
+      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
+      input_iterator_handle = core_gen_dataset_ops.iterator_to_string_handle(
+          self._resource)
+
+    self._device = device
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self.output_types, self.output_shapes, self.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    _prefetch_fn.add_to_graph(None)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=gen_dataset_ops.iterator_get_device(self._resource),
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=iterator_ops._generate_shared_name(
+              "function_buffer_resource"))
+
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        ret = gen_dataset_ops.function_buffering_resource_get_next(
+            function_buffer_resource=self._buffering_resource,
+            output_types=self._flat_output_types)
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
 # pylint: enable=protected-access
 
 
@@ -190,12 +241,37 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset):
     self._device = device
     self._buffer_size = buffer_size if buffer_size is not None else 1
 
+  # The static analysis cannot tell that the eager iterator's superclass has
+  # a `next()` method.
+  # pylint: disable=non-iterator-returned
+  def __iter__(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    The returned iterator implements the Python iterator protocol and therefore
+    can only be used in eager mode.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      raise RuntimeError("dataset.__iter__() is only supported when eager "
+                         "execution is enabled.")
+  # pylint: enable=non-iterator-returned
+
   def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=True,
-        device=self._device,
-        buffer_size=self._buffer_size)
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
+                                       device=self._device,
+                                       buffer_size=self._buffer_size)
 
   def make_initializable_iterator(self, shared_name=None):
     return _PrefetchToDeviceIterator(
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 99b1e098d5..0783d1b5d7 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -71,8 +71,15 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
       dataset: A `tf.data.Dataset` object.
 
     Raises:
+      TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
+    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
+      raise TypeError(
+          "`tf.contrib.data.prefetch_to_device()` is not compatible with "
+          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
+          "over the dataset instead.")
+
     super(Iterator, self).__init__(dataset)
     if not context.context().device_spec.device_type:
       is_remote_device = False
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index c658505de4..f76a896d3d 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -24,6 +24,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
 from tensorflow.contrib.eager.python import checkpointable_utils
@@ -192,6 +193,18 @@ class IteratorTest(test.TestCase):
       x = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], x.numpy())
 
+  def testTensorsExplicitPrefetchToDevice(self):
+    ds = Dataset.from_tensor_slices([0., 1.])
+    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
+
+    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
+      datasets.Iterator(ds)
+
+    for i, x in enumerate(ds):
+      with ops.device(test.gpu_device_name()):
+        x = math_ops.add(x, x)
+        self.assertEqual(float(i) + float(i), x.numpy())
+
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
-- 
GitLab


From 53868bfd9705da3fc15b59ab02db39b652686b13 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 6 Apr 2018 09:45:01 -0700
Subject: [PATCH 0365/1262] Materialize tensor array sizes whenever possible

PiperOrigin-RevId: 191900015
---
 .../grappler/optimizers/constant_folding.cc   | 33 ++++++++++++++-
 .../optimizers/constant_folding_test.cc       | 42 +++++++++++++++++++
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 2f1b9e41d7..b2a1ce6ab6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -298,7 +298,8 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
   for (int node_idx = 0; node_idx < node_count; ++node_idx) {
     NodeDef* node = graph_->mutable_node(node_idx);
     const string op = node->op();
-    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
+    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN" &&
+        op != "TensorArraySizeV3") {
       continue;
     }
 
@@ -349,6 +350,36 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
       continue;
     }
 
+    if (op == "TensorArraySizeV3") {
+      const NodeDef* array = node_map_->GetNode(node->input(0));
+      if (array->attr().count("dynamic_size") != 0 &&
+          array->attr().at("dynamic_size").b()) {
+        continue;
+      }
+      const NodeDef* array_size = node_map_->GetNode(array->input(0));
+      if (IsReallyConstant(*array_size)) {
+        // Don't materialize 0 sizes to avoid triggering incorrect static
+        // checks. A 0 sized array that can't grow isn't useful anyway.
+        const TensorProto& raw_val = array_size->attr().at("value").tensor();
+        if (raw_val.dtype() != DT_INT32) {
+          continue;
+        }
+        Tensor value(raw_val.dtype(), raw_val.tensor_shape());
+        if (!value.FromProto(raw_val)) {
+          continue;
+        }
+        if (value.flat<int32>()(0) == 0) {
+          continue;
+        }
+        node->set_op("Const");
+        *node->mutable_attr() = array_size->attr();
+        node->set_input(0, AsControlDependency(NodeName(node->input(0))));
+        node->set_input(1, AddControlDependency(NodeName(node->input(1)),
+                                                graph_, node_map_.get()));
+      }
+      continue;
+    }
+
     // Handle ShapeN materialization case.
     // It's possible that not all input tensors have known shapes.
     CHECK_EQ(op, "ShapeN");
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 71ee81dfde..08c92687e3 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2402,6 +2402,48 @@ TEST_F(ConstantFoldingTest, Enter) {
   }
 }
 
+TEST_F(ConstantFoldingTest, TensorArraySize) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output size = ops::Const(scope.WithOpName("size"), 5, TensorShape({}));
+  auto dynamic_array =
+      ops::TensorArray(scope.WithOpName("dynamic"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(true));
+  auto static_array =
+      ops::TensorArray(scope.WithOpName("static"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(false));
+  auto dynamic_sz = ops::TensorArraySize(
+      scope.WithOpName("dynamic_sz"), dynamic_array.handle, dynamic_array.flow);
+  auto static_sz = ops::TensorArraySize(scope.WithOpName("static_sz"),
+                                        static_array.handle, static_array.flow);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  auto tensors_expected =
+      EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ("dynamic_sz", output.node(3).name());
+  EXPECT_EQ("TensorArraySizeV3", output.node(3).op());
+  EXPECT_EQ("static_sz", output.node(4).name());
+  EXPECT_EQ("Const", output.node(4).op());
+
+  auto tensors_actual = EvaluateNodes(output, {"dynamic_sz", "static_sz"});
+  EXPECT_EQ(2, tensors_expected.size());
+  EXPECT_EQ(2, tensors_actual.size());
+  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_actual[0]);
+  test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From 7535f6beb7ba95bf54e1513b0c2c51b844a7a49f Mon Sep 17 00:00:00 2001
From: Thomas Bastiani <thom@codehawks.eu>
Date: Fri, 6 Apr 2018 17:56:07 +0100
Subject: [PATCH 0366/1262] Bounds-check node ID before getting it's name
 (#18090)

When the edge is either a frame enter or exit edge then
DescribeCycle() would segfault.
---
 tensorflow/compiler/jit/mark_for_compilation_pass.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index f651768a67..3b631d6f4e 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/version.h"
 
@@ -432,6 +433,9 @@ string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
   }
 
   auto node_name = [&cycles, &graph](int node_id) {
+    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
+      return string("(null)");
+    }
     auto* node = graph.FindNodeId(node_id);
     if (node == nullptr) {
       return string("(null)");
-- 
GitLab


From 665e44f612c72d39717b0a5163dca82a07e1c174 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 6 Apr 2018 10:26:40 -0700
Subject: [PATCH 0367/1262] Object-based checkpointing support for
 unidirectional cuDNN LSTM cells

Once checked in, this will be the only way I know of to save canonical weights
when executing eagerly. Eager's name-based saving support will only do the
opaque parameter buffer.

I'm not going to try converting everything in one go, but it's a start at
least. And everything else should raise a NotImplementedError rather than
silently not saving correctly.

Single-layer cuDNN cells can be swapped for un-wrapped cuDNN compatible cells or
single cells wrapped in MultiRNNCells. Multi-layer cells need MultiRNNCell
wrapping.

PiperOrigin-RevId: 191905703
---
 tensorflow/contrib/cudnn_rnn/BUILD            |   1 +
 .../python/kernel_tests/cudnn_rnn_test.py     | 151 +++++++++++++++++-
 .../cudnn_rnn/python/layers/cudnn_rnn.py      |  20 +++
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |  75 ++++++++-
 4 files changed, 237 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 8b5d13f725..d68015ae15 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -25,6 +25,7 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 9897c31a98..9cc6ca09ad 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import argparse
 import collections
+import functools
 import itertools
 import os
 import sys
@@ -28,13 +29,14 @@ import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -265,7 +267,7 @@ def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None):
     return outputs, (output_state_fw, output_state_bw)
 
 
-class CudnnRNNTestBasic(TensorFlowTestCase):
+class CudnnRNNTestBasic(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -467,7 +469,7 @@ class CudnnRNNTestBasic(TensorFlowTestCase):
 
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
+class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
 
   def _CompareWeights(self, lhs, rhs):
     self.assertEqual(len(lhs), len(rhs))
@@ -701,9 +703,146 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
+class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+
+  def _VerifyCheckpoint(
+      self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
+      num_layers, input_size, expected_variable_values, num_applications=3):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with ops.device("gpu:0"):
+      cudnn_layer = cudnn_cell_fn()
+      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      status = cudnn_checkpoint.restore(checkpoint_path)
+      inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
+                                   dtype=dtypes.float32)
+      cudnn_output, _ = cudnn_layer(inputs)
+      status.assert_consumed().run_restore_ops()
+    second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
+    restore_layer = compatible_cell_fn()
+    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+        cell=restore_layer)
+    status = restore_layer_checkpoint.restore(second_save_path)
+    current_state = restore_layer.zero_state(1, dtypes.float32)
+    for _ in range(num_applications):
+      restore_layer_output, current_state = restore_layer(
+          inputs=3. * array_ops.ones([1, input_size]),
+          state=current_state)
+    status.assert_consumed().run_restore_ops()
+    self.assertTrue(restore_layer.variables)
+    for variable, expected_value in zip(
+        restore_layer.variables, expected_variable_values):
+      self.assertAllClose(expected_value, self.evaluate(variable))
+    self.assertAllClose(self.evaluate(restore_layer_output),
+                        self.evaluate(cudnn_output)[-1, -1:, ...])
+
+  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn):
+    # Single-layer cuDNN cells with object-based checkpointing should be
+    # checkpoint compatible with either single CudnnCompatible cells or
+    # MultiRnnCells with one cell.
+    input_size = 3
+    save_cell_layer = single_cell_fn()
+    save_cell_layer(
+        inputs=array_ops.ones([1, input_size]),
+        state=save_cell_layer.zero_state(1, dtypes.float32))
+    self.assertTrue(save_cell_layer.variables)
+    expected_values = []
+    np.random.seed(10)
+    for variable in save_cell_layer.variables:
+      value = np.random.normal(size=variable.shape)
+      expected_values.append(value)
+      self.evaluate(variable.assign(value))
+    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=
+        lambda: rnn_cell_impl.MultiRNNCell([single_cell_fn()]),
+        cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=1,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testLSTMCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=1, num_units=num_units,
+            direction=direction, name="awesome_lstm"))
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testGRUCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    with self.assertRaises(NotImplementedError):
+      # TODO(allenl): Implement object-based saving for GRUs and other cells.
+      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+          single_cell_fn=functools.partial(
+              cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
+          cudnn_cell_fn=functools.partial(
+              cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
+              direction=direction, name="awesome_gru"))
+
+  def _CheckpointableMultiLayerTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn, num_layers):
+
+    def _MultiCellFn():
+      return rnn_cell_impl.MultiRNNCell(
+          [single_cell_fn() for _ in range(num_layers)])
+    input_size = 3
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      save_layer = _MultiCellFn()
+      save_layer(inputs=array_ops.ones([1, input_size]),
+                 state=save_layer.zero_state(1, dtypes.float32))
+      self.assertTrue(save_layer.variables)
+      expected_values = []
+      np.random.seed(10)
+      for variable in save_layer.variables:
+        value = np.random.normal(size=variable.shape)
+        expected_values.append(value)
+        self.evaluate(variable.assign(value))
+      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=_MultiCellFn, cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=num_layers,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testCudnnCompatibleLSTMCheckpointablMultiLayer(self):
+    num_units = 2
+    num_layers = 3
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableMultiLayerTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=num_layers, num_units=num_units,
+            direction=direction, name="awesome_lstm"),
+        num_layers=num_layers)
+
+
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
+class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -884,7 +1023,7 @@ class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
                               rtol=2e-5)
 
 
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
+class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
   def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size,
                             dtype, direction):
@@ -931,7 +1070,7 @@ class CudnnRNNTestParamsSize(TensorFlowTestCase):
                                    dtype, direction)
 
 
-class CudnnRNNTestTraining(TensorFlowTestCase):
+class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
   def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1):
     """Compute the numeric gradient of y wrt to x.
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 36fba917a8..00d9544602 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -142,6 +142,9 @@ class _CudnnRNN(base_layer.Layer):
   """
   # pylint:enable=line-too-long
 
+  # TODO(allenl): Document object-based saving and checkpoint compatibility once
+  # it's implemented for more cuDNN Layers.
+
   # The following are constants defined by subclasses.
   # Type of RNN cell.
   _rnn_mode = None
@@ -363,6 +366,11 @@ class _CudnnRNN(base_layer.Layer):
       self._create_saveable()
     self.built = True
 
+  def _gather_saveables_for_checkpoint(self):
+    raise NotImplementedError(
+        "This cell does not yet support object-based saving. File a feature "
+        "request if this limitation bothers you.")
+
   def call(self, inputs, initial_state=None, training=True):
     """Runs the forward step for the RNN model.
 
@@ -499,6 +507,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
+    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
+        checkpointable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
@@ -521,6 +531,16 @@ class CudnnLSTM(_CudnnRNN):
     return ([self.num_layers * self.num_dirs, batch_size, self.num_units],
             [self.num_layers * self.num_dirs, batch_size, self.num_units])
 
+  @property
+  def _gather_saveables_for_checkpoint(self):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      # Skip one inheritance level to avoid NotImplementedError.
+      return super(_CudnnRNN, self)._gather_saveables_for_checkpoint
+    else:
+      raise NotImplementedError(
+          "Object-based saving does not currently support bidirectional LSTM "
+          "cells. File a feature request if this limitation bothers you.")
+
 
 class _CudnnRNNNoInputC(_CudnnRNN):
   """Abstract simple CudnnRNN layer without input_c."""
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 622241a177..588a5e705d 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
@@ -31,6 +32,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import saver
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
@@ -262,13 +264,16 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     # instead of having the master pull all slices and then save them.
     slice_spec = ""
     params = weights + biases
-    param_names = weight_names + bias_names
+    self._weight_names = weight_names
+    self._bias_names = bias_names
+    self._param_names = weight_names + bias_names
+    prefixed_param_names = weight_names + bias_names
     if self._scope:
-      param_names = ["%s/%s" % (self._scope, pn) for pn in param_names]
-
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
     specs = [
         saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, param_names)
+        for param, param_name in zip(params, prefixed_param_names)
     ]
     super(CudnnOpaqueParamsSaveable, self).__init__(
         array_ops.identity(self._variables), specs, name)
@@ -281,6 +286,45 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self._OpaqueParamsToCanonical()
+    with ops.device("gpu:0"):
+      (weights, _), (biases, _) = self._TransformCanonical(
+          weights, biases)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [array_ops.identity(restore_buffer[name])
+               for name in self._param_names]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = checkpointable_utils.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
   def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
     if self._direction == CUDNN_RNN_UNIDIRECTION:
       return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
@@ -570,6 +614,29 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_biases.append(b)
     tf_bias_names.append(prefix + "/bias")
 
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
 
 class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   """SaveableObject implementation handling Cudnn GRU opaque params."""
-- 
GitLab


From 5f8f0dc7998db78188c083b3c6945191145497a7 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 6 Apr 2018 10:28:10 -0700
Subject: [PATCH 0368/1262] Allow TFE_NewContext to fail more reasonably when
 SWIG is checking status.

Before:
TFE_Context would check nullptr, and the function would fail straight away.

Now:
TFE_Context is nullptr, so it skips down to checking the status, and an error
is raised.

I'm not able to find in SWIG documentation how to order typemaps in the
generated code - ideally, I'd order it to check the status typemap first. This
code makes it not dependent on this ordering either way.

PiperOrigin-RevId: 191905893
---
 tensorflow/python/pywrap_tfe.i | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 7acb8eeb1a..5ee55301df 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -120,9 +120,9 @@ limitations under the License.
 
 }
 %typemap(out) (TFE_Context*) {
-  if ($1 == nullptr) {
-    SWIG_fail;
-  } else {
+  // When the TFE_Context* returned is a nullptr, we expect the status is not
+  // OK. This will raise an error (happens in another typemap).
+  if ($1 != nullptr) {
     $result = PyCapsule_New($1, nullptr, TFE_DeleteContextCapsule);
   }
 }
-- 
GitLab


From 711e2f503039bd8a277928ef8a2b3740ae2bfa4b Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Fri, 6 Apr 2018 10:28:18 -0700
Subject: [PATCH 0369/1262] Change GetInstructionCallContext to take an opcode
 instead of an HloInstruction. This enables use of the function without an
 actual instruction (eg, if you just have an HloProto).

PiperOrigin-RevId: 191905914
---
 tensorflow/compiler/xla/service/call_graph.cc         | 6 +++---
 tensorflow/compiler/xla/service/call_graph.h          | 2 +-
 tensorflow/compiler/xla/service/flatten_call_graph.cc | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 13eb02ca01..a8053d15e1 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -51,8 +51,8 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
   return out;
 }
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction) {
-  switch (instruction->opcode()) {
+CallContext GetInstructionCallContext(HloOpcode opcode) {
+  switch (opcode) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
@@ -101,7 +101,7 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
 
 void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
   CHECK_EQ(instruction->parent(), computation());
-  const CallContext context = GetInstructionCallContext(instruction);
+  const CallContext context = GetInstructionCallContext(instruction->opcode());
   if (!instruction->called_computations().empty()) {
     CHECK(context == CallContext::kSequential ||
           context == CallContext::kParallel);
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 688c4085df..97d3811508 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -53,7 +53,7 @@ enum class CallContext {
 string CallContextToString(CallContext context);
 std::ostream& operator<<(std::ostream& out, const CallContext& context);
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction);
+CallContext GetInstructionCallContext(HloOpcode opcode);
 
 // Represents an HLO instruction which calls one or more computations.
 class CallSite {
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 2b6caa1494..85409b330b 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -93,7 +93,7 @@ Status FlattenNode(const CallGraphNode& node) {
       auto current = worklist.back();
       worklist.pop_back();
       for (auto* instruction : current->instructions()) {
-        if (GetInstructionCallContext(instruction) !=
+        if (GetInstructionCallContext(instruction->opcode()) !=
             CallContext::kSequential) {
           continue;
         }
-- 
GitLab


From 991e23c8cb4b9f3371393e322697b203b64d2326 Mon Sep 17 00:00:00 2001
From: nio1814 <niioaddy@live.com>
Date: Fri, 6 Apr 2018 10:52:04 -0700
Subject: [PATCH 0370/1262] Maxpoolwithargmax cpu (#18145)

* added cpu launch for maxpoolwithargmax

* added gradient op

* op builders

* op builders

added CPU test

code formatting

* undid unwanted changes

* one more unwanted change undid

* addressed review comments
---
 tensorflow/core/kernels/maxpooling_op.cc      | 92 +++++++++++++++----
 .../python/kernel_tests/pooling_ops_test.py   |  6 --
 2 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 9be7408012..23176b8577 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -56,7 +57,7 @@ template <typename Device, typename T>
 static void SpatialMaxPoolWithArgMaxHelper(
     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
-    const PoolParameters& params, const Padding& padding) {
+    const PoolParameters& params) {
   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
       ConstEigenMatrixMap;
   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -151,7 +152,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
       }
     }
 
-    {
+    if (input_backprop != nullptr) {
       auto input_backprop_flat = input_backprop->flat<T>();
       auto out_arg_max_flat = output_arg_max->flat<int64>();
       auto out_backprop_flat = out_backprop.flat<T>();
@@ -173,9 +174,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
         // Although this check is in the inner loop, it is worth its value
         // so we don't end up with memory corruptions. Our benchmark shows that
         // the performance impact is quite small
-        CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
-            << "Invalid input backprop index: " << input_backprop_index << ", "
-            << in_start << ", " << in_end;
+        //CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
+        FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
@@ -293,7 +293,7 @@ class MaxPoolingGradOp : public OpKernel {
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
-        out_backprop, params, padding_);
+        out_backprop, params);
   }
 
  private:
@@ -869,6 +869,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output, Tensor* argmax,
+                     bool propogate_nans) {
+    Tensor unused;
+    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
+        context, output, argmax, nullptr, input, unused, params);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingWithArgmaxOp : public OpKernel {
  public:
@@ -921,6 +932,53 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingGradWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      EigenMatrixMap;
+
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&grad_in, &argmax, &grad_out](int64 start, int64 limit) {
+      const int64 batch_size =
+          GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+      const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
+      const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
+
+      {
+        auto grad_out_flat = grad_out->flat<T>();
+        auto argmax_flat = argmax.flat<int64>();
+        auto grad_in_flat = grad_in.flat<T>();
+
+        const int64 output_start = start * output_size_per_batch;
+        const int64 output_end = limit * output_size_per_batch;
+        EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
+                                  output_end - output_start);
+        inputShard.setConstant(T(0));
+
+        const int input_start = start * input_size_per_batch;
+        const int input_end = limit * input_size_per_batch;
+        for (int64 index = input_start; index < input_end; index++) {
+          const int64 grad_out_index = argmax_flat(index);
+          CHECK(grad_out_index >= output_start && grad_out_index < output_end)
+              << "Invalid output gradient index: " << grad_out_index << ", "
+              << output_start << ", " << output_end;
+          grad_out_flat(grad_out_index) += grad_in_flat(index);
+        }
+      }
+    };
+
+    const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+    const int64 shard_cost = grad_out->NumElements() / batch_size;
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          shard_cost, shard);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingGradWithArgmaxOp : public OpKernel {
  public:
@@ -1309,7 +1367,17 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
                               .HostMemory("ksize")                       \
                               .HostMemory("strides")                     \
                               .TypeConstraint<T>("T"),                   \
-                          MaxPoolingGradGradOp<D##Device, T>);
+                          MaxPoolingGradGradOp<D##Device, T>)            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<int64>("Targmax")          \
+                              .TypeConstraint<T>("T"),                   \
+                          MaxPoolingWithArgmaxOp<D##Device, T>);         \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .TypeConstraint<int64>("Targmax"),         \
+                          MaxPoolingGradWithArgmaxOp<D##Device, T>);
 
 // Below kernels implemented only for CPU device.
 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
@@ -1374,16 +1442,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
                               .HostMemory("strides")                 \
                               .TypeConstraint<T>("T"),               \
                           MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<int64>("Targmax")      \
-                              .TypeConstraint<T>("T"),               \
-                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<int64>("Targmax"),     \
-                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index ed44a1a4d1..a0c372db7d 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -817,9 +817,6 @@ class PoolingTest(test.TestCase):
           cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     with self.test_session(use_gpu=True) as sess:
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
@@ -836,9 +833,6 @@ class PoolingTest(test.TestCase):
       self.assertAllEqual(argmax.ravel(), [0, 1, 3, 5])
 
   def testMaxPoolingGradWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-- 
GitLab


From 6e8008294b6ed502123feadca93a2968f76b94a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 11:16:17 -0700
Subject: [PATCH 0371/1262] TPU Cost Estimator has been modified to also
 account for the memory cost in the execution time. Until more sophisticated
 methods are added, we resort to the roofline model to calculate such cost.

PiperOrigin-RevId: 191913626
---
 .../grappler/costs/op_level_cost_estimator.cc     | 15 ++++++++++-----
 .../core/grappler/costs/op_level_cost_estimator.h |  5 +++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 14e46ecdd9..79735e6cc2 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -459,11 +459,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
-  if (compute_memory_overlap_) {
-    costs.execution_time = std::max(compute_cost, memory_cost);
-  } else {
-    costs.execution_time = compute_cost + memory_cost;
-  }
+  CombineCostsAndUpdateExecutionTime(&costs);
   return costs;
 }
 
@@ -1375,5 +1371,14 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   return costs;
 }
 
+void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
+    Costs* costs) const {
+  if (compute_memory_overlap_) {
+    costs->execution_time = std::max(costs->compute_time, costs->memory_time);
+  } else {
+    costs->execution_time = costs->compute_time + costs->memory_time;
+  }
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index fcbecbb6dc..7080264698 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -173,6 +173,11 @@ class OpLevelCostEstimator {
       const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
+  // This method calculates the execution time depending on whether IO can
+  // overlap with computation. It assumes the memory and the compute times have
+  // already been calculated.
+  void CombineCostsAndUpdateExecutionTime(Costs* costs) const;
+
  protected:
   std::map<string, int> elementwise_ops_;
   typedef std::function<Costs(const OpContext& op_context)> CostImpl;
-- 
GitLab


From 516f687678290f9748d866a74080fadbf76de09b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 11:17:41 -0700
Subject: [PATCH 0372/1262] Properly handle callable objects.

PiperOrigin-RevId: 191913834
---
 tensorflow/contrib/autograph/pyct/inspect_utils.py      | 6 ++++++
 tensorflow/contrib/autograph/pyct/inspect_utils_test.py | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index d19c6ed75e..30a5961821 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -74,6 +74,12 @@ def getmethodclass(m):
     ValueError: if the class could not be resolved for any unexpected reason.
   """
 
+  # Callable objects: return their own class.
+  if (not hasattr(m, '__name__') and hasattr(m, '__class__') and
+      hasattr(m, '__call__')):
+    if isinstance(m.__class__, six.class_types):
+      return m.__class__
+
   # Instance method and class methods: should be bound to a non-null "self".
   # If self is a class, then it's a class method.
   if hasattr(m, '__self__'):
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index ddca6f963b..eda3fc13fd 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -225,6 +225,15 @@ class InspectUtilsTest(test.TestCase):
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
         LocalClass)
 
+  def test_getmethodclass_callables(self):
+    class TestCallable(object):
+
+      def __call__(self):
+        pass
+
+    c = TestCallable()
+    self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 2997ab5727acf62e98d19ff16b4302400d060c5a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 11:21:24 -0700
Subject: [PATCH 0373/1262] Minor doc clarification for reduce_sum return type

PiperOrigin-RevId: 191914398
---
 tensorflow/python/ops/math_ops.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b460ce5b95..01d670ea2d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1402,10 +1402,11 @@ def reduce_sum(input_tensor,
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
-    The reduced tensor.
+    The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum
+  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
-- 
GitLab


From dc19f610c2db98cebb0274d9e2cb49c6d05d2f8f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 11:21:41 -0700
Subject: [PATCH 0374/1262] Added headers only version of
 tensorflow/core/kernels:cwise_lib, cwise_lib_hdrs. This is for clients that
 want to use the cwise_ops machinery when making their own custom ops,
 including cwise_lib directly causes multiple definition linker errors.

PiperOrigin-RevId: 191914445
---
 tensorflow/core/kernels/BUILD | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d2a2cdd13d..a196fc54af 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6180,3 +6180,12 @@ cc_library(
         "@gemmlowp",
     ],
 )
+
+# Header-only version of cwise_lib for clients that want to use the cwise_ops
+# functionality in their own custom ops.
+cc_header_only_library(
+    name = "cwise_lib_hdrs",
+    deps = [
+        ":cwise_lib",
+    ],
+)
-- 
GitLab


From beda9ebd36bbf6964459c7ee2209975d62cb01e6 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 6 Apr 2018 11:23:40 -0700
Subject: [PATCH 0375/1262] [TF:XLA] Create Despecializing Pass Pipeline

When comparing backends, it is useful to take an HLO optimized for one backend and perform transformations in order to match numerics.  This can be thought of as finding a lowest common denominator.

Move this grouping of passes into its own HloPassPipeline that can be reused in a few different places.

PiperOrigin-RevId: 191914799
---
 tensorflow/compiler/xla/service/BUILD         | 15 +++++++
 .../compiler/xla/service/despecializer.cc     | 35 +++++++++++++++
 .../compiler/xla/service/despecializer.h      | 45 +++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/despecializer.cc
 create mode 100644 tensorflow/compiler/xla/service/despecializer.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 3a99d84bea..db91e80407 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2639,6 +2639,21 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "despecializer",
+    srcs = ["despecializer.cc"],
+    hdrs = ["despecializer.h"],
+    deps = [
+        ":bfloat16_normalization",
+        ":defuser",
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
 cc_library(
     name = "source_map_util",
     srcs = ["source_map_util.cc"],
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
new file mode 100644
index 0000000000..d938f3a2c4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/despecializer.h"
+
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/defuser.h"
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+namespace xla {
+
+Despecializer::Despecializer() : pipeline_("despecializer") {
+  // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<Defuser>();
+  pipeline_.AddPass<ImplicitBroadcastRemover>();
+  pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
+}
+
+StatusOr<bool> Despecializer::Run(HloModule* module) {
+  return pipeline_.Run(module);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
new file mode 100644
index 0000000000..af48f4ab6e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Creates an HloPassPipeline containing multiple HloPasses that can
+// despecialize an optimized HloModule. This is useful to run an HloModule
+// optimized for one specfic platform on a different platform (undoing platform
+// specific passes) with matching numerics for comparison.
+//
+// Current despecialization passes are Defuser, ImplicitBroadcastRemover,
+// and BFloat16MixedPrecisionRemoval.
+class Despecializer : public HloPassInterface {
+ public:
+  Despecializer();
+  tensorflow::StringPiece name() const override { return "despecializer"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  HloPassPipeline pipeline_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
-- 
GitLab


From 98b8b786036172d33c85b6b5f81347440d0594df Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Fri, 6 Apr 2018 11:24:20 -0700
Subject: [PATCH 0376/1262] Update tf.keras to keras 2.1.5 version.

PiperOrigin-RevId: 191914904
---
 .../_impl/keras/applications/mobilenet.py     | 222 +-----------------
 .../_impl/keras/applications/resnet50.py      |   5 +-
 .../keras/_impl/keras/layers/convolutional.py | 195 +++++++++++++++
 .../_impl/keras/layers/convolutional_test.py  |  38 +++
 .../keras/_impl/keras/layers/recurrent.py     | 137 ++++++++---
 .../_impl/keras/layers/recurrent_test.py      |  16 +-
 tensorflow/python/keras/layers/__init__.py    |   1 +
 ...flow.keras.layers.-depthwise-conv2-d.pbtxt | 187 +++++++++++++++
 .../tensorflow.keras.layers.-g-r-u-cell.pbtxt |   2 +-
 .../tensorflow.keras.layers.-g-r-u.pbtxt      |   6 +-
 .../api/golden/tensorflow.keras.layers.pbtxt  |   4 +
 11 files changed, 562 insertions(+), 251 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt

diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index ad96b53a45..12775fccec 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -84,11 +84,13 @@ from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import DepthwiseConv2D
 from tensorflow.python.keras._impl.keras.layers import Dropout
 from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import Reshape
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -116,195 +118,6 @@ def preprocess_input(x):
   return imagenet_utils.preprocess_input(x, mode='tf')
 
 
-class DepthwiseConv2D(Conv2D):
-  """Depthwise separable 2D convolution.
-
-  Depthwise Separable convolutions consists in performing
-  just the first step in a depthwise spatial convolution
-  (which acts on each input channel separately).
-  The `depth_multiplier` argument controls how many
-  output channels are generated per input channel in the depthwise step.
-
-  Arguments:
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          width and height of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the width and height.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `'valid'` or `'same'` (case-insensitive).
-      depth_multiplier: The number of depthwise convolution output channels
-          for each input channel.
-          The total number of depthwise convolution output
-          channels will be equal to `filters_in * depth_multiplier`.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be 'channels_last'.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. 'linear' activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix.
-      bias_initializer: Initializer for the bias vector.
-      depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its 'activation')..
-      depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-
-  Input shape:
-      4D tensor with shape:
-      `[batch, channels, rows, cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, rows, cols, channels]` if data_format='channels_last'.
-
-  Output shape:
-      4D tensor with shape:
-      `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
-  """
-
-  def __init__(self,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DepthwiseConv2D, self).__init__(
-        filters=None,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.depthwise_constraint = constraints.get(depthwise_constraint)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-  @shape_type_conversion
-  def build(self, input_shape):
-    if len(input_shape) < 4:
-      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
-                       'Received input shape:', str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = 3
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs to '
-                       '`DepthwiseConv2D` '
-                       'should be defined. Found `None`.')
-    input_dim = int(input_shape[channel_axis])
-    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
-                              input_dim, self.depth_multiplier)
-
-    self.depthwise_kernel = self.add_weight(
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        name='depthwise_kernel',
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(input_dim * self.depth_multiplier,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs, training=None):
-    outputs = K.depthwise_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        strides=self.strides,
-        padding=self.padding,
-        dilation_rate=self.dilation_rate,
-        data_format=self.data_format)
-
-    if self.bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-
-    return outputs
-
-  @shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-      out_filters = input_shape[1] * self.depth_multiplier
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      cols = input_shape[2]
-      out_filters = input_shape[3] * self.depth_multiplier
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding, self.strides[1])
-
-    if self.data_format == 'channels_first':
-      return (input_shape[0], out_filters, rows, cols)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, cols, out_filters)
-
-  def get_config(self):
-    config = super(DepthwiseConv2D, self).get_config()
-    config.pop('filters')
-    config.pop('kernel_initializer')
-    config.pop('kernel_regularizer')
-    config.pop('kernel_constraint')
-    config['depth_multiplier'] = self.depth_multiplier
-    config['depthwise_initializer'] = initializers.serialize(
-        self.depthwise_initializer)
-    config['depthwise_regularizer'] = regularizers.serialize(
-        self.depthwise_regularizer)
-    config['depthwise_constraint'] = constraints.serialize(
-        self.depthwise_constraint)
-    return config
-
-
 @tf_export('keras.applications.MobileNet',
            'keras.applications.mobilenet.MobileNet')
 def MobileNet(input_shape=None,
@@ -318,18 +131,11 @@ def MobileNet(input_shape=None,
               classes=1000):
   """Instantiates the MobileNet architecture.
 
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
   To load a MobileNet model via `load_model`, import the custom
-  objects `relu6` and `DepthwiseConv2D` and pass them to the
-  `custom_objects` parameter.
+  objects `relu6` and pass them to the `custom_objects` parameter.
   E.g.
   model = load_model('mobilenet.h5', custom_objects={
-                     'relu6': mobilenet.relu6,
-                     'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+                     'relu6': mobilenet.relu6})
 
   Arguments:
       input_shape: optional shape tuple, only to be specified
@@ -383,11 +189,6 @@ def MobileNet(input_shape=None,
           backend that does not support separable convolutions.
   """
 
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('Only TensorFlow backend is currently supported, '
-                       'as other backends do not support '
-                       'depthwise convolution.')
-
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
@@ -522,7 +323,7 @@ def MobileNet(input_shape=None,
   # load weights
   if weights == 'imagenet':
     if K.image_data_format() == 'channels_first':
-      raise ValueError('Weights for "channels_last" format '
+      raise ValueError('Weights for "channels_first" format '
                        'are not available.')
     if alpha == 1.0:
       alpha_text = '1_0'
@@ -598,14 +399,14 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   filters = int(filters * alpha)
+  x = ZeroPadding2D(padding=(1, 1), name='conv1_pad')(inputs)
   x = Conv2D(
       filters,
       kernel,
-      padding='same',
+      padding='valid',
       use_bias=False,
       strides=strides,
-      name='conv1')(
-          inputs)
+      name='conv1')(x)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -665,15 +466,14 @@ def _depthwise_conv_block(inputs,
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   pointwise_conv_filters = int(pointwise_conv_filters * alpha)
-
+  x = ZeroPadding2D(padding=(1, 1), name='conv_pad_%d' % block_id)(inputs)
   x = DepthwiseConv2D(  # pylint: disable=not-callable
       (3, 3),
-      padding='same',
+      padding='valid',
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(
-          inputs)
+      name='conv_dw_%d' % block_id)(x)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 46c0e63557..f8c6aff4f2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -45,6 +45,7 @@ from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -236,9 +237,9 @@ def ResNet50(include_top=True,
   else:
     bn_axis = 1
 
+  x = ZeroPadding2D(padding=(3, 3), name='conv1_pad')(img_input)
   x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
-          img_input)
+      64, (7, 7), strides=(2, 2), padding='valid', name='conv1')(x)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 162ae6c28f..7cdebc6aa4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
@@ -1024,6 +1025,200 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.DepthwiseConv2D')
+class DepthwiseConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Depthwise Separable convolutions consists in performing
+  just the first step in a depthwise spatial convolution
+  (which acts on each input channel separately).
+  The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Arguments:
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+        width and height of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the width and height.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: one of `'valid'` or `'same'` (case-insensitive).
+    depth_multiplier: The number of depthwise convolution output channels
+        for each input channel.
+        The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be 'channels_last'.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. 'linear' activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    depthwise_initializer: Initializer for the depthwise kernel matrix.
+    bias_initializer: Initializer for the bias vector.
+    depthwise_regularizer: Regularizer function applied to
+        the depthwise kernel matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+        the output of the layer (its 'activation').
+    depthwise_constraint: Constraint function applied to
+        the depthwise kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+    4D tensor with shape:
+    `[batch, channels, rows, cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+  Output shape:
+    4D tensor with shape:
+    `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               depth_multiplier=1,
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DepthwiseConv2D, self).__init__(
+        filters=None,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`DepthwiseConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0],
+                              self.kernel_size[1],
+                              input_dim,
+                              self.depth_multiplier)
+
+    self.depthwise_kernel = self.add_weight(
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs, training=None):
+    outputs = K.depthwise_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        strides=self.strides,
+        padding=self.padding,
+        dilation_rate=self.dilation_rate,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = K.bias_add(
+          outputs,
+          self.bias,
+          data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+
+    return outputs
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+      out_filters = input_shape[1] * self.depth_multiplier
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+      out_filters = input_shape[3] * self.depth_multiplier
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return (input_shape[0], out_filters, rows, cols)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], rows, cols, out_filters)
+
+  def get_config(self):
+    config = super(DepthwiseConv2D, self).get_config()
+    config.pop('filters')
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    return config
+
+
 @tf_export('keras.layers.UpSampling1D')
 class UpSampling1D(Layer):
   """Upsampling layer for 1D inputs.
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index f4a134b96c..12b4267675 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -961,5 +961,43 @@ class CroppingTest(test.TestCase):
       keras.layers.Cropping3D(cropping=None)
 
 
+class DepthwiseConv2DTest(test.TestCase):
+
+  def _run_test(self, kwargs, arg, values):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    test_kwargs = copy.copy(kwargs)
+    for value in values:
+      test_kwargs[arg] = value
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.DepthwiseConv2D,
+            kwargs=test_kwargs,
+            input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_depthwise_conv2d(self):
+    kwargs = {'kernel_size': (3, 3)}
+
+    self._run_test(kwargs, 'padding', ['valid', 'same'])
+    self._run_test(kwargs, 'strides', [(2, 2)])
+    if test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, 'data_format', ['channels_first'])
+    self._run_test(kwargs, 'depth_multiplier', [1, 2])
+
+    kwargs = {'kernel_size': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'activation': None,
+              'depthwise_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'activity_regularizer': 'l2',
+              'depthwise_constraint': 'unit_norm',
+              'strides': (2, 2),
+             }
+    self._run_test(kwargs, 'depth_multiplier', [1])
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 7f9f77c296..f53db987ff 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -251,7 +251,7 @@ class RNN(Layer):
           It is also possible for `cell` to be a list of RNN cell instances,
           in which cases the cells get stacked on after the other in the RNN,
           implementing an efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -797,10 +797,10 @@ class RNN(Layer):
 
   @property
   def losses(self):
-    losses = []
+    layer_losses = super(RNN, self).losses
     if isinstance(self.cell, Layer):
-      losses += self.cell.losses
-    return losses + self._losses
+      return self.cell.losses + layer_losses
+    return layer_losses
 
   @property
   def updates(self):
@@ -1017,7 +1017,7 @@ class SimpleRNN(RNN):
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1237,6 +1237,9 @@ class GRUCell(Layer):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
   """
 
   def __init__(self,
@@ -1256,6 +1259,7 @@ class GRUCell(Layer):
                dropout=0.,
                recurrent_dropout=0.,
                implementation=1,
+               reset_after=False,
                **kwargs):
     super(GRUCell, self).__init__(**kwargs)
     self.units = units
@@ -1278,6 +1282,7 @@ class GRUCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
+    self.reset_after = reset_after
     self.state_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
@@ -1299,12 +1304,25 @@ class GRUCell(Layer):
         constraint=self.recurrent_constraint)
 
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.units * 3,),
-          name='bias',
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      if not self.reset_after:
+        bias_shape = (3 * self.units,)
+      else:
+        # separate biases for input and recurrent kernels
+        # Note: the shape is intentionally different from CuDNNGRU biases
+        # `(2 * 3 * self.units,)`, so that we can distinguish the classes
+        # when loading and converting saved weights.
+        bias_shape = (2, 3 * self.units)
+      self.bias = self.add_weight(shape=bias_shape,
+                                  name='bias',
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+      if not self.reset_after:
+        self.input_bias, self.recurrent_bias = self.bias, None
+      else:
+        self.input_bias = K.flatten(self.bias[0])
+        self.recurrent_bias = K.flatten(self.bias[1])
+
     else:
       self.bias = None
     self.built = True
@@ -1340,13 +1358,15 @@ class GRUCell(Layer):
         inputs_z = inputs
         inputs_r = inputs
         inputs_h = inputs
+
       x_z = K.dot(inputs_z, self.kernel[:, :self.units])
       x_r = K.dot(inputs_r, self.kernel[:, self.units:self.units * 2])
       x_h = K.dot(inputs_h, self.kernel[:, self.units * 2:])
+
       if self.use_bias:
-        x_z = K.bias_add(x_z, self.bias[:self.units])
-        x_r = K.bias_add(x_r, self.bias[self.units:self.units * 2])
-        x_h = K.bias_add(x_h, self.bias[self.units * 2:])
+        x_z = K.bias_add(x_z, self.input_bias[:self.units])
+        x_r = K.bias_add(x_r, self.input_bias[self.units: self.units * 2])
+        x_h = K.bias_add(x_h, self.input_bias[self.units * 2:])
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1_z = h_tm1 * rec_dp_mask[0]
@@ -1356,42 +1376,70 @@ class GRUCell(Layer):
         h_tm1_z = h_tm1
         h_tm1_r = h_tm1
         h_tm1_h = h_tm1
-      z = self.recurrent_activation(
-          x_z + K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units]))
-      r = self.recurrent_activation(
-          x_r + K.dot(h_tm1_r, self.recurrent_kernel[:, self.units:
-                                                     self.units * 2]))
-
-      hh = self.activation(x_h + K.dot(r * h_tm1_h,
-                                       self.recurrent_kernel[:,
-                                                             self.units * 2:]))
+
+      recurrent_z = K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units])
+      recurrent_r = K.dot(h_tm1_r,
+                          self.recurrent_kernel[:, self.units:self.units * 2])
+      if self.reset_after and self.use_bias:
+        recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias[:self.units])
+        recurrent_r = K.bias_add(recurrent_r,
+                                 self.recurrent_bias[self.units:
+                                                     self.units * 2])
+
+      z = self.recurrent_activation(x_z + recurrent_z)
+      r = self.recurrent_activation(x_r + recurrent_r)
+
+      # reset gate applied after/before matrix multiplication
+      if self.reset_after:
+        recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
+        if self.use_bias:
+          recurrent_h = K.bias_add(recurrent_h,
+                                   self.recurrent_bias[self.units * 2:])
+        recurrent_h = r * recurrent_h
+      else:
+        recurrent_h = K.dot(r * h_tm1_h,
+                            self.recurrent_kernel[:, self.units * 2:])
+
+      hh = self.activation(x_h + recurrent_h)
     else:
       if 0. < self.dropout < 1.:
         inputs *= dp_mask[0]
+
+      # inputs projected by all gate matrices at once
       matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
-        matrix_x = K.bias_add(matrix_x, self.bias)
+        # biases: bias_z_i, bias_r_i, bias_h_i
+        matrix_x = K.bias_add(matrix_x, self.input_bias)
+
+      x_z = matrix_x[:, :self.units]
+      x_r = matrix_x[:, self.units: 2 * self.units]
+      x_h = matrix_x[:, 2 * self.units:]
+
       if 0. < self.recurrent_dropout < 1.:
         h_tm1 *= rec_dp_mask[0]
       matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
 
-      x_z = matrix_x[:, :self.units]
-      x_r = matrix_x[:, self.units:2 * self.units]
       recurrent_z = matrix_inner[:, :self.units]
       recurrent_r = matrix_inner[:, self.units:2 * self.units]
 
       z = self.recurrent_activation(x_z + recurrent_z)
       r = self.recurrent_activation(x_r + recurrent_r)
 
-      x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
+      if self.reset_after:
+        recurrent_h = r * matrix_inner[:, 2 * self.units:]
+      else:
+        recurrent_h = K.dot(r * h_tm1,
+                            self.recurrent_kernel[:, 2 * self.units:])
+
       hh = self.activation(x_h + recurrent_h)
+    # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
       if training is None and not context.executing_eagerly():
         # This would be harmless to set in eager mode, but eager tensors
         # disallow setting arbitrary attributes.
         h._uses_learning_phase = True
+
     return h, [h]
 
   def get_config(self):
@@ -1415,7 +1463,8 @@ class GRUCell(Layer):
         'bias_constraint': constraints.serialize(self.bias_constraint),
         'dropout': self.dropout,
         'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'implementation': self.implementation,
+        'reset_after': self.reset_after
     }
     base_config = super(GRUCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1423,9 +1472,16 @@ class GRUCell(Layer):
 
 @tf_export('keras.layers.GRU')
 class GRU(RNN):
-  """Gated Recurrent Unit - Cho et al.
+  """Gated Recurrent Unit - Cho et al. 2014.
 
-  2014.
+  There are two variants. The default one is based on 1406.1078v3 and
+  has reset gate applied to hidden state before matrix multiplication. The
+  other one is based on original 1406.1078v1 and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. Use `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
@@ -1469,7 +1525,7 @@ class GRU(RNN):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1485,6 +1541,9 @@ class GRU(RNN):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
 
   """
 
@@ -1511,6 +1570,7 @@ class GRU(RNN):
                go_backwards=False,
                stateful=False,
                unroll=False,
+               reset_after=False,
                **kwargs):
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
@@ -1532,7 +1592,8 @@ class GRU(RNN):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
+        implementation=implementation,
+        reset_after=reset_after)
     super(GRU, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -1613,6 +1674,10 @@ class GRU(RNN):
   def implementation(self):
     return self.cell.implementation
 
+  @property
+  def reset_after(self):
+    return self.cell.reset_after
+
   def get_config(self):
     config = {
         'units':
@@ -1648,7 +1713,9 @@ class GRU(RNN):
         'recurrent_dropout':
             self.recurrent_dropout,
         'implementation':
-            self.implementation
+            self.implementation,
+        'reset_after':
+            self.reset_after
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1929,7 +1996,7 @@ class LSTMCell(Layer):
 
 @tf_export('keras.layers.LSTM')
 class LSTM(RNN):
-  """Long-Short Term Memory layer - Hochreiter 1997.
+  """Long Short-Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index fb743b617f..641b563a25 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -232,6 +232,7 @@ class RNNTest(test.TestCase):
       cell = RNNCellWithConstants(32)
       layer = keras.layers.RNN(cell)
       y = layer(x, constants=c)
+
       model = keras.models.Model([x, c], y)
       model.compile(optimizer='rmsprop', loss='mse')
       model.train_on_batch(
@@ -279,6 +280,20 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
+    with self.test_session():
+      # Test GRUCell reset_after property.
+      x = keras.Input((None, 5))
+      c = keras.Input((3,))
+      cells = [keras.layers.recurrent.GRUCell(32, reset_after=True)]
+      layer = keras.layers.recurrent.RNN(cells)
+      y = layer(x, constants=c)
+      model = keras.models.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 32))
+      )
+
     with self.test_session():
       # Test stacked RNN serialization
       x_np = np.random.random((6, 5, 5))
@@ -541,6 +556,5 @@ class RNNTest(test.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 84ee5040dc..b45cafed31 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -49,6 +49,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras._impl.keras.layers.convolutional import DepthwiseConv2D
 
 # Image processing layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
new file mode 100644
index 0000000000..b38716aa2c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -0,0 +1,187 @@
+path: "tensorflow.keras.layers.DepthwiseConv2D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'kernel_size\', \'strides\', \'padding\', \'depth_multiplier\', \'data_format\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'1\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 1fd3febad2..4274b8d425 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -91,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index f5f41d879d..8d9f06083c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -122,6 +122,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "reset_after"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
@@ -160,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 088c8e88e2..affc9bd09b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -116,6 +116,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DepthwiseConv2D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Dot"
     mtype: "<type \'type\'>"
-- 
GitLab


From 76719deb3c12d84c4945a444e92493956530e165 Mon Sep 17 00:00:00 2001
From: Richard Wei <rxwei@google.com>
Date: Fri, 6 Apr 2018 11:47:44 -0700
Subject: [PATCH 0377/1262] Remove `TF_InitializeTPU` and `TF_ShutdownTPU` from
 experimental C API as they are no longer needed. Also remove a duplicate
 function declaration.

PiperOrigin-RevId: 191918408
---
 tensorflow/c/c_api_experimental.cc | 51 ------------------------------
 tensorflow/c/c_api_experimental.h  | 21 ------------
 2 files changed, 72 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index bea9378571..e82a546092 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -56,57 +56,6 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
   }
 }
 
-void TF_InitializeTPU(TF_Session* session, TF_Status* status) {
-  VLOG(1) << "Initializing TPU";
-  TF_Operation* config_op =
-      TF_GraphOperationByName(session->graph, "ConfigureDistributedTPU");
-  if (config_op == nullptr) {
-    status->status = tensorflow::errors::Internal(
-        "Unable to find node ConfigureDistributedTPU in the TF graph.");
-    return;
-  }
-
-  TF_Output config_node{config_op, 0};
-
-  TF_Tensor* dummy_output;
-  TF_SessionRun(session, /*run_options*/ nullptr,
-                // input related parameters
-                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
-                // output related parameters
-                /*outputs*/ &config_node, /*output_values*/ &dummy_output,
-                /*noutputs*/ 1,
-                /*targets*/ nullptr, /*ntargets*/ 0,
-                /*run_metadata*/ nullptr, status);
-  if (status->status.ok()) {
-    TF_DeleteTensor(dummy_output);
-  }
-}
-
-void TF_ShutdownTPU(TF_Session* session, TF_Status* status) {
-  {
-    tensorflow::mutex_lock c(session->graph->mu);
-    VLOG(1) << "Shutting down TPU, with input graph: "
-            << session->graph->graph.ToGraphDefDebug().DebugString();
-  }
-
-  TF_Operation* shutdown_op =
-      TF_GraphOperationByName(session->graph, "ShutdownDistributedTPU");
-  if (shutdown_op == nullptr) {
-    status->status = tensorflow::errors::Internal(
-        "Unable to find node ShutdownDistributedTPU in the TF graph.");
-    return;
-  }
-
-  TF_SessionRun(session, /*run_options*/ nullptr,
-                // input related parameters
-                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
-                // output related parameters
-                /*outputs*/ nullptr, /*output_values*/ nullptr,
-                /*noutputs*/ 0,
-                /*targets*/ &shutdown_op, /*ntargets*/ 1,
-                /*run_metadata*/ nullptr, status);
-}
-
 const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   tensorflow::mutex_lock c(graph->mu);
   const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString();
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index ebcec8176b..666342974e 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -60,27 +60,6 @@ extern "C" {
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
-// Initializes TPU system. Must be called exactly once before TF_SessionRun() is
-// called on a TPU graph.
-//
-// The session graph must contain a node named ConfigureDistributedTPU.
-// TODO(b/74774824): Improve the API on initializing TPU system.
-TF_CAPI_EXPORT extern void TF_InitializeTPU(TF_Session* session,
-                                            TF_Status* status);
-
-// Shuts down TPU system. For any `session` where TF_InitializeTPU() has
-// been successfully called, this call must be made exactly once before the
-// session is closed.
-// The session graph must contain a node named ShutdownDistributedTPU.
-TF_CAPI_EXPORT extern void TF_ShutdownTPU(TF_Session* session,
-                                          TF_Status* status);
-
-// Returns the graph content in a human-readable format, with length set in
-// `len`. The format is subject to change in the future.
-// The returned string is heap-allocated, and caller should call free() on it.
-TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
-                                                      size_t* len);
-
 // Returns the graph content in a human-readable format, with length set in
 // `len`. The format is subject to change in the future.
 // The returned string is heap-allocated, and caller should call free() on it.
-- 
GitLab


From f15c117c4f4d51a6660bf14b6d6cf73c52692cfb Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 6 Apr 2018 11:55:19 -0700
Subject: [PATCH 0378/1262] Fix small performance regression in
 microbenchmarks.

PiperOrigin-RevId: 191919464
---
 tensorflow/python/eager/benchmarks_test.py | 23 ++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 7ad37058fd..3aad4a114a 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -217,10 +217,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_identity(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
+          30000)
 
   def benchmark_tf_gradient_forward_identity(self):
     with backprop.GradientTape() as tape:
@@ -236,10 +237,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_no_op(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(lambda x: x, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(lambda x: x, [0])(m),
+          30000)
 
   def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
@@ -271,11 +273,12 @@ class MicroBenchmarks(test.Benchmark):
     # pylint: disable=protected-access
     ctx_handle = context.context()._handle
     # pylint: enable=protected-access
+    device = context.context().device_name
     attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
              m.dtype.as_datatype_enum)
     def func():
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "MatMul", inputs,
-                                       attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul",
+                                       inputs, attrs, 1)
 
     self._run(func, num_iters)
 
-- 
GitLab


From 4f7943f7358fc69af62dc280c6f6ba549ebe2167 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Fri, 6 Apr 2018 11:56:08 -0700
Subject: [PATCH 0379/1262] Support RNN profiling in StreamExecutor for CUDA
 GPUs.

This change hasn't applied autotune on TF Cudnn kernels, only provides lower level support.

PiperOrigin-RevId: 191919566
---
 tensorflow/core/kernels/cudnn_rnn_ops.cc      |  46 ++--
 tensorflow/stream_executor/cuda/cuda_blas.cc  |  18 --
 tensorflow/stream_executor/cuda/cuda_dnn.cc   | 216 +++++++++++++++---
 tensorflow/stream_executor/cuda/cuda_dnn.h    |  32 ++-
 tensorflow/stream_executor/cuda/cuda_timer.h  |   7 +
 tensorflow/stream_executor/dnn.cc             |   4 +
 tensorflow/stream_executor/dnn.h              |  22 +-
 tensorflow/stream_executor/stream.cc          |  36 ++-
 tensorflow/stream_executor/stream.h           |  18 +-
 .../stream_executor/stream_executor_pimpl.cc  |  14 +-
 .../stream_executor/stream_executor_pimpl.h   |  11 +-
 11 files changed, 314 insertions(+), 110 deletions(-)

diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index ba9686e94e..07dc786d9b 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -104,6 +104,7 @@ namespace {
 using perftools::gputools::DeviceMemory;
 using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
+using perftools::gputools::dnn::AlgorithmConfig;
 using perftools::gputools::dnn::RnnDirectionMode;
 using perftools::gputools::dnn::RnnInputMode;
 using perftools::gputools::dnn::RnnMode;
@@ -544,9 +545,10 @@ class CudnnRNNKernelCommon : public OpKernel {
     auto* stream = context->op_device_context()->stream();
     // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
     // random number generator, therefore set state_allocator to nullptr.
+    const AlgorithmConfig algo_config;
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
         num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        rnn_mode(), ToDataType<T>::value, algo_config, dropout(), seed(),
         nullptr /* state_allocator */);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
@@ -891,22 +893,24 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
         CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
             new CudnnRNNPersistentSpaceAllocator(context);
         rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        const AlgorithmConfig algo_config;
         auto rnn_desc_s = executor->createRnnDescriptor(
             model_shapes.num_layers, model_shapes.num_units,
             model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+            rnn_mode(), data_type, algo_config, dropout(), seed(),
+            dropout_state_allocator);
         OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
         rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
       }
       launch_status =
           stream
-              ->ThenRnnForward(*rnn_state.rnn_desc, *input_desc, input_data,
-                               *hidden_state_desc, input_h_data,
-                               *hidden_state_desc, input_c_data, params_data,
-                               *output_desc, &output_data, *hidden_state_desc,
-                               &output_h_data, *hidden_state_desc,
-                               &output_c_data, is_training_,
-                               &reserve_space_allocator, &workspace_allocator)
+              ->ThenRnnForward(
+                  *rnn_state.rnn_desc, *input_desc, input_data,
+                  *hidden_state_desc, input_h_data, *hidden_state_desc,
+                  input_c_data, params_data, *output_desc, &output_data,
+                  *hidden_state_desc, &output_h_data, *hidden_state_desc,
+                  &output_c_data, is_training_, &reserve_space_allocator,
+                  &workspace_allocator, /* output_result_profile */ nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -1095,25 +1099,27 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
         CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
             new CudnnRNNPersistentSpaceAllocator(context);
         rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        const AlgorithmConfig algo_config;
         auto rnn_desc_s = executor->createRnnDescriptor(
             model_shapes.num_layers, model_shapes.num_units,
             model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+            rnn_mode(), data_type, algo_config, dropout(), seed(),
+            dropout_state_allocator);
         OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
         rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
       }
       launch_status =
           stream
-              ->ThenRnnBackward(*rnn_state.rnn_desc, *input_desc, input_data,
-                                *hidden_state_desc, input_h_data,
-                                *hidden_state_desc, input_c_data, params_data,
-                                *output_desc, output_data, *hidden_state_desc,
-                                output_h_data, *hidden_state_desc,
-                                output_c_data, output_backprop_data,
-                                output_h_backprop_data, output_c_backprop_data,
-                                &input_backprop_data, &input_h_backprop_data,
-                                &input_c_backprop_data, &params_backprop_data,
-                                &reserve_space_uint8, &workspace_allocator)
+              ->ThenRnnBackward(
+                  *rnn_state.rnn_desc, *input_desc, input_data,
+                  *hidden_state_desc, input_h_data, *hidden_state_desc,
+                  input_c_data, params_data, *output_desc, output_data,
+                  *hidden_state_desc, output_h_data, *hidden_state_desc,
+                  output_c_data, output_backprop_data, output_h_backprop_data,
+                  output_c_backprop_data, &input_backprop_data,
+                  &input_h_backprop_data, &input_c_backprop_data,
+                  &params_backprop_data, &reserve_space_uint8,
+                  &workspace_allocator, /* output_result_profile */ nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c563f8f931..1c550dbb13 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -2076,12 +2076,6 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2114,12 +2108,6 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2188,12 +2176,6 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index f408c06f46..3fd9275289 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -297,6 +297,9 @@ CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 namespace {
 
+// Forward declaration.
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+
 cudnnHandle_t ToHandle(void* opaque_handle) {
   return static_cast<cudnnHandle_t>(opaque_handle);
 }
@@ -381,6 +384,23 @@ port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
   }
   return port::Status::OK();
 }
+
+cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
+  if (algorithm.is_default()) {
+    return CUDNN_RNN_ALGO_STANDARD;
+  } else {
+    cudnnRNNAlgo_t algo = static_cast<cudnnRNNAlgo_t>(algorithm.algo_id());
+    switch (algo) {
+      case CUDNN_RNN_ALGO_STANDARD:
+      case CUDNN_RNN_ALGO_PERSIST_STATIC:
+      case CUDNN_RNN_ALGO_PERSIST_DYNAMIC:
+        return algo;
+      default:
+        LOG(FATAL) << "Unsupported Cudnn RNN algorithm: "
+                   << algorithm.algo_id();
+    }
+  }
+}
 #endif
 
 port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
@@ -1124,6 +1144,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
                      cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
+                     cudnnDataType_t compute_type,
+                     const dnn::AlgorithmConfig& algorithm_config,
                      float dropout, uint64 seed,
                      ScratchAllocator* state_allocator)
       : parent_(parent),
@@ -1134,7 +1156,9 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
-        data_type_(data_type) {
+        data_type_(data_type),
+        compute_type_(compute_type),
+        algorithm_config_(algorithm_config) {
     // Create the dropout handle.
     cudnn_dropout_desc_.reset(new CudnnDropoutDescriptor(
         parent, cudnn_handle, dropout, seed, state_allocator));
@@ -1148,18 +1172,20 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
 #if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
-    cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config_.algorithm());
     status = wrap::cudnnSetRNNDescriptor_v6(
         parent, cudnn_handle, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, rnn_algo /*algo*/, data_type /*dataType*/);
+        rnn_mode /*mode*/, rnn_algo /*algo*/, compute_type /*dataType*/);
 #else
+    CHECK(algorithm_config_.is_default())
+        << "Non-default algorithm not supported for CUDA version < 6.0";
     status = wrap::cudnnSetRNNDescriptor(
         parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, data_type /*dataType*/);
+        rnn_mode /*mode*/, compute_type /*dataType*/);
 #endif
     CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
 
@@ -1170,9 +1196,7 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
       SetFailure(cudnn_params_desc_->Status());
       return;
     }
-    if (data_type == CUDNN_DATA_HALF) {
-      set_use_tensor_op_math(true);
-    }
+    set_use_tensor_op_math(algorithm_config_.algorithm().tensor_ops_enabled());
   }
   ~CudnnRnnDescriptor() override {
     if (rnn_desc_) {
@@ -1206,6 +1230,10 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   cudnnDirectionMode_t direction_mode() const { return direction_mode_; }
   cudnnRNNMode_t rnn_mode() const { return rnn_mode_; }
   cudnnDataType_t data_type() const { return data_type_; }
+  cudnnDataType_t compute_type() const { return compute_type_; }
+  const dnn::AlgorithmConfig& algorithm_config() const {
+    return algorithm_config_;
+  }
   int64 ParamsSizeInBytes() const override {
     return cudnn_params_desc_->params_size_in_bytes();
   }
@@ -1236,6 +1264,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
   cudnnDataType_t data_type_;
+  cudnnDataType_t compute_type_;
+  dnn::AlgorithmConfig algorithm_config_;
   std::unique_ptr<CudnnDropoutDescriptor> cudnn_dropout_desc_;
   std::unique_ptr<CudnnRnnParamsDescriptor> cudnn_params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
@@ -1608,7 +1638,8 @@ bool CudnnSupport::DoRnnForwardImpl(
     const CudnnRnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<T>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1665,9 +1696,24 @@ bool CudnnSupport::DoRnnForwardImpl(
     }
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the forward call
+  cudnnStatus_t status;
   if (!is_training) {
-    cudnnStatus_t status = wrap::cudnnRNNForwardInference(
+    status = wrap::cudnnRNNForwardInference(
         parent_, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
@@ -1679,13 +1725,8 @@ bool CudnnSupport::DoRnnForwardImpl(
         output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
         workspace.opaque() /*workspace*/,
         workspace.size() /*workSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardInference: "
-                 << ToString(status);
-      return false;
-    }
   } else {
-    cudnnStatus_t status = wrap::cudnnRNNForwardTraining(
+    status = wrap::cudnnRNNForwardTraining(
         parent_, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
@@ -1699,8 +1740,24 @@ bool CudnnSupport::DoRnnForwardImpl(
         workspace.size() /*workSpaceSizeInBytes*/,
         reserve_space.opaque() /*reserveSpace*/,
         reserve_space.size() /*reserveSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardTraining"
+  }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    if (status == CUDNN_STATUS_SUCCESS) {
+      auto algo_desc = rnn_desc.algorithm_config().algorithm();
+      output_profile_result->set_algorithm(algo_desc);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
+  }
+  if (status != CUDNN_STATUS_SUCCESS) {
+    // Silently return when we are profiling.
+    if (!is_profiling) {
+      LOG(ERROR) << "Failed to call "
+                 << (is_training ? "cudnnRNNForwardTraining "
+                                 : "cudnnRNNForwardInference ")
                  << ToString(status);
       return false;
     }
@@ -1732,7 +1789,8 @@ bool CudnnSupport::DoRnnBackwardImpl(
     DeviceMemory<T>* input_c_backprop_data,
     DeviceMemory<T>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1761,6 +1819,20 @@ bool CudnnSupport::DoRnnBackwardImpl(
     return false;
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the backward data call
   cudnnStatus_t status = wrap::cudnnRNNBackwardData(
       parent_, ToHandle(dnn_handle_) /*handle*/, rnn_desc.handle() /*rnnDesc*/,
@@ -1781,7 +1853,11 @@ bool CudnnSupport::DoRnnBackwardImpl(
       workspace.size() /*workSpaceSizeInBytes*/,
       reserve_space_data->opaque() /*reserveSpace*/,
       reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+
   if (status != CUDNN_STATUS_SUCCESS) {
+    if (is_profiling) {
+      timer->Stop(AsCUDAStream(stream));
+    }
     LOG(ERROR) << "Failed to call cudnnRNNBackwardData: " << ToString(status);
     return false;
   }
@@ -1803,11 +1879,23 @@ bool CudnnSupport::DoRnnBackwardImpl(
         reserve_space_data->opaque() /*reserveSpace*/,
         reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
     if (status != CUDNN_STATUS_SUCCESS) {
+      if (is_profiling) {
+        timer->Stop(AsCUDAStream(stream));
+      }
       LOG(ERROR) << "Failed to call cudnnRNNBackwardWeights: "
                  << ToString(status);
       return false;
     }
   }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
 
   return true;
 }
@@ -1819,15 +1907,17 @@ CudnnSupport::createRnnDescriptor(int num_layers, int hidden_size,
                                   int input_size, dnn::RnnInputMode input_mode,
                                   dnn::RnnDirectionMode direction_mode,
                                   dnn::RnnMode rnn_mode,
-                                  dnn::DataType data_type, float dropout,
-                                  uint64 seed,
+                                  dnn::DataType data_type,
+                                  const dnn::AlgorithmConfig& algorithm_config,
+                                  float dropout, uint64 seed,
                                   ScratchAllocator* state_allocator) {
 #if CUDNN_VERSION >= 5000
   mutex_lock lock{dnn_handle_mutex_};
   std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
       parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
       ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
-      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type), dropout, seed,
+      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type),
+      GetRnnComputeType(data_type), algorithm_config, dropout, seed,
       state_allocator));
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
@@ -1904,7 +1994,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1925,7 +2016,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1946,7 +2038,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<float>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1967,7 +2060,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1989,7 +2083,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<double>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2010,7 +2105,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2039,7 +2135,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<Eigen::half>* input_c_backprop_data,
     DeviceMemory<Eigen::half>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2063,7 +2160,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2091,7 +2188,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<float>* input_c_backprop_data,
     DeviceMemory<float>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2115,7 +2213,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2144,7 +2242,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<double>* input_c_backprop_data,
     DeviceMemory<double>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2168,7 +2267,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2363,6 +2462,33 @@ cudnnDataType_t GetConvComputeType<double>() {
   return CUDNN_DATA_DOUBLE;
 }
 
+// A helper struct to decide whether to use FP32 as the internal compute type
+// for rnn when the input data type is FP16. By default it is turned on,
+// users can explicitly disable them (choose to use FP16 as the internal compute
+// type) through an env-var "TF_FP16_RNN_USE_FP32_COMPUTE=0".
+struct RnnDoFP32ComputationFP16Input {
+  static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
+  static constexpr bool kDefaultFlag = true;
+};
+
+// A helper function to return the internal compute type for
+// RNNs in cudnn.
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+      return CUDNN_DATA_FLOAT;
+    case dnn::DataType::kDouble:
+      return CUDNN_DATA_DOUBLE;
+    case dnn::DataType::kHalf:
+      if (CudnnEnvVar<RnnDoFP32ComputationFP16Input>::IsEnabled()) {
+        return CUDNN_DATA_FLOAT;
+      } else {
+        return CUDNN_DATA_HALF;
+      }
+    default:
+      LOG(FATAL) << "Invalid RNN data type: " << static_cast<int>(data_type);
+  }
+}
 }  // namespace
 
 template <class T>
@@ -2742,6 +2868,30 @@ bool CudnnSupport::GetConvolveAlgorithms(
   return true;
 }
 
+bool CudnnSupport::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
+  // clang-format off
+#if CUDNN_VERSION >= 6000
+    CUDNN_RNN_ALGO_STANDARD,
+    CUDNN_RNN_ALGO_PERSIST_STATIC,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
+#endif
+    // clang-format on
+  };
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+#if CUDNN_VERSION >= 7100
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+#endif
+  }
+  return true;
+}
+
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 48d56f71e3..e40ba9b012 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -50,8 +50,9 @@ class CudnnSupport : public dnn::DnnSupport {
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator* state_allocator) override;
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
@@ -77,7 +78,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -94,7 +96,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<float>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -111,7 +114,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<double>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -135,7 +139,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* input_c_backprop_data,
                      DeviceMemory<Eigen::half>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -159,7 +164,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<float>* input_c_backprop_data,
                      DeviceMemory<float>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -183,12 +189,16 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<double>* input_c_backprop_data,
                      DeviceMemory<double>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool GetConvolveAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
@@ -746,7 +756,8 @@ class CudnnSupport : public dnn::DnnSupport {
                         const CudnnRnnStateTensorDescriptor& output_c_desc,
                         DeviceMemory<T>* output_c_data, bool is_training,
                         ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator);
+                        ScratchAllocator* workspace_allocator,
+                        dnn::ProfileResult* output_profile_result);
 
   template <class T>
   bool DoRnnBackwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
@@ -771,7 +782,8 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemory<T>* input_c_backprop_data,
                          DeviceMemory<T>* params_backprop_data,
                          DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator);
+                         ScratchAllocator* workspace_allocator,
+                         dnn::ProfileResult* output_profile_result);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 4a2714dc1f..2abc55ec94 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -77,6 +77,13 @@ class CUDATimer : public internal::TimerInterface {
                          // executing in a stream.
 };
 
+struct TimerDeleter {
+  void operator()(CUDATimer *t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
 }  // namespace cuda
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 44144a0613..0a3c4bcf50 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -28,6 +28,10 @@ bool DnnSupport::GetConvolveAlgorithms(
   return false;
 }
 
+bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
+  return false;
+}
+
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index b41536e638..43cfd313c1 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1195,6 +1195,9 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  // Returns a list of supported rnn algorithms.
+  virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
+
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
   // original float coefficient[row * num_columns + column] =
@@ -2001,6 +2004,7 @@ class DnnSupport {
                       dnn::RnnInputMode input_mode,
                       dnn::RnnDirectionMode direction_mode,
                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
+                      const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
                       ScratchAllocator* state_allocator) {
     return port::Status{port::error::UNIMPLEMENTED,
@@ -2076,7 +2080,8 @@ class DnnSupport {
                             DeviceMemory<Eigen::half>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2096,7 +2101,8 @@ class DnnSupport {
                             DeviceMemory<float>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2116,7 +2122,8 @@ class DnnSupport {
                             DeviceMemory<double>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
   // Enqueue a backward operation of the RNN model onto the stream.
@@ -2183,7 +2190,8 @@ class DnnSupport {
       DeviceMemory<Eigen::half>* input_c_backprop_data,
       DeviceMemory<Eigen::half>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2210,7 +2218,8 @@ class DnnSupport {
       DeviceMemory<float>* input_c_backprop_data,
       DeviceMemory<float>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2237,7 +2246,8 @@ class DnnSupport {
       DeviceMemory<double>* input_c_backprop_data,
       DeviceMemory<double>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 1e3afde268..fe498507a8 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -4795,7 +4795,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<Eigen::half> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4803,7 +4804,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4827,7 +4829,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<float> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4835,7 +4838,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4860,7 +4864,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<double> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4868,7 +4873,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4900,7 +4906,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<Eigen::half> *input_c_backprop_data,
     DeviceMemory<Eigen::half> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4910,7 +4917,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4941,7 +4949,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<float> *input_c_backprop_data,
     DeviceMemory<float> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4951,7 +4960,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4983,7 +4993,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<double> *input_c_backprop_data,
     DeviceMemory<double> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4993,7 +5004,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index d7d1131569..4af426001f 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -1802,7 +1802,8 @@ class Stream {
                          DeviceMemory<Eigen::half> *output_c_data,
                          bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1819,7 +1820,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<float> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1836,7 +1838,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<double> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   // Enqueue a backward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnBackward for more details.
@@ -1863,7 +1866,8 @@ class Stream {
       DeviceMemory<Eigen::half> *input_c_backprop_data,
       DeviceMemory<Eigen::half> *params_backprop_data,
       DeviceMemory<uint8> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
+      ScratchAllocator *workspace_allocator,
+      dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1887,7 +1891,8 @@ class Stream {
                           DeviceMemory<float> *input_c_backprop_data,
                           DeviceMemory<float> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1911,7 +1916,8 @@ class Stream {
                           DeviceMemory<double> *input_c_backprop_data,
                           DeviceMemory<double> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   // Enqueue onto the stream a operation that transforms a tensor.
   // See DnnSupport::DoTransformTensor for more details.
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index afca1c2e59..f55fa68402 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -305,6 +305,15 @@ bool StreamExecutor::GetConvolveAlgorithms(
                                             cc_minor, out_algorithms);
 }
 
+bool StreamExecutor::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return false;
+  }
+  return dnn_support->GetRnnAlgorithms(out_algorithms);
+}
+
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmDesc> *out_algorithms) {
@@ -344,7 +353,8 @@ port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size,
     dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-    dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout, uint64 seed,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
     ScratchAllocator *state_allocator) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
@@ -353,7 +363,7 @@ StreamExecutor::createRnnDescriptor(
   }
   return dnn_support->createRnnDescriptor(
       num_layers, hidden_size, input_size, input_mode, direction_mode, rnn_mode,
-      data_type, dropout, seed, state_allocator);
+      data_type, algorithm_config, dropout, seed, state_allocator);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index a2a77218cb..69d0374d73 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -349,10 +349,14 @@ class StreamExecutor {
   // platform that underlies this interface.
   bool SupportsDnn() const;
 
-  // Get the list of supported algorithms for the forward convolution opeartion.
+  // Returns the list of supported algorithms for the forward convolution
+  // operation.
   bool GetConvolveAlgorithms(bool with_winograd_nonfused,
                              std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
+  // Returns the list of supported algorithms for rnn operation.
+  bool GetRnnAlgorithms(std::vector<dnn::AlgorithmDesc> *out_algorithms);
+
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
@@ -372,8 +376,9 @@ class StreamExecutor {
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator *state_allocator);
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator *state_allocator);
 
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
-- 
GitLab


From a056c115e83e6f07fd3dbb5d6439658828025024 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Fri, 6 Apr 2018 11:59:17 -0700
Subject: [PATCH 0380/1262] Validate errorReporter and improve the
 documentation on it.

PiperOrigin-RevId: 191920009
---
 tensorflow/contrib/lite/model.cc | 26 +++++++++++++++++++-------
 tensorflow/contrib/lite/model.h  | 26 +++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 606f4a5635..3448de68e8 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,6 +30,13 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+// Ensure that ErrorReporter is non-null.
+ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
+  return e ? e : DefaultErrorReporter();
+}
+}  // namespace
+
 const char* kEmptyTensorName = "";
 
 TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
@@ -78,6 +85,8 @@ std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
     const char* filename, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
@@ -89,6 +98,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
 std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
     const char* filename, TfLiteVerifier* verifier,
     ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
@@ -104,6 +115,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
     const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   Allocation* allocation =
       new MemoryAllocation(buffer, buffer_size, error_reporter);
@@ -114,6 +127,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
     const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   model.reset(new FlatBufferModel(model_spec, error_reporter));
   if (!model->initialized()) model.reset();
@@ -133,15 +148,13 @@ bool FlatBufferModel::CheckModelIdentifier() const {
 
 FlatBufferModel::FlatBufferModel(const Model* model,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
   model_ = model;
 }
 
 FlatBufferModel::FlatBufferModel(Allocation* allocation,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
   allocation_ = allocation;
   if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
@@ -154,7 +167,7 @@ InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
     : model_(model.GetModel()),
       op_resolver_(op_resolver),
-      error_reporter_(model.error_reporter()),
+      error_reporter_(ValidateErrorReporter(model.error_reporter())),
       allocation_(model.allocation()) {}
 
 InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
@@ -162,8 +175,7 @@ InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
                                        ErrorReporter* error_reporter)
     : model_(model),
       op_resolver_(op_resolver),
-      error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {}
+      error_reporter_(ValidateErrorReporter(error_reporter)) {}
 
 TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   TfLiteStatus status = kTfLiteOk;
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 036dc46e03..5a55b031a8 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -56,27 +56,37 @@ class TfLiteVerifier {
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
  public:
-  // Builds a model based on a file. Returns a nullptr in case of failure.
+  // Builds a model based on a file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Verifies whether the content of the file is legit, then builds a model
-  // based on the file. Returns a nullptr in case of failure.
+  // based on the file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
       const char* filename, TfLiteVerifier* verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model directly from a flatbuffer pointer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromModel(
       const tflite::Model* model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
@@ -100,7 +110,10 @@ class FlatBufferModel {
 
  private:
   // Loads a model from a given allocation. FlatBufferModel will take over the
-  // ownership of `allocation`, and delete it in desctructor.
+  // ownership of `allocation`, and delete it in destructor. The ownership of
+  // `error_reporter`remains with the caller and must have lifetime at least
+  // as much as FlatBufferModel. This is to allow multiple models to use the
+  // same ErrorReporter instance.
   FlatBufferModel(Allocation* allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
@@ -111,7 +124,10 @@ class FlatBufferModel {
   // Flatbuffer traverser pointer. (Model* is a pointer that is within the
   // allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
+  // The error reporter to use for model errors and subsequent errors when
+  // the interpreter is created
   ErrorReporter* error_reporter_;
+  // The allocator used for holding memory of the model.
   Allocation* allocation_ = nullptr;
 };
 
-- 
GitLab


From 4d90c62824a2e4e445efab58d2c5829774a884ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 12:18:04 -0700
Subject: [PATCH 0381/1262] Fix a few bugs in ArithmeticOptimizer and make it
 robust to failures of shape inference.

PiperOrigin-RevId: 191922788
---
 .../optimizers/arithmetic_optimizer.cc        | 48 +++++++++++--------
 .../optimizers/arithmetic_optimizer.h         |  2 +-
 .../optimizers/graph_optimizer_stage.cc       |  4 ++
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 59a5695af0..7bf264ba30 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -237,17 +237,16 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
     return false;
   }
 
-  // Now, src_shape and dst_shape have at most one dimension with unknown
-  // sizes, and are compatible. Therefore, the reshape is a no-op when
-  //
-  // 1. at least one of them is fully-defined, or
-  // 2. both are partially defined and the -1 appears on the same dimension,
-  //    i.e., IsIdenticalTo returns true.
-  if (src_num_unknown_dim_sizes == 1 && dst_num_unknown_dim_sizes == 1) {
-    return dst_shape.IsIdenticalTo(src_shape);
+  // If dst_num_unknown_dim_sizes != src_num_unknown_dim_sizes we would weaken
+  // shape inference in subsequent passes if we removed this reshape.
+  if (src_num_unknown_dim_sizes != dst_num_unknown_dim_sizes) {
+    return false;
   }
 
-  return true;
+  // Remove the reshape if both are fully defined or partially defined and the
+  // unknown or symbolic shape appears on the same dimension, i.e., if
+  // IsIdenticalTo returns true.
+  return dst_shape.IsIdenticalTo(src_shape);
 }
 
 NodeDef* GetTailOfValuePreservingChain(
@@ -727,7 +726,9 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
-          new_add_node->set_input(i, unique_factors[i]);
+          const string& unique_factor_i = unique_factors[i];
+          new_add_node->set_input(i, unique_factor_i);
+          ctx_.node_map->AddOutput(unique_factor_i, new_add_node->name());
         }
 
         // Add control deps on add node
@@ -859,13 +860,18 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     NodeDef* node_perm;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
+    if (!IsConstant(*node_perm)) {
+      return Status::OK();
+    }
     std::vector<int64> node_perm_values;
     TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
-
     if (input->op() == node->op()) {
       // Remove pairs of transposes that cancel each other.
       NodeDef* input_perm;
       TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
+      if (!IsConstant(*input_perm)) {
+        return Status::OK();
+      }
       std::vector<int64> input_perm_values;
       TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
       if (AreInversePermutations(node_perm_values, input_perm_values)) {
@@ -1337,9 +1343,9 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     //      ^      |
     //      |      |
     //    input ---+
-    NodeDef* reshape = node_map_->GetNode(node->name());
+    NodeDef* reshape = const_cast<NodeDef*>(node);
     int output_pos = 0;
-    string input_node_name = ParseNodeName(node->input(0), &output_pos);
+    string input_node_name = ParseNodeName(reshape->input(0), &output_pos);
     const NodeDef* input = node_map_->GetNode(input_node_name);
     if (input->op() == "Reshape" && !HasControlInputs(*input)) {
       reshape->set_input(0, input->input(0));
@@ -1653,7 +1659,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   return "";
 }
 
-Status ArithmeticOptimizer::SimplifyArithmeticOps() {
+Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
@@ -1668,11 +1674,11 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
   const auto stop = [](const string& result) { return !result.empty(); };
   GraphOptimizerStagePipeline<string> pipeline(stop);
 
-  if (options_.combine_add_to_addn)
+  if (options_.combine_add_to_addn && can_use_shapes)
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
-  if (options_.hoist_common_factor_out_of_aggregation)
+  if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
-  if (options_.remove_identity_transpose)
+  if (options_.remove_identity_transpose && can_use_shapes)
     pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
   if (options_.remove_redundant_bitcast)
     pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
@@ -1759,10 +1765,14 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Shapes are only needed in aggressive mode.
   graph_properties_.reset(new GraphProperties(item));
-  TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
+  const Status status = graph_properties_->InferStatically(false);
+  const bool can_use_shapes = status.ok();
+  if (!can_use_shapes) {
+    VLOG(1) << "Shape inference failed." << status.error_message();
+  }
 
   // Perform the optimizations.
-  TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
+  TF_RETURN_IF_ERROR(SimplifyArithmeticOps(can_use_shapes));
 
   optimized_graph->Swap(optimized_graph_);
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 7e81ed0a1f..39b89dedba 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -105,7 +105,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
 
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
-  Status SimplifyArithmeticOps();
+  Status SimplifyArithmeticOps(bool can_use_shapes);
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
   // tensor (e.g. "split:1") or an emtpy string if no simplification is
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 7044705ade..1ea57f7b4f 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -42,6 +42,10 @@ Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
 Status GetTensorProperties(const GraphOptimizerContext& ctx,
                            const string& tensor,
                            OpInfo::TensorProperties* properties) {
+  if (ctx.graph_properties == nullptr) {
+    return errors::InvalidArgument("Graph properties are unknown.");
+  }
+
   int port;
   string tensor_node_name = ParseNodeName(tensor, &port);
   if (port < 0) {
-- 
GitLab


From 94749d892e38abb54e320d9f916a40ff9b6ad4b3 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 6 Apr 2018 12:22:17 -0700
Subject: [PATCH 0382/1262] Update the rewriter options with the optimizer
 options

PiperOrigin-RevId: 191923287
---
 tensorflow/python/framework/function_test.py | 32 ++++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 83d256fab6..c05396b06e 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -58,12 +58,32 @@ def _OptimizerOptions():
   for cse in [False, True]:
     for inline in [False, True]:
       for cfold in [False, True]:
-        yield config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-            optimizer_options=config_pb2.OptimizerOptions(
-                opt_level=config_pb2.OptimizerOptions.L0,
-                do_common_subexpression_elimination=cse,
-                do_function_inlining=inline,
-                do_constant_folding=cfold)))
+        cfg = config_pb2.ConfigProto(
+            graph_options=config_pb2.GraphOptions(
+                optimizer_options=config_pb2.OptimizerOptions(
+                    opt_level=config_pb2.OptimizerOptions.L0,
+                    do_common_subexpression_elimination=cse,
+                    do_function_inlining=inline,
+                    do_constant_folding=cfold)))
+        if cse:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if inline:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if cfold:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        yield cfg
 
 
 @test_util.with_c_api
-- 
GitLab


From aeb23e74a2613e573a4cebf3a8314b8e2d1a3b34 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Fri, 6 Apr 2018 12:36:20 -0700
Subject: [PATCH 0383/1262] Pull changes from prefetching_ops to support dicts
 in prefetching_ops_v2 in distribute, and update estimator test to use
 prefetching. Also update readme to reflect the support of dictionaries.

PiperOrigin-RevId: 191924990
---
 .../kernel_tests/prefetching_ops_test.py      | 38 +++++++++++++++++++
 tensorflow/contrib/distribute/README.md       |  6 +--
 .../python/estimator_integration_test.py      |  2 +-
 .../distribute/python/prefetching_ops_v2.py   |  4 +-
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 4b50260670..b08132cd72 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
@@ -265,6 +266,43 @@ class PrefetchToDeviceTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchSparseTensorsToDevice(self):
+    def make_tensor(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2])
+    host_dataset = dataset_ops.Dataset.range(10).map(make_tensor)
+
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        actual = sess.run(next_element)
+        self.assertAllEqual([i], actual.values)
+        self.assertAllEqual([[0, 0]], actual.indices)
+        self.assertAllEqual([2, 2], actual.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 28483f4c88..14de1e8f49 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -117,7 +117,7 @@ in the input function gives a solid boost in performance. When using
 This feature is in early stages and there are a lot of improvements forthcoming:
 
 * Metrics are not yet supported during distributed training.
-* Summaries are currently computed in every tower.
+* Summaries are only computed in the first tower in `MirroredStrategy`.
 * Evaluation is not yet distributed.
 * Eager support is in the works; performance can be more challenging with eager
 execution.
@@ -129,10 +129,6 @@ effective batch size will be `num_gpus * batch_size`. Therefore, consider
 adjusting your learning rate or batch size according to the number of GPUs.
 We are working on addressing this limitation by splitting each batch across GPUs
 instead.
-* Dictionaries inside dataset in the input are not supported when prefetching
-on GPUs is turned on. (If you need to use dictionaries in the dataset, turn off
-prefetching on GPUs by passing param `prefetch_on_device=False` to
-`MirroredStrategy`)
 * PartitionedVariables are not supported yet.
 
 ## What's next?
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index 2b49b8f4ef..c5a520ab5a 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -61,7 +61,7 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           mode=['graph'],
           distribution=[
               combinations.one_device_strategy,
-              combinations.mirrored_strategy_without_prefetch
+              combinations.mirrored_strategy_with_gpu_and_cpu
           ]))
   def test_complete_flow_with_mode(self, distribution):
     label_dimension = 2
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
index e1ddf3cece..dfcbb8568f 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -45,10 +45,12 @@ class _PrefetchToDeviceIterator(object):
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
           handle, input_iterator.output_types, input_iterator.output_shapes,
           input_iterator.output_classes)
-      return remote_iterator.get_next()
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     target_device = gen_dataset_ops.iterator_get_device(
         input_iterator._iterator_resource)
-- 
GitLab


From e8dedc2c0f00dd28a3398cceb5e3293faaabbb9f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 12:37:17 -0700
Subject: [PATCH 0384/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191925087
---
 tensorflow/core/kernels/BUILD                 |  2 ++
 .../core/kernels/crop_and_resize_op_test.cc   |  9 ++++---
 tensorflow/core/kernels/decode_image_op.cc    |  6 ++---
 .../core/kernels/dynamic_partition_op_test.cc |  5 ++--
 .../core/kernels/dynamic_stitch_op_test.cc    | 27 ++++++++++---------
 tensorflow/core/kernels/gather_op_test.cc     |  3 ++-
 .../kernels/non_max_suppression_op_test.cc    |  9 ++++---
 .../quantize_and_dequantize_op_test.cc        |  9 ++++---
 ...ote_fused_graph_rewriter_transform_test.cc |  3 ++-
 .../core/kernels/resize_bicubic_op_test.cc    |  6 ++---
 .../core/kernels/resize_bilinear_op_test.cc   | 18 ++++++-------
 tensorflow/core/kernels/roll_op_test.cc       | 18 +++++++------
 tensorflow/core/kernels/scatter_nd_op_test.cc | 27 +++++++++----------
 tensorflow/core/kernels/scatter_op_test.cc    | 18 ++++++++-----
 tensorflow/core/kernels/shape_op_test.cc      |  5 ++--
 tensorflow/core/kernels/softmax_op.cc         |  3 ++-
 tensorflow/core/kernels/softmax_op_gpu.cu.cc  |  3 ++-
 .../sparse_dense_binary_op_shared_test.cc     |  3 ++-
 tensorflow/core/kernels/summary_op_test.cc    | 13 ++++-----
 19 files changed, 102 insertions(+), 85 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a196fc54af..1857d8d655 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2265,6 +2265,7 @@ tf_cc_tests(
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -5905,6 +5906,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index a35e1b0788..709082e799 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -242,7 +243,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input image must be 4-D"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "input image must be 4-D"))
       << s;
 }
 
@@ -255,7 +256,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
+      str_util::StrContains(s.ToString(), "box_index has incompatible shape"))
       << s;
 }
 
@@ -267,8 +268,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   AddInputFromArray<int32>(TensorShape({2}), {3, 3});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_index has values outside [0, batch_size)"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "box_index has values outside [0, batch_size)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 912d04c153..2cafa44f37 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -41,9 +41,9 @@ enum FileFormat {
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
   // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
-  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  if (str_util::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
+  if (str_util::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (str_util::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
   return kUnknownFormat;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 9a7ed0af21..17eb4e24b7 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -153,8 +154,8 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "partitions[2] = 99 is not in [0, 4)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
index 6775893ce6..7fa6e320f5 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -88,9 +89,9 @@ TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [5] does not start with "
-                            "indices[1].shape = [1,5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [5] does not start with indices[1].shape = [1,5]"))
       << s;
 }
 
@@ -103,9 +104,9 @@ TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [1,5] does not start with "
-                            "indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [1,5] does not start with indices[1].shape = [5]"))
       << s;
 }
 
@@ -119,9 +120,10 @@ TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
   AddInputFromArray<float>(TensorShape({4, 2}),
                            {10, 11, 60, 61, 20, 21, 30, 31});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Need data[0].shape[1:] = data[1].shape[1:], "
-                            "got data[0].shape = [3,1], data[1].shape = [4,2]"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Need data[0].shape[1:] = data[1].shape[1:], got "
+                            "data[0].shape = [3,1], data[1].shape = [4,2]"))
       << s;
 }
 
@@ -134,10 +136,9 @@ TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "data[1].shape = [4] does not start with indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [4] does not start with indices[1].shape = [5]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 3edcb34bca..0409cadb67 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -171,7 +172,7 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 67d9217b95..9387fb13bc 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -147,7 +148,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -160,7 +161,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -308,7 +309,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -322,7 +323,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 5ffcc7d65d..e41df12d91 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
@@ -379,8 +380,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
@@ -401,8 +402,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index d5b37b1ce1..9217c25978 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -181,7 +182,7 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     int cluster_count = 0;
     for (const NodeDef& node_def : output_graph_def_.node()) {
       const string& name = node_def.name();
-      if (StringPiece(name).starts_with(REMOTE_FUSED_GRAPH_NODE_NAME)) {
+      if (str_util::StartsWith(name, REMOTE_FUSED_GRAPH_NODE_NAME)) {
         ++cluster_count;
         RemoteFusedGraphExecuteInfo info;
         string serialized_proto;
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 25a37d5e1a..c23570d885 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -218,9 +219,8 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index a920e60281..6d57892828 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -457,9 +458,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
@@ -467,8 +467,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: input must be 4-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: input must be 4-dimensional"))
       << s;
 }
 
@@ -476,8 +476,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must be 1-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must be 1-dimensional"))
       << s;
 }
 
@@ -485,8 +485,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must have two elements"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must have two elements"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index 90b6f8d0f3..e431226aa6 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -372,7 +373,8 @@ TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "input must be 1-D or higher"))
       << s;
 }
 
@@ -384,8 +386,8 @@ TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("axis must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "axis must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -397,8 +399,8 @@ TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -410,8 +412,8 @@ TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
   AddInputFromArray<int32>(TensorShape({1}), {1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift and axis must have the same size"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift and axis must have the same size"))
       << s;
 }
 
@@ -423,7 +425,7 @@ TEST_F(RollOpTest, Error_AxisOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "is out of range")) << s;
 }
 
 // isd - (inner shift dimension) The inner most dimension to be shifted.
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index ae81efa31d..c134a8dd5b 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -183,9 +184,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid indices: [2,0] = [99] does not index into [5,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid indices: [2,0] = [99] does not index into [5,3]"))
       << s;
 }
 
@@ -198,10 +198,10 @@ TEST_F(ScatterNdUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("The outermost dimension of updates and indices "
-                            "must match. Got indices.shape [1,3,1], "
-                            "updates.shape [3,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match. Got "
+      "indices.shape [1,3,1], updates.shape [3,3]"))
       << s;
 }
 
@@ -216,10 +216,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Must have updates.shape = indices.shape[:batch_dim]"))
-
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Must have updates.shape = indices.shape[:batch_dim]"))
       << s;
 }
 
@@ -233,10 +231,9 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "The outermost dimension of updates and indices must match."))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match."))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index 5b3537b94c..2ec8c42233 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -170,7 +171,7 @@ TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
@@ -183,8 +184,9 @@ TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
@@ -200,8 +202,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
 
       << s;
@@ -217,8 +220,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
index a545fb146c..9cd590ae61 100644
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ b/tensorflow/core/kernels/shape_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -62,8 +63,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
 REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE",
                                       GetShapeFromKnownVecSize);
 
-static void ExpectHasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+static void ExpectHasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index e1712ac239..e72608945b 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -55,7 +56,7 @@ template <typename Device, typename T>
 class SoftmaxOp : public OpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 130d693dbd..b63dcbb163 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -128,7 +129,7 @@ template <typename T>
 class SoftmaxOpGPU : public OpKernel {
  public:
   explicit SoftmaxOpGPU(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index fe198af7e6..29577ebb4e 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -32,7 +33,7 @@ namespace tensorflow {
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 3c46abb8ab..9dcabcc584 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -122,7 +123,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not the same shape")) << s;
 }
 
 TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
@@ -133,7 +134,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -145,7 +146,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -256,7 +257,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
@@ -266,7 +267,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 // --------------------------------------------------------------------------
@@ -365,7 +366,7 @@ TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}),
                             {s1.SerializeAsString(), s2.SerializeAsString()});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "Duplicate tag")) << s;
 }
 
 }  // namespace
-- 
GitLab


From ed6a2cd368cea48433236563953f7c603ba78115 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 6 Apr 2018 13:49:42 -0700
Subject: [PATCH 0385/1262] A single measurement is enough when using
 simulation

PiperOrigin-RevId: 191934781
---
 tensorflow/python/grappler/cluster.i | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 067c8213d4..6816e20407 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -320,7 +320,8 @@ static PyObject* TF_MeasureCosts(
   tensorflow::OpPerformanceList op_performance_data;
   tensorflow::StepStats step_stats;
 
-  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), 10, 0);
+  const int num_measurements = cluster->type() == "virtual" ? 1 : 10;
+  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), num_measurements, 0);
 
   tensorflow::grappler::Costs costs;
   tensorflow::Status status = _GetOpPerformanceDataAndRunTime(
-- 
GitLab


From 8413fb51307c0274ae4db31181c531de046eb309 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 6 Apr 2018 13:58:40 -0700
Subject: [PATCH 0386/1262] Switch the eager PTB example to object-based
 saving.

Should be the last eager example using name-based saving. This works now that cuDNN LSTM cells are Checkpointable.

The CPU and GPU checkpoints for this example (still) won't be compatible; I may follow up with a CL which makes them equivalent. Right now I don't think they're even implementing the same architecture.

PiperOrigin-RevId: 191935995
---
 .../eager/python/examples/rnn_ptb/rnn_ptb.py  | 57 ++++++++++---------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index a90048d813..be5d60449d 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -315,32 +315,37 @@ def main(_):
   have_gpu = tfe.num_gpus() > 0
   use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu
 
-  with tfe.restore_variables_on_create(
-      tf.train.latest_checkpoint(FLAGS.logdir)):
-    with tf.device("/device:GPU:0" if have_gpu else None):
-      # Make learning_rate a Variable so it can be included in the checkpoint
-      # and we can resume training with the last saved learning_rate.
-      learning_rate = tfe.Variable(20.0, name="learning_rate")
-      sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
-      model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
-                       FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
-                       use_cudnn_rnn)
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-
-      best_loss = None
-      for _ in range(FLAGS.epoch):
-        train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
-        eval_loss = evaluate(model, eval_data)
-        if not best_loss or eval_loss < best_loss:
-          if FLAGS.logdir:
-            tfe.Saver(model.trainable_weights + [learning_rate]).save(
-                os.path.join(FLAGS.logdir, "ckpt"))
-          best_loss = eval_loss
-        else:
-          learning_rate.assign(learning_rate / 4.0)
-          sys.stderr.write("eval_loss did not reduce in this epoch, "
-                           "changing learning rate to %f for the next epoch\n" %
-                           learning_rate.numpy())
+  with tf.device("/device:GPU:0" if have_gpu else None):
+    # Make learning_rate a Variable so it can be included in the checkpoint
+    # and we can resume training with the last saved learning_rate.
+    learning_rate = tfe.Variable(20.0, name="learning_rate")
+    model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
+                     FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
+                     use_cudnn_rnn)
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+    checkpoint = tfe.Checkpoint(
+        learning_rate=learning_rate, model=model,
+        # GradientDescentOptimizer has no state to checkpoint, but noting it
+        # here lets us swap in an optimizer that does.
+        optimizer=optimizer)
+    # Restore existing variables now (learning_rate), and restore new variables
+    # on creation if a checkpoint exists.
+    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
+    sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
+
+    best_loss = None
+    for _ in range(FLAGS.epoch):
+      train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
+      eval_loss = evaluate(model, eval_data)
+      if not best_loss or eval_loss < best_loss:
+        if FLAGS.logdir:
+          checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
+        best_loss = eval_loss
+      else:
+        learning_rate.assign(learning_rate / 4.0)
+        sys.stderr.write("eval_loss did not reduce in this epoch, "
+                         "changing learning rate to %f for the next epoch\n" %
+                         learning_rate.numpy())
 
 
 if __name__ == "__main__":
-- 
GitLab


From 3745f2582daeae7a49a129e250cf0cc2d573924a Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 6 Apr 2018 14:13:49 -0700
Subject: [PATCH 0387/1262] Pad support for quantized zero.

PiperOrigin-RevId: 191938267
---
 .../internal/optimized/optimized_ops.h        |  28 ++--
 .../internal/reference/reference_ops.h        |  13 +-
 tensorflow/contrib/lite/kernels/pad.cc        |  27 ++--
 tensorflow/contrib/lite/kernels/pad_test.cc   | 129 +++++++++++++++---
 4 files changed, 158 insertions(+), 39 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 9a274612ad..7a383fba18 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5067,7 +5067,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
@@ -5087,27 +5087,27 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, 0,
+    memset(output_data, pad_value,
            left_b_padding * output_height * output_width * output_depth *
                sizeof(T));
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
              left_h_padding * output_width * output_depth * sizeof(T));
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
                left_w_padding * output_depth * sizeof(T));
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
-                 left_d_padding * sizeof(T));
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+                 pad_value, left_d_padding * sizeof(T));
         }
 
         T* out = output_data +
@@ -5121,20 +5121,21 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
           memset(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              0, right_d_padding * sizeof(T));
+              pad_value, right_d_padding * sizeof(T));
         }
       }
       if (right_w_padding != 0) {
         memset(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            0, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth * sizeof(T));
       }
     }
     if (right_h_padding != 0) {
       memset(output_data + Offset(output_dims, 0, 0,
                                   output_height - right_h_padding, out_b),
-             0, right_h_padding * output_width * output_depth * sizeof(T));
+             pad_value,
+             right_h_padding * output_width * output_depth * sizeof(T));
     }
   }
   if (right_b_padding != 0) {
@@ -5146,6 +5147,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 31e190e248..3245bf615e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3086,7 +3086,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -3116,7 +3116,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = 0;
+            *out_ptr++ = static_cast<T>(pad_value);
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -3126,6 +3126,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 inline bool LoopCondition(int index, int stop, int stride) {
   return stride > 0 ? index < stop : index > stop;
 }
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index c29da3862e..4f9449a225 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -119,39 +119,46 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar)                                           \
+#define TF_LITE_PAD(type, scalar, pad_value)                                \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
             GetTensorDims(op_context.input), before_padding, after_padding, \
             GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output))
+            GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float);
+        TF_LITE_PAD(reference_ops, float, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float);
+        TF_LITE_PAD(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
+      // Quantized Pad requires that 0 is represented in the quantized range.
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                  std::numeric_limits<uint8_t>::min());
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t);
+        TF_LITE_PAD(reference_ops, uint8_t,
+                    op_context.output->params.zero_point);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t);
+        TF_LITE_PAD(optimized_ops, uint8_t,
+                    op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t);
+        TF_LITE_PAD(reference_ops, int32_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t);
+        TF_LITE_PAD(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t);
+        TF_LITE_PAD(reference_ops, int64_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t);
+        TF_LITE_PAD(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index 28834ad071..c06237e572 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class PadOpModel : public SingleOpModel {
  public:
@@ -29,6 +30,10 @@ class PadOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
@@ -36,6 +41,11 @@ class PadOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int output_;
@@ -50,16 +60,17 @@ class PadOpModel : public SingleOpModel {
 //    m.Invoke();
 class PadOpConstModel : public PadOpModel {
  public:
-  PadOpConstModel(std::initializer_list<int> input_shape,
+  PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -72,40 +83,45 @@ class PadOpConstModel : public PadOpModel {
 //    m.Invoke();
 class PadOpDynamicModel : public PadOpModel {
  public:
-  PadOpDynamicModel(std::initializer_list<int> input_shape,
-                    std::initializer_list<int> paddings_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  PadOpDynamicModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape, paddings_shape});
+    BuildInterpreter({input.shape, paddings_shape});
   }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
+                      {TensorType_FLOAT32}),
       "dims != 4");
 }
 
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -114,7 +130,8 @@ TEST(PadOpTest, SimpleConstTest) {
 }
 
 TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -124,7 +141,8 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -134,7 +152,8 @@ TEST(PadOpTest, AdvancedConstTest) {
 }
 
 TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -144,6 +163,80 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+class QuantizedPadOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 8b5212011c7b67b7f8c2ea1b641aa0a7151c82d0 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 6 Apr 2018 14:24:42 -0700
Subject: [PATCH 0388/1262] Branch 191925087 (#18299)

* Fix docstring.

PiperOrigin-RevId: 191747417

* Use constants in tf.zeros if the constant won't be too big.

Using fill saves on GraphDef size, but can slow down models since the
total number of ops is greater (fill + shape + constant op). This
change makes us only use fill for large shapes.

PiperOrigin-RevId: 191747456

* Fix typos in "Profile Model Float Operations" documentation.

PiperOrigin-RevId: 191751175

* Added a call in CheckpointSaverHook.after_create_session to always save
checkpoint before the first training step.

PiperOrigin-RevId: 191753026

* Document expected regular structure of the statistical testing library.

PiperOrigin-RevId: 191753693

* Refine BatchReshape error messages.

PiperOrigin-RevId: 191754120

* Include the operators module in the test framework as well.

PiperOrigin-RevId: 191756100

* Expand activity analysis to the test nodes of if and while statements.

PiperOrigin-RevId: 191756234

* Inline more functions

PiperOrigin-RevId: 191761109

* Sync only the convolutional_recurrent file to Keras 2.1.5.

PiperOrigin-RevId: 191763101

* Internal change

PiperOrigin-RevId: 191769724

* Expose odeint_fixed in tf.contrib.integrate

PiperOrigin-RevId: 191769890

* Automated g4 rollback of changelist 191761109

PiperOrigin-RevId: 191771969

* Fix final eval bottleneck creation to work in cases where it isn't cached already.

Fixes #17423

PiperOrigin-RevId: 191773001

* Fix regression caused by cl/191020868: Re-use materialized shapes for other broadcast gradient shape nodes.

PiperOrigin-RevId: 191779263

* Save the original from_proto method before calling it to avoid infinite loop.

PiperOrigin-RevId: 191784430

* Automated g4 rollback of changelist 191753026

PiperOrigin-RevId: 191784709

* [XLA] Remove a dead function and a stale todo.

PiperOrigin-RevId: 191786563

* Enable branch prediction in TensorFlow

PiperOrigin-RevId: 191788253

* Changes loss_reduction default to SUM_OVER_BATCH_SIZE for multi_class_head and binary_classification_head.

PiperOrigin-RevId: 191793392

* quantized LSTM support improvements

PiperOrigin-RevId: 191794956

* Fix TF_ImportGraphDefResults and TF_Function leaks in Python API.

PiperOrigin-RevId: 191797853

* [XLA] Better support for mul reductions in MakeFakeArguments()

Mul reductions want a 1 as their init value, not a 0 or a random value.

PiperOrigin-RevId: 191802819

* Disable tests that are currently failing with cuda 9

PiperOrigin-RevId: 191805453

* Make tf.contrib.estimator.add_metrics work with warm-starting.

PiperOrigin-RevId: 191805682

* Add Raspberry Pi section and link to github build instructions.

PiperOrigin-RevId: 191807862

* Add for and while loops to the list of operators. Do not use them yet.

PiperOrigin-RevId: 191807973

* [TF:XLA] No need to set return value in the while loop's condition.

PiperOrigin-RevId: 191809110

* Add functions to extract the basic symbols on which a composite name relies. This in turn allows to statically obtain a block's syntactic closure.

PiperOrigin-RevId: 191809965

* Add link for index file in performance tab.

PiperOrigin-RevId: 191811610

* Added an option to inline all functions in aggressive mode.

PiperOrigin-RevId: 191819577

* Make concat handler support mixed range input

PiperOrigin-RevId: 191822664

* Automated g4 rollback of changelist 191605505

PiperOrigin-RevId: 191824447

* Add a command line parameter to toco to change the way toco rescales input and output tensors.

PiperOrigin-RevId: 191825756

*   refactor and add proto field required by POD support.

PiperOrigin-RevId: 191826636

* Lazily evaluate shapes with the C API enabled.

This change makes it so shapes are computed only when requested with
_USE_C_API = True. Note that the C API will still raise a shape error
if necessary when the op is created.

In addition, it cleans up the logic for _USE_C_SHAPES = True. In this
case, we lazily fetch and cache shapes directly from the C API. We no
longer need set_shapes_for_outputs at all in this case.

PiperOrigin-RevId: 191830565

* [XLA] Don't call Literal::Get in HloEvaluator's convolution loop.

This speeds up the implementation of conv because Literal::Get calls
Literal::Piece::data, which is relatively slow.

Instead, we call Literal::Data() once and cache the result.

Before: ConvolutionTest/0.StridedFilter (59094 ms)
After:  ConvolutionTest/0.StridedFilter (41812 ms)

Speedup: 59/42 = 1.4x
PiperOrigin-RevId: 191830741

* Added `drop_final_batch` argument to make_batched_features_dataset. This allows the batch_and_drop_remainder function to be used instead of the default batch function.

PiperOrigin-RevId: 191831842

* Add RunMetadata logging to tf.train.ProfilerHook for Tensorboard Memeory/CPU usage visualization

PiperOrigin-RevId: 191832832

* [XLA] Don't call MultidimensionalIndexToLinearIndex in HloEvaluator's convolution routine.

Before: ConvolutionTest/0.StridedFilter (41812 ms)
After:  ConvolutionTest/0.StridedFilter (28054 ms)

Speedup: 42 / 28 = 1.5x
PiperOrigin-RevId: 191835735

* Expose the adaptive sampling option for SDCA and shuffle the data when adaptive sampling is off.

PiperOrigin-RevId: 191836004

* Swap in the new implementation of while and for loops.

PiperOrigin-RevId: 191838806

* Upgrade libpng

PiperOrigin-RevId: 191840652

* Fix StringPiece use-after-free in MasterSession::ReffedClientGraph.

Use the owned ClientGraph as the source for the node_to_name_ map, rather than the borrowed GraphExecutionState (which can be deleted while the ReffedClientGraph is in use).

PiperOrigin-RevId: 191847023

* Add a test to check graceful handling of out-of-memory conditions.

PiperOrigin-RevId: 191860462

* internal change

PiperOrigin-RevId: 191869400

* Fix typos in XlaCompilationCache

PiperOrigin-RevId: 191881135

* Define PRNG seeding style for new code in Distributions and TF Probability, with rationales.

Implement lightweight PRNG for seed generation in that style.

Enables incremental refactoring of existing code into this style.

PiperOrigin-RevId: 191884573

* Avoid marking clusters containing only Identity ops for compilation.

This would produce clusters where XLA cannot optimize anything.

PiperOrigin-RevId: 191887414

* Add description to the LPIRC 2018 competition benchmarker.

PiperOrigin-RevId: 191889484

* The training model need not be built when the kfac optimizer is initialized so the
self._variables will be empty list.  So pass a function which returns list of trainable variables to estimator.

PiperOrigin-RevId: 191893084

* Fix up the support for the case where a given array name occurs multiple
times in the inputs/outputs list of an op. The (non-essential) computation
of the optimal workspace size had not been updated for that case, causing it
to fail on a simple test case. Moreover, the initial implementation had some
redundant usage of std::find that this CL moves to a shared helper function.

PiperOrigin-RevId: 191894081

* Support override of device filters for gRPC, by overriding the requests with default session config.

PiperOrigin-RevId: 191895856

* Tweaked docstrings in LayerCollection.

PiperOrigin-RevId: 191897098

* [TPUClusterResolver] Start a TFServer when running in GKE

This change allows advanced input pipelines (e.g. StreamingFilesDataset, or split-pipelines that use py_func's) to run in GKE- and GKE-like enviornments.

PiperOrigin-RevId: 191897639

* [tf.data] Enable using `tf.contrib.data.prefetch_to_device()` in eager mode.

The added functionality is a substitute for the implicit prefetching in
`tfe.Iterator`, and the two paths will converge in a future change.

Fixes #18260.

PiperOrigin-RevId: 191897666

* Materialize tensor array sizes whenever possible

PiperOrigin-RevId: 191900015

* Object-based checkpointing support for unidirectional cuDNN LSTM cells

Once checked in, this will be the only way I know of to save canonical weights
when executing eagerly. Eager's name-based saving support will only do the
opaque parameter buffer.

I'm not going to try converting everything in one go, but it's a start at
least. And everything else should raise a NotImplementedError rather than
silently not saving correctly.

Single-layer cuDNN cells can be swapped for un-wrapped cuDNN compatible cells or
single cells wrapped in MultiRNNCells. Multi-layer cells need MultiRNNCell
wrapping.

PiperOrigin-RevId: 191905703

* Allow TFE_NewContext to fail more reasonably when SWIG is checking status.

Before:
TFE_Context would check nullptr, and the function would fail straight away.

Now:
TFE_Context is nullptr, so it skips down to checking the status, and an error
is raised.

I'm not able to find in SWIG documentation how to order typemaps in the
generated code - ideally, I'd order it to check the status typemap first. This
code makes it not dependent on this ordering either way.

PiperOrigin-RevId: 191905893

* Change GetInstructionCallContext to take an opcode instead of an
HloInstruction.
This enables use of the function without an actual instruction (eg, if you just
have an HloProto).

PiperOrigin-RevId: 191905914

* TPU Cost Estimator has been modified to also account for the memory cost in the execution time. Until more sophisticated methods are added, we resort to the roofline model to calculate such cost.

PiperOrigin-RevId: 191913626

* Properly handle callable objects.

PiperOrigin-RevId: 191913834

* Minor doc clarification for reduce_sum return type

PiperOrigin-RevId: 191914398

* Added headers only version of tensorflow/core/kernels:cwise_lib, cwise_lib_hdrs. This is for clients that want to use the cwise_ops machinery when making their own custom ops, including cwise_lib directly causes multiple definition linker errors.

PiperOrigin-RevId: 191914445

* [TF:XLA] Create Despecializing Pass Pipeline

When comparing backends, it is useful to take an HLO optimized for one backend and perform transformations in order to match numerics.  This can be thought of as finding a lowest common denominator.

Move this grouping of passes into its own HloPassPipeline that can be reused in a few different places.

PiperOrigin-RevId: 191914799

* Update tf.keras to keras 2.1.5 version.

PiperOrigin-RevId: 191914904

* Remove `TF_InitializeTPU` and `TF_ShutdownTPU` from experimental C API as they are no longer needed. Also remove a duplicate function declaration.

PiperOrigin-RevId: 191918408

* Fix small performance regression in microbenchmarks.

PiperOrigin-RevId: 191919464

* Support RNN profiling in StreamExecutor for CUDA GPUs.

This change hasn't applied autotune on TF Cudnn kernels, only provides lower level support.

PiperOrigin-RevId: 191919566

* Validate errorReporter and improve the documentation on it.

PiperOrigin-RevId: 191920009

* Fix a few bugs in ArithmeticOptimizer and make it robust to failures of shape inference.

PiperOrigin-RevId: 191922788

* Update the rewriter options with the optimizer options

PiperOrigin-RevId: 191923287

* Pull changes from prefetching_ops to support dicts in prefetching_ops_v2 in distribute, and update estimator test to use prefetching.
Also update readme to reflect the support of dictionaries.

PiperOrigin-RevId: 191924990

* Replaced calls to deprecated tensorflow::StringPiece methods with their
tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191925087
---
 tensorflow/c/c_api_experimental.cc            |   51 -
 tensorflow/c/c_api_experimental.h             |   21 -
 tensorflow/cc/profiler/BUILD                  |    3 +
 tensorflow/compiler/jit/BUILD                 |    4 +-
 .../compiler/jit/kernels/xla_launch_op.cc     |    5 +
 .../compiler/jit/mark_for_compilation_pass.cc |   19 +-
 .../jit/mark_for_compilation_pass_test.cc     |   34 +
 .../compiler/jit/xla_compilation_cache.h      |    6 +-
 tensorflow/compiler/jit/xla_device_context.cc |   34 +-
 tensorflow/compiler/jit/xla_device_context.h  |    7 +
 tensorflow/compiler/jit/xla_launch_util.cc    |   34 +-
 tensorflow/compiler/tests/BUILD               |   20 +
 tensorflow/compiler/tests/jit_test.py         |   36 +-
 tensorflow/compiler/tests/oom_test.py         |   61 +
 tensorflow/compiler/tf2xla/lib/while_loop.cc  |    1 -
 .../compiler/xla/executable_run_options.cc    |    7 +
 .../compiler/xla/executable_run_options.h     |    4 +
 .../xla/legacy_flags/debug_options_flags.cc   |    7 +
 tensorflow/compiler/xla/service/BUILD         |   15 +
 tensorflow/compiler/xla/service/call_graph.cc |    6 +-
 tensorflow/compiler/xla/service/call_graph.h  |    2 +-
 tensorflow/compiler/xla/service/cpu/BUILD     |   23 +
 .../compiler/xla/service/cpu/cpu_runtime.cc   |    8 +
 .../compiler/xla/service/cpu/cpu_runtime.h    |    4 +
 .../xla/service/cpu/cpu_runtime_test.cc       |   84 +-
 .../xla/service/cpu/dot_op_emitter.cc         |   23 +-
 .../xla/service/cpu/runtime_matmul_mkl.cc     |  128 ++
 .../xla/service/cpu/runtime_matmul_mkl.h      |   84 ++
 .../xla/service/cpu/simple_orc_jit.cc         |    5 +
 .../compiler/xla/service/despecializer.cc     |   35 +
 .../compiler/xla/service/despecializer.h      |   45 +
 .../xla/service/flatten_call_graph.cc         |    2 +-
 .../compiler/xla/service/hlo_evaluator.cc     |   71 +-
 tensorflow/compiler/xla/service/service.h     |    4 -
 tensorflow/compiler/xla/tests/test_utils.cc   |   68 +-
 tensorflow/compiler/xla/xla.proto             |    3 +
 tensorflow/contrib/autograph/converters/BUILD |   12 +-
 .../autograph/converters/break_statements.py  |    7 +-
 .../autograph/converters/control_flow.py      |   76 +-
 .../autograph/converters/control_flow_test.py |   72 +
 .../converters/converter_test_base.py         |    2 +
 .../contrib/autograph/converters/for_loops.py |   92 --
 .../autograph/converters/for_loops_test.py    |   70 -
 tensorflow/contrib/autograph/impl/api_test.py |   11 +-
 .../contrib/autograph/impl/conversion.py      |    3 -
 tensorflow/contrib/autograph/operators/BUILD  |   17 +-
 .../contrib/autograph/operators/__init__.py   |    5 +
 .../autograph/operators/control_flow.py       |  179 +++
 .../autograph/operators/control_flow_test.py  |   82 ++
 tensorflow/contrib/autograph/pyct/ast_util.py |    2 +-
 .../contrib/autograph/pyct/inspect_utils.py   |    6 +
 .../autograph/pyct/inspect_utils_test.py      |    9 +
 .../contrib/autograph/pyct/qual_names.py      |   23 +
 .../contrib/autograph/pyct/qual_names_test.py |   15 +
 .../pyct/static_analysis/activity.py          |   18 +-
 .../pyct/static_analysis/activity_test.py     |    2 +
 .../autograph/pyct/static_analysis/annos.py   |    1 +
 .../contrib/autograph/utils/__init__.py       |    3 -
 .../contrib/autograph/utils/builtins.py       |   68 -
 .../autograph/utils/multiple_dispatch.py      |   41 -
 .../autograph/utils/multiple_dispatch_test.py |   23 -
 .../python/training/tpu_cluster_resolver.py   |   75 +-
 .../training/tpu_cluster_resolver_test.py     |    8 +-
 tensorflow/contrib/cudnn_rnn/BUILD            |    1 +
 .../python/kernel_tests/cudnn_rnn_test.py     |  151 +-
 .../cudnn_rnn/python/layers/cudnn_rnn.py      |   20 +
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |   75 +-
 .../kernel_tests/prefetching_ops_test.py      |   38 +
 .../kernel_tests/reader_dataset_ops_test.py   |   20 +-
 .../data/python/ops/prefetching_ops.py        |  114 +-
 tensorflow/contrib/data/python/ops/readers.py |   11 +-
 tensorflow/contrib/distribute/README.md       |    6 +-
 .../python/estimator_integration_test.py      |    2 +-
 .../distribute/python/prefetching_ops_v2.py   |    4 +-
 tensorflow/contrib/distributions/BUILD        |   10 +
 tensorflow/contrib/distributions/__init__.py  |    2 +
 .../python/kernel_tests/batch_reshape_test.py |    4 +-
 .../python/kernel_tests/seed_stream_test.py   |   70 +
 .../distributions/python/ops/batch_reshape.py |    5 +-
 .../distributions/python/ops/seed_stream.py   |  228 +++
 .../python/ops/statistical_testing.py         |  111 +-
 tensorflow/contrib/eager/python/datasets.py   |    7 +
 .../contrib/eager/python/datasets_test.py     |   13 +
 .../estimator/python/estimator/extenders.py   |    5 +-
 .../estimator/python/estimator/head.py        |   11 +-
 tensorflow/contrib/integrate/__init__.py      |    1 +
 .../contrib/kfac/python/ops/estimator.py      |   11 +-
 .../kfac/python/ops/layer_collection.py       |  123 +-
 .../contrib/kfac/python/ops/optimizer.py      |   10 +-
 .../python/kernel_tests/sdca_ops_test.py      |   54 +
 .../linear_optimizer/python/ops/sdca_ops.py   |    7 +-
 .../linear_optimizer/python/sdca_optimizer.py |    9 +-
 tensorflow/contrib/lite/java/BUILD            |   39 +
 tensorflow/contrib/lite/java/ovic/README.md   |   83 ++
 .../tensorflow/ovic/OvicClassifierTest.java   |   35 +-
 .../contrib/lite/kernels/concatenation.cc     |   22 +-
 .../lite/kernels/concatenation_test.cc        |   68 +
 .../internal/optimized/optimized_ops.h        |   56 +
 .../internal/reference/reference_ops.h        |   55 +
 .../contrib/lite/kernels/internal/tensor.h    |   23 +
 tensorflow/contrib/lite/model.cc              |   26 +-
 tensorflow/contrib/lite/model.h               |   26 +-
 .../lite/toco/allocate_transient_arrays.cc    |   36 +-
 tensorflow/contrib/lite/toco/args.h           |    1 +
 .../contrib/lite/toco/export_tensorflow.cc    |    4 +
 .../graph_transformations/hardcode_min_max.cc |   45 +-
 .../make_initial_dequantize_operator.cc       |   14 +-
 .../toco/graph_transformations/quantize.cc    |    3 +-
 .../contrib/lite/toco/model_cmdline_flags.cc  |   14 +-
 .../contrib/lite/toco/model_flags.proto       |    6 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   26 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |   23 +-
 tensorflow/contrib/lite/toco/tooling_util.h   |    2 +-
 .../rnn/python/kernel_tests/rnn_cell_test.py  |    8 +-
 .../python/framework/fake_summary_writer.py   |    6 +
 .../tpu/profiler/capture_tpu_profile.cc       |   68 +-
 .../contrib/tpu/profiler/dump_tpu_profile.cc  |   75 +-
 .../contrib/tpu/profiler/dump_tpu_profile.h   |    1 +
 .../contrib/tpu/profiler/tpu_profiler.proto   |   22 +-
 .../distributed_runtime/master_session.cc     |   24 +-
 .../rpc/grpc_master_service.cc                |   26 +-
 .../rpc/grpc_master_service.h                 |    7 +-
 .../rpc/grpc_server_lib.cc                    |    3 +-
 .../grappler/costs/op_level_cost_estimator.cc |   15 +-
 .../grappler/costs/op_level_cost_estimator.h  |    5 +
 .../optimizers/arithmetic_optimizer.cc        |   48 +-
 .../optimizers/arithmetic_optimizer.h         |    2 +-
 .../grappler/optimizers/constant_folding.cc   |   53 +-
 .../optimizers/constant_folding_test.cc       |   42 +
 .../grappler/optimizers/function_optimizer.cc |   24 +-
 .../grappler/optimizers/function_optimizer.h  |    5 +-
 .../optimizers/function_optimizer_test.cc     |    8 +-
 .../optimizers/graph_optimizer_stage.cc       |    4 +
 tensorflow/core/kernels/BUILD                 |   11 +
 .../core/kernels/crop_and_resize_op_test.cc   |    9 +-
 tensorflow/core/kernels/cudnn_rnn_ops.cc      |   46 +-
 tensorflow/core/kernels/decode_image_op.cc    |    6 +-
 .../core/kernels/dynamic_partition_op_test.cc |    5 +-
 .../core/kernels/dynamic_stitch_op_test.cc    |   27 +-
 tensorflow/core/kernels/gather_op_test.cc     |    3 +-
 .../kernels/non_max_suppression_op_test.cc    |    9 +-
 .../quantize_and_dequantize_op_test.cc        |    9 +-
 ...ote_fused_graph_rewriter_transform_test.cc |    3 +-
 .../core/kernels/resize_bicubic_op_test.cc    |    6 +-
 .../core/kernels/resize_bilinear_op_test.cc   |   18 +-
 tensorflow/core/kernels/roll_op_test.cc       |   18 +-
 tensorflow/core/kernels/scatter_nd_op_test.cc |   27 +-
 tensorflow/core/kernels/scatter_op_test.cc    |   18 +-
 tensorflow/core/kernels/sdca_internal.cc      |    5 +
 tensorflow/core/kernels/sdca_internal.h       |    7 +-
 tensorflow/core/kernels/sdca_ops.cc           |    6 +-
 tensorflow/core/kernels/shape_op_test.cc      |    5 +-
 tensorflow/core/kernels/softmax_op.cc         |    3 +-
 tensorflow/core/kernels/softmax_op_gpu.cu.cc  |    3 +-
 .../sparse_dense_binary_op_shared_test.cc     |    3 +-
 tensorflow/core/kernels/summary_op_test.cc    |   13 +-
 tensorflow/core/platform/macros.h             |   17 +-
 .../g3doc/profile_model_architecture.md       |   32 +-
 tensorflow/docs_src/mobile/tflite/devguide.md |    9 +-
 tensorflow/docs_src/performance/leftnav_files |    1 +
 .../examples/image_retraining/retrain.py      |    7 +-
 tensorflow/python/client/tf_session_helper.cc |    9 -
 tensorflow/python/client/tf_session_helper.h  |    7 -
 tensorflow/python/eager/benchmarks_test.py    |   23 +-
 tensorflow/python/eager/function.py           |    4 +-
 tensorflow/python/eager/graph_callable.py     |    2 +-
 tensorflow/python/framework/c_api_util.py     |   26 +
 tensorflow/python/framework/function.py       |   10 +-
 tensorflow/python/framework/function_test.py  |   32 +-
 tensorflow/python/framework/importer.py       |   14 +-
 tensorflow/python/framework/ops.py            |  182 ++-
 tensorflow/python/framework/tensor_util.py    |    3 +-
 .../_impl/keras/applications/mobilenet.py     |  222 +--
 .../_impl/keras/applications/resnet50.py      |    5 +-
 .../keras/_impl/keras/layers/convolutional.py |  195 +++
 .../keras/layers/convolutional_recurrent.py   | 1222 ++++++++++++-----
 .../layers/convolutional_recurrent_test.py    |    1 +
 .../_impl/keras/layers/convolutional_test.py  |   38 +
 .../keras/_impl/keras/layers/recurrent.py     |  137 +-
 .../_impl/keras/layers/recurrent_test.py      |   16 +-
 tensorflow/python/keras/layers/__init__.py    |    1 +
 tensorflow/python/ops/array_ops.py            |   23 +
 tensorflow/python/ops/math_ops.py             |    5 +-
 .../python/ops/resource_variable_ops.py       |    7 +-
 tensorflow/python/pywrap_tfe.i                |    6 +-
 .../training/basic_session_run_hooks.py       |    3 +
 .../training/basic_session_run_hooks_test.py  |   13 +
 tensorflow/python/training/distribute.py      |    5 +-
 tensorflow/stream_executor/cuda/cuda_blas.cc  |   18 -
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |  216 ++-
 tensorflow/stream_executor/cuda/cuda_dnn.h    |   32 +-
 tensorflow/stream_executor/cuda/cuda_timer.h  |    7 +
 tensorflow/stream_executor/dnn.cc             |    4 +
 tensorflow/stream_executor/dnn.h              |   22 +-
 tensorflow/stream_executor/stream.cc          |   36 +-
 tensorflow/stream_executor/stream.h           |   18 +-
 .../stream_executor/stream_executor_pimpl.cc  |   14 +-
 .../stream_executor/stream_executor_pimpl.h   |   11 +-
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |  114 +-
 ...flow.keras.layers.-depthwise-conv2-d.pbtxt |  187 +++
 .../tensorflow.keras.layers.-g-r-u-cell.pbtxt |    2 +-
 .../tensorflow.keras.layers.-g-r-u.pbtxt      |    6 +-
 .../api/golden/tensorflow.keras.layers.pbtxt  |    4 +
 .../windows/cpu/pip/build_tf_windows.sh       |   12 +
 tensorflow/workspace.bzl                      |    8 +-
 third_party/png.BUILD                         |   12 +
 206 files changed, 5394 insertions(+), 1928 deletions(-)
 create mode 100644 tensorflow/compiler/tests/oom_test.py
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
 create mode 100644 tensorflow/compiler/xla/service/despecializer.cc
 create mode 100644 tensorflow/compiler/xla/service/despecializer.h
 delete mode 100644 tensorflow/contrib/autograph/converters/for_loops.py
 delete mode 100644 tensorflow/contrib/autograph/converters/for_loops_test.py
 create mode 100644 tensorflow/contrib/autograph/operators/control_flow.py
 create mode 100644 tensorflow/contrib/autograph/operators/control_flow_test.py
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/seed_stream.py
 create mode 100644 tensorflow/contrib/lite/java/ovic/README.md
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index bea9378571..e82a546092 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -56,57 +56,6 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
   }
 }
 
-void TF_InitializeTPU(TF_Session* session, TF_Status* status) {
-  VLOG(1) << "Initializing TPU";
-  TF_Operation* config_op =
-      TF_GraphOperationByName(session->graph, "ConfigureDistributedTPU");
-  if (config_op == nullptr) {
-    status->status = tensorflow::errors::Internal(
-        "Unable to find node ConfigureDistributedTPU in the TF graph.");
-    return;
-  }
-
-  TF_Output config_node{config_op, 0};
-
-  TF_Tensor* dummy_output;
-  TF_SessionRun(session, /*run_options*/ nullptr,
-                // input related parameters
-                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
-                // output related parameters
-                /*outputs*/ &config_node, /*output_values*/ &dummy_output,
-                /*noutputs*/ 1,
-                /*targets*/ nullptr, /*ntargets*/ 0,
-                /*run_metadata*/ nullptr, status);
-  if (status->status.ok()) {
-    TF_DeleteTensor(dummy_output);
-  }
-}
-
-void TF_ShutdownTPU(TF_Session* session, TF_Status* status) {
-  {
-    tensorflow::mutex_lock c(session->graph->mu);
-    VLOG(1) << "Shutting down TPU, with input graph: "
-            << session->graph->graph.ToGraphDefDebug().DebugString();
-  }
-
-  TF_Operation* shutdown_op =
-      TF_GraphOperationByName(session->graph, "ShutdownDistributedTPU");
-  if (shutdown_op == nullptr) {
-    status->status = tensorflow::errors::Internal(
-        "Unable to find node ShutdownDistributedTPU in the TF graph.");
-    return;
-  }
-
-  TF_SessionRun(session, /*run_options*/ nullptr,
-                // input related parameters
-                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
-                // output related parameters
-                /*outputs*/ nullptr, /*output_values*/ nullptr,
-                /*noutputs*/ 0,
-                /*targets*/ &shutdown_op, /*ntargets*/ 1,
-                /*run_metadata*/ nullptr, status);
-}
-
 const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   tensorflow::mutex_lock c(graph->mu);
   const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString();
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index ebcec8176b..666342974e 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -60,27 +60,6 @@ extern "C" {
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
-// Initializes TPU system. Must be called exactly once before TF_SessionRun() is
-// called on a TPU graph.
-//
-// The session graph must contain a node named ConfigureDistributedTPU.
-// TODO(b/74774824): Improve the API on initializing TPU system.
-TF_CAPI_EXPORT extern void TF_InitializeTPU(TF_Session* session,
-                                            TF_Status* status);
-
-// Shuts down TPU system. For any `session` where TF_InitializeTPU() has
-// been successfully called, this call must be made exactly once before the
-// session is closed.
-// The session graph must contain a node named ShutdownDistributedTPU.
-TF_CAPI_EXPORT extern void TF_ShutdownTPU(TF_Session* session,
-                                          TF_Status* status);
-
-// Returns the graph content in a human-readable format, with length set in
-// `len`. The format is subject to change in the future.
-// The returned string is heap-allocated, and caller should call free() on it.
-TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
-                                                      size_t* len);
-
 // Returns the graph content in a human-readable format, with length set in
 // `len`. The format is subject to change in the future.
 // The returned string is heap-allocated, and caller should call free() on it.
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index 00799526fc..cf65fe1ab9 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -9,6 +9,9 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
+    tags = [
+        "noguitar",  # b/77649654
+    ],
     deps = [
         ":profiler",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 24aa203c00..a492fc6b9b 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -204,14 +204,14 @@ cc_library(
         ":common",
         ":xla_compilation_cache",
         ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 2d6511a45b..f48941fce3 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -155,6 +155,9 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
   options.device_allocator = xla_allocator;
+  // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
+  // is restricted to Variables, but we need something like this to apply to
+  // normal Tensors too.
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -179,8 +182,10 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   run_options.set_stream(stream);
   run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+
   auto run_result = executable->Run(launch_context.arguments(), run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 3b631d6f4e..386240ff8d 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -732,11 +732,15 @@ Status MarkForCompilationPass::RunImpl(
     }
   }
 
-  // Count the number of elements in each cluster.
-  std::vector<int> cluster_sizes(graph->num_node_ids());
+  // Count the number of non-trivial elements in each cluster.
+  std::vector<int> effective_cluster_sizes(graph->num_node_ids());
   for (const Node* n : compilation_candidates) {
     int cluster = clusters[n->id()].Get().representative;
-    cluster_sizes[cluster]++;
+    // Identity nodes will be removed if the node gets marked for compilation.
+    // Therefore we don't want to count them towards the effective cluster size.
+    if (n->def().op() != "Identity") {
+      effective_cluster_sizes[cluster]++;
+    }
   }
 
   // Names for each cluster.
@@ -769,9 +773,12 @@ Status MarkForCompilationPass::RunImpl(
     const XlaOpRegistry::DeviceRegistration* registration;
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
-    // Or compile if this is a cluster of >= min_cluster_size compilable
-    // operators.
-    if (cluster_sizes[cluster] >= min_cluster_size || marked_for_compilation ||
+    // Compile if this is a cluster of >= min_cluster_size compilable operators.
+    // Also, always compile if the operator is placed on a device that requires
+    // compilation, or if it contains at least one op that is marked for
+    // compilation that is not an Identity op.
+    if (effective_cluster_sizes[cluster] >= min_cluster_size ||
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
         registration->requires_compilation) {
       string& name = cluster_names[cluster];
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 2e362e0a63..80edaf28b8 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -575,5 +577,37 @@ TEST(XlaCompilationTest, Retval) {
   EXPECT_EQ(clusters["A"], clusters["B"]);
 }
 
+TEST(XlaCompilationTest, DontCountIdentityOps) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    auto c = ops::Identity(root.WithOpName("C"), b);
+    auto r = ops::_Retval(root.WithOpName("R"), c, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    b.node()->AddAttr(kXlaCompileAttr, true);
+    auto r = ops::_Retval(root.WithOpName("R"), b, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 5c0c79b880..be1043d8c3 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -52,13 +52,14 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `constant_args` is a maps of tensorflow argument number to constant value.
+  // `constant_args` is a map of tensorflow argument number to its constant
+  //  value.
   // `variable_args` is a snapshot of the current values of the
   // resource variable arguments to `function`; uninitialized variables are
   // represented by an absent OptionalTensor.
   // The result of compilation is written to `*compilation_result`, which must
   // be non-null. If `executable` is non-null, also builds an
-  // xla::LocalExecutable and sets `executable to point to it. The resulting
+  // xla::LocalExecutable and sets `executable` to point to it. The resulting
   // executable pointer may be null if the computation has no non-constant
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
@@ -96,6 +97,7 @@ class XlaCompilationCache : public ResourceBase {
                      xla::LocalExecutable** executable,
                      const XlaCompiler::CompileOptions* compile_options,
                      bool compile_single_op);
+
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
   Status BuildExecutable(const XlaCompiler::Options& options,
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 6a57831cde..43eb164012 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
 
@@ -53,8 +54,33 @@ XlaTransferManager::XlaTransferManager(se::Stream* stream,
                                        bool transfer_as_literal)
     : stream_(stream),
       client_(client),
+      transfer_manager_(client->backend().transfer_manager()),
       transfer_as_literal_(transfer_as_literal) {}
 
+Status XlaTransferManager::TransferLiteralToDevice(
+    const Tensor& host_tensor, Tensor* device_tensor) const {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(HostTensorToLiteral(host_tensor, &literal));
+  VLOG(1) << "Transfer to device as literal: " << literal.ToString();
+
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+  return transfer_manager_->TransferLiteralToDevice(stream_->parent(), literal,
+                                                    shaped_buffer);
+}
+
+Status XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor) const {
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
+                      transfer_manager_->TransferLiteralFromDevice(
+                          stream_->parent(), shaped_buffer));
+  VLOG(1) << "Transfer from device as literal: " << literal->ToString();
+  return LiteralToHostTensor(*literal, host_tensor->dtype(), host_tensor);
+}
+
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                                Device* device,
                                                Tensor* device_tensor,
@@ -86,9 +112,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
     if (transfer_as_literal_) {
-      status = xla::Unimplemented(
-          "XlaTransferManager::CopyCPUTensorToDevice not implemented for "
-          "literals");
+      status = TransferLiteralToDevice(*cpu_tensor, device_tensor);
     } else {
       stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
@@ -129,9 +153,7 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
 
     Status status;
     if (transfer_as_literal_) {
-      status = xla::Unimplemented(
-          "XlaTransferManager::CopyDeviceTensorToCPU not implemented for "
-          "literals");
+      status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
     } else {
       stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index a8ad511fbd..ad914a1c23 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -57,11 +57,18 @@ class XlaTransferManager {
   perftools::gputools::Stream* stream() const { return stream_; }
 
  private:
+  Status TransferLiteralToDevice(const Tensor& host_tensor,
+                                 Tensor* device_tensor) const;
+  Status TransferLiteralFromDevice(Tensor* host_tensor,
+                                   const Tensor& device_tensor) const;
+
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
   perftools::gputools::Stream* stream_;
   // For the underlying memory allocator and XLA's TransferManager.
   xla::LocalClient* client_;
+  // Transfer manager, for marshalling data to and from the device.
+  xla::TransferManager* transfer_manager_;
   // True if we must use XLA's TransferManager for correct device transfers.
   bool transfer_as_literal_;
 };
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 354be1e1b5..50b0061d69 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -16,12 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -165,6 +167,8 @@ void XlaComputationLaunchContext::PopulateOutputs(
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
+    VLOG(2) << "Result tuple shape (on device): "
+            << output->on_device_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
@@ -179,6 +183,10 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const size_t total_bytes = const_tensor.TotalBytes();
       if (stream && total_bytes > 0) {
         // Copy host -> device. (Empty tensors don't have backing buffers.)
+        // Manually allocate memory using an XlaTensorBuffer so we can allocate
+        // as much memory as the device requires (as given by
+        // GetByteSizeRequirement). This avoids XlaTransferManager having to
+        // reallocate the device buffer later.
         VLOG(1) << "Constant output tensor on device";
 
         OP_REQUIRES_OK(
@@ -189,15 +197,23 @@ void XlaComputationLaunchContext::PopulateOutputs(
                                   client_, stream->parent()->device_ordinal()));
         }
 
-        const void* src_ptr = DMAHelper::base(&const_tensor);
-        gpu::DeviceMemoryBase dst_ptr =
-            XlaTensor::DeviceMemoryFromTensor(*output_tensor);
-        // Memcpying asynchronously is safe for the GPU, but the CPU uses a
-        // shared allocator so hold a reference to the copied-to buffer until
-        // complete.
-        TensorReference ref(*output_tensor);
-        stream->ThenMemcpy(&dst_ptr, src_ptr, total_bytes);
-        stream->ThenDoHostCallback([ref] { ref.Unref(); });
+        Device* device = dynamic_cast<Device*>(ctx->device());
+        OP_REQUIRES(ctx, device != nullptr,
+                    errors::Internal("DeviceBase was not a Device."));
+        ctx->op_device_context()->CopyCPUTensorToDevice(
+            &const_tensor, device, output_tensor,
+            [&](Status status) { TF_CHECK_OK(status); });
+
+        if (device->device_type() == DEVICE_GPU) {
+          // The GPUDeviceContext enqueues the host->device transfer in a
+          // separate stream from the main compute stream. We must ensure the
+          // compute stream is synchronized with the host->device transfer
+          // stream now otherwise we will create a race condition.
+          auto* gpu_device_context =
+              static_cast<GPUDeviceContext*>(ctx->op_device_context());
+          gpu_device_context->stream()->ThenWaitFor(
+              gpu_device_context->host_to_device_stream());
+        }
       } else {
         // No copy required.
         ctx->set_output(i, const_tensor);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index edabdc218a..e345c1266a 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -191,6 +191,26 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "oom_test",
+    size = "medium",
+    srcs = ["oom_test.py"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index f9d87c2d1c..1f7da659e5 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -38,6 +39,18 @@ from tensorflow.python.platform import test
 jit_scope = jit.experimental_jit_scope
 
 
+# Disable rewrites to make sure we don't end up having to update this test
+# whenever we implement new ones.
+def NoRewriteSessionConfig():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      function_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
 def CompiledKernel(fn, *inputs, **kwargs):
   """Execute 'fn' as a compiled XLA kernel, with 'inputs'."""
   name = kwargs.pop("name", None)
@@ -81,7 +94,7 @@ class JitLaunchTest(test.TestCase):
   # actually ran. However, it is sometimes possible for _XlaLaunch ops to be
   # constant-folded away, so the check is optional.
   def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
-    with session_lib.Session() as sess:
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
       placeholders = []
       feeds = {}
       for arg in args:
@@ -258,7 +271,7 @@ class XlaCompilationTest(test.TestCase):
   def testReshape(self):
     """Tests an operator with compile-time constant and non-constant inputs."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -282,7 +295,7 @@ class XlaCompilationTest(test.TestCase):
   def testIgnoredArguments(self):
     """Tests that JIT computations can ignore formal parameters."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.int32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -306,7 +319,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoops(self):
     """Tests that compilation accepts computations containing loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         c = lambda i, _: math_ops.less(i, 5)
@@ -324,7 +337,7 @@ class XlaCompilationTest(test.TestCase):
   def testCond(self):
     """Tests that compilation handles switch operators."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.float32)
       c = array_ops.placeholder(dtypes.bool)
@@ -365,7 +378,8 @@ class XlaCompilationTest(test.TestCase):
       inp = array_ops.placeholder(dtypes.float32)
       out = Entry(inp)
 
-    with self.test_session(graph=g, use_gpu=True) as sess:
+    with self.test_session(
+        config=NoRewriteSessionConfig(), graph=g, use_gpu=True) as sess:
       run_metadata = config_pb2.RunMetadata()
       val = sess.run(out,
                      feed_dict={inp: [2., 10.]},
@@ -377,7 +391,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoopDeadlock(self):
     """Regression test for bug that caused deadlocks in graphs with loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         y = x + 1.0
@@ -404,10 +418,10 @@ class XlaCompilationTest(test.TestCase):
         y = Forward(x)
         dx, = gradients_impl.gradients(y, [x], 1.0)
 
-      cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-          optimizer_options=config_pb2.OptimizerOptions(
-              opt_level=config_pb2.OptimizerOptions.L1,
-              do_function_inlining=True)))
+      cfg = NoRewriteSessionConfig()
+      cfg.graph_options.optimizer_options.opt_level = (
+          config_pb2.OptimizerOptions.L1)
+      cfg.graph_options.optimizer_options.do_function_inlining = True
       with session_lib.Session(graph=g, config=cfg) as sess:
         run_metadata = config_pb2.RunMetadata()
         dx_val = sess.run(dx,
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
new file mode 100644
index 0000000000..1434e965e3
--- /dev/null
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for out-of-memory conditions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class OutOfMemoryTest(xla_test.XLATestCase):
+
+  def testOutputOutOfMemory(self):
+    """Allocates tensors until out of memory.
+
+    Generates a large rank-1 tensor. The tensor is an output of an XLA
+    computation, not constant.
+
+    Check that a ResourceExhaustedError is raised and can be caught.
+
+    We spin in a loop generating larger and larger tensors until an OOM event
+    happens. We may be running sandboxed, so have a small host memory limit, so
+    any hardcoded value is unlikely to land in the sweet spot between device
+    memory size and host memory size with stability.
+    """
+
+    def test_loop():
+      size = 2e8
+      while True:
+        with self.test_session():
+          # Force the compiled code to not be constant by feeding in an addend.
+          p = array_ops.placeholder(dtypes.float32, shape=[])
+          with self.test_scope():
+            # Create a large R1 tensor.
+            c = array_ops.zeros([size, 1]) + p
+
+            c.eval(feed_dict={p: 1.0})
+            size *= 2
+
+    self.assertRaises(errors.ResourceExhaustedError, test_loop)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 86c02ac2e6..495d9c6078 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -54,7 +54,6 @@ xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
         auto result,
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
                            cond_builder.get()));
-    TF_RETURN_IF_ERROR(cond_builder->SetReturnValue(result));
   }
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 392ad9010a..1700c97718 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -87,4 +87,11 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
+  rng_seed_ = rng_seed;
+  return *this;
+}
+
+int ExecutableRunOptions::rng_seed() const { return rng_seed_; }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index d4fcbf0493..2c1d9ffff1 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -84,6 +84,9 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
+  ExecutableRunOptions& set_rng_seed(int rng_seed);
+  int rng_seed() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -92,6 +95,7 @@ class ExecutableRunOptions {
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
+  int rng_seed_ = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index c8ed3e3a2b..f037663e3f 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,6 +40,9 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
+#ifdef INTEL_MKL
+  flags->set_xla_cpu_use_mkl_dnn(true);
+#endif  // INTEL_MKL
 
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
@@ -288,6 +291,10 @@ void AllocateFlags() {
           flag_values->xla_gpu_use_cudnn_batchnorm(),
           "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
           "rather than expanding them to a soup of HLOs."),
+      tensorflow::Flag("xla_cpu_use_mkl_dnn",
+                       bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
+                       flag_values->xla_cpu_use_mkl_dnn(),
+                       "Generate calls to MKL-DNN in the CPU backend."),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 3a99d84bea..db91e80407 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2639,6 +2639,21 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "despecializer",
+    srcs = ["despecializer.cc"],
+    hdrs = ["despecializer.h"],
+    deps = [
+        ":bfloat16_normalization",
+        ":defuser",
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
 cc_library(
     name = "source_map_util",
     srcs = ["source_map_util.cc"],
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 13eb02ca01..a8053d15e1 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -51,8 +51,8 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
   return out;
 }
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction) {
-  switch (instruction->opcode()) {
+CallContext GetInstructionCallContext(HloOpcode opcode) {
+  switch (opcode) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
@@ -101,7 +101,7 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
 
 void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
   CHECK_EQ(instruction->parent(), computation());
-  const CallContext context = GetInstructionCallContext(instruction);
+  const CallContext context = GetInstructionCallContext(instruction->opcode());
   if (!instruction->called_computations().empty()) {
     CHECK(context == CallContext::kSequential ||
           context == CallContext::kParallel);
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 688c4085df..97d3811508 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -53,7 +53,7 @@ enum class CallContext {
 string CallContextToString(CallContext context);
 std::ostream& operator<<(std::ostream& out, const CallContext& context);
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction);
+CallContext GetInstructionCallContext(HloOpcode opcode);
 
 // Represents an HLO instruction which calls one or more computations.
 class CallSite {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 966e2d0fc5..246b802861 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -18,6 +18,10 @@ load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -170,6 +174,7 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
@@ -538,6 +543,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_matmul_mkl",
+    srcs = ["runtime_matmul_mkl.cc"],
+    hdrs = ["runtime_matmul_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
+    ]),
+)
+
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = [
@@ -584,10 +605,12 @@ cc_library(
 tf_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
+    shard_count = 10,
     tags = ["optonly"],
     deps = [
         ":cpu_runtime",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_matmul",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 9a3bd68c80..872b0be1f8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,6 +37,14 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kMKLMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF32";
+extern const char* const kMKLMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF64";
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF32";
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF64";
 extern const char* const kEigenConvF16SymbolName =
     "__xla_cpu_runtime_EigenConvF16";
 extern const char* const kEigenConvF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index e61d6ea28b..e392e231b4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,6 +44,10 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kMKLMatMulF32SymbolName;
+extern const char* const kMKLMatMulF64SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f385829cdf..2ac950e6d9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
@@ -130,25 +131,23 @@ MatMulShape MatMulShapes[] = {
 // * transpose_lhs
 // * transpose_rhs
 // * single_threaded
-using EigenMatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
+using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
 
-class EigenMatMulTest
-    : public CpuRuntimeTest,
-      public ::testing::WithParamInterface<EigenMatMulTestParam> {
+class EigenMatMulTest : public CpuRuntimeTest,
+                        public ::testing::WithParamInterface<MatMulTestParam> {
  public:
-  static string Name(
-      const ::testing::TestParamInfo<EigenMatMulTestParam>& info) {
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
     MatMulShape shape = std::get<0>(info.param);
     bool transpose_lhs = std::get<1>(info.param);
     bool transpose_rhs = std::get<2>(info.param);
     bool single_threaded = std::get<3>(info.param);
 
     return tensorflow::strings::Printf(
-        "MatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
         transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
         single_threaded ? "single" : "multi");
   }
-};  // namespace xla
+};
 
 TEST_P(EigenMatMulTest, DoIt) {
   MatMulShape shape = std::get<0>(GetParam());
@@ -169,5 +168,74 @@ INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
                                            ::testing::Bool()),
                         EigenMatMulTest::Name);
 
+#ifdef INTEL_MKL
+class MKLMatMulTest : public CpuRuntimeTest,
+                      public ::testing::WithParamInterface<MatMulTestParam> {
+ public:
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
+    MatMulShape shape = std::get<0>(info.param);
+    bool transpose_lhs = std::get<1>(info.param);
+    bool transpose_rhs = std::get<2>(info.param);
+    bool single_threaded = std::get<3>(info.param);
+
+    return tensorflow::strings::Printf(
+        "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
+        single_threaded ? "single" : "multi");
+  }
+};
+
+std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
+                                                  const Array2D<float>& b,
+                                                  bool transpose_lhs,
+                                                  bool transpose_rhs,
+                                                  bool single_threaded) {
+  CHECK_EQ(a.width(), b.height());
+  int64 m = a.height();
+  int64 n = b.width();
+  int64 k = a.width();
+
+  // The MKL matmul runtime function expects the matrix to be in column major
+  // order and array2d is in row-major order. Create transposes of a and b. The
+  // 'data' buffer in the transposed array is the original array in column major
+  // order.
+  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
+  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
+
+  // Since we're going to transpose c before returning it, swap the order of the
+  // dimension sizes to ensure the returned array is properly dimensioned.
+  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  if (single_threaded) {
+    __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
+        m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    __xla_cpu_runtime_MKLMatMulF32(nullptr, c_transpose->data(),
+                                   a_transpose->data(), b_transpose->data(), m,
+                                   n, k, transpose_lhs, transpose_rhs);
+  }
+  return MaybeTransposeArray2D(*c_transpose, true);
+}
+
+TEST_P(MKLMatMulTest, DoIt) {
+  MatMulShape shape = std::get<0>(GetParam());
+  bool transpose_lhs = std::get<1>(GetParam());
+  bool transpose_rhs = std::get<2>(GetParam());
+  bool single_threaded = std::get<3>(GetParam());
+
+  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
+  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
+  auto c =
+      MKLMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs, single_threaded);
+  CheckMatrixMultiply(*a, *b, *c);
+}
+
+INSTANTIATE_TEST_CASE_P(MKLMatMulTestInstantiaion, MKLMatMulTest,
+                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        MKLMatMulTest::Name);
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 8b1e20d79e..29afd8ea5f 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -918,28 +918,35 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded_eigen =
+  bool multi_threaded =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
     case F16:
-      fn_name = multi_threaded_eigen
+      fn_name = multi_threaded
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
       float_type = ir_builder_->getHalfTy();
       break;
     case F32:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF32SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF32SymbolName
+                                   : runtime::kEigenMatMulF32SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF32SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF32SymbolName);
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF64SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF64SymbolName
+                                   : runtime::kEigenMatMulF64SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF64SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
new file mode 100644
index 0000000000..92da5f71c2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+#include "third_party/intel_mkl_ml/include/mkl_service.h"
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/types.h"
+
+#define EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+namespace {
+// BLAS GEMM API for 32-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF32(const void* run_options_ptr, float* out, float* lhs, float* rhs,
+               int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For column-major matrices, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_sgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+// BLAS GEMM API for 64-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF64(const void* run_options_ptr, double* out, double* lhs,
+               double* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For a column-major matrix, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_dgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+}  // namespace
+
+void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out,
+                                    float* lhs, float* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+// BLAS GEMM API for 64-bit Matrix Multiplication
+void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out,
+                                    double* lhs, double* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr,
+                                                  float* out, float* lhs,
+                                                  float* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr,
+                                                  double* out, double* lhs,
+                                                  double* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+#endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
new file mode 100644
index 0000000000..831b796efb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+
+#include <iostream>
+#include "tensorflow/core/platform/types.h"
+#ifdef INTEL_MKL
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+#else
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 4198260a22..b7ce5bbe47 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
@@ -183,6 +184,10 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
new file mode 100644
index 0000000000..d938f3a2c4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/despecializer.h"
+
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/defuser.h"
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+namespace xla {
+
+Despecializer::Despecializer() : pipeline_("despecializer") {
+  // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<Defuser>();
+  pipeline_.AddPass<ImplicitBroadcastRemover>();
+  pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
+}
+
+StatusOr<bool> Despecializer::Run(HloModule* module) {
+  return pipeline_.Run(module);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
new file mode 100644
index 0000000000..af48f4ab6e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Creates an HloPassPipeline containing multiple HloPasses that can
+// despecialize an optimized HloModule. This is useful to run an HloModule
+// optimized for one specfic platform on a different platform (undoing platform
+// specific passes) with matching numerics for comparison.
+//
+// Current despecialization passes are Defuser, ImplicitBroadcastRemover,
+// and BFloat16MixedPrecisionRemoval.
+class Despecializer : public HloPassInterface {
+ public:
+  Despecializer();
+  tensorflow::StringPiece name() const override { return "despecializer"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  HloPassPipeline pipeline_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 2b6caa1494..85409b330b 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -93,7 +93,7 @@ Status FlattenNode(const CallGraphNode& node) {
       auto current = worklist.back();
       worklist.pop_back();
       for (auto* instruction : current->instructions()) {
-        if (GetInstructionCallContext(instruction) !=
+        if (GetInstructionCallContext(instruction->opcode()) !=
             CallContext::kSequential) {
           continue;
         }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 9d7251b6ae..53ad8909c5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -202,6 +202,25 @@ void IterateThroughWindow(
   } while (IndexUtil::BumpIndices(window_shape, &window_index));
 }
 
+// Creates a vector of multipliers which can be used to create a linear index
+// into shape.
+//
+// Given the multidimensional index {i1, ..., iN} and
+// M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+//
+//   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+//
+// This lets you calculate LI given the multidimensional indices in any order.
+DimensionVector MakeDimMultipliers(const Shape& shape) {
+  DimensionVector v(ShapeUtil::Rank(shape));
+  int64 scale = 1;
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+    v[dim] = scale;
+    scale *= shape.dimensions(dim);
+  }
+  return v;
+}
+
 }  // namespace
 
 template <typename ReturnT, typename ElementwiseT>
@@ -999,25 +1018,30 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Shape& window_shape =
         ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
 
-    DimensionVector lhs_index(lhs_rank);
-    DimensionVector rhs_index(rhs_rank);
+    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
+
     DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
 
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
     auto func = [&](ArraySlice<int64> out_index) {
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
-
-      std::fill(lhs_index.begin(), lhs_index.end(), 0);
-      std::fill(rhs_index.begin(), rhs_index.end(), 0);
       std::fill(rhs_spatial_index.begin(), rhs_spatial_index.end(), 0);
 
-      lhs_index[input_batch_dim] = out_index[output_batch_dim];
-      rhs_index[kernel_output_z_dim] = out_index[output_z_dim];
-
       // Convolve input feature with kernel.
       do {
         for (int64 iz = 0; iz < z_size; ++iz) {
-          lhs_index[input_z_dim] = iz;
-          rhs_index[kernel_input_z_dim] = iz;
+          int64 lhs_linear_index = 0;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+
+          int64 rhs_linear_index = 0;
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
 
           // Find corresponding spatial dimension index for input (lhs).
           for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
@@ -1042,29 +1066,32 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
             // Calculate the actual lhs (input) index after dilation.  As an
             // optimization, skip this integer divide if there's no dilation.
+            int64 lhs_spatial_index;
             if (window_dim.base_dilation() > 1) {
-              lhs_index[input_spatial_dim] =
-                  undilated_index / window_dim.base_dilation();
+              lhs_spatial_index = undilated_index / window_dim.base_dilation();
             } else {
-              lhs_index[input_spatial_dim] = undilated_index;
+              lhs_spatial_index = undilated_index;
             }
+            lhs_linear_index +=
+                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
 
-            // Skip if input index is not in bound.
-            if (!(lhs_index[input_spatial_dim] >= 0 &&
-                  lhs_index[input_spatial_dim] <
+            // Skip if input index is not in bounds.
+            if (!(lhs_spatial_index >= 0 &&
+                  lhs_spatial_index <
                       lhs_shape.dimensions(input_spatial_dim))) {
               goto cnt;
             }
 
-            rhs_index[dnums.kernel_spatial_dimensions(ki)] =
-                window_dim.window_reversal()
-                    ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                    : rhs_spatial_index[ki];
+            rhs_linear_index +=
+                (window_dim.window_reversal()
+                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                     : rhs_spatial_index[ki]) *
+                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
           }
 
           result_val +=
-              static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-              static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
         }
       cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index e09d58bbe7..9fa72c1b8c 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -300,8 +300,6 @@ class Service : public ServiceInterface {
   Service(const ServiceOptions& options,
           std::unique_ptr<Backend> execute_backend);
 
-  static StatusOr<std::unique_ptr<Backend>> CreateComputeConstantBackend();
-
   // Resolves the given argument handles in the allocation tracker and returns
   // the corresponding allocations for every replica. The function also verifies
   // that each allocation matches the execution platform and device ordinal of
@@ -437,8 +435,6 @@ class Service : public ServiceInterface {
   CompilationCache compilation_cache_;
 
   // Backend to compile and execute computations on.
-  //
-  // TODO(b/28616830): Support multiple backends for execution.
   std::unique_ptr<Backend> execute_backend_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Service);
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 821432ef7d..68f75d50cb 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -160,27 +160,38 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
   return std::move(literal);
 }
 
-// Matches binary addition computations.
-bool LooksLikeSum(const HloComputation& computation) {
+enum class ConstantType { kUnknown, kZero, kOne };
+
+// Return the constant type required by this computation, if known.
+ConstantType GetInitValue(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
-  return root->opcode() == HloOpcode::kAdd &&
-         computation.num_parameters() == 2 &&
-         root->operand(0)->opcode() == HloOpcode::kParameter &&
-         root->operand(1)->opcode() == HloOpcode::kParameter &&
-         root->operand(0) != root->operand(1);
+  if (computation.num_parameters() != 2 ||
+      root->operand(0)->opcode() != HloOpcode::kParameter ||
+      root->operand(1)->opcode() != HloOpcode::kParameter ||
+      root->operand(0) == root->operand(1)) {
+    return ConstantType::kUnknown;
+  }
+
+  switch (root->opcode()) {
+    case HloOpcode::kAdd:
+      return ConstantType::kZero;
+    case HloOpcode::kMultiply:
+      return ConstantType::kOne;
+    default:
+      return ConstantType::kUnknown;
+  }
 }
 
-// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
-// which requires an init_value of 0 rather than a random value.
-bool NeedsZeroInitValue(const HloUse& use) {
+// Reduce, ReduceWindow, and SelectAndScatter ops may need a non-random
+// initialization value.
+bool NeedsInitValue(const HloUse& use) {
   const HloInstruction* const instruction = use.instruction;
   const HloOpcode opcode = instruction->opcode();
   const int64 op_num = use.operand_number;
   return (
       ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
-       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
-      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
-       LooksLikeSum(*instruction->scatter())));
+       op_num == 1) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2));
 }
 
 // Generate random values that are constrained to the input_shape minus the
@@ -222,7 +233,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
         auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
         constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
                                 fused_uses.end());
-      } else if (NeedsZeroInitValue(use)) {
+      } else if (NeedsInitValue(use)) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kConvert ||
                  opcode == HloOpcode::kReducePrecision) {
@@ -243,7 +254,8 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
-  HloInstruction* needs_zero = nullptr;
+  HloInstruction* needs_constant = nullptr;
+  ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
       case HloOpcode::kDynamicSlice:
@@ -258,8 +270,13 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
 
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
+        needs_constant = use;
+        constant_type = GetInitValue(*use->to_apply());
+        break;
+
       case HloOpcode::kSelectAndScatter:
-        needs_zero = use;
+        needs_constant = use;
+        constant_type = GetInitValue(*use->scatter());
         break;
 
       default:
@@ -268,17 +285,26 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
             use->ToString().c_str());
     }
   }
-  if (needs_index != nullptr && needs_zero != nullptr) {
+  if (needs_index != nullptr && needs_constant != nullptr) {
     return Unimplemented(
         "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
-        "zero: %s\n",
-        needs_index->ToString().c_str(), needs_zero->ToString().c_str());
+        "constant: %s\n",
+        needs_index->ToString().c_str(), needs_constant->ToString().c_str());
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
                                            needs_index->shape(), engine);
-  } else if (needs_zero != nullptr) {
-    return Literal::CreateFromShape(param.shape());
+  } else if (needs_constant != nullptr) {
+    switch (constant_type) {
+      case ConstantType::kZero:
+        return Literal::Zero(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kOne:
+        return Literal::One(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kUnknown:
+        // We want the identity element for the computation, but we don't really
+        // know what it is - so any value we generate will be just as wrong.
+        return MakeFakeLiteralInternal(param.shape(), engine);
+    }
   } else {
     return MakeFakeLiteralInternal(param.shape(), engine);
   }
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 5cb18113e5..f9943f71d3 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -189,6 +189,9 @@ message DebugOptions {
   // directory.
   string xla_dump_per_pass_hlo_proto_to = 96;
 
+  // Generate calls to MKL-DNN in the CPU backend.
+  bool xla_cpu_use_mkl_dnn = 97;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index c5a0dc1095..8f9bffa55e 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -24,7 +24,6 @@ py_library(
         "continue_statements.py",
         "control_flow.py",
         "decorators.py",
-        "for_loops.py",
         "ifexp.py",
         "list_comprehension.py",
         "lists.py",
@@ -49,6 +48,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":converters",
+        "//tensorflow/contrib/autograph/operators",
         "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/contrib/autograph/pyct/static_analysis",
         "//tensorflow/contrib/autograph/utils",
@@ -132,16 +132,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "for_loops_test",
-    srcs = ["for_loops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":test_lib",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
 py_test(
     name = "name_scopes_test",
     srcs = ["name_scopes_test.py"],
diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 48026bccab..62115d4005 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -32,6 +32,7 @@ class BreakCanonicalizationTransformer(transformer.Base):
   def __init__(self, context):
     super(BreakCanonicalizationTransformer, self).__init__(context)
     # This is a stack structure, to correctly process nested loops.
+    # Each item is a list [break_used, break_variable_name]
     self.break_uses = []
 
   def _create_break_check(self):
@@ -99,9 +100,9 @@ class BreakCanonicalizationTransformer(transformer.Base):
     self.break_uses.append([False, break_var])
     node.body = self._manual_visit_list(node.body)
     if self.break_uses[-1][0]:
-      anno.setanno(node, 'extra_cond',
-                   gast.UnaryOp(gast.Not(),
-                                gast.Name(break_var, gast.Load(), None)))
+      extra_cond = templates.replace_as_expression(
+          'not var_name', var_name=break_var)
+      anno.setanno(node, 'extra_cond', extra_cond)
       final_nodes = [self._create_break_init(), node]
     else:
       final_nodes = node
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 49d932026f..55a28e8ac3 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -22,6 +22,7 @@ import gast
 
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
@@ -49,11 +50,6 @@ class ControlFlowTransformer(transformer.Base):
   def __init__(self, context):
     super(ControlFlowTransformer, self).__init__(context)
 
-  # pylint:disable=invalid-name
-
-  def visit_For(self, node):
-    assert False, 'for statement should have been canonicalized at this point'
-
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
     if aliased_orig_names:
@@ -170,6 +166,13 @@ class ControlFlowTransformer(transformer.Base):
     body_closure = body_scope.modified - body_scope.created
     all_referenced = body_scope.referenced
 
+    cond_scope = anno.getanno(node, NodeAnno.COND_SCOPE)
+    cond_closure = set()
+    for s in cond_scope.referenced:
+      for root in s.support_set:
+        if root not in body_scope.created:
+          cond_closure.add(root)
+
     state = list(body_closure)
     if not state:
       # TODO(mdan): Implement this properly.
@@ -204,7 +207,8 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = autograph_utils.run_while(test_name, body_name, [state])
+      state_ast_tuple = __ops.while_loop(
+          test_name, body_name, (state,), (extra_deps,))
     """
     node = templates.replace(
         template,
@@ -216,11 +220,67 @@ class ControlFlowTransformer(transformer.Base):
         test=test,
         body_name=self.context.namer.new_symbol('loop_body',
                                                 body_scope.referenced),
-        body=node_body)
+        body=node_body,
+        extra_deps=tuple(s.ast() for s in cond_closure),
+    )
 
     return node
 
-  # pylint:enable=invalid-name
+  def visit_For(self, node):
+    self.generic_visit(node)
+
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    body_closure = body_scope.modified - body_scope.created
+    all_referenced = body_scope.referenced
+
+    state = list(body_closure)
+
+    state_ssf = [
+        self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
+    ]
+    ssf_map = {
+        name: ssf
+        for name, ssf in zip(state, state_ssf)
+        if str(name) != ssf
+    }
+
+    if len(state) == 1:
+      state = state[0]
+      state_ssf = state_ssf[0]
+      state_ast_tuple = state
+    else:
+      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    if anno.hasanno(node, 'extra_cond'):
+      extra_cond = anno.getanno(node, 'extra_cond')
+      extra_cond = ast_util.rename_symbols(extra_cond, ssf_map)
+    else:
+      extra_cond = parser.parse_expression('True')
+
+    template = """
+      def extra_cond_name(state_ssf):
+        return extra_cond_expr
+      def body_name(iterate, state_ssf):
+        body
+        return state_ssf,
+      state_ast_tuple = __ops.for_loop(
+          iterated, extra_cond_name, body_name, (state,))
+    """
+    node = templates.replace(
+        template,
+        state=state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iterated=node.iter,
+        iterate=node.target,
+        extra_cond_name=self.context.namer.new_symbol('extra_cond',
+                                                      all_referenced),
+        extra_cond_expr=extra_cond,
+        body_name=self.context.namer.new_symbol('loop_body', all_referenced),
+        body=node_body)
+
+    return node
 
 
 def transform(node, context):
diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index 86fed51f27..c5610b16b4 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
@@ -94,6 +95,77 @@ class ControlFlowTest(converter_test_base.TestCase):
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
+  def test_simple_for(self):
+
+    def test_fn(l):
+      s1 = 0
+      s2 = 0
+      for e in l:
+        s1 += e
+        s2 += e * e
+      return s1, s2
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_single_var(self):
+
+    def test_fn(l):
+      s = 0
+      for e in l:
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_with_iterated_expression(self):
+
+    eval_count = [0]
+
+    def count_evals(x):
+      eval_count[0] += 1
+      return x
+
+    def test_fn(n):
+      s = 0
+      for e in count_evals(range(n)):
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      result.count_evals = count_evals
+      self.assertEqual(test_fn(5), result.test_fn(5))
+      # count_evals ran twice, once for test_fn and another for result.test_fn
+      self.assertEqual(eval_count[0], 2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 3ea2cfd668..6f75e9a529 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import contextlib
 import imp
 
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.pyct import compiler
 from tensorflow.contrib.autograph.pyct import context
@@ -77,6 +78,7 @@ class TestCase(test.TestCase):
       result.tf = self.make_fake_mod('fake_tf', *symbols)
       result.autograph_utils = utils
       result.autograph_api = self.make_fake_mod('fake_api', converted_call)
+      result.__dict__['__ops'] = operators
       yield result
     except Exception:  # pylint:disable=broad-except
       if source is None:
diff --git a/tensorflow/contrib/autograph/converters/for_loops.py b/tensorflow/contrib/autograph/converters/for_loops.py
deleted file mode 100644
index 4999c47bdc..0000000000
--- a/tensorflow/contrib/autograph/converters/for_loops.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Canonicalizes for loops into while loops.
-
-This canonicalizer uses the len function on its argument. That should be
-converted to a tf.shape separately.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.autograph.pyct import anno
-from tensorflow.contrib.autograph.pyct import templates
-from tensorflow.contrib.autograph.pyct import transformer
-from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
-
-
-class ForLoopCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes for loops (e.g. into while loops)."""
-
-  def __init__(self, context):
-    super(ForLoopCanonicalizationTransformer, self).__init__(context)
-
-  def visit_For(self, node):
-    self.generic_visit(node)
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    i_var = self.context.namer.new_symbol('i', body_scope.referenced)
-    smart_loop_iter_var = self.context.namer.new_symbol('smart_loop_iter',
-                                                        body_scope.referenced)
-    cont_var = self.context.namer.new_symbol('cont', body_scope.referenced)
-    # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
-    if anno.hasanno(node, 'extra_cond'):
-      template = """
-        i = 0
-        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
-        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-        while cont and extra_cond:
-          body
-          i += 1
-          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-      """
-      return templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=i_var,
-          smart_loop_iter=smart_loop_iter_var,
-          cont=cont_var,
-          extra_cond=anno.getanno(node, 'extra_cond'))
-    else:
-      template = """
-        i = 0
-        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
-        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-        while cont:
-          body
-          i += 1
-          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-      """
-      repl = templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=i_var,
-          smart_loop_iter=smart_loop_iter_var,
-          cont=cont_var)
-      return repl
-
-  def visit_Continue(self, node):
-    assert False, 'continue statement should be desugared at this point'
-
-  def visit_Break(self, node):
-    assert False, 'break statement should be desugared at this point'
-
-
-def transform(node, context):
-  return ForLoopCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/for_loops_test.py b/tensorflow/contrib/autograph/converters/for_loops_test.py
deleted file mode 100644
index 943f52de55..0000000000
--- a/tensorflow/contrib/autograph/converters/for_loops_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for for_loops module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.autograph.converters import converter_test_base
-from tensorflow.contrib.autograph.converters import for_loops
-from tensorflow.python.platform import test
-
-
-class ControlFlowTest(converter_test_base.TestCase):
-
-  def test_basic_for(self):
-
-    def test_fn(l):
-      s = 0
-      for e in l:
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      l = [1, 2, 3]
-      self.assertEqual(test_fn(l), result.test_fn(l))
-      l = []
-      self.assertEqual(test_fn(l), result.test_fn(l))
-
-  def test_for_with_iterated_expression(self):
-
-    eval_count = [0]
-
-    def count_evals(x):
-      eval_count[0] += 1
-      return x
-
-    def test_fn(n):
-      s = 0
-      for e in count_evals(range(n)):
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      result.count_evals = count_evals
-      self.assertEqual(test_fn(5), result.test_fn(5))
-      # count_evals ran twice, once for test_fn and another for result.test_fn
-      self.assertEqual(eval_count[0], 2)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index ee2d301d75..f9db07778a 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -37,8 +37,12 @@ class ApiTest(test.TestCase):
   def setUp(self):
     config.COMPILED_IMPORT_STATEMENTS = (
         'from __future__ import print_function',
-        'from tensorflow.contrib.autograph import utils as '
-        'autograph_utils', 'tf = autograph_utils.fake_tf()')
+        'from tensorflow.contrib.autograph import utils'
+        ' as autograph_utils',
+        'from tensorflow.contrib.autograph import operators'
+        ' as __ops',
+        'tf = autograph_utils.fake_tf()',
+    )
 
   def test_decorator_recurses(self):
 
@@ -197,8 +201,7 @@ class ApiTest(test.TestCase):
 
     compiled_code = api.to_code(test_fn)
 
-    # Just check for some key words and that it is parseable Python code.
-    self.assertRegexpMatches(compiled_code, 'autograph_utils\\.run_while')
+    # Just check that it is parseable Python code.
     self.assertIsNotNone(parser.parse_str(compiled_code))
 
 
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 62a49cd92d..3bacc94300 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -28,7 +28,6 @@ from tensorflow.contrib.autograph.converters import call_trees
 from tensorflow.contrib.autograph.converters import continue_statements
 from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import decorators
-from tensorflow.contrib.autograph.converters import for_loops
 from tensorflow.contrib.autograph.converters import ifexp
 from tensorflow.contrib.autograph.converters import lists
 from tensorflow.contrib.autograph.converters import logical_expressions
@@ -324,8 +323,6 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   node = _static_analysis_pass(node, ctx)
   node = lists.transform(node, ctx)
-  node = for_loops.transform(node, ctx)
-  # for_loops may insert new global references.
   node = builtin_functions.transform(node, ctx)
 
   node = _static_analysis_pass(node, ctx)
diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
index 7856c253bd..4c62468575 100644
--- a/tensorflow/contrib/autograph/operators/BUILD
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -2,6 +2,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -18,8 +20,21 @@ py_library(
     name = "operators",
     srcs = [
         "__init__.py",
+        "control_flow.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
-    deps = [],
+    deps = [
+        "//tensorflow/contrib/autograph/utils",
+    ],
+)
+
+py_test(
+    name = "control_flow_test",
+    srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
 )
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
index c3f4cab69e..04b4734551 100644
--- a/tensorflow/contrib/autograph/operators/__init__.py
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -22,3 +22,8 @@ closures for the body.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+# TODO(mdan): Add a container for implementation-specific toggles (throughout).
+
+from tensorflow.contrib.autograph.operators.control_flow import for_loop
+from tensorflow.contrib.autograph.operators.control_flow import while_loop
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
new file mode 100644
index 0000000000..5b8cb2d63c
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -0,0 +1,179 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Control flow statements: loops, conditionals, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+
+
+def for_loop(iterated, extra_cond, loop_body, init_state):
+  """Functional form of a for statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations, excluding the iterate. In what follows we
+  refer to state as either a tuple of entities that represent an actual state,
+  or a list of arguments of the corresponding types.
+
+  Args:
+    iterated: The entity being iterated over.
+    extra_cond: Callable with the state as arguments, and boolean return type.
+        An additionnal loop condition.
+    loop_body: Callable with the iterate and the state as arguments, and
+        state as return type. The actual loop body.
+    init_state: Tuple containing the initial state.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  if tensor_util.is_tensor(iterated):
+    return _known_len_for_loop(iterated, extra_cond, loop_body, init_state)
+  elif isinstance(iterated, dataset_ops.Dataset):
+    return _dataset_for_loop(iterated, extra_cond, loop_body, init_state)
+  else:
+    return _py_for_loop(iterated, extra_cond, loop_body, init_state)
+
+
+def _py_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that executes a Python for loop."""
+  state = init_state
+  for iterate in iterated:
+    if not extra_cond(*state):
+      break
+    state = loop_body(iterate, *state)
+
+  # TODO(mdan): Remove this special case.
+  if len(state) == 1:
+    return state[0]
+  return state
+
+
+def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over objects that define a length."""
+  n = builtins.dynamic_len(iterated)
+
+  def while_body(iterate_index, *state):
+    iterate = iterated[iterate_index]
+    new_state = loop_body(iterate, *state)
+    return (iterate_index + 1,) + new_state
+
+  def while_cond(iterate_index, *state):
+    return gen_math_ops.logical_and(iterate_index < n, extra_cond(*state))
+
+  results = while_loop(
+      while_cond,
+      while_body,
+      init_state=(0,) + init_state,
+      extra_deps=(iterated,))
+  # Dropping the iteration index because it's not syntactically visible.
+  results = results[1:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over TF Datasets."""
+  # Because Datsets only expose get_next, in the style of Python iterators,
+  # we are forced to unpack the loop as:
+  #
+  # epoch_number, iterate = ds.get_next()
+  # while epoch_number < 2:
+  #   <body>
+  #   epoch_number, iterate = ds.get_next()
+  epoch_numbers = dataset_ops.Dataset.range(2)
+  def tag_with(ds, tag):
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
+  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
+
+  iterator = ds_with_epoch.make_initializable_iterator()
+  with ops.control_dependencies((iterator.initializer,)):
+    epoch_number, iterate = iterator.get_next()
+
+    def while_body(epoch_number, iterate, *state):
+      new_state = loop_body(iterate, *state)
+      epoch_number, iterate = iterator.get_next()
+      return (epoch_number, iterate) + new_state
+
+    def while_cond(epoch_number, iterate, *state):
+      del iterate
+      return gen_math_ops.logical_and(epoch_number < 1, extra_cond(*state))
+
+    results = while_loop(
+        while_cond,
+        while_body,
+        init_state=(epoch_number, iterate) + init_state,
+        extra_deps=())
+  # Dropping the epoch number and iterate because they are not not syntactically
+  # visible.
+  results = results[2:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def while_loop(loop_cond, loop_body, init_state, extra_deps):
+  """Functional form of a while statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations. In what follows we refer to state as either
+  a tuple of entities that represent an actual state, or a list of arguments
+  of the corresponding types.
+
+  Args:
+    loop_cond: Callable with the state as arguments, and boolean return type.
+        The loop condition.
+    loop_body: Callable with the state as arguments, and state as return type.
+        The actual loop body.
+    init_state: Tuple containing the initial state.
+    extra_deps: Tuple containing additional entities on which the loop may
+        depend, such as loop invariants referenced by loop_cond. Used
+        exclusively for dispatch control.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
+  # That could be somethins as simple as a collection of dispatch rules, with
+  # some prioritization.
+  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+    return _tf_while_loop(loop_cond, loop_body, init_state)
+  else:
+    return _py_while_loop(loop_cond, loop_body, init_state)
+
+
+def _tf_while_loop(loop_cond, loop_body, init_state):
+  """Overload of while_loop that stages a TF while_loop."""
+  return control_flow_ops.while_loop(loop_cond, loop_body, init_state)
+
+
+def _py_while_loop(loop_cond, loop_body, init_state):
+  """Overload of while_loop that executes a Python while loop."""
+  state = init_state
+  while loop_cond(*state):
+    state = loop_body(*state)
+  return state
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
new file mode 100644
index 0000000000..9112b1627f
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for control_flow module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph import operators
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ForLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    s = operators.for_loop(
+        constant_op.constant([1, 2, 3, 4]),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+  def test_python(self):
+    s = operators.for_loop(
+        range(5),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    self.assertEqual(10, s)
+
+  def test_dataset(self):
+    to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
+    s = operators.for_loop(
+        dataset_ops.Dataset.range(5).map(to_int32),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+
+class WhileLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    n = constant_op.constant(5)
+    results = operators.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i,),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    with self.test_session() as sess:
+      self.assertEqual((5, 10), sess.run(results))
+
+  def test_python(self):
+    n = 5
+    results = operators.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    self.assertEqual((5, 10), results)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 4f76a69522..4a70bab440 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -28,7 +28,7 @@ from tensorflow.contrib.autograph.pyct import anno
 class CleanCopier(gast.NodeVisitor):
   """Copy AST nodes.
 
-  The copied nodes will ignore almost all fields that prefixed by '__'.
+  The copied nodes will ignore almost all fields that are prefixed by '__'.
   Exceptions make some annotations.
   """
 
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index d19c6ed75e..30a5961821 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -74,6 +74,12 @@ def getmethodclass(m):
     ValueError: if the class could not be resolved for any unexpected reason.
   """
 
+  # Callable objects: return their own class.
+  if (not hasattr(m, '__name__') and hasattr(m, '__class__') and
+      hasattr(m, '__call__')):
+    if isinstance(m.__class__, six.class_types):
+      return m.__class__
+
   # Instance method and class methods: should be bound to a non-null "self".
   # If self is a class, then it's a class method.
   if hasattr(m, '__self__'):
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index ddca6f963b..eda3fc13fd 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -225,6 +225,15 @@ class InspectUtilsTest(test.TestCase):
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
         LocalClass)
 
+  def test_getmethodclass_callables(self):
+    class TestCallable(object):
+
+      def __call__(self):
+        pass
+
+    c = TestCallable()
+    self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
index 4d5764a974..583cf7ecd7 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -112,6 +112,29 @@ class QN(object):
       raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
     return self._parent
 
+  @property
+  def support_set(self):
+    """Returns the set of simple symbols that this QN relies on.
+
+    This would be the smallest set of symbols necessary for the QN to
+    statically resolve (assuming properties and index ranges are verified
+    at runtime).
+
+    Examples:
+      'a.b' has only one support symbol, 'a'
+      'a[i]' has two roots, 'a' and 'i'
+    """
+    # TODO(mdan): This might be the set of Name nodes in the AST. Track those?
+    roots = set()
+    if self.has_attr():
+      roots.update(self.parent.support_set)
+    elif self.has_subscript():
+      roots.update(self.parent.support_set)
+      roots.update(self.qn[1].support_set)
+    else:
+      roots.add(self)
+    return roots
+
   def __hash__(self):
     return hash(self.qn + (self._has_attr, self._has_subscript))
 
diff --git a/tensorflow/contrib/autograph/pyct/qual_names_test.py b/tensorflow/contrib/autograph/pyct/qual_names_test.py
index 103bd25aa3..264afd508c 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names_test.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names_test.py
@@ -154,6 +154,21 @@ class QNTest(test.TestCase):
     a_sub_three = QN(a, subscript=QN(qual_names.NumberLiteral(3)))
     self.assertEqual(a_sub_three.ast().slice.value.n, 3)
 
+  def test_support_set(self):
+    a = QN('a')
+    b = QN('b')
+    c = QN('c')
+    a_sub_b = QN(a, subscript=b)
+    a_dot_b = QN(a, attr='b')
+    a_dot_b_dot_c = QN(a_dot_b, attr='c')
+    a_dot_b_sub_c = QN(a_dot_b, subscript=c)
+
+    self.assertSetEqual(a.support_set, set((a,)))
+    self.assertSetEqual(a_sub_b.support_set, set((a, b)))
+    self.assertSetEqual(a_dot_b.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_dot_c.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_sub_c.support_set, set((a, c)))
+
 
 class QNResolverTest(test.TestCase):
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index da6a2f6f05..6dd53091fa 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -265,10 +265,10 @@ class ActivityAnalizer(transformer.Base):
       qn = QN(node.name)
       self.scope.mark_write(qn)
     current_scope = self.scope
-    fndef_scope = Scope(current_scope, isolated=True)
-    self.scope = fndef_scope
+    body_scope = Scope(current_scope, isolated=True)
+    self.scope = body_scope
     self.generic_visit(node)
-    anno.setanno(node, NodeAnno.BODY_SCOPE, fndef_scope)
+    anno.setanno(node, NodeAnno.BODY_SCOPE, body_scope)
     self.scope = current_scope
     return node
 
@@ -282,7 +282,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_If(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
@@ -297,7 +303,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_While(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index 37c28872bb..1e6c686b01 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -204,6 +204,8 @@ class ActivityAnalizerTest(test.TestCase):
     self.assertScopeIsRmc(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
         ('b', 'c'), ('a', 'b', 'c'))
+    self.assertScopeIsRmc(
+        anno.getanno(while_node, NodeAnno.COND_SCOPE), ('b',), (), ())
 
   def test_for(self):
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index 5254b83ca7..d6d9f7e1a6 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -43,6 +43,7 @@ class NodeAnno(NoValue):
   # Scopes
   # Scopes are represented by objects of type activity.Scope.
   ARGS_SCOPE = 'The scope for the argument list of a function call.'
+  COND_SCOPE = 'The scope for the test node of a conditional statement.'
   BODY_SCOPE = (
       'The scope for the main body of a statement (True branch for if '
       'statements, main body for loops).')
diff --git a/tensorflow/contrib/autograph/utils/__init__.py b/tensorflow/contrib/autograph/utils/__init__.py
index 22898b17e9..817d4126d1 100644
--- a/tensorflow/contrib/autograph/utils/__init__.py
+++ b/tensorflow/contrib/autograph/utils/__init__.py
@@ -19,8 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.autograph.utils.builtins import dynamic_builtin
-from tensorflow.contrib.autograph.utils.builtins import dynamic_dataset
-from tensorflow.contrib.autograph.utils.builtins import dynamic_for_cond
 from tensorflow.contrib.autograph.utils.builtins import dynamic_print
 from tensorflow.contrib.autograph.utils.builtins import dynamic_range
 from tensorflow.contrib.autograph.utils.context_managers import control_dependency_on_returns
@@ -28,7 +26,6 @@ from tensorflow.contrib.autograph.utils.misc import alias_tensors
 from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is
 from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is_not
 from tensorflow.contrib.autograph.utils.multiple_dispatch import run_cond
-from tensorflow.contrib.autograph.utils.multiple_dispatch import run_while
 from tensorflow.contrib.autograph.utils.py_func import wrap_py_func
 from tensorflow.contrib.autograph.utils.tensor_list import dynamic_list_append
 from tensorflow.contrib.autograph.utils.testing import fake_tf
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index c6af0e4d13..7fbb7c09d8 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -24,10 +24,8 @@ import six
 
 from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.contrib.autograph.utils import type_check
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_inspect
@@ -106,69 +104,3 @@ def dynamic_print(*values):
 
   return py_func.wrap_py_func(
       flushed_print, None, values, use_dummy_return=True)
-
-
-def dynamic_dataset(iterated):
-  """Implementartion of smart tf.data.Dataset epoch wrapping.
-
-  The function checks if the input is a tf.data.Dataset and if so then wraps it
-  so that for each element it returns it also returns the current epoch the
-  dataset iteration is in, for two epochs.  If the input is not a
-  tf.data.Dataset then it just returns the input.
-
-  Args:
-    iterated: The iterable or tf.data.Dataset that is being iterated over.
-  Returns:
-    Either just the untouched input, or in the case of input being a
-    tf.data.Dataset then it returns a wrapped  tf.data.Dataset where for each
-    element it returns it also returns the current epoch the dataset iteration
-    is in.
-  """
-  if not isinstance(iterated, dataset_ops.Dataset):
-    return iterated
-
-  def epoch_dataset_number_helper(i):
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(i).repeat(), iterated))
-
-  epoch_numbers = dataset_ops.Dataset.range(2)
-  return epoch_numbers.flat_map(epoch_dataset_number_helper)
-
-
-def dynamic_for_cond(iteration, iterated):
-  """Implementartion of smart while-loop condition using dynamic dispatch.
-
-  The function checks if it is iterating over a tf.data.Dataset or not, and in
-  the case it is not then it simply returns if we are still in range of the
-  iterated and the next element.  If it is iterating over a dataset then it only
-  iterates for a single epoch.
-
-  Args:
-    iteration: The current iteration of the loop.
-    iterated: The iterable or tf.data.Dataset that is being iterated over.
-  Returns:
-    A tuple of a bool that indicates whether the loop should continue, and the
-    next element in iterated.
-  """
-  # TODO(znado): Clean up.
-  # TODO(znado): This won't work for unpacked iterates. Fix.
-  if isinstance(iterated, dataset_ops.Dataset):
-    curr_epoch, next_elem = iterated.make_one_shot_iterator().get_next()
-    return math_ops.less(curr_epoch, 1), next_elem
-  elif tensor_util.is_tensor(iterated):
-    if iterated.shape.ndims > 1:
-      elem_shape = array_ops.shape(iterated)[1:]
-    else:
-      elem_shape = ()
-    if iterated.shape.ndims == 0 or iterated.shape[0] == 0:
-      return False, array_ops.zeros(elem_shape, iterated.dtype)
-    return control_flow_ops.cond(
-        math_ops.less(iteration, dynamic_len(iterated)),
-        lambda: (True, iterated[iteration]),
-        lambda: (False, array_ops.zeros(elem_shape, iterated.dtype)))
-  elif hasattr(iterated, '__len__'):
-    if iteration < len(iterated):
-      return True, iterated[iteration]
-    return False, None
-  else:
-    raise NotImplementedError('Python iterators not yet supported.')
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch.py b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
index 47049255f3..70eef5676f 100644
--- a/tensorflow/contrib/autograph/utils/multiple_dispatch.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.contrib.autograph.utils.type_check import is_tensor
 from tensorflow.python.ops import control_flow_ops
 
@@ -66,42 +64,3 @@ def py_cond(condition, true_fn, false_fn):
   if len(results) == 1:
     return results[0]
   return results
-
-
-def run_while(cond_fn, body_fn, init_args):
-  """Type-dependent functional while loop.
-
-  Args:
-    cond_fn: A Python callable implementing the stop conditions of the loop.
-    body_fn: A Python callable implementing the body of the loop.
-    init_args: The initial values of the arguments that will be passed to both
-      cond_fn and body_fn.
-
-  Returns:
-    result: A list of values with the same shape and type as init_args. If any
-    of the init_args, or any variables closed-over in cond_fn are Tensors,
-    tf.while_loop will be used, otherwise a Python while loop will be ran.
-
-  Raises:
-    ValueError: if init_args is not a tuple or list with one or more elements.
-  """
-  if not isinstance(init_args, (tuple, list)) or not init_args:
-    raise ValueError(
-        'init_args must be a non-empty list or tuple, found %s' % init_args)
-
-  # TODO(alexbw): statically determine all active variables in cond_fn,
-  # and pass them directly
-  closure_vars = tuple(
-      [c.cell_contents for c in six.get_function_closure(cond_fn) or []])
-  possibly_tensors = tuple(init_args) + closure_vars
-  if is_tensor(*possibly_tensors):
-    return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
-  else:
-    return py_while_loop(cond_fn, body_fn, init_args)
-
-
-def py_while_loop(cond_fn, body_fn, init_args):
-  state = init_args
-  while cond_fn(*state):
-    state = body_fn(*state)
-  return state
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
index e6a41bb416..f72f8e94a0 100644
--- a/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
@@ -70,29 +70,6 @@ class MultipleDispatchTest(test.TestCase):
       out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
       self.assertEqual(sess.run(out), 3)
 
-  def test_run_while_python(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 1.0, 0.5])
-    self.assertEqual(x, 0.75)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 4.0, 0.5])
-    self.assertEqual(x, 3.0)
-
-  def test_run_while_tf(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    with Session() as sess:
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 1.0, 0.5])
-      self.assertEqual(sess.run(x), 0.75)
-
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 4.0, 0.5])
-      self.assertEqual(sess.run(x), 3.0)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 95c5c920aa..5a2771229d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -61,11 +61,13 @@ class TPUClusterResolver(ClusterResolver):
       return False
     return True
 
-  def _inGke(self):
+  @staticmethod
+  def _inGke():
     """When running in GKE, the environment variable will be set."""
     return _GKE_ENV_VARIABLE in os.environ
 
-  def _gkeMaster(self):
+  @staticmethod
+  def _gkeMaster():
     return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
 
   def __init__(self,
@@ -119,8 +121,9 @@ class TPUClusterResolver(ClusterResolver):
             'Using multiple TPUs in a single session is not yet implemented')
       tpu = tpu[0]
 
+    in_gke = self._inGke()
     # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None and self._inGke():
+    if tpu is None and in_gke:
       tpu = self._gkeMaster()
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
@@ -158,7 +161,8 @@ class TPUClusterResolver(ClusterResolver):
       self._service = service
 
     self._coordinator_name = coordinator_name
-    if coordinator_name and not coordinator_address and should_resolve:
+    if coordinator_name and not coordinator_address and (should_resolve or
+                                                         in_gke):
       self._start_local_server()
     else:
       self._coordinator_address = coordinator_address
@@ -204,31 +208,50 @@ class TPUClusterResolver(ClusterResolver):
     Raises:
       RuntimeError: If the provided TPU is not healthy.
     """
-    if not self._shouldResolve():
-      return server_lib.ClusterSpec({})
-
-    full_name = 'projects/%s/locations/%s/nodes/%s' % (
-        self._project, self._zone, compat.as_text(self._tpu))
-    request = self._service.projects().locations().nodes().get(name=full_name)
-    response = request.execute()
-
-    if 'health' in response and response['health'] != 'HEALTHY':
-      raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
-                                                          response['health']))
-
-    if 'networkEndpoints' in response:
-      worker_list = [
-          '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-          for endpoint in response['networkEndpoints']
-      ]
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
+
+    if self._shouldResolve():
+      # Case 1.
+      full_name = 'projects/%s/locations/%s/nodes/%s' % (
+          self._project, self._zone, compat.as_text(self._tpu))
+      request = self._service.projects().locations().nodes().get(name=full_name)
+      response = request.execute()
+
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
+                                                            response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list = [instance_url]
+
+      cluster_spec = {self._job_name: worker_list}
     else:
-      # Fall back to the deprecated response format
-      instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-      worker_list = [instance_url]
-
-    cluster_spec = {self._job_name: worker_list}
+      if not self._tpu.startswith(compat.as_bytes('grpc://')):
+        # Case 3.
+        return server_lib.ClusterSpec({})
+      # Case 2.
+      cluster_spec = {self._job_name: [self._tpu[len(
+          compat.as_bytes('grpc://')):]]}
 
     if self._coordinator_address:
+      # {1, 2}.a
       cluster_spec[self._coordinator_name] = [self._coordinator_address]
 
     return server_lib.ClusterSpec(cluster_spec)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index e1e3e6867a..dff7a03b68 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -362,14 +362,10 @@ class TPUClusterResolverTest(test.TestCase):
   def testGkeEnvironment(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
     self.assertTrue('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ)
-    tpu_cluster_resolver = TPUClusterResolver()
-    self.assertTrue(tpu_cluster_resolver._inGke())
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver._gkeMaster()))
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.get_master()))
+        compat.as_bytes(TPUClusterResolver._gkeMaster()))
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
 
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 8b5d13f725..d68015ae15 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -25,6 +25,7 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 9897c31a98..9cc6ca09ad 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import argparse
 import collections
+import functools
 import itertools
 import os
 import sys
@@ -28,13 +29,14 @@ import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -265,7 +267,7 @@ def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None):
     return outputs, (output_state_fw, output_state_bw)
 
 
-class CudnnRNNTestBasic(TensorFlowTestCase):
+class CudnnRNNTestBasic(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -467,7 +469,7 @@ class CudnnRNNTestBasic(TensorFlowTestCase):
 
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
+class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
 
   def _CompareWeights(self, lhs, rhs):
     self.assertEqual(len(lhs), len(rhs))
@@ -701,9 +703,146 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
+class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+
+  def _VerifyCheckpoint(
+      self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
+      num_layers, input_size, expected_variable_values, num_applications=3):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with ops.device("gpu:0"):
+      cudnn_layer = cudnn_cell_fn()
+      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      status = cudnn_checkpoint.restore(checkpoint_path)
+      inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
+                                   dtype=dtypes.float32)
+      cudnn_output, _ = cudnn_layer(inputs)
+      status.assert_consumed().run_restore_ops()
+    second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
+    restore_layer = compatible_cell_fn()
+    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+        cell=restore_layer)
+    status = restore_layer_checkpoint.restore(second_save_path)
+    current_state = restore_layer.zero_state(1, dtypes.float32)
+    for _ in range(num_applications):
+      restore_layer_output, current_state = restore_layer(
+          inputs=3. * array_ops.ones([1, input_size]),
+          state=current_state)
+    status.assert_consumed().run_restore_ops()
+    self.assertTrue(restore_layer.variables)
+    for variable, expected_value in zip(
+        restore_layer.variables, expected_variable_values):
+      self.assertAllClose(expected_value, self.evaluate(variable))
+    self.assertAllClose(self.evaluate(restore_layer_output),
+                        self.evaluate(cudnn_output)[-1, -1:, ...])
+
+  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn):
+    # Single-layer cuDNN cells with object-based checkpointing should be
+    # checkpoint compatible with either single CudnnCompatible cells or
+    # MultiRnnCells with one cell.
+    input_size = 3
+    save_cell_layer = single_cell_fn()
+    save_cell_layer(
+        inputs=array_ops.ones([1, input_size]),
+        state=save_cell_layer.zero_state(1, dtypes.float32))
+    self.assertTrue(save_cell_layer.variables)
+    expected_values = []
+    np.random.seed(10)
+    for variable in save_cell_layer.variables:
+      value = np.random.normal(size=variable.shape)
+      expected_values.append(value)
+      self.evaluate(variable.assign(value))
+    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=
+        lambda: rnn_cell_impl.MultiRNNCell([single_cell_fn()]),
+        cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=1,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testLSTMCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=1, num_units=num_units,
+            direction=direction, name="awesome_lstm"))
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testGRUCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    with self.assertRaises(NotImplementedError):
+      # TODO(allenl): Implement object-based saving for GRUs and other cells.
+      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+          single_cell_fn=functools.partial(
+              cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
+          cudnn_cell_fn=functools.partial(
+              cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
+              direction=direction, name="awesome_gru"))
+
+  def _CheckpointableMultiLayerTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn, num_layers):
+
+    def _MultiCellFn():
+      return rnn_cell_impl.MultiRNNCell(
+          [single_cell_fn() for _ in range(num_layers)])
+    input_size = 3
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      save_layer = _MultiCellFn()
+      save_layer(inputs=array_ops.ones([1, input_size]),
+                 state=save_layer.zero_state(1, dtypes.float32))
+      self.assertTrue(save_layer.variables)
+      expected_values = []
+      np.random.seed(10)
+      for variable in save_layer.variables:
+        value = np.random.normal(size=variable.shape)
+        expected_values.append(value)
+        self.evaluate(variable.assign(value))
+      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=_MultiCellFn, cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=num_layers,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testCudnnCompatibleLSTMCheckpointablMultiLayer(self):
+    num_units = 2
+    num_layers = 3
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableMultiLayerTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=num_layers, num_units=num_units,
+            direction=direction, name="awesome_lstm"),
+        num_layers=num_layers)
+
+
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
+class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -884,7 +1023,7 @@ class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
                               rtol=2e-5)
 
 
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
+class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
   def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size,
                             dtype, direction):
@@ -931,7 +1070,7 @@ class CudnnRNNTestParamsSize(TensorFlowTestCase):
                                    dtype, direction)
 
 
-class CudnnRNNTestTraining(TensorFlowTestCase):
+class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
   def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1):
     """Compute the numeric gradient of y wrt to x.
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 36fba917a8..00d9544602 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -142,6 +142,9 @@ class _CudnnRNN(base_layer.Layer):
   """
   # pylint:enable=line-too-long
 
+  # TODO(allenl): Document object-based saving and checkpoint compatibility once
+  # it's implemented for more cuDNN Layers.
+
   # The following are constants defined by subclasses.
   # Type of RNN cell.
   _rnn_mode = None
@@ -363,6 +366,11 @@ class _CudnnRNN(base_layer.Layer):
       self._create_saveable()
     self.built = True
 
+  def _gather_saveables_for_checkpoint(self):
+    raise NotImplementedError(
+        "This cell does not yet support object-based saving. File a feature "
+        "request if this limitation bothers you.")
+
   def call(self, inputs, initial_state=None, training=True):
     """Runs the forward step for the RNN model.
 
@@ -499,6 +507,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
+    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
+        checkpointable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
@@ -521,6 +531,16 @@ class CudnnLSTM(_CudnnRNN):
     return ([self.num_layers * self.num_dirs, batch_size, self.num_units],
             [self.num_layers * self.num_dirs, batch_size, self.num_units])
 
+  @property
+  def _gather_saveables_for_checkpoint(self):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      # Skip one inheritance level to avoid NotImplementedError.
+      return super(_CudnnRNN, self)._gather_saveables_for_checkpoint
+    else:
+      raise NotImplementedError(
+          "Object-based saving does not currently support bidirectional LSTM "
+          "cells. File a feature request if this limitation bothers you.")
+
 
 class _CudnnRNNNoInputC(_CudnnRNN):
   """Abstract simple CudnnRNN layer without input_c."""
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 2ac9442406..9796aae4b0 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
@@ -31,6 +32,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import saver
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
@@ -266,13 +268,16 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     # instead of having the master pull all slices and then save them.
     slice_spec = ""
     params = weights + biases
-    param_names = weight_names + bias_names
+    self._weight_names = weight_names
+    self._bias_names = bias_names
+    self._param_names = weight_names + bias_names
+    prefixed_param_names = weight_names + bias_names
     if self._scope:
-      param_names = ["%s/%s" % (self._scope, pn) for pn in param_names]
-
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
     specs = [
         saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, param_names)
+        for param, param_name in zip(params, prefixed_param_names)
     ]
     super(CudnnOpaqueParamsSaveable, self).__init__(
         array_ops.identity(self._variables), specs, name)
@@ -285,6 +290,45 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self._OpaqueParamsToCanonical()
+    with ops.device("gpu:0"):
+      (weights, _), (biases, _) = self._TransformCanonical(
+          weights, biases)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [array_ops.identity(restore_buffer[name])
+               for name in self._param_names]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = checkpointable_utils.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
   def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
     if self._direction == CUDNN_RNN_UNIDIRECTION:
       return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
@@ -574,6 +618,29 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_biases.append(b)
     tf_bias_names.append(prefix + "/bias")
 
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
 
 class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   """SaveableObject implementation handling Cudnn GRU opaque params."""
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 4b50260670..b08132cd72 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
@@ -265,6 +266,43 @@ class PrefetchToDeviceTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchSparseTensorsToDevice(self):
+    def make_tensor(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2])
+    host_dataset = dataset_ops.Dataset.range(10).map(make_tensor)
+
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        actual = sess.run(next_element)
+        self.assertAllEqual([i], actual.values)
+        self.assertAllEqual([[0, 0]], actual.indices)
+        self.assertAllEqual([2, 2], actual.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 6ee1b572f1..f3e9302409 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -271,7 +271,8 @@ class ReadBatchFeaturesTest(test.TestCase):
                            reader_num_threads=1,
                            parser_num_threads=1,
                            shuffle=False,
-                           shuffle_seed=None):
+                           shuffle_seed=None,
+                           drop_final_batch=False):
     self.filenames = filenames
     self.num_epochs = num_epochs
     self.batch_size = batch_size
@@ -289,7 +290,8 @@ class ReadBatchFeaturesTest(test.TestCase):
         shuffle=shuffle,
         shuffle_seed=shuffle_seed,
         reader_num_threads=reader_num_threads,
-        parser_num_threads=parser_num_threads).make_one_shot_iterator(
+        parser_num_threads=parser_num_threads,
+        drop_final_batch=drop_final_batch).make_one_shot_iterator(
         ).get_next()
 
   def _record(self, f, r):
@@ -559,6 +561,20 @@ class ReadBatchFeaturesTest(test.TestCase):
               with self.assertRaises(errors.OutOfRangeError):
                 self._next_actual_batch(sess)
 
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          # Basic test: read from file 0.
+          self.outputs = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              drop_final_batch=True)
+          for _, tensor in self.outputs.items():
+            if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
+              self.assertEqual(tensor.shape[0], batch_size)
+
 
 class MakeCsvDatasetTest(test.TestCase):
 
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 77e23d0319..89c04dc89a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -25,10 +25,11 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -111,19 +112,7 @@ class _PrefetchToDeviceIterator(object):
           self._input_iterator.output_shapes,
           self._input_iterator.output_classes)
       ret = remote_iterator.get_next()
-
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
-      ret = nest.pack_sequence_as(ret, [
-          sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
-      ])
-
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
-      return nest.flatten(ret)
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
@@ -179,6 +168,68 @@ class _PrefetchToDeviceIterator(object):
   @property
   def output_types(self):
     return self._input_dataset.output_types
+
+
+class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               device,
+               buffer_size):
+    with ops.device("/device:CPU:0"):
+      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
+      input_iterator_handle = core_gen_dataset_ops.iterator_to_string_handle(
+          self._resource)
+
+    self._device = device
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self.output_types, self.output_shapes, self.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    _prefetch_fn.add_to_graph(None)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=gen_dataset_ops.iterator_get_device(self._resource),
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=iterator_ops._generate_shared_name(
+              "function_buffer_resource"))
+
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        ret = gen_dataset_ops.function_buffering_resource_get_next(
+            function_buffer_resource=self._buffering_resource,
+            output_types=self._flat_output_types)
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
 # pylint: enable=protected-access
 
 
@@ -190,12 +241,37 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset):
     self._device = device
     self._buffer_size = buffer_size if buffer_size is not None else 1
 
+  # The static analysis cannot tell that the eager iterator's superclass has
+  # a `next()` method.
+  # pylint: disable=non-iterator-returned
+  def __iter__(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    The returned iterator implements the Python iterator protocol and therefore
+    can only be used in eager mode.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      raise RuntimeError("dataset.__iter__() is only supported when eager "
+                         "execution is enabled.")
+  # pylint: enable=non-iterator-returned
+
   def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=True,
-        device=self._device,
-        buffer_size=self._buffer_size)
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
+                                       device=self._device,
+                                       buffer_size=self._buffer_size)
 
   def make_initializable_iterator(self, shared_name=None):
     return _PrefetchToDeviceIterator(
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 9a48aa02fb..b8eb09978e 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -370,7 +370,8 @@ def make_batched_features_dataset(file_pattern,
                                   prefetch_buffer_size=1,
                                   reader_num_threads=1,
                                   parser_num_threads=2,
-                                  sloppy_ordering=False):
+                                  sloppy_ordering=False,
+                                  drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   Example:
@@ -443,6 +444,9 @@ def make_batched_features_dataset(file_pattern,
       produced is deterministic prior to shuffling (elements are still
       randomized if `shuffle=True`. Note that if the seed is set, then order
       of elements after shuffling is deterministic). Defaults to `False`.
+    drop_final_batch: If `True`, and the batch size does not evenly divide the
+      input dataset size, the final smaller batch will be dropped. Defaults to
+      `False`.
 
   Returns:
     A dataset of `dict` elements. Each `dict` maps feature keys to
@@ -481,7 +485,10 @@ def make_batched_features_dataset(file_pattern,
   elif shuffle:
     dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
 
-  dataset = dataset.batch(batch_size)
+  if drop_final_batch:
+    dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
+  else:
+    dataset = dataset.batch(batch_size)
 
   # Parse `Example` tensors to a dictionary of `Feature` tensors.
   dataset = dataset.map(
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 28483f4c88..14de1e8f49 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -117,7 +117,7 @@ in the input function gives a solid boost in performance. When using
 This feature is in early stages and there are a lot of improvements forthcoming:
 
 * Metrics are not yet supported during distributed training.
-* Summaries are currently computed in every tower.
+* Summaries are only computed in the first tower in `MirroredStrategy`.
 * Evaluation is not yet distributed.
 * Eager support is in the works; performance can be more challenging with eager
 execution.
@@ -129,10 +129,6 @@ effective batch size will be `num_gpus * batch_size`. Therefore, consider
 adjusting your learning rate or batch size according to the number of GPUs.
 We are working on addressing this limitation by splitting each batch across GPUs
 instead.
-* Dictionaries inside dataset in the input are not supported when prefetching
-on GPUs is turned on. (If you need to use dictionaries in the dataset, turn off
-prefetching on GPUs by passing param `prefetch_on_device=False` to
-`MirroredStrategy`)
 * PartitionedVariables are not supported yet.
 
 ## What's next?
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index 2b49b8f4ef..c5a520ab5a 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -61,7 +61,7 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           mode=['graph'],
           distribution=[
               combinations.one_device_strategy,
-              combinations.mirrored_strategy_without_prefetch
+              combinations.mirrored_strategy_with_gpu_and_cpu
           ]))
   def test_complete_flow_with_mode(self, distribution):
     label_dimension = 2
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
index e1ddf3cece..dfcbb8568f 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -45,10 +45,12 @@ class _PrefetchToDeviceIterator(object):
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
           handle, input_iterator.output_types, input_iterator.output_shapes,
           input_iterator.output_classes)
-      return remote_iterator.get_next()
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     target_device = gen_dataset_ops.iterator_get_device(
         input_iterator._iterator_resource)
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 9799901483..fec6eafd4a 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -490,6 +490,16 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_test(
+    name = "seed_stream_test",
+    size = "small",
+    srcs = ["python/kernel_tests/seed_stream_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 cuda_py_test(
     name = "statistical_testing_test",
     size = "medium",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 4d4489468d..ddf59891e6 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -59,6 +59,7 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.seed_stream import *
 from tensorflow.contrib.distributions.python.ops.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.test_util import *
 from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
@@ -126,6 +127,7 @@ _allowed_symbols = [
     'NormalWithSoftplusScale',
     'Poisson',
     'PoissonLogNormalQuadratureCompound',
+    'SeedStream',
     'SinhArcsinh',
     'StudentT',
     'StudentTWithAbsDfSoftplusScale',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
index c6c8d2cf6e..59d549b7b8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -536,14 +536,14 @@ class _BatchReshapeTest(object):
 
     if self.is_static_shape:
       with self.assertRaisesRegexp(NotImplementedError,
-                                   "too few event dims"):
+                                   "too few batch and event dims"):
         poisson_141_reshaped.log_prob(x_4)
       with self.assertRaisesRegexp(NotImplementedError,
                                    "unexpected batch and event shape"):
         poisson_141_reshaped.log_prob(x_114)
       return
 
-    with self.assertRaisesOpError("too few event dims"):
+    with self.assertRaisesOpError("too few batch and event dims"):
       with self.test_session():
         poisson_141_reshaped.log_prob(x_4).eval()
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
new file mode 100644
index 0000000000..9680573317
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
@@ -0,0 +1,70 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the SeedStream class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import seed_stream
+from tensorflow.python.platform import test
+
+
+class SeedStreamTest(test.TestCase):
+
+  def assertAllUnique(self, items):
+    self.assertEqual(len(items), len(set(items)))
+
+  def testNonRepetition(self):
+    # The probability of repetitions in a short stream from a correct
+    # PRNG is negligible; this test catches bugs that prevent state
+    # updates.
+    strm = seed_stream.SeedStream(seed=4, salt="salt")
+    output = [strm() for _ in range(50)]
+    self.assertEqual(sorted(output), sorted(list(set(output))))
+
+  def testReproducibility(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm3 = seed_stream.SeedStream(seed=4, salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertEqual(outputs, [strm2() for _ in range(50)])
+    self.assertEqual(outputs, [strm3() for _ in range(50)])
+
+  def testSeededDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=5, salt="salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testSaltedDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="another salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testNestingRobustness(self):
+    # SeedStreams started from generated seeds should not collide with
+    # the master or with each other, even if the salts are the same.
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(strm1(), salt="salt")
+    strm3 = seed_stream.SeedStream(strm1(), salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertAllUnique(
+        outputs + [strm2() for _ in range(50)] + [strm3() for _ in range(50)])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index 3e6c35e0d6..bf5590cd55 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -290,7 +290,7 @@ class BatchReshape(distribution_lib.Distribution):
           isinstance(expected_batch_event_ndims, int)):
         if x_ndims < expected_batch_event_ndims:
           raise NotImplementedError(
-              "Broadcasting is not supported; too few event dims "
+              "Broadcasting is not supported; too few batch and event dims "
               "(expected at least {}, saw {}).".format(
                   expected_batch_event_ndims, x_ndims))
         ndims_assertion = []
@@ -299,7 +299,8 @@ class BatchReshape(distribution_lib.Distribution):
             check_ops.assert_greater_equal(
                 x_ndims,
                 expected_batch_event_ndims,
-                message="Broadcasting is not supported; too few event dims.",
+                message=("Broadcasting is not supported; too few "
+                         "batch and event dims."),
                 name="assert_batch_and_event_ndims_large_enough"),
         ]
 
diff --git a/tensorflow/contrib/distributions/python/ops/seed_stream.py b/tensorflow/contrib/distributions/python/ops/seed_stream.py
new file mode 100644
index 0000000000..056d349688
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/seed_stream.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Local PRNG for amplifying seed entropy into seeds for base operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+
+
+class SeedStream(object):
+  """Local PRNG for amplifying seed entropy into seeds for base operations.
+
+  Writing sampling code which correctly sets the pseudo-random number
+  generator (PRNG) seed is surprisingly difficult.  This class serves as
+  a helper for the TensorFlow Probability coding pattern designed to
+  avoid common mistakes.
+
+  # Motivating Example
+
+  A common first-cut implementation of a sampler for the beta
+  distribution is to compute the ratio of a gamma with itself plus
+  another gamma.  This code snippet tries to do that, but contains a
+  surprisingly common error:
+
+  ```python
+  def broken_beta(shape, alpha, beta, seed):
+    x = tf.random_gamma(shape, alpha, seed=seed)
+    y = tf.random_gamma(shape, beta, seed=seed)
+    return x / (x + y)
+  ```
+
+  The mistake is that the two gamma draws are seeded with the same
+  seed.  This causes them to always produce the same results, which,
+  in turn, leads this code snippet to always return `0.5`.  Because it
+  can happen across abstraction boundaries, this kind of error is
+  surprisingly easy to make when handling immutable seeds.
+
+  # Goals
+
+  TensorFlow Probability adopts a code style designed to eliminate the
+  above class of error, without exacerbating others.  The goals of
+  this code style are:
+
+  - Support reproducibility of results (by encouraging seeding of all
+    pseudo-random operations).
+
+  - Avoid shared-write global state (by not relying on a global PRNG).
+
+  - Prevent accidental seed reuse by TF Probability implementers.  This
+    goal is served with the local pseudo-random seed generator provided
+    in this module.
+
+  - Mitigate potential accidental seed reuse by TF Probability clients
+    (with a salting scheme).
+
+  - Prevent accidental resonances with downstream PRNGs (by hashing the
+    output).
+
+  ## Non-goals
+
+  - Implementing a high-performance PRNG for generating large amounts of
+    entropy.  That's the job of the underlying TensorFlow PRNG we are
+    seeding.
+
+  - Avoiding random seed collisions, aka "birthday attacks".
+
+  # Code pattern
+
+  ```python
+  def random_beta(shape, alpha, beta, seed):        # (a)
+    seed = SeedStream(seed, salt="random_beta")     # (b)
+    x = tf.random_gamma(shape, alpha, seed=seed())  # (c)
+    y = tf.random_gamma(shape, beta, seed=seed())   # (c)
+    return x / (x + y)
+  ```
+
+  The elements of this pattern are:
+
+  - Accept an explicit seed (line a) as an argument in all public
+    functions, and write the function to be deterministic (up to any
+    numerical issues) for fixed seed.
+
+    - Rationale: This provides the client with the ability to reproduce
+      results.  Accepting an immutable seed rather than a mutable PRNG
+      object reduces code coupling, permitting different sections to be
+      reproducible independently.
+
+  - Use that seed only to initialize a local `SeedStream` instance (line b).
+
+    - Rationale: Avoids accidental seed reuse.
+
+  - Supply the name of the function being implemented as a salt to the
+    `SeedStream` instance (line b).  This serves to keep the salts
+    unique; unique salts ensure that clients of TF Probability will see
+    different functions always produce independent results even if
+    called with the same seeds.
+
+  - Seed each callee operation with the output of a unique call to the
+    `SeedStream` instance (lines c).  This ensures reproducibility of
+    results while preventing seed reuse across callee invocations.
+
+  # Why salt?
+
+  Salting the `SeedStream` instances (with unique salts) is defensive
+  programming against a client accidentally committing a mistake
+  similar to our motivating example.  Consider the following situation
+  that might arise without salting:
+
+  ```python
+  def tfp_foo(seed):
+    seed = SeedStream(seed, salt="")
+    foo_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def tfp_bar(seed):
+    seed = SeedStream(seed, salt="")
+    bar_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def client_baz(seed):
+    foo = tfp_foo(seed=seed)
+    bar = tfp_bar(seed=seed)
+    ...
+  ```
+
+  The client should have used different seeds as inputs to `foo` and
+  `bar`.  However, because they didn't, *and because `foo` and `bar`
+  both sample a Gaussian internally as their first action*, the
+  internal `foo_stuff` and `bar_stuff` will be the same, and the
+  returned `foo` and `bar` will not be independent, leading to subtly
+  incorrect answers from the client's simulation.  This kind of bug is
+  particularly insidious for the client, because it depends on a
+  Distributions implementation detail, namely the order in which `foo`
+  and `bar` invoke the samplers they depend on.  In particular, a
+  Bayesflow team member can introduce such a bug in previously
+  (accidentally) correct client code by performing an internal
+  refactoring that causes this operation order alignment.
+
+  A salting discipline eliminates this problem by making sure that the
+  seeds seen by `foo`'s callees will differ from those seen by `bar`'s
+  callees, even if `foo` and `bar` are invoked with the same input
+  seed.
+  """
+
+  def __init__(self, seed, salt):
+    """Initializes a `SeedStream`.
+
+    Args:
+      seed: Any Python object convertible to string, supplying the
+        initial entropy.  If `None`, operations seeded with seeds
+        drawn from this `SeedStream` will follow TensorFlow semantics
+        for not being seeded.
+      salt: Any Python object convertible to string, supplying
+        auxiliary entropy.  Must be unique across the Distributions
+        and TensorFlow Probability code base.  See class docstring for
+        rationale.
+    """
+    self._seed = seed
+    self._salt = salt
+    self._counter = 0
+
+  def __call__(self):
+    """Returns a fresh integer usable as a seed in downstream operations.
+
+    If this `SeedStream` was initialized with `seed=None`, returns
+    `None`.  This has the effect that downstream operations (both
+    `SeedStream`s and primitive TensorFlow ops) will behave as though
+    they were unseeded.
+
+    The returned integer is non-negative, and uniformly distributed in
+    the half-open interval `[0, 2**512)`.  This is consistent with
+    TensorFlow, as TensorFlow operations internally use the residue of
+    the given seed modulo `2**31 - 1` (see
+    `tensorflow/python/framework/random_seed.py`).
+
+    Returns:
+      seed: A fresh integer usable as a seed in downstream operations,
+        or `None`.
+    """
+    self._counter += 1
+    if self._seed is None:
+      return None
+    composite = str((self._seed, self._counter, self._salt)).encode("utf-8")
+    return int(hashlib.sha512(composite).hexdigest(), 16)
+
+  @property
+  def original_seed(self):
+    return self._seed
+
+  @property
+  def salt(self):
+    return self._salt
+
+# Design rationales for the SeedStream class
+#
+# - Salts are accepted for the reason given above to supply them.
+#
+# - A `None` seed propagates to downstream seeds, so they exhibit
+#   their "unseeded" behavior.
+#
+# - The return value is a Python int so it can be passed directly to
+#   TensorFlow operations as a seed.  It is large to avoid losing seed
+#   space needlessly (TF will internally read only the last 31 bits).
+#
+# - The output is hashed with a crypto-grade hash function as a form
+#   of defensive programming: this reliably prevents all possible
+#   accidental resonances with all possible downstream PRNGs.  The
+#   specific function used is not important; SHA512 was ready to hand.
+#
+# - The internal state update is a simple counter because (a) given
+#   that the output is hashed anyway, this is enough, and (b) letting
+#   it be this predictable permits a future "generate many seeds in
+#   parallel" operation whose results would agree with running
+#   sequentially.
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index d66c34cc1a..5c52015e5f 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -12,7 +12,114 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Statistical test assertions calibrated for their error rates."""
+"""Statistical test assertions calibrated for their error rates.
+
+Statistical tests have an inescapable probability of error: a correct
+sampler can still fail a test by chance, and an incorrect sampler can
+still pass a test by chance.  This library is about bounding both of
+those error rates.  This requires admitting a task-specific notion of
+"discrepancy": Correct code will fail rarely, code that misbehaves by
+more than the discrepancy will pass rarely, and nothing reliable can
+be said about code that misbehaves, but misbehaves by less than the
+discrepancy.
+
+# Example
+
+Consider testing that the mean of a scalar probability distribution P
+is some expected constant.  Suppose the support of P is the interval
+`[0, 1]`.  Then you might do this:
+
+```python
+tfd = tf.contrib.distributions
+
+expected_mean = ...
+num_samples = 5000
+samples = ... draw 5000 samples from P
+
+# Check that the mean looks right
+check1 = tfd.assert_true_mean_equal_by_dkwm(
+    samples, low=0., high=1., expected=expected_mean,
+    false_fail_rate=1e-6)
+
+# Check that the difference in means detectable with 5000 samples is
+# small enough
+check2 = tf.assert_less(
+    tfd.min_discrepancy_of_true_means_detectable_by_dkwm(
+        num_samples, low=0., high=1.0,
+        false_fail_rate=1e-6, false_pass_rate=1e-6),
+    0.01)
+
+# Be sure to execute both assertion ops
+sess.run([check1, check2])
+```
+
+The second assertion is an instance of experiment design.  It's a
+deterministic computation (independent of the code under test) that
+checks that `5000` samples is enough to reliably resolve mean
+differences of `0.01` or more.  Here "reliably" means that if the code
+under test is correct, the probability of drawing an unlucky sample
+that causes this test to fail is at most 1e-6; and if the code under
+test is incorrect enough that its true mean is 0.01 more or less than
+expected, then the probability of drawing a "lucky" sample that causes
+the test to false-pass is also at most 1e-6.
+
+# Overview
+
+Every function in this library can be characterized in terms of:
+
+- The property being tested, such as the full density of the
+  distribution under test, or just its true mean, or a single
+  Bernoulli probability, etc.
+
+- The relation being asserted, e.g., whether the mean is less, more,
+  or equal to the given expected value.
+
+- The stochastic bound being relied upon, such as the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
+  or the CDF of the binomial distribution (for assertions about
+  Bernoulli probabilities).
+
+- The number of sample sets in the statistical test.  For example,
+  testing equality of means has a one-sample variant, where the
+  expected mean is given exactly, and a two-sample variant, where the
+  expected mean is itself given by a set of samples (e.g., from an
+  alternative algorithm).
+
+- What operation(s) of the test are to be performed.  Each test has
+  three of these:
+
+  1. `assert` executes the test.  Specifically, it creates a TF op that
+     produces an error if it has enough evidence to prove that the
+     property under test is violated.  These functions depend on the
+     desired false failure rate, because that determines the sizes of
+     appropriate confidence intervals, etc.
+
+  2. `min_discrepancy` computes the smallest difference reliably
+     detectable by that test, given the sample count and error rates.
+     What it's a difference of is test-specific.  For example, a test
+     for equality of means would make detection guarantees about the
+     difference the true means.
+
+  3. `min_num_samples` computes the minimum number of samples needed
+     to reliably detect a given discrepancy with given error rates.
+
+  The latter two are for experimental design, and are meant to be
+  usable either interactively or inline in the overall test method.
+
+This library follows a naming convention, to make room for every
+combination of the above.  A name mentions the operation first, then
+the property, then the relation, then the bound, then, if the test
+takes more than one set of samples, a token indicating this.  For
+example, `assert_true_mean_equal_by_dkwm` (which is implicitly
+one-sample).  Each name is a grammatically sound noun phrase (or verb
+phrase, for the asserts).
+
+# Asymptotic properties
+
+The number of samples needed tends to scale as `O(1/discrepancy**2)` and
+as `O(log(1/error_rate))`.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -40,7 +147,7 @@ __all__ = [
 
 
 def _batch_sort_vector(x, ascending=True, name=None):
-  with ops.name_scope(name, "sort_each_row", [x]):
+  with ops.name_scope(name, "_batch_sort_vector", [x]):
     x = ops.convert_to_tensor(x, name="x")
     n = array_ops.shape(x)[-1]
     if ascending:
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 99b1e098d5..0783d1b5d7 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -71,8 +71,15 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
       dataset: A `tf.data.Dataset` object.
 
     Raises:
+      TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
+    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
+      raise TypeError(
+          "`tf.contrib.data.prefetch_to_device()` is not compatible with "
+          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
+          "over the dataset instead.")
+
     super(Iterator, self).__init__(dataset)
     if not context.context().device_spec.device_type:
       is_remote_device = False
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index c658505de4..f76a896d3d 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -24,6 +24,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
 from tensorflow.contrib.eager.python import checkpointable_utils
@@ -192,6 +193,18 @@ class IteratorTest(test.TestCase):
       x = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], x.numpy())
 
+  def testTensorsExplicitPrefetchToDevice(self):
+    ds = Dataset.from_tensor_slices([0., 1.])
+    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
+
+    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
+      datasets.Iterator(ds)
+
+    for i, x in enumerate(ds):
+      with ops.device(test.gpu_device_name()):
+        x = math_ops.add(x, x)
+        self.assertEqual(float(i) + float(i), x.numpy())
+
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 266ae93305..201699ed77 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -97,7 +97,10 @@ def add_metrics(estimator, metric_fn):
   return estimator_lib.Estimator(
       model_fn=new_model_fn,
       model_dir=estimator.model_dir,
-      config=estimator.config)
+      config=estimator.config,
+      # pylint: disable=protected-access
+      warm_start_from=estimator._warm_start_settings)
+      # pylint: enable=protected-access
 
 
 def clip_gradients_by_norm(optimizer, clip_norm):
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 85ef3291ba..ae2fd8b490 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -41,11 +41,10 @@ from tensorflow.python.training import training_util
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
 
-# TODO(b/65403806): Switch loss_reduction default to SUM_OVER_BATCH_SIZE.
 def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM,
+                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
@@ -86,7 +85,8 @@ def multi_class_head(n_classes,
       have any value in `label_vocabulary`. Note that errors will be raised if
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -111,7 +111,7 @@ def binary_classification_head(
     weight_column=None,
     thresholds=None,
     label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     loss_fn=None,
     name=None):
   """Creates a `_Head` for single label binary classification.
@@ -155,7 +155,8 @@ def binary_classification_head(
       `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index 68bf511099..694f0c14bd 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -18,6 +18,7 @@
 See the @{$python/contrib.integrate} guide.
 
 @@odeint
+@@odeint_fixed
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index ced1110676..d11c9c8288 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -85,9 +85,9 @@ class FisherEstimator(object):
     """Create a FisherEstimator object.
 
     Args:
-      variables: A list of the variables for which to estimate the Fisher. This
-          must match the variables registered in layer_collection (if it is not
-          None).
+      variables: A `list` of variables or `callable` which returns the variables
+          for which to estimate the Fisher. This must match the variables
+          registered in layer_collection (if it is not None).
       cov_ema_decay: The decay factor used when calculating the covariance
           estimate moving averages.
       damping: float. The damping factor used to stabilize training due to
@@ -147,7 +147,10 @@ class FisherEstimator(object):
 
   @property
   def variables(self):
-    return self._variables
+    if callable(self._variables):
+      return self._variables()
+    else:
+      return self._variables
 
   @property
   def damping(self):
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 19608aca47..411da033c3 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -84,7 +84,7 @@ _EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES = {
     APPROX_KRONECKER_INDEP_NAME: fb.EmbeddingKFACMultiIndepFB
 }
 
-# Possible value for 'reuse' keyword argument. Sets 'reuse' to
+# Possible value for `reuse` keyword argument. Sets `reuse` to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
@@ -294,8 +294,8 @@ class LayerCollection(object):
       layer_key: A variable or tuple of variables. The key to check for in
           existing registrations and to register if valid.
       fisher_block: The associated `FisherBlock`.
-      reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
-        or 'VARIABLE_SCOPE'.
+      reuse: Method to use for inserting new `FisherBlock's. One of True, False,
+        or `VARIABLE_SCOPE`.
 
     Raises:
       ValueError: If `layer_key` was already registered and reuse is `False`,
@@ -359,15 +359,14 @@ class LayerCollection(object):
         is None.
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
+      reuse: (OPTIONAL) bool or str.  If True, adds `loss` as an additional
+        tower for the existing loss function.
 
     Raises:
       ValueError: If reuse == True and name == None.
       ValueError: If reuse == True and seed != None.
-      KeyError: If reuse == True and no existing LossFunction with 'name' found.
-      KeyError: If reuse == False and existing LossFunction with 'name' found.
+      KeyError: If reuse == True and no existing LossFunction with `name` found.
+      KeyError: If reuse == False and existing LossFunction with `name` found.
     """
 
     name = name or self._graph.unique_name(base_name)
@@ -491,24 +490,24 @@ class LayerCollection(object):
     """
     params = frozenset(utils.ensure_sequence(params))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.fisher_blocks.keys()'.
+    # Check if any of the variables in `params` is already in
+    # 'self.fisher_blocks.keys()`.
     for registered_params, fisher_block in self.fisher_blocks.items():
       registered_params_set = set(utils.ensure_sequence(registered_params))
       for variable in params:
         if (variable in registered_params_set and
             params != registered_params_set):
           raise ValueError(
-              "Can't link parameters {}, variable {} was already registered in "
+              "Can`t link parameters {}, variable {} was already registered in "
               "group {} with layer {}".format(params, variable,
                                               registered_params, fisher_block))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.linked_parameters'.
+    # Check if any of the variables in `params` is already in
+    # 'self.linked_parameters`.
     for variable in params:
       for other_linked_params in self.linked_parameters:
         if variable in other_linked_params:
-          raise ValueError("Can't link parameters {}, variable {} was already "
+          raise ValueError("Can`t link parameters {}, variable {} was already "
                            "linked in group {}.".format(params, variable,
                                                         other_linked_params))
     self._linked_parameters[params] = approximation
@@ -576,15 +575,15 @@ class LayerCollection(object):
         produced by layer.
       approx: str or None. If not None must be "kron".  The Fisher
         approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -618,15 +617,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
@@ -669,15 +668,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
@@ -686,7 +685,7 @@ class LayerCollection(object):
         _CONV2D_APPROX_TO_BLOCK_TYPES)
 
     # It feels bad to pass in configuration that has to do with the internal
-    # implementation.  And then we can't use the same constructor for both
+    # implementation.  And then we can`t use the same constructor for both
     # anymore and are thus forced to use this ugly if-statement.
     # TODO(b/74793309): Clean this up?
     if approx == APPROX_KRONECKER_NAME:
@@ -749,15 +748,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     # TODO(b/74793309): Have this use _get_block_type like the other
@@ -804,15 +803,15 @@ class LayerCollection(object):
       data_format: str or None. Format of data.
       approx: str or None. If not None must "diagonal".  The Fisher
         approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     # TODO(b/74793309): Have this use _get_block_type like the other
@@ -872,15 +871,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     self.register_depthwise_conv2d(
@@ -917,14 +916,14 @@ class LayerCollection(object):
       approx: str or None. It not None, must be one of "full" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str. If True, this adds 'batch_size' to the total
+      reuse: bool or str. If True, this adds `batch_size` to the total
         mini-batch size use when estimating the Fisher block for this layer
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -954,10 +953,10 @@ class LayerCollection(object):
         correspond to a "time-step" in an RNN). OR, can be single Tensor, of
         shape [num_uses * batch_size , input_size], which is a reshaped version
         of a Tensor of shape [num_uses, batch_size, input_size].
-      outputs: A list of Tensors, the same length as 'inputs', each of shape
+      outputs: A list of Tensors, the same length as `inputs`, each of shape
         [batch_size, output_size]. Outputs produced by layer. The list indexes
         each use in the graph (which might correspond to a "time-step" in an
-        RNN). Needs to correspond with the order used in 'inputs'.  OR, can be
+        RNN). Needs to correspond with the order used in `inputs`.  OR, can be
         a single Tensor of shape [num_uses * batch_size, output_size], which is
         a reshaped version of a Tensor of shape [num_uses, batch_size,
         output_size].
@@ -967,16 +966,16 @@ class LayerCollection(object):
       approx: str or None. If not None, must be of "kron_indep", "kron_series_1"
         or "kron_series_2". The Fisher approximation to use. If None the default
         value is used. (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
+      ValueError: For improper value to `approx`.
     """
     block_type, approx = self._get_block_type(
         params, approx, self.default_fully_connected_multi_approximation,
@@ -1025,7 +1024,7 @@ class LayerCollection(object):
       outputs: A list of Tensors, each of shape [batch_size, height, width,
         out_channels]. Output produced by layer. The list indexes each use
         in the graph (which might correspond to a "time-step" in an RNN).
-        Needs to correspond with the order used in 'inputs'.  OR, can be a
+        Needs to correspond with the order used in `inputs`.  OR, can be a
         single Tensor, of shape [num_uses * batch_size, height, width,
         out_channels], which is a reshaped version of a Tensor of shape
         [num_uses, batch_size, height, width, out_channels].
@@ -1037,17 +1036,17 @@ class LayerCollection(object):
       approx: str or None. If not None must by "kron_indep". The Fisher
         approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -1098,7 +1097,7 @@ class LayerCollection(object):
       outputs: A list of Tensors, each of shape [batch_size, embedding_size].
         Outputs produced by layer. The list indexes each use in the graph
         (which might correspond to a "time-step" in an RNN). Needs to
-        correspond with the order used in 'inputs'. OR, can be a
+        correspond with the order used in `inputs`. OR, can be a
         single Tensor, of shape [num_uses * batch_size, embedding_size], which
         is a reshaped version of a Tensor of shape [num_uses, batch_size,
         embedding_size].
@@ -1108,17 +1107,17 @@ class LayerCollection(object):
       approx: str or None. If not None must by "kron_indep". The Fisher
         approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -1156,7 +1155,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'logits' as an additional
+      reuse: bool or str.  If True, this adds `logits` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1190,7 +1189,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'mean' and 'var' as an additional
+      reuse: bool or str.  If True, this adds `mean` and `var` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1219,7 +1218,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'logits' as an additional
+      reuse: bool or str.  If True, this adds `logits` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1231,18 +1230,18 @@ class LayerCollection(object):
                                 name=name, reuse=reuse)
 
   def make_or_get_factor(self, cls, args):
-    """Insert 'cls(args)' into 'self.fisher_factors' if not already present.
+    """Insert `cls(args)` into 'self.fisher_factors` if not already present.
 
-    Wraps constructor in 'tf.variable_scope()' to ensure variables constructed
-    in 'cls.__init__' are placed under this LayerCollection's scope.
+    Wraps constructor in `tf.variable_scope()` to ensure variables constructed
+    in `cls.__init__` are placed under this LayerCollection's scope.
 
     Args:
       cls: Class that implements FisherFactor.
-      args: Tuple of arguments to pass into 'cls's constructor. Must be
+      args: Tuple of arguments to pass into `cls's constructor. Must be
         hashable.
 
     Returns:
-      Instance of 'cls' found in self.fisher_factors.
+      Instance of `cls` found in self.fisher_factors.
     """
     try:
       hash(args)
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 843aeef7d8..f01c5a8322 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -108,13 +108,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       ValueError: If momentum is non-zero and momentum_type is not 'regular'
           or 'adam'.
     """
-
-    variables = var_list
-    if variables is None:
-      variables = tf_variables.trainable_variables()
-
     # Parameters to be passed to the Fisher estimator:
-    self._variables = variables
+    self._variables = var_list or tf_variables.trainable_variables
     self._cov_ema_decay = cov_ema_decay
     self._layers = layer_collection
     self._estimation_mode = estimation_mode
@@ -235,7 +230,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
 
   @property
   def variables(self):
-    return self._variables
+    return self._fisher_est.variables
 
   @property
   def damping(self):
@@ -373,6 +368,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     else:
       kwargs["var_list"] = kwargs.get("var_list") or self.variables
       var_list = kwargs["var_list"]
+
     if set(var_list) != set(self.variables):
       raise ValueError("var_list doesn't match with set of Fisher-estimating "
                        "variables.")
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index cfe62fac43..ac50699f59 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
 import threading
 
 from tensorflow.contrib.linear_optimizer.python.ops.sdca_ops import SdcaModel
@@ -102,6 +103,33 @@ def make_example_dict(example_protos, example_weights):
       example_ids=['%d' % i for i in range(0, len(example_protos))])
 
 
+def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
+  random.seed(1)
+  sparse_features = [
+      SparseFeatureColumn(
+          [int(i / num_non_zero) for i in range(num_examples * num_non_zero)],
+          [int(random.random() * dim) for _ in range(
+              num_examples * num_non_zero)],
+          [num_non_zero**(-0.5) for _ in range(num_examples * num_non_zero)])
+  ]
+  examples_dict = dict(
+      sparse_features=sparse_features,
+      dense_features=[],
+      example_weights=[random.random() for _ in range(num_examples)],
+      example_labels=[
+          1. if random.random() > 0.5 else 0. for _ in range(num_examples)
+      ],
+      example_ids=[str(i) for i in range(num_examples)])
+
+  weights = variables_lib.Variable(
+      array_ops.zeros([dim], dtype=dtypes.float32))
+  variables_dict = dict(
+      sparse_features_weights=[weights],
+      dense_features_weights=[])
+
+  return examples_dict, variables_dict
+
+
 def make_variable_dict(max_age, max_gender):
   # TODO(sibyl-toe9oF2e):  Figure out how to derive max_age & max_gender from
   # examples_dict.
@@ -235,6 +263,32 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         self.assertAllClose(
             0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
 
+  def testSparseRandom(self):
+    dim = 20
+    num_examples = 1000
+    # Number of non-zero features per example.
+    non_zeros = 10
+    # Setup test data.
+    with self._single_threaded_test_session():
+      examples, variables = make_random_examples_and_variables_dicts(
+          num_examples, dim, non_zeros)
+      options = dict(
+          symmetric_l2_regularization=.1,
+          symmetric_l1_regularization=0,
+          num_table_shards=1,
+          adaptive=False,
+          loss_type='logistic_loss')
+
+      lr = SdcaModel(examples, variables, options)
+      variables_lib.global_variables_initializer().run()
+      train_op = lr.minimize()
+      for _ in range(4):
+        train_op.run()
+      lr.update_weights(train_op).run()
+      # Duality gap is 1.4e-5.
+      # It would be 0.01 without shuffling and 0.02 with adaptive sampling.
+      self.assertNear(0.0, lr.approximate_duality_gap().eval(), err=1e-3)
+
   def testDistributedSimple(self):
     # Setup test data
     example_protos = [
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 3f5fdc18bb..f980746a19 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -168,6 +168,10 @@ class SdcaModel(object):
     # of workers
     return self._options.get('num_loss_partitions', 1)
 
+  def _adaptive(self):
+    # Perform adaptive sampling.
+    return self._options.get('adaptive', True)
+
   def _num_table_shards(self):
     # Number of hash table shards.
     # Return 1 if not specified or if the value is 'None'
@@ -344,7 +348,8 @@ class SdcaModel(object):
           l1=self._options['symmetric_l1_regularization'],
           l2=self._symmetric_l2_regularization(),
           num_loss_partitions=self._num_loss_partitions(),
-          num_inner_iterations=1)
+          num_inner_iterations=1,
+          adaptative=self._adaptive())
       # pylint: enable=protected-access
 
       with ops.control_dependencies([esu]):
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 92d022f2a3..dffdddacfb 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -71,12 +71,14 @@ class SDCAOptimizer(object):
                num_loss_partitions=1,
                num_table_shards=None,
                symmetric_l1_regularization=0.0,
-               symmetric_l2_regularization=1.0):
+               symmetric_l2_regularization=1.0,
+               adaptive=True):
     self._example_id_column = example_id_column
     self._num_loss_partitions = num_loss_partitions
     self._num_table_shards = num_table_shards
     self._symmetric_l1_regularization = symmetric_l1_regularization
     self._symmetric_l2_regularization = symmetric_l2_regularization
+    self._adaptive = adaptive
 
   def get_name(self):
     return 'SDCAOptimizer'
@@ -101,6 +103,10 @@ class SDCAOptimizer(object):
   def symmetric_l2_regularization(self):
     return self._symmetric_l2_regularization
 
+  @property
+  def adaptive(self):
+    return self._adaptive
+
   def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
                      features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
@@ -228,6 +234,7 @@ class SDCAOptimizer(object):
         options=dict(
             symmetric_l1_regularization=self._symmetric_l1_regularization,
             symmetric_l2_regularization=self._symmetric_l2_regularization,
+            adaptive=self._adaptive,
             num_loss_partitions=self._num_loss_partitions,
             num_table_shards=self._num_table_shards,
             loss_type=loss_type))
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 7f7a2632dd..b14230acd7 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -46,6 +46,23 @@ android_library(
     ],
 )
 
+java_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
 java_library(
     name = "tensorflowlitelib",
     srcs = glob(
@@ -147,6 +164,28 @@ java_test(
     ],
 )
 
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "ovic/src/testdata/float_model.lite",
+        "ovic/src/testdata/labels.txt",
+        "ovic/src/testdata/low_res_model.lite",
+        "ovic/src/testdata/quantized_model.lite",
+        "ovic/src/testdata/test_image_128.jpg",
+        "ovic/src/testdata/test_image_224.jpg",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":ovicbenchmarkerlib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
new file mode 100644
index 0000000000..76c33838bf
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -0,0 +1,83 @@
+# Benchmarker for LPIRC Workshop at CVPR 2018
+
+This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
+
+## Pre-requesits
+
+Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
+
+## To test the benchmarker:
+
+The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
+
+Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
+We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
+
+### Obtain the sample models
+
+The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
+
+Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/contrib/lite`).
+
+
+* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip):
+
+```sh
+curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip -o /tmp/ovic.zip
+```
+
+* Unzip the package into the testdata folder:
+
+```sh
+unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+### Run tests
+
+You can run test with Bazel as below. This helps to ensure that the installation is correct.
+
+```sh
+bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java:OvicClassifierTest --test_output=all
+```
+
+### Test your submissions
+
+Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it as below.
+
+* Move your submission to the testdata folder:
+
+Let say the submission file is located at `/tmp/my_model.lite`, then
+
+```sh
+cp /tmp/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+* Resize the test image to the resolutions that are expected by your submission:
+
+The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
+
+* Add your model and test image to the BUILD rule:
+
+```JSON
+java_test(
+  name = "OvicClassifierTest",
+  size = "medium",
+  srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+  data = [
+      "ovic/src/testdata/float_model.lite",
+      "ovic/src/testdata/labels.txt",
+      "ovic/src/testdata/low_res_model.lite",
+      "ovic/src/testdata/quantized_model.lite",
+      "ovic/src/testdata/test_image_128.jpg",
+      "ovic/src/testdata/test_image_224.jpg",
+      "ovic/src/testdata/my_model.lite",        # <--- Your submission.
+      "ovic/src/testdata/my_test_image.jpg",    # <--- Your test image.
+  ],
+      ...
+```
+
+* Modify `OvicClassifierTest.java` to test your model.
+
+Change `TEST_IMAGE_PATH` to `testdata/my_test_image.jpg`. If your model runs inference in floating point, change `FLOAT_MODEL_PATH` to `testdata/my_model.lite`. If your model runs [quantized inference](https://www.tensorflow.org/performance/quantization), change `QUANTIZED_MODEL_PATH` to `testdata/my_model.lite`.
+
+Now you can run the bazel tests to catch any runtime issues with the submission.
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 4fd23a99d2..098ed8ceba 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -26,7 +26,6 @@ import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
-import java.nio.file.Paths;
 import javax.imageio.ImageIO;
 import org.junit.Before;
 import org.junit.Test;
@@ -45,27 +44,33 @@ public final class OvicClassifierTest {
   private ByteBuffer testImage = null;
   private ByteBuffer lowResTestImage = null;
   private OvicSingleImageResult testResult = null;
-  private static final String LABELS_PATH = "testdata/labels.txt";
-  private static final String QUANTIZED_MODEL_PATH = "testdata/quantized_model.lite";
-  private static final String LOW_RES_MODEL_PATH = "testdata/low_res_model.lite";
-  private static final String FLOAT_MODEL_PATH = "testdata/float_model.lite";
-  private static final String TEST_IMAGE_PATH = "testdata/test_image_224.jpg";
-  private static final String TEST_LOW_RES_IMAGE_PATH = "testdata/test_image_128.jpg";
+  private static final String LABELS_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+  private static final String QUANTIZED_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/quantized_model.lite";
+  private static final String LOW_RES_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/low_res_model.lite";
+  private static final String FLOAT_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/float_model.lite";
+  private static final String TEST_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_224.jpg";
+  private static final String TEST_LOW_RES_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_128.jpg";
   private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
 
   @Before
   public void setUp() {
     try {
-      File labelsfile = new File(getTestDir(LABELS_PATH));
+      File labelsfile = new File(LABELS_PATH);
       labelsInputStream = new FileInputStream(labelsfile);
-      quantizedModel = loadModelFile(getTestDir(QUANTIZED_MODEL_PATH));
-      floatModel = loadModelFile(getTestDir(FLOAT_MODEL_PATH));
-      lowResModel = loadModelFile(getTestDir(LOW_RES_MODEL_PATH));
-      File imageFile = new File(getTestDir(TEST_IMAGE_PATH));
+      quantizedModel = loadModelFile(QUANTIZED_MODEL_PATH);
+      floatModel = loadModelFile(FLOAT_MODEL_PATH);
+      lowResModel = loadModelFile(LOW_RES_MODEL_PATH);
+      File imageFile = new File(TEST_IMAGE_PATH);
       BufferedImage img = ImageIO.read(imageFile);
       testImage = toByteBuffer(img);
       // Low res image and models.
-      imageFile = new File(getTestDir(TEST_LOW_RES_IMAGE_PATH));
+      imageFile = new File(TEST_LOW_RES_IMAGE_PATH);
       img = ImageIO.read(imageFile);
       lowResTestImage = toByteBuffer(img);
     } catch (IOException e) {
@@ -74,10 +79,6 @@ public final class OvicClassifierTest {
     System.out.println("Successful setup");
   }
 
-  private static String getTestDir(String testfile) throws IOException {
-    return Paths.get("third_party/tensorflow/contrib/lite/java/ovic/src/", testfile).toString();
-  }
-
   @Test
   public void ovicClassifier_quantizedModelCreateSuccess() throws Exception {
     classifier = new OvicClassifier(labelsInputStream, quantizedModel);
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index a619ada86a..45ea8d0049 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -67,10 +67,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
     TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
     TF_LITE_ENSURE_EQ(context, t->type, input_type);
-    if (input_type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, t->params.zero_point, t0->params.zero_point);
-      TF_LITE_ENSURE_EQ(context, t->params.scale, t0->params.scale);
-    }
     for (int d = 0; d < t0->dims->size; ++d) {
       if (d == axis) {
         sum_axis += t->dims->data[axis];
@@ -87,11 +83,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
-  if (input_type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
-                      t0->params.zero_point);
-    TF_LITE_ENSURE_EQ(context, output->params.scale, t0->params.scale);
-  }
 
   return context->ResizeTensor(context, output, output_size);
 }
@@ -115,6 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
       GetTensorDims(output))
 
+#define TF_LITE_CONCATENATION_QUANTIZED(type)                                  \
+  VectorOfQuantizedTensors all_inputs(*context, *node->inputs);                \
+  type::Concatenation(                                                         \
+      RemapDim(NumDimensions(output), axis), all_inputs.data(),                \
+      all_inputs.dims(), all_inputs.zero_point(), all_inputs.scale(),          \
+      node->inputs->size, GetTensorData<uint8>(output), GetTensorDims(output), \
+      output->params.zero_point, output->params.scale)
+
   switch (output->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
@@ -125,9 +124,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(reference_ops);
       } else {
-        TF_LITE_CONCATENATION(optimized_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
       }
       break;
     default:
@@ -136,6 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+#undef TF_LITE_CONCATENATION_QUANTIZED
 #undef TF_LITE_CONCATENATION
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index ba1ffc5f84..467ff6f7e1 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -28,6 +28,7 @@ class BaseConcatenationOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): Also test different activation types, axis, input
   // dimensions.
+  BaseConcatenationOpModel() {}
   BaseConcatenationOpModel(const TensorData& input_template, int axis,
                            int num_inputs) {
     std::vector<std::vector<int>> all_input_shapes;
@@ -60,6 +61,23 @@ class ConcatenationOpModel : public BaseConcatenationOpModel {
 class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
  public:
   using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  QuantizedConcatenationOpModel(const std::vector<TensorData>& input_template,
+                                int axis, int num_inputs,
+                                const TensorData& output_template) {
+    std::vector<std::vector<int>> all_input_shapes;
+    CHECK_EQ(input_template.size(), num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template[i].shape);
+      AddInput(input_template[i]);
+    }
+    output_ = AddOutput({output_template.type, /*shape=*/{},
+                         output_template.min, output_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
   void SetInput(int index, std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(index, data);
   }
@@ -168,6 +186,56 @@ TEST(ConcatenationOpTest, FourInputsQuantized) {
                               }));
 }
 
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRangeClampingLogic) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -1., 1.});
+
+  m0.SetInput(0, {1.0f, -3.0f, -4.0f, -7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, -3.2f, -4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f,   //
+                      -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,  //
+                  },
+                  4e-3)));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  255, 0, 255, 255, 255, 0, 255, 255,  //
+                                  0, 0, 255, 255, 0, 255, 255, 255,    //
+                              }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 3642da311c..9a274612ad 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -2732,6 +2732,62 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the reference implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 3575974ae9..31e190e248 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1566,6 +1566,61 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the optimized implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int64_t outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 62e38e0d4c..4bce2ffaaf 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -126,6 +126,29 @@ class VectorOfTensors {
   std::vector<Dims<4>*> all_dims_ptr_;
 };
 
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32> zero_point_;
+  std::vector<float> scale_;
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 606f4a5635..3448de68e8 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,6 +30,13 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+// Ensure that ErrorReporter is non-null.
+ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
+  return e ? e : DefaultErrorReporter();
+}
+}  // namespace
+
 const char* kEmptyTensorName = "";
 
 TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
@@ -78,6 +85,8 @@ std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
     const char* filename, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
@@ -89,6 +98,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
 std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
     const char* filename, TfLiteVerifier* verifier,
     ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
@@ -104,6 +115,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
     const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   Allocation* allocation =
       new MemoryAllocation(buffer, buffer_size, error_reporter);
@@ -114,6 +127,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
     const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   model.reset(new FlatBufferModel(model_spec, error_reporter));
   if (!model->initialized()) model.reset();
@@ -133,15 +148,13 @@ bool FlatBufferModel::CheckModelIdentifier() const {
 
 FlatBufferModel::FlatBufferModel(const Model* model,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
   model_ = model;
 }
 
 FlatBufferModel::FlatBufferModel(Allocation* allocation,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
   allocation_ = allocation;
   if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
@@ -154,7 +167,7 @@ InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
     : model_(model.GetModel()),
       op_resolver_(op_resolver),
-      error_reporter_(model.error_reporter()),
+      error_reporter_(ValidateErrorReporter(model.error_reporter())),
       allocation_(model.allocation()) {}
 
 InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
@@ -162,8 +175,7 @@ InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
                                        ErrorReporter* error_reporter)
     : model_(model),
       op_resolver_(op_resolver),
-      error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {}
+      error_reporter_(ValidateErrorReporter(error_reporter)) {}
 
 TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   TfLiteStatus status = kTfLiteOk;
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 036dc46e03..5a55b031a8 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -56,27 +56,37 @@ class TfLiteVerifier {
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
  public:
-  // Builds a model based on a file. Returns a nullptr in case of failure.
+  // Builds a model based on a file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Verifies whether the content of the file is legit, then builds a model
-  // based on the file. Returns a nullptr in case of failure.
+  // based on the file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
       const char* filename, TfLiteVerifier* verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model directly from a flatbuffer pointer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromModel(
       const tflite::Model* model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
@@ -100,7 +110,10 @@ class FlatBufferModel {
 
  private:
   // Loads a model from a given allocation. FlatBufferModel will take over the
-  // ownership of `allocation`, and delete it in desctructor.
+  // ownership of `allocation`, and delete it in destructor. The ownership of
+  // `error_reporter`remains with the caller and must have lifetime at least
+  // as much as FlatBufferModel. This is to allow multiple models to use the
+  // same ErrorReporter instance.
   FlatBufferModel(Allocation* allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
@@ -111,7 +124,10 @@ class FlatBufferModel {
   // Flatbuffer traverser pointer. (Model* is a pointer that is within the
   // allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
+  // The error reporter to use for model errors and subsequent errors when
+  // the interpreter is created
   ErrorReporter* error_reporter_;
+  // The allocator used for holding memory of the model.
   Allocation* allocation_ = nullptr;
 };
 
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 621fbcb98d..1f3ea2e1c7 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -200,6 +200,12 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   allocator->Deallocate(*array->alloc);
 }
 
+void PushBackIfNotFound(const string& s, std::vector<string>* v) {
+  if (std::find(v->begin(), v->end(), s) == v->end()) {
+    v->push_back(s);
+  }
+}
+
 }  // namespace
 
 void AllocateTransientArrays(Model* model,
@@ -251,18 +257,12 @@ void AllocateTransientArrays(Model* model,
     std::vector<string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
-        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
-                      input) == arrays_to_allocate.end()) {
-          arrays_to_allocate.push_back(input);
-        }
+        PushBackIfNotFound(input, &arrays_to_allocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (StartsAt(array_lifespans[output], op_index)) {
-        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
-                      output) == arrays_to_allocate.end()) {
-          arrays_to_allocate.push_back(output);
-        }
+        PushBackIfNotFound(output, &arrays_to_allocate);
       }
     }
     for (const string& array : arrays_to_allocate) {
@@ -274,18 +274,12 @@ void AllocateTransientArrays(Model* model,
     std::vector<string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
-        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
-                      input) == arrays_to_deallocate.end()) {
-          arrays_to_deallocate.push_back(input);
-        }
+        PushBackIfNotFound(input, &arrays_to_deallocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (EndsAt(array_lifespans[output], op_index)) {
-        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
-                      output) == arrays_to_deallocate.end()) {
-          arrays_to_deallocate.push_back(output);
-        }
+        PushBackIfNotFound(output, &arrays_to_deallocate);
       }
     }
     for (const string& array : arrays_to_deallocate) {
@@ -310,17 +304,21 @@ void AllocateTransientArrays(Model* model,
     // for each operator, compute the sum of the sizes of the array that must
     // be live during the execution of this operator, plus the size of
     // persistent arrays that must be live at all times.
-    std::size_t size = persistent_alloc_size;
+    std::vector<string> non_persistent_edges;
     for (const auto& input : op->inputs) {
       if (!array_lifespans[input].persistent) {
-        size += TransientArraySize(*model, input, transient_data_alignment);
+        PushBackIfNotFound(input, &non_persistent_edges);
       }
     }
     for (const auto& output : op->outputs) {
       if (!array_lifespans[output].persistent) {
-        size += TransientArraySize(*model, output, transient_data_alignment);
+        PushBackIfNotFound(output, &non_persistent_edges);
       }
     }
+    std::size_t size = persistent_alloc_size;
+    for (const string& edge : non_persistent_edges) {
+      size += TransientArraySize(*model, edge, transient_data_alignment);
+    }
     // The optimal total size is the maximum of all operator-specific sizes.
     optimal_transient_alloc_size = std::max(optimal_transient_alloc_size, size);
   }
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 39e49bc347..7a7059e357 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -202,6 +202,7 @@ struct ParsedModelFlags {
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
   Arg<toco::StringMapList> model_checks;
+  Arg<bool> change_concat_input_ranges = Arg<bool>(true);
   // Debugging output options.
   // TODO(benoitjacob): these shouldn't be ModelFlags.
   Arg<string> graphviz_first_array;
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 5d51431005..4a77196aab 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT16;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
 using tensorflow::DT_UINT8;
@@ -1868,6 +1869,9 @@ void AddPlaceholder(const string& name, ArrayDataType type,
     case ArrayDataType::kInt64:
       (*placeholder->mutable_attr())["dtype"].set_type(DT_INT64);
       break;
+    case ArrayDataType::kInt16:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT16);
+      break;
     default:
       LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 23c9e3246b..437e30a918 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -95,30 +95,37 @@ bool HardcodeMinMaxForConcatenation(Model* model, Operator* op) {
   overall_minmax.min = overall_min;
   overall_minmax.max = overall_max;
   bool changed = false;
-  for (const auto& input : op->inputs) {
-    auto& array = model->GetArray(input);
-    if (!array.minmax) {
-      changed = true;
-    } else if (!(overall_minmax == array.GetMinMax())) {
-      changed = true;
-      LOG(WARNING)
-          << "Tweaking the MinMax of array " << input << ", which is "
-          << "an input to " << LogName(*op) << ", because we want all inputs "
-          << "and outputs of a Concatenation operator to have the same MinMax "
-          << "so that it can be implemented as a pure byte-copy, no "
-             "arithmetic.";
+  if (model->flags.change_concat_input_ranges()) {
+    for (const auto& input : op->inputs) {
+      auto& array = model->GetArray(input);
+      if (!array.minmax) {
+        changed = true;
+      } else if (!(overall_minmax == array.GetMinMax())) {
+        changed = true;
+        LOG(WARNING)
+            << "Tweaking the MinMax of array " << input << ", which is "
+            << "an input to " << LogName(*op) << ", because we want all inputs "
+            << "and outputs of a Concatenation operator to have the same "
+            << "MinMax so that it can be implemented as a pure byte-copy, no "
+               "arithmetic.";
+      }
+      array.GetOrCreateMinMax() = overall_minmax;
     }
-    array.GetOrCreateMinMax() = overall_minmax;
   }
   if (!output.minmax) {
     changed = true;
   } else if (!(overall_minmax == output.GetMinMax())) {
-    changed = true;
-    LOG(WARNING)
-        << "Tweaking the MinMax of the output array of " << LogName(*op)
-        << ", because we want all inputs "
-        << "and outputs of a Concatenation operator to have the same MinMax "
-        << "so that it can be implemented as a pure byte-copy, no arithmetic.";
+    if (model->flags.change_concat_input_ranges()) {
+      changed = true;
+      LOG(WARNING)
+          << "Tweaking the MinMax of the output array of " << LogName(*op)
+          << ", because we want all inputs "
+          << "and outputs of a Concatenation operator to have the same MinMax "
+          << "so that it can be implemented as a pure byte-copy, no "
+          << "arithmetic.";
+    } else {
+      return false;
+    }
   }
   output.GetOrCreateMinMax() = overall_minmax;
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index 935da9f966..183b3d3f2e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -78,15 +78,21 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   image_input_op->outputs = {dequantized_input_name};
   model->operators.emplace(model->operators.begin(), image_input_op);
 
-  CHECK(input_array.final_data_type == ArrayDataType::kUint8);
-  input_array.data_type = ArrayDataType::kUint8;
   dequantized_input_array.data_type = ArrayDataType::kFloat;
   const auto& input_minmax = input_array.GetMinMax();
   auto& dequantized_input_minmax = dequantized_input_array.GetOrCreateMinMax();
   dequantized_input_minmax = input_minmax;
   auto& input_qparams = input_array.GetOrCreateQuantizationParams();
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
-                                                         &input_qparams);
+  input_array.data_type = input_array.final_data_type;
+  if (input_array.data_type == ArrayDataType::kUint8) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
+                                                           &input_qparams);
+  } else if (input_array.data_type == ArrayDataType::kInt16) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(input_minmax,
+                                                           &input_qparams);
+  } else {
+    LOG(FATAL) << "unhandled data type";
+  }
 
   transformation->AddMessageF(
       "Created %s"
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 7784558b22..5b1268f9a9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -431,7 +431,8 @@ bool ChooseQuantizationForOperatorOutput(
       (op.type == OperatorType::kSpaceToDepth) ||
       (op.type == OperatorType::kTensorFlowReshape) ||
       (op.type == OperatorType::kTensorFlowSplit) ||
-      (op.type == OperatorType::kConcatenation)) {
+      (op.type == OperatorType::kConcatenation &&
+       model->flags.change_concat_input_ranges())) {
     int data_input_index = 0;
     if (op.type == OperatorType::kTensorFlowSplit) {
       data_input_index = 1;
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 245eb52444..7bbeab7c9d 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -165,6 +165,11 @@ bool ParseModelFlagsFromCommandLineFlags(
            "Path to an optional file containing a serialized ModelFlags proto. "
            "Options specified on the command line will override the values in "
            "the proto."),
+      Flag("change_concat_input_ranges",
+           parsed_flags.change_concat_input_ranges.bind(),
+           parsed_flags.change_concat_input_ranges.default_value(),
+           "Boolean to change the behavior of min/max ranges for inputs and"
+           " output of the concat operators."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -399,12 +404,15 @@ void ReadModelFlagsFromCommandLineFlags(
       parsed_model_flags.allow_nonascii_arrays.value());
   model_flags->set_allow_nonexistent_arrays(
       parsed_model_flags.allow_nonexistent_arrays.value());
+  model_flags->set_change_concat_input_ranges(
+      parsed_model_flags.change_concat_input_ranges.value());
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
     string arrays_extra_info_file_contents;
-    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
-                            &arrays_extra_info_file_contents,
-                            port::file::Defaults());
+    CHECK(port::file::GetContents(
+              parsed_model_flags.arrays_extra_info_file.value(),
+              &arrays_extra_info_file_contents, port::file::Defaults())
+              .ok());
     ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
                                       model_flags->mutable_arrays_extra_info());
   }
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 835dea49eb..d23e80c464 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -128,7 +128,7 @@ message ArraysExtraInfo {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 19.
+// Next ID to USE: 20.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -175,4 +175,8 @@ message ModelFlags {
   // If set, this ArraysExtraInfo allows to pass extra information about arrays
   // not specified in the input model file, such as extra MinMax information.
   optional ArraysExtraInfo arrays_extra_info = 18;
+
+  // When set to false, toco will not change the input ranges and the output
+  // ranges of concat operator to the overlap of all input ranges.
+  optional bool change_concat_input_ranges = 19 [default = true];
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 76e9a27aef..96c5ebd64f 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -130,20 +130,26 @@ bool SupportsPreallocatedWorkspace(FileFormat format) {
 }
 
 bool IsRealValued(toco::ArrayDataType type) {
+  // TODO(benoitjacob) - this is hardcoding that uint8 and int16 are only used
+  // for quantized real-number values, and no other integer type is ever used
+  // for that. This is dirty, should be resolved as part of a more general push
+  // to more explicitly distinguish between true-integers and
+  // integers used as quantized values representing real numbers.
   return static_cast<bool>(type == toco::ArrayDataType::kFloat ||
-                           type == toco::ArrayDataType::kUint8);
+                           type == toco::ArrayDataType::kUint8 ||
+                           type == toco::ArrayDataType::kInt16);
 }
 
 void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   ArrayDataType type;
-  if (toco_flags.has_inference_input_type()) {
+  if (!SupportsQuantization(output_format)) {
+    // Data type is implicitly float for non-quantized formats
+    type = ArrayDataType::kFloat;
+  } else if (toco_flags.has_inference_input_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_input_type());
   } else if (toco_flags.has_inference_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_type());
-  } else if (!SupportsQuantization(output_format)) {
-    // Data type is implicitly float for non-quantized formats
-    type = ArrayDataType::kFloat;
   } else {
     // Nothing to do. Data types stay as-is.
     return;
@@ -198,11 +204,6 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 }
 
 void Transform(const TocoFlags& toco_flags, Model* model) {
-  // Clean up after import.
-  SetFinalDataTypeOnInputs(toco_flags, model);
-  UseArraysExtraInfo(model);
-  FinishBuildingRNNStates(model);
-
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
@@ -215,6 +216,11 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         << "Quantized inference is not allowed with float inputs.";
   }
 
+  // Clean up after import.
+  SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model, quantize_output);
+  FinishBuildingRNNStates(model);
+
   // Remove unused ops before performing any other optimizations. This is to
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 56fa8f4b69..b72f5fa2a7 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1378,12 +1378,22 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     const float mean_value = input_array_proto.mean_value();
     const float std_value = input_array_proto.std_value();
     MinMax input_minmax;
-    input_minmax.min = (0.f - mean_value) / std_value;
-    input_minmax.max = (255.f - mean_value) / std_value;
+    float qmin = 0, qmax = 255;
+    if (input_array.data_type == ArrayDataType::kInt16) {
+      qmin = -32768;
+      qmax = 32767;
+    }
+    input_minmax.min = (qmin - mean_value) / std_value;
+    input_minmax.max = (qmax - mean_value) / std_value;
     if (input_array.minmax) {
       if (input_array_proto.has_mean_value() ||
           input_array_proto.has_std_value()) {
-        CHECK(input_minmax == *input_array.minmax)
+        const double width = input_minmax.max - input_minmax.min;
+        const double kMinMaxAllowedDiff = 1e-6 * width;
+        CHECK(std::abs(input_minmax.min - input_array.minmax->min) <
+                  kMinMaxAllowedDiff &&
+              std::abs(input_minmax.max - input_array.minmax->max) <
+                  kMinMaxAllowedDiff)
             << input_minmax.min << ", " << input_minmax.max
             << " != " << input_array.minmax->min << ", "
             << input_array.minmax->max;
@@ -1403,7 +1413,8 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       CHECK(input_array.shape().dims_size());
     }
   }
-
+  model->flags.set_change_concat_input_ranges(
+      model_flags.change_concat_input_ranges());
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
   model->flags.set_allow_nonexistent_arrays(
       model_flags.allow_nonexistent_arrays());
@@ -2000,7 +2011,7 @@ void FinishBuildingRNNStates(Model* model) {
   }
 }
 
-void UseArraysExtraInfo(Model* model) {
+void UseArraysExtraInfo(Model* model, bool quantize_output) {
   for (const auto& entry : model->flags.arrays_extra_info().entries()) {
     if (!model->HasArray(entry.name())) {
       continue;
@@ -2012,7 +2023,7 @@ void UseArraysExtraInfo(Model* model) {
       minmax.min = entry.min();
       minmax.max = entry.max();
     }
-    if (entry.has_data_type()) {
+    if (entry.has_data_type() && quantize_output) {
       array.final_data_type =
           ConvertIODataTypeToArrayDataType(entry.data_type());
     }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 259ee7fbd0..dfd81173c3 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -285,7 +285,7 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 // already quantized, then case (a) should hold.
 void FinishBuildingRNNStates(Model* model);
 
-void UseArraysExtraInfo(Model* model);
+void UseArraysExtraInfo(Model* model, bool quantize_output);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 63fdd91d36..c7d85862f6 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -842,12 +842,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[6.450831e-04, 4.697885e-04], [9.862894e-05, 7.212213e-04],
-           [4.401947e-04, 9.143004e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[4.621217e-04, 3.365449e-04], [7.438179e-05, 5.439147e-04],
-           [3.347936e-04, 6.953785e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index 15a415df30..eac34afc4a 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -52,6 +52,7 @@ class FakeSummaryWriter(object):
     self._added_graphs = []
     self._added_meta_graphs = []
     self._added_session_logs = []
+    self._added_run_metadata = {}
 
   @property
   def summaries(self):
@@ -127,6 +128,11 @@ class FakeSummaryWriter(object):
     # pylint: disable=unused-argument
     self._added_session_logs.append(session_log)
 
+  def add_run_metadata(self, run_metadata, tag, global_step=None):
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    self._added_run_metadata[tag] = run_metadata
+
   def flush(self):
     pass
 
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index f2003e04dd..6b198dbc16 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -64,9 +64,11 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms,
-                        const string& repository_root, const string& session_id,
-                        const ProfileOptions& opts) {
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+             const string& repository_root, const string& session_id,
+             const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
@@ -94,7 +96,31 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
           channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-  return response;
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
+        logdir, session_id, "", response, &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  return response.encoded_trace().empty();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+bool NewSession(const string& service_addr,
+                const std::vector<tensorflow::string>& hostnames,
+                int duration_ms, const string& repository_root,
+                const string& session_id, const ProfileOptions& opts) {
+  return true;
 }
 
 }  // namespace
@@ -104,12 +130,16 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
+  tensorflow::string FLAGS_workers_list;
   int FLAGS_duration_ms = 2000;
   int FLAGS_num_tracing_attempts = 3;
   bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
+      tensorflow::Flag("workers_list", &FLAGS_workers_list,
+                       "The list of worker TPUs that we are about to profile "
+                       "in the current session."),
       tensorflow::Flag("logdir", &FLAGS_logdir,
                        "Path of TensorBoard log directory e.g. /tmp/tb_log, "
                        "gs://tb_bucket"),
@@ -153,18 +183,30 @@ int main(int argc, char** argv) {
   constexpr char kProfilePluginDirectory[] = "plugins/profile/";
   tensorflow::string repository_root =
       ::tensorflow::io::JoinPath(FLAGS_logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(FLAGS_workers_list, ",");
+
+  bool empty_trace = false;
   while (true) {
     std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms,
-                                        repository_root, session_id, opts);
-    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    if (hostnames.empty()) {
+      empty_trace = tensorflow::tpu::Profile(FLAGS_service_addr, FLAGS_logdir,
+                                             duration_ms, repository_root,
+                                             session_id, opts);
+    } else {
+      tensorflow::string tpu_master = FLAGS_service_addr;
+      empty_trace =
+          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
+                                      repository_root, session_id, opts);
+    }
+    if (remaining_attempts <= 0 || !empty_trace) break;
     std::cout << "No trace event is collected. Automatically retrying."
               << std::endl
               << std::endl;
   }
 
-  if (response.encoded_trace().empty()) {
+  if (empty_trace) {
     std::cout << "No trace event is collected after "
               << FLAGS_num_tracing_attempts << " attempt(s). "
               << "Perhaps, you want to try again (with more attempts?)."
@@ -175,13 +217,5 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-      FLAGS_logdir, session_id, response, &std::cout));
-  // Print this at the end so that it's not buried in irrelevant LOG messages.
-  std::cout
-      << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
-      << "Set an appropriate duration (with --duration_ms) if you "
-         "don't see a full step in your trace or the captured trace is too "
-         "large."
-      << std::endl;
+  return 0;
 }
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ebd6185faa..ae508583f8 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -41,6 +41,7 @@ namespace {
 using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
+using ::tensorflow::strings::StrCat;
 
 constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
 constexpr char kJsonOpProfileFileName[] = "op_profile.json";
@@ -61,28 +62,33 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
   return Status::OK();
 }
 
-Status DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace,
-                               std::ostream* os) {
+Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
+                               const string& encoded_trace, std::ostream* os) {
   string proto_path = JoinPath(run_dir, kProtoTraceFileName);
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), proto_path, encoded_trace));
   LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
 
-  string json_path = JoinPath(run_dir, kJsonTraceFileName);
+  string json_path = JoinPath(run_dir, StrCat(host_prefix, kJsonTraceFileName));
   Trace trace;
   trace.ParseFromString(encoded_trace);
-  *os << "Trace contains " << trace.trace_events_size() << " events."
-      << std::endl;
+  if (os) {
+    *os << "Trace contains " << trace.trace_events_size() << " events."
+        << std::endl;
+  }
   TF_RETURN_IF_ERROR(
       WriteGzippedDataToFile(json_path, TraceEventsToJson(trace)));
-  *os << "Dumped JSON trace data to " << json_path << std::endl;
+  if (os) {
+    *os << "Dumped JSON trace data to " << json_path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
+                                   const string& host_prefix,
                                    const tpu::op_profile::Profile& profile,
                                    std::ostream* os) {
-  string path = JoinPath(run_dir, kJsonOpProfileFileName);
+  string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
   JsonOptions options;
   options.always_print_primitive_fields = true;
@@ -93,49 +99,20 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
         string(status.error_message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
-  *os << "Dumped json op profile data to " << path << std::endl;
+  if (os) {
+    *os << "Dumped json op profile data to " << path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
+                                  const string& host_prefix,
                                   const tensorflow::ProfileToolData& tool,
                                   std::ostream* os) {
-  string path = JoinPath(run_dir, tool.name());
+  string path = JoinPath(run_dir, StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
-  *os << "Dumped tool data for " << tool.name() << " to " << path << std::endl;
-  return Status::OK();
-}
-
-Status DumpGraphEvents(const string& logdir, const string& run,
-                       const ProfileResponse& response, std::ostream* os) {
-  int num_graphs = response.computation_graph_size();
-  if (response.computation_graph_size() == 0) return Status::OK();
-  // The server might generates multiple graphs for one program; we simply
-  // pick the first one.
-  if (num_graphs > 1) {
-    *os << num_graphs
-        << " TPU program variants observed over the profiling period. "
-        << "One computation graph will be chosen arbitrarily." << std::endl;
-  }
-  // The graph plugin expects the graph in <logdir>/<run>/<event.file>.
-  string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run));
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir));
-  EventsWriter event_writer(JoinPath(run_dir, "events"));
-  Event event;
-  // Add the computation graph.
-  event.set_graph_def(response.computation_graph(0).SerializeAsString());
-  event_writer.WriteEvent(event);
-  *os << "Wrote a HLO graph to " << event_writer.FileName() << std::endl;
-
-  if (response.has_hlo_metadata()) {
-    tensorflow::TaggedRunMetadata tagged_run_metadata;
-    tagged_run_metadata.set_tag(run);
-    tagged_run_metadata.set_run_metadata(
-        response.hlo_metadata().SerializeAsString());
-    tensorflow::Event meta_event;
-    *meta_event.mutable_tagged_run_metadata() = tagged_run_metadata;
-    event_writer.WriteEvent(meta_event);
-    *os << "Wrote HLO ops run metadata to " << event_writer.FileName()
+  if (os) {
+    *os << "Dumped tool data for " << tool.name() << " to " << path
         << std::endl;
   }
   return Status::OK();
@@ -144,27 +121,29 @@ Status DumpGraphEvents(const string& logdir, const string& run,
 }  // namespace
 
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os) {
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
+  string host_prefix = host.empty() ? "" : StrCat(host, ".");
   string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
   if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
-    TF_RETURN_IF_ERROR(
-        DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
+    TF_RETURN_IF_ERROR(DumpTraceToLogDirectory(profile_run_dir, host_prefix,
+                                               response.encoded_trace(), os));
   }
   if (response.has_op_profile() &&
       (response.op_profile().has_by_program_structure() ||
        response.op_profile().has_by_category())) {
-    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
+    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir, host_prefix,
                                                    response.op_profile(), os));
   }
   for (const auto& tool_data : response.tool_data()) {
-    TF_RETURN_IF_ERROR(
-        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
+    TF_RETURN_IF_ERROR(DumpToolDataToLogDirectory(profile_run_dir, host_prefix,
+                                                  tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 29ef977bac..ecf21b1de2 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -32,6 +32,7 @@ namespace tpu {
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index cddc3cd1b4..8505c4bc69 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -21,6 +21,17 @@ message ProfileOptions {
   // next-field: 2
 }
 
+message ToolRequestOptions {
+  // Required formats for the tool, it should be one of "json", "proto", "raw"
+  // etc. If not specified (backward compatible), use default format, i.e. most
+  // tools use json format.
+  string output_formats = 2;
+
+  // Whether save the result directly to repository or pass it back to caller.
+  // Default to false for backward compatibilities.
+  bool save_to_repo = 3;
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -30,9 +41,12 @@ message ProfileRequest {
   // events.
   uint64 max_events = 2;
 
-  // required profiling tools name such as "input_pipeline_analyzer" etc
+  // Required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Specifies the requirement for each tools.
+  map<string, ToolRequestOptions> tool_options = 8;
+
   // Optional profiling options that control how a TF session will be profiled.
   ProfileOptions opts = 4;
 
@@ -43,10 +57,14 @@ message ProfileRequest {
   // The user provided profile session identifier.
   string session_id = 6;
 
+  // The hostname of system where the profile should happen.
+  // We use it as identifier in part of our output filename.
+  string host_name = 7;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 7
+  // next-field: 9
 }
 
 message ProfileToolData {
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 01da54fcb3..64adf35c5e 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -66,8 +66,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     std::unique_ptr<ClientGraph> cg,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
-                    GraphExecutionState* execution_state, bool is_partial,
-                    WorkerCacheInterface* worker_cache, bool should_deregister)
+                    bool is_partial, WorkerCacheInterface* worker_cache,
+                    bool should_deregister)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
@@ -80,8 +80,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
-    // Initialize a name to node map for testing that fetches are reachable.
-    for (Node* n : execution_state->full_graph()->nodes()) {
+    // Initialize a name to node map for processing device stats.
+    for (Node* n : client_graph_->graph.nodes()) {
       name_to_node_.insert({n->name(), n});
     }
   }
@@ -829,8 +829,6 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
 // TODO(suharsh,mrry): Build a map from fetch target to set of feeds it depends
 // on once at setup time to prevent us from computing the dependencies
 // everytime.
-// TODO(suharshs,mrry): Consider removing the need for execution_state to reduce
-// contention.
 Status MasterSession::ReffedClientGraph::CheckFetches(
     const RunStepRequestWrapper& req, const RunState* run_state,
     GraphExecutionState* execution_state) {
@@ -840,8 +838,8 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
@@ -856,11 +854,11 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     const string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
-    stack.push_back(it->second);
+    stack.push_back(n);
   }
 
   // Any tensor needed for fetches can't be in pending_feeds.
@@ -1293,8 +1291,8 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
-          stats_publisher_factory_, execution_state_.get(), is_partial,
-          worker_cache, !should_delete_worker_sessions_);
+          stats_publisher_factory_, is_partial, worker_cache,
+          !should_delete_worker_sessions_);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index b4d18d8607..63745e8ebd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -47,11 +47,11 @@ namespace tensorflow {
 
 class GrpcMasterService : public AsyncServiceInterface {
  public:
-  GrpcMasterService(Master* master, int64 default_timeout_in_ms,
+  GrpcMasterService(Master* master, const ConfigProto& default_session_config,
                     ::grpc::ServerBuilder* builder)
       : master_impl_(master),
-        default_timeout_in_ms_(default_timeout_in_ms),
-        is_shutdown_(false) {
+        is_shutdown_(false),
+        default_session_config_(default_session_config) {
     builder->RegisterService(&master_service_);
     cq_ = builder->AddCompletionQueue();
   }
@@ -129,12 +129,12 @@ class GrpcMasterService : public AsyncServiceInterface {
 
  private:
   Master* master_impl_ = nullptr;  // Not owned.
-  const int64 default_timeout_in_ms_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::MasterService::AsyncService master_service_;
 
   mutex mu_;
   bool is_shutdown_ GUARDED_BY(mu_);
+  const ConfigProto default_session_config_;
   ::grpc::Alarm* shutdown_alarm_ = nullptr;
 
   template <class RequestMessage, class ResponseMessage>
@@ -144,9 +144,13 @@ class GrpcMasterService : public AsyncServiceInterface {
   // RPC handler for creating a session.
   void CreateSessionHandler(
       MasterCall<CreateSessionRequest, CreateSessionResponse>* call) {
-    master_impl_->CreateSession(&call->request, &call->response,
-                                [call](const Status& status) {
+    CreateSessionRequest* rewritten_req = new CreateSessionRequest;
+    rewritten_req->mutable_config()->MergeFrom(default_session_config_);
+    rewritten_req->MergeFrom(call->request);
+    master_impl_->CreateSession(rewritten_req, &call->response,
+                                [call, rewritten_req](const Status& status) {
                                   call->SendResponse(ToGrpcStatus(status));
+                                  delete rewritten_req;
                                 });
     ENQUEUE_REQUEST(CreateSession, true);
   }
@@ -178,7 +182,7 @@ class GrpcMasterService : public AsyncServiceInterface {
     if (call->request.options().timeout_in_ms() > 0) {
       call_opts->SetTimeout(call->request.options().timeout_in_ms());
     } else {
-      call_opts->SetTimeout(default_timeout_in_ms_);
+      call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
     }
     RunStepRequestWrapper* wrapped_request =
         new ProtoRunStepRequest(&call->request);
@@ -249,10 +253,10 @@ class GrpcMasterService : public AsyncServiceInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
 };
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder) {
-  return new GrpcMasterService(master, default_timeout_in_ms, builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder) {
+  return new GrpcMasterService(master, default_session_config, builder);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 473604f257..f0fe5b0c4e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
 
 namespace grpc {
 class ServerBuilder;
@@ -28,9 +29,9 @@ namespace tensorflow {
 class AsyncServiceInterface;
 class Master;
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index a6f4be3eaf..be19103582 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -183,8 +183,7 @@ Status GrpcServer::Init(
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   master_impl_ = CreateMaster(&master_env_);
-  master_service_ = NewGrpcMasterService(
-      master_impl_.get(), config.operation_timeout_in_ms(), &builder);
+  master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
   worker_impl_ =
       worker_func ? worker_func(&worker_env_) : NewGrpcWorker(&worker_env_);
   worker_service_ =
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 14e46ecdd9..79735e6cc2 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -459,11 +459,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
-  if (compute_memory_overlap_) {
-    costs.execution_time = std::max(compute_cost, memory_cost);
-  } else {
-    costs.execution_time = compute_cost + memory_cost;
-  }
+  CombineCostsAndUpdateExecutionTime(&costs);
   return costs;
 }
 
@@ -1375,5 +1371,14 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   return costs;
 }
 
+void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
+    Costs* costs) const {
+  if (compute_memory_overlap_) {
+    costs->execution_time = std::max(costs->compute_time, costs->memory_time);
+  } else {
+    costs->execution_time = costs->compute_time + costs->memory_time;
+  }
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index fcbecbb6dc..7080264698 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -173,6 +173,11 @@ class OpLevelCostEstimator {
       const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
+  // This method calculates the execution time depending on whether IO can
+  // overlap with computation. It assumes the memory and the compute times have
+  // already been calculated.
+  void CombineCostsAndUpdateExecutionTime(Costs* costs) const;
+
  protected:
   std::map<string, int> elementwise_ops_;
   typedef std::function<Costs(const OpContext& op_context)> CostImpl;
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 59a5695af0..7bf264ba30 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -237,17 +237,16 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
     return false;
   }
 
-  // Now, src_shape and dst_shape have at most one dimension with unknown
-  // sizes, and are compatible. Therefore, the reshape is a no-op when
-  //
-  // 1. at least one of them is fully-defined, or
-  // 2. both are partially defined and the -1 appears on the same dimension,
-  //    i.e., IsIdenticalTo returns true.
-  if (src_num_unknown_dim_sizes == 1 && dst_num_unknown_dim_sizes == 1) {
-    return dst_shape.IsIdenticalTo(src_shape);
+  // If dst_num_unknown_dim_sizes != src_num_unknown_dim_sizes we would weaken
+  // shape inference in subsequent passes if we removed this reshape.
+  if (src_num_unknown_dim_sizes != dst_num_unknown_dim_sizes) {
+    return false;
   }
 
-  return true;
+  // Remove the reshape if both are fully defined or partially defined and the
+  // unknown or symbolic shape appears on the same dimension, i.e., if
+  // IsIdenticalTo returns true.
+  return dst_shape.IsIdenticalTo(src_shape);
 }
 
 NodeDef* GetTailOfValuePreservingChain(
@@ -727,7 +726,9 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
-          new_add_node->set_input(i, unique_factors[i]);
+          const string& unique_factor_i = unique_factors[i];
+          new_add_node->set_input(i, unique_factor_i);
+          ctx_.node_map->AddOutput(unique_factor_i, new_add_node->name());
         }
 
         // Add control deps on add node
@@ -859,13 +860,18 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     NodeDef* node_perm;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
+    if (!IsConstant(*node_perm)) {
+      return Status::OK();
+    }
     std::vector<int64> node_perm_values;
     TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
-
     if (input->op() == node->op()) {
       // Remove pairs of transposes that cancel each other.
       NodeDef* input_perm;
       TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
+      if (!IsConstant(*input_perm)) {
+        return Status::OK();
+      }
       std::vector<int64> input_perm_values;
       TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
       if (AreInversePermutations(node_perm_values, input_perm_values)) {
@@ -1337,9 +1343,9 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     //      ^      |
     //      |      |
     //    input ---+
-    NodeDef* reshape = node_map_->GetNode(node->name());
+    NodeDef* reshape = const_cast<NodeDef*>(node);
     int output_pos = 0;
-    string input_node_name = ParseNodeName(node->input(0), &output_pos);
+    string input_node_name = ParseNodeName(reshape->input(0), &output_pos);
     const NodeDef* input = node_map_->GetNode(input_node_name);
     if (input->op() == "Reshape" && !HasControlInputs(*input)) {
       reshape->set_input(0, input->input(0));
@@ -1653,7 +1659,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   return "";
 }
 
-Status ArithmeticOptimizer::SimplifyArithmeticOps() {
+Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
@@ -1668,11 +1674,11 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
   const auto stop = [](const string& result) { return !result.empty(); };
   GraphOptimizerStagePipeline<string> pipeline(stop);
 
-  if (options_.combine_add_to_addn)
+  if (options_.combine_add_to_addn && can_use_shapes)
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
-  if (options_.hoist_common_factor_out_of_aggregation)
+  if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
-  if (options_.remove_identity_transpose)
+  if (options_.remove_identity_transpose && can_use_shapes)
     pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
   if (options_.remove_redundant_bitcast)
     pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
@@ -1759,10 +1765,14 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Shapes are only needed in aggressive mode.
   graph_properties_.reset(new GraphProperties(item));
-  TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
+  const Status status = graph_properties_->InferStatically(false);
+  const bool can_use_shapes = status.ok();
+  if (!can_use_shapes) {
+    VLOG(1) << "Shape inference failed." << status.error_message();
+  }
 
   // Perform the optimizations.
-  TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
+  TF_RETURN_IF_ERROR(SimplifyArithmeticOps(can_use_shapes));
 
   optimized_graph->Swap(optimized_graph_);
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 7e81ed0a1f..39b89dedba 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -105,7 +105,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
 
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
-  Status SimplifyArithmeticOps();
+  Status SimplifyArithmeticOps(bool can_use_shapes);
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
   // tensor (e.g. "split:1") or an emtpy string if no simplification is
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index d941a0b3f9..b2a1ce6ab6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -298,7 +298,8 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
   for (int node_idx = 0; node_idx < node_count; ++node_idx) {
     NodeDef* node = graph_->mutable_node(node_idx);
     const string op = node->op();
-    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
+    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN" &&
+        op != "TensorArraySizeV3") {
       continue;
     }
 
@@ -349,6 +350,36 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
       continue;
     }
 
+    if (op == "TensorArraySizeV3") {
+      const NodeDef* array = node_map_->GetNode(node->input(0));
+      if (array->attr().count("dynamic_size") != 0 &&
+          array->attr().at("dynamic_size").b()) {
+        continue;
+      }
+      const NodeDef* array_size = node_map_->GetNode(array->input(0));
+      if (IsReallyConstant(*array_size)) {
+        // Don't materialize 0 sizes to avoid triggering incorrect static
+        // checks. A 0 sized array that can't grow isn't useful anyway.
+        const TensorProto& raw_val = array_size->attr().at("value").tensor();
+        if (raw_val.dtype() != DT_INT32) {
+          continue;
+        }
+        Tensor value(raw_val.dtype(), raw_val.tensor_shape());
+        if (!value.FromProto(raw_val)) {
+          continue;
+        }
+        if (value.flat<int32>()(0) == 0) {
+          continue;
+        }
+        node->set_op("Const");
+        *node->mutable_attr() = array_size->attr();
+        node->set_input(0, AsControlDependency(NodeName(node->input(0))));
+        node->set_input(1, AddControlDependency(NodeName(node->input(1)),
+                                                graph_, node_map_.get()));
+      }
+      continue;
+    }
+
     // Handle ShapeN materialization case.
     // It's possible that not all input tensors have known shapes.
     CHECK_EQ(op, "ShapeN");
@@ -552,7 +583,6 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
 
   const DataType type = node.attr().at("T").type();
   NodeDef* out[2];
-  bool created_const = false;
   for (int j = 0; j < 2; ++j) {
     int reduction_indices = reduce_dims[j].size();
     Tensor value(type, TensorShape({reduction_indices}));
@@ -576,20 +606,17 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
           AddControlDependency(node.name(), graph_, node_map_.get());
       *out[j]->add_input() = ctrl_dep;
       node_map_->AddOutput(NodeName(ctrl_dep), const_name);
-      created_const = true;
     }
   }
 
-  if (created_const) {
-    const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
-    for (NodeDef* output : outputs) {
-      for (int k = 0; k < output->input_size(); ++k) {
-        int port;
-        string node_name = ParseNodeName(output->input(k), &port);
-        if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
-          *output->mutable_input(k) = out[port]->name();
-          node_map_->UpdateInput(output->name(), node_name, out[port]->name());
-        }
+  const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
+  for (NodeDef* output : outputs) {
+    for (int k = 0; k < output->input_size(); ++k) {
+      int port;
+      string node_name = ParseNodeName(output->input(k), &port);
+      if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
+        *output->mutable_input(k) = out[port]->name();
+        node_map_->UpdateInput(output->name(), node_name, out[port]->name());
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 71ee81dfde..08c92687e3 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -2402,6 +2402,48 @@ TEST_F(ConstantFoldingTest, Enter) {
   }
 }
 
+TEST_F(ConstantFoldingTest, TensorArraySize) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output size = ops::Const(scope.WithOpName("size"), 5, TensorShape({}));
+  auto dynamic_array =
+      ops::TensorArray(scope.WithOpName("dynamic"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(true));
+  auto static_array =
+      ops::TensorArray(scope.WithOpName("static"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(false));
+  auto dynamic_sz = ops::TensorArraySize(
+      scope.WithOpName("dynamic_sz"), dynamic_array.handle, dynamic_array.flow);
+  auto static_sz = ops::TensorArraySize(scope.WithOpName("static_sz"),
+                                        static_array.handle, static_array.flow);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  auto tensors_expected =
+      EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ("dynamic_sz", output.node(3).name());
+  EXPECT_EQ("TensorArraySizeV3", output.node(3).op());
+  EXPECT_EQ("static_sz", output.node(4).name());
+  EXPECT_EQ("Const", output.node(4).op());
+
+  auto tensors_actual = EvaluateNodes(output, {"dynamic_sz", "static_sz"});
+  EXPECT_EQ(2, tensors_expected.size());
+  EXPECT_EQ(2, tensors_actual.size());
+  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_actual[0]);
+  test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index f1da469a6c..343c89a9da 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -36,8 +36,11 @@ namespace {
 
 class FunctionInliningContext {
  public:
-  explicit FunctionInliningContext(const GrapplerItem& item)
-      : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+  explicit FunctionInliningContext(const GrapplerItem& item,
+                                   RewriterConfig::Toggle opt_level)
+      : library_(&item.graph.library()),
+        opt_level_(opt_level),
+        functions_(InliningCandidates(item)) {}
 
   const FunctionDefLibrary& Library() const { return *library_; }
 
@@ -59,13 +62,9 @@ class FunctionInliningContext {
     std::unordered_map<string, const FunctionDef*> functions;
     for (const FunctionDef& func : item.graph.library().function()) {
       // Don't inline functions marked as noinline
-      if (func.attr().count("_noinline") != 0) {
-        continue;
-      }
-      // Don't touch anything marked XLA to prevent XLA failures further down
-      // the road.
-      if (func.attr().count("_XlaCompile") > 0 &&
-          func.attr().at("_XlaCompile").b()) {
+      if (func.attr().count("_noinline") != 0 &&
+          func.attr().at("_noinline").b() &&
+          opt_level_ != RewriterConfig::AGGRESSIVE) {
         continue;
       }
       // Can't create IdentityN nodes with no input or output: skip these
@@ -80,6 +79,7 @@ class FunctionInliningContext {
   }
 
   const FunctionDefLibrary* library_;
+  RewriterConfig::Toggle opt_level_;
   std::unordered_map<string, const FunctionDef*> functions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
@@ -206,6 +206,10 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
                                         ctx, optimized_graph));
     } else {
+      // Annotate the node with the function attributes.
+      for (const auto& attr : func.attr()) {
+        func_body_node.mutable_attr()->insert(attr);
+      }
       // Move the node to the main graph
       optimized_graph->add_node()->Swap(&func_body_node);
     }
@@ -367,7 +371,7 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  FunctionInliningContext function_inlining_ctx(item);
+  FunctionInliningContext function_inlining_ctx(item, opt_level_);
 
   // Nothing to do here.
   if (!function_inlining_ctx.HasInlinedFunctions()) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 41444e4673..b124efe01d 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,7 +26,7 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer(RewriterConfig::Toggle opt_level) {}
+  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
   ~FunctionOptimizer() override {}
 
   string name() const override { return "function_optimizer"; };
@@ -36,6 +36,9 @@ class FunctionOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index c804d75756..fe26a56fc2 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -412,7 +412,7 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
       {mul_func, square_func});
 
   GraphDef output;
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
@@ -508,7 +508,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -550,7 +550,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -613,7 +613,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   // The optimizer should succeed but the graphs should be the same.
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 7044705ade..1ea57f7b4f 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -42,6 +42,10 @@ Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
 Status GetTensorProperties(const GraphOptimizerContext& ctx,
                            const string& tensor,
                            OpInfo::TensorProperties* properties) {
+  if (ctx.graph_properties == nullptr) {
+    return errors::InvalidArgument("Graph properties are unknown.");
+  }
+
   int port;
   string tensor_node_name = ParseNodeName(tensor, &port);
   if (port < 0) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d2a2cdd13d..1857d8d655 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2265,6 +2265,7 @@ tf_cc_tests(
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -5905,6 +5906,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -6180,3 +6182,12 @@ cc_library(
         "@gemmlowp",
     ],
 )
+
+# Header-only version of cwise_lib for clients that want to use the cwise_ops
+# functionality in their own custom ops.
+cc_header_only_library(
+    name = "cwise_lib_hdrs",
+    deps = [
+        ":cwise_lib",
+    ],
+)
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index a35e1b0788..709082e799 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -242,7 +243,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input image must be 4-D"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "input image must be 4-D"))
       << s;
 }
 
@@ -255,7 +256,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
+      str_util::StrContains(s.ToString(), "box_index has incompatible shape"))
       << s;
 }
 
@@ -267,8 +268,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   AddInputFromArray<int32>(TensorShape({2}), {3, 3});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_index has values outside [0, batch_size)"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "box_index has values outside [0, batch_size)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index ba9686e94e..07dc786d9b 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -104,6 +104,7 @@ namespace {
 using perftools::gputools::DeviceMemory;
 using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
+using perftools::gputools::dnn::AlgorithmConfig;
 using perftools::gputools::dnn::RnnDirectionMode;
 using perftools::gputools::dnn::RnnInputMode;
 using perftools::gputools::dnn::RnnMode;
@@ -544,9 +545,10 @@ class CudnnRNNKernelCommon : public OpKernel {
     auto* stream = context->op_device_context()->stream();
     // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
     // random number generator, therefore set state_allocator to nullptr.
+    const AlgorithmConfig algo_config;
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
         num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        rnn_mode(), ToDataType<T>::value, algo_config, dropout(), seed(),
         nullptr /* state_allocator */);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
@@ -891,22 +893,24 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
         CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
             new CudnnRNNPersistentSpaceAllocator(context);
         rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        const AlgorithmConfig algo_config;
         auto rnn_desc_s = executor->createRnnDescriptor(
             model_shapes.num_layers, model_shapes.num_units,
             model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+            rnn_mode(), data_type, algo_config, dropout(), seed(),
+            dropout_state_allocator);
         OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
         rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
       }
       launch_status =
           stream
-              ->ThenRnnForward(*rnn_state.rnn_desc, *input_desc, input_data,
-                               *hidden_state_desc, input_h_data,
-                               *hidden_state_desc, input_c_data, params_data,
-                               *output_desc, &output_data, *hidden_state_desc,
-                               &output_h_data, *hidden_state_desc,
-                               &output_c_data, is_training_,
-                               &reserve_space_allocator, &workspace_allocator)
+              ->ThenRnnForward(
+                  *rnn_state.rnn_desc, *input_desc, input_data,
+                  *hidden_state_desc, input_h_data, *hidden_state_desc,
+                  input_c_data, params_data, *output_desc, &output_data,
+                  *hidden_state_desc, &output_h_data, *hidden_state_desc,
+                  &output_c_data, is_training_, &reserve_space_allocator,
+                  &workspace_allocator, /* output_result_profile */ nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -1095,25 +1099,27 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
         CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
             new CudnnRNNPersistentSpaceAllocator(context);
         rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        const AlgorithmConfig algo_config;
         auto rnn_desc_s = executor->createRnnDescriptor(
             model_shapes.num_layers, model_shapes.num_units,
             model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+            rnn_mode(), data_type, algo_config, dropout(), seed(),
+            dropout_state_allocator);
         OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
         rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
       }
       launch_status =
           stream
-              ->ThenRnnBackward(*rnn_state.rnn_desc, *input_desc, input_data,
-                                *hidden_state_desc, input_h_data,
-                                *hidden_state_desc, input_c_data, params_data,
-                                *output_desc, output_data, *hidden_state_desc,
-                                output_h_data, *hidden_state_desc,
-                                output_c_data, output_backprop_data,
-                                output_h_backprop_data, output_c_backprop_data,
-                                &input_backprop_data, &input_h_backprop_data,
-                                &input_c_backprop_data, &params_backprop_data,
-                                &reserve_space_uint8, &workspace_allocator)
+              ->ThenRnnBackward(
+                  *rnn_state.rnn_desc, *input_desc, input_data,
+                  *hidden_state_desc, input_h_data, *hidden_state_desc,
+                  input_c_data, params_data, *output_desc, output_data,
+                  *hidden_state_desc, output_h_data, *hidden_state_desc,
+                  output_c_data, output_backprop_data, output_h_backprop_data,
+                  output_c_backprop_data, &input_backprop_data,
+                  &input_h_backprop_data, &input_c_backprop_data,
+                  &params_backprop_data, &reserve_space_uint8,
+                  &workspace_allocator, /* output_result_profile */ nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 912d04c153..2cafa44f37 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -41,9 +41,9 @@ enum FileFormat {
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
   // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
-  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  if (str_util::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
+  if (str_util::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (str_util::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
   return kUnknownFormat;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 9a7ed0af21..17eb4e24b7 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -153,8 +154,8 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "partitions[2] = 99 is not in [0, 4)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
index 6775893ce6..7fa6e320f5 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -88,9 +89,9 @@ TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [5] does not start with "
-                            "indices[1].shape = [1,5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [5] does not start with indices[1].shape = [1,5]"))
       << s;
 }
 
@@ -103,9 +104,9 @@ TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [1,5] does not start with "
-                            "indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [1,5] does not start with indices[1].shape = [5]"))
       << s;
 }
 
@@ -119,9 +120,10 @@ TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
   AddInputFromArray<float>(TensorShape({4, 2}),
                            {10, 11, 60, 61, 20, 21, 30, 31});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Need data[0].shape[1:] = data[1].shape[1:], "
-                            "got data[0].shape = [3,1], data[1].shape = [4,2]"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Need data[0].shape[1:] = data[1].shape[1:], got "
+                            "data[0].shape = [3,1], data[1].shape = [4,2]"))
       << s;
 }
 
@@ -134,10 +136,9 @@ TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "data[1].shape = [4] does not start with indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [4] does not start with indices[1].shape = [5]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 3edcb34bca..0409cadb67 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -171,7 +172,7 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 67d9217b95..9387fb13bc 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -147,7 +148,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -160,7 +161,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -308,7 +309,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -322,7 +323,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 5ffcc7d65d..e41df12d91 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
@@ -379,8 +380,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
@@ -401,8 +402,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index d5b37b1ce1..9217c25978 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -181,7 +182,7 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     int cluster_count = 0;
     for (const NodeDef& node_def : output_graph_def_.node()) {
       const string& name = node_def.name();
-      if (StringPiece(name).starts_with(REMOTE_FUSED_GRAPH_NODE_NAME)) {
+      if (str_util::StartsWith(name, REMOTE_FUSED_GRAPH_NODE_NAME)) {
         ++cluster_count;
         RemoteFusedGraphExecuteInfo info;
         string serialized_proto;
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 25a37d5e1a..c23570d885 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -218,9 +219,8 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index a920e60281..6d57892828 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -457,9 +458,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
@@ -467,8 +467,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: input must be 4-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: input must be 4-dimensional"))
       << s;
 }
 
@@ -476,8 +476,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must be 1-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must be 1-dimensional"))
       << s;
 }
 
@@ -485,8 +485,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must have two elements"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must have two elements"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index 90b6f8d0f3..e431226aa6 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -372,7 +373,8 @@ TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "input must be 1-D or higher"))
       << s;
 }
 
@@ -384,8 +386,8 @@ TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("axis must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "axis must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -397,8 +399,8 @@ TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -410,8 +412,8 @@ TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
   AddInputFromArray<int32>(TensorShape({1}), {1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift and axis must have the same size"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift and axis must have the same size"))
       << s;
 }
 
@@ -423,7 +425,7 @@ TEST_F(RollOpTest, Error_AxisOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "is out of range")) << s;
 }
 
 // isd - (inner shift dimension) The inner most dimension to be shifted.
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index ae81efa31d..c134a8dd5b 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -183,9 +184,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid indices: [2,0] = [99] does not index into [5,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid indices: [2,0] = [99] does not index into [5,3]"))
       << s;
 }
 
@@ -198,10 +198,10 @@ TEST_F(ScatterNdUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("The outermost dimension of updates and indices "
-                            "must match. Got indices.shape [1,3,1], "
-                            "updates.shape [3,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match. Got "
+      "indices.shape [1,3,1], updates.shape [3,3]"))
       << s;
 }
 
@@ -216,10 +216,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Must have updates.shape = indices.shape[:batch_dim]"))
-
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Must have updates.shape = indices.shape[:batch_dim]"))
       << s;
 }
 
@@ -233,10 +231,9 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "The outermost dimension of updates and indices must match."))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match."))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index 5b3537b94c..2ec8c42233 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -170,7 +171,7 @@ TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
@@ -183,8 +184,9 @@ TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
@@ -200,8 +202,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
 
       << s;
@@ -217,8 +220,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 5a389a6548..623de2a482 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -302,6 +302,11 @@ Status Examples::SampleAdaptiveProbabilities(
   return Status::OK();
 }
 
+void Examples::RandomShuffle() {
+  std::iota(sampled_index_.begin(), sampled_index_.end(), 0);
+  std::random_shuffle(sampled_index_.begin(), sampled_index_.end());
+}
+
 // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
 Status Examples::Initialize(OpKernelContext* const context,
                             const ModelWeights& weights,
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 1665b1210e..bfdb3febdc 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -322,10 +322,7 @@ class Examples {
     return examples_.at(example_index);
   }
 
-  int sampled_index(const int id, const bool adaptive) const {
-    if (adaptive) return sampled_index_[id];
-    return id;
-  }
+  int sampled_index(const int id) const { return sampled_index_[id]; }
 
   // Adaptive SDCA in the current implementation only works for
   // binary classification, where the input argument for num_weight_vectors
@@ -337,6 +334,8 @@ class Examples {
       const std::unique_ptr<DualLossUpdater>& loss_updater,
       const int num_weight_vectors);
 
+  void RandomShuffle();
+
   int num_examples() const { return examples_.size(); }
 
   int num_features() const { return num_features_; }
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 5b63057f3f..55e68b348b 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -153,8 +153,9 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
                        options.num_loss_partitions, options.regularizations,
                        model_weights, example_state_data, options.loss_updater,
                        /*num_weight_vectors =*/1));
+  } else {
+    examples.RandomShuffle();
   }
-
   mutex mu;
   Status train_step_status GUARDED_BY(mu);
   std::atomic<std::int64_t> atomic_index(-1);
@@ -162,8 +163,7 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
     for (int id = static_cast<int>(begin); id < end; ++id) {
-      const int64 example_index =
-          examples.sampled_index(++atomic_index, options.adaptive);
+      const int64 example_index = examples.sampled_index(++atomic_index);
       const Example& example = examples.example(example_index);
       const float dual = example_state_data(example_index, 0);
       const float example_weight = example.example_weight();
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
index a545fb146c..9cd590ae61 100644
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ b/tensorflow/core/kernels/shape_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -62,8 +63,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
 REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE",
                                       GetShapeFromKnownVecSize);
 
-static void ExpectHasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+static void ExpectHasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index e1712ac239..e72608945b 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -55,7 +56,7 @@ template <typename Device, typename T>
 class SoftmaxOp : public OpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 130d693dbd..b63dcbb163 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -128,7 +129,7 @@ template <typename T>
 class SoftmaxOpGPU : public OpKernel {
  public:
   explicit SoftmaxOpGPU(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index fe198af7e6..29577ebb4e 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -32,7 +33,7 @@ namespace tensorflow {
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 3c46abb8ab..9dcabcc584 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -122,7 +123,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not the same shape")) << s;
 }
 
 TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
@@ -133,7 +134,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -145,7 +146,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -256,7 +257,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
@@ -266,7 +267,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 // --------------------------------------------------------------------------
@@ -365,7 +366,7 @@ TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}),
                             {s1.SerializeAsString(), s2.SerializeAsString()});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "Duplicate tag")) << s;
 }
 
 }  // namespace
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 6119edfd5a..1b1faed703 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -67,11 +67,18 @@ limitations under the License.
 #define TF_EXPORT __attribute__((visibility("default")))
 #endif  // COMPILER_MSVC
 
-// GCC can be told that a certain branch is not likely to be taken (for
-// instance, a CHECK failure), and use that information in static analysis.
-// Giving it this information can help it optimize for the common case in
-// the absence of better information (ie. -fprofile-arcs).
-#if defined(COMPILER_GCC3)
+#ifdef __has_builtin
+#define TF_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TF_HAS_BUILTIN(x) 0
+#endif
+
+// Compilers can be told that a certain branch is not likely to be taken
+// (for instance, a CHECK failure), and use that information in static
+// analysis. Giving it this information can help it optimize for the
+// common case in the absence of better information (ie.
+// -fprofile-arcs).
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
index 61bb66bd21..4ccd43ce68 100644
--- a/tensorflow/core/profiler/g3doc/profile_model_architecture.md
+++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
@@ -45,22 +45,22 @@ sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 
 For an operation to have float operation statistics:
 
-* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
-use the definition to calculate float operations. Contributes are welcome.
-
-* It must have known "shape" information for RegisterStatistics('flops')
-to calculate the statistics. It is suggested to pass in `-run_meta_path` if
-shape is only known during runtime. tfprof can fill in the missing shape with
-the runtime shape information from RunMetadata.
-Hence, it is suggested to use `-account_displayed_op_only`
-option so that you know the statistics are only for the operations printed out.
-
-* If no RunMetadata provided, tfprof count float_ops of each graph node once,
-even if it is defined in tf.while_loop. This is because tfprof doesn't know
-how many times are run statically. If RunMetadata provided, tfprof calculate
-float_ops as float_ops * run_count.
-
-
+*   It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
+    uses the definition to calculate float operations. Contributions are
+    welcomed.
+
+*   It must have known "shape" information for RegisterStatistics('flops') to
+    calculate the statistics. It is suggested to pass in `-run_meta_path` if
+    shape is only known during runtime. tfprof can fill in the missing shape
+    with the runtime shape information from RunMetadata. Hence, it is suggested
+    to use `-account_displayed_op_only` option so that you know the statistics
+    are only for the operations printed out.
+
+*   If no RunMetadata is provided, tfprof counts float_ops of each graph node
+    once, even if it is defined in a tf.while_loop. This is because tfprof
+    doesn't know statically how many times each graph node is run. If
+    RunMetadata is provided, tfprof calculates float_ops as float_ops *
+    run_count.
 
 ```python
 # To profile float opertions in commandline, you need to pass --graph_path
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
index 96392a3c9b..4133bc172a 100644
--- a/tensorflow/docs_src/mobile/tflite/devguide.md
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -190,7 +190,7 @@ graph visualization.
 
 ## 3. Use the TensorFlow Lite model for inference in a mobile app
 
-After completing the prior steps, you should now have a .tflite model file.
+After completing the prior steps, you should now have a `.tflite` model file.
 
 ### Android
 
@@ -222,3 +222,10 @@ trained Tensorflow models to the
 [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
 devices. To use the converter, refer to the
 [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+
+### Raspberry Pi
+
+Compile Tensorflow Lite for a Raspberry Pi by following the
+[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/rpi.md)
+This compiles a static library file (`.a`) used to build your app. There are
+plans for Python bindings and a demo app.
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index d11a7e5d07..1f894c39fe 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,3 +1,4 @@
+index.md
 performance_guide.md
 datasets_performance.md
 performance_models.md
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 99a71206ac..fcc191250f 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -870,15 +870,16 @@ def run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
     resized_image_tensor: The input node of the recognition graph.
     bottleneck_tensor: The bottleneck output layer of the CNN graph.
   """
-  (sess, bottleneck_input, ground_truth_input, evaluation_step,
-   prediction) = build_eval_session(model_info, class_count)
-
   test_bottlenecks, test_ground_truth, test_filenames = (
       get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
                                     'testing', FLAGS.bottleneck_dir,
                                     FLAGS.image_dir, jpeg_data_tensor,
                                     decoded_image_tensor, resized_image_tensor,
                                     bottleneck_tensor, FLAGS.architecture))
+
+  (sess, bottleneck_input, ground_truth_input, evaluation_step,
+   prediction) = build_eval_session(model_info, class_count)
+
   test_accuracy, predictions = sess.run(
       [evaluation_step, prediction],
       feed_dict={
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index b48d758e4a..b6481e7e29 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -629,15 +629,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
   TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
 }
 
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status) {
-  std::vector<int64_t> dims(num_dims);
-  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
-  return dims;
-}
-
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results) {
   int num_missing_unused_input_mappings;
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index d2b4abc476..cfd27c2bee 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -229,13 +229,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
                                     const std::vector<int64_t>& dims,
                                     bool unknown_shape, TF_Status* status);
 
-// Return the shape of output. `num_dims` should be the output of
-// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status);
-
 // Returns the string representations of the missing unused input mappings.
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results);
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 7ad37058fd..3aad4a114a 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -217,10 +217,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_identity(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
+          30000)
 
   def benchmark_tf_gradient_forward_identity(self):
     with backprop.GradientTape() as tape:
@@ -236,10 +237,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_no_op(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(lambda x: x, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(lambda x: x, [0])(m),
+          30000)
 
   def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
@@ -271,11 +273,12 @@ class MicroBenchmarks(test.Benchmark):
     # pylint: disable=protected-access
     ctx_handle = context.context()._handle
     # pylint: enable=protected-access
+    device = context.context().device_name
     attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
              m.dtype.as_datatype_enum)
     def func():
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "MatMul", inputs,
-                                       attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul",
+                                       inputs, attrs, 1)
 
     self._run(func, num_iters)
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 711eddcec1..61859d6be3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -294,7 +294,7 @@ class _EagerDefinedFunction(object):
     self.signature = function_def.signature
     self.grad_func_name = None
     self.python_grad_func = None
-    self._c_func = fn
+    self._c_func = c_api_util.ScopedTFFunction(fn)
     self._grad_func = None
 
 
@@ -661,7 +661,7 @@ def _defun_internal(name, func, args, kwds):
   if context.executing_eagerly():
     for f in tmp_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
-      _register(f._c_func)  # pylint: disable=protected-access
+      _register(f._c_func.func)  # pylint: disable=protected-access
   return GraphModeFunction(
       fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
       func_outputs, output_shapes, variables)
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index ee5d87f083..d40ea982c7 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -325,7 +325,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register(f._c_func)  # pylint: disable=protected-access
+    function._register(f._c_func.func)  # pylint: disable=protected-access
   initializer_function = function.GraphModeFunction(
       initialization_name,
       placeholder_inputs,
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 4356a534b4..7bbe3183df 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -63,6 +63,32 @@ class ScopedTFImportGraphDefOptions(object):
       c_api.TF_DeleteImportGraphDefOptions(self.options)
 
 
+class ScopedTFImportGraphDefResults(object):
+  """Wrapper around TF_ImportGraphDefOptions that handles deletion."""
+
+  def __init__(self, results):
+    self.results = results
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteImportGraphDefResults is not None:
+      c_api.TF_DeleteImportGraphDefResults(self.results)
+
+
+class ScopedTFFunction(object):
+  """Wrapper around TF_Function that handles deletion."""
+
+  def __init__(self, func):
+    self.func = func
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteFunction is not None:
+      c_api.TF_DeleteFunction(self.func)
+
+
 @tf_contextlib.contextmanager
 def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index c5caf9ebc0..9570f009a5 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -274,7 +274,7 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     if self._c_func:
       with c_api_util.tf_buffer() as buf:
-        c_api.TF_FunctionToFunctionDef(self._c_func, buf)
+        c_api.TF_FunctionToFunctionDef(self._c_func.func, buf)
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
@@ -397,7 +397,7 @@ class _DefinedFunction(object):
                       if self._out_names else [])
       description = self._func.__doc__ or None
       # pylint: disable=protected-access
-      self._c_func = c_api.TF_GraphToFunction_wrapper(
+      c_func = c_api.TF_GraphToFunction_wrapper(
           temp_graph._c_graph,
           base_func_name,
           self._func_name is None,  # append_hash_to_fn_name
@@ -407,6 +407,7 @@ class _DefinedFunction(object):
           output_names,
           None,  # opts
           description)
+      self._c_func = c_api_util.ScopedTFFunction(c_func)
       # pylint: enable=protected-access
       self._set_c_attrs(kwargs_attr)
 
@@ -429,7 +430,7 @@ class _DefinedFunction(object):
       serialized = attr_value.SerializeToString()
       # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
       # It might be worth creating a convenient way to re-use the same status.
-      c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
+      c_api.TF_FunctionSetAttrValueProto(self._c_func.func, compat.as_str(name),
                                          serialized)
 
   def _create_hash_str(self, input_arg, output_arg, node_def):
@@ -825,7 +826,8 @@ def _from_definition(fdef, grad_func=None):
   # pylint: disable=protected-access
   if ops._USE_C_API:
     serialized = fdef.SerializeToString()
-    result._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+    c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+    result._c_func = c_api_util.ScopedTFFunction(c_func)
     result._extra_inputs = []
   else:
     result._definition = fdef
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 83d256fab6..c05396b06e 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -58,12 +58,32 @@ def _OptimizerOptions():
   for cse in [False, True]:
     for inline in [False, True]:
       for cfold in [False, True]:
-        yield config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-            optimizer_options=config_pb2.OptimizerOptions(
-                opt_level=config_pb2.OptimizerOptions.L0,
-                do_common_subexpression_elimination=cse,
-                do_function_inlining=inline,
-                do_constant_folding=cfold)))
+        cfg = config_pb2.ConfigProto(
+            graph_options=config_pb2.GraphOptions(
+                optimizer_options=config_pb2.OptimizerOptions(
+                    opt_level=config_pb2.OptimizerOptions.L0,
+                    do_common_subexpression_elimination=cse,
+                    do_function_inlining=inline,
+                    do_constant_folding=cfold)))
+        if cse:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if inline:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if cfold:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        yield cfg
 
 
 @test_util.with_c_api
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 23f529b988..3f8a8c4bef 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -487,6 +487,7 @@ def import_graph_def(graph_def,
         try:
           results = c_api.TF_GraphImportGraphDefWithResults(
               graph._c_graph, serialized, options)  # pylint: disable=protected-access
+          results = c_api_util.ScopedTFImportGraphDefResults(results)
         except errors.InvalidArgumentError as e:
           # Convert to ValueError for backwards compatibility.
           raise ValueError(str(e))
@@ -515,7 +516,7 @@ def import_graph_def(graph_def,
     # they are likely to be due to a typo.
     missing_unused_input_keys = (
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
-            results))
+            results.results))
     if missing_unused_input_keys:
       missing_unused_input_keys = [
           compat.as_str(s) for s in missing_unused_input_keys
@@ -527,7 +528,7 @@ def import_graph_def(graph_def,
     if return_elements is None:
       return None
     else:
-      return _GatherReturnElements(return_elements, graph, results)
+      return _GatherReturnElements(return_elements, graph, results.results)
 
   else:
     g = graph
@@ -684,11 +685,10 @@ def import_graph_def(graph_def,
                      ', '.join(x.name for x in op._input_types))))
         # pylint: enable=protected-access
 
-        if not g._is_function(op.type):  # pylint: disable=protected-access
-          # Execute shape inference for this op.
-          # NOTE(mrry): If the graph contains a cycle, the full shape
-          # information may not be available for this op's inputs.
-          ops.set_shapes_for_outputs(op)
+        # Execute shape inference for this op.
+        # NOTE(mrry): If the graph contains a cycle, the full shape
+        # information may not be available for this op's inputs.
+        ops.set_shape_and_handle_data_for_outputs(op)
         # For nodes with _output_shapes set, set the output shapes.
         if '_output_shapes' in op.node_def.attr:
           for i, output in enumerate(op.outputs):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2d55f98a1c..2574fa57a4 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -289,15 +289,26 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-    self._shape_val = tensor_shape.unknown_shape()
+
+    if _USE_C_API:
+      # This will be set by set_shape_and_handle_data_for_outputs.
+      self._shape_val = None
+    else:
+      # The Python code requires all tensors start with a shape to support shape
+      # inference on imported while loops. This isn't necessary with the C API
+      # enabled because the C API provides the shapes for imported nodes.
+      # TODO(skyewm): remove when _USE_C_API is removed.
+      self._shape_val = tensor_shape.unknown_shape()
+
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
 
-    # Attributes used for C++ shape inference. Not inspected, only forwarded.
-    # If set, will be a HandleData object from cpp_shape_inference.proto.
-    # TODO(b/74620627): remove when _USE_C_SHAPES is removed
-    self._handle_data = None
+    if not _USE_C_SHAPES:
+      # Attributes used for C++ shape inference. Not inspected, only forwarded.
+      # If set, will be a HandleData object from cpp_shape_inference.proto.
+      self._handle_data = None
+
     self._id = uid()
 
   @property
@@ -371,18 +382,45 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
-    graph = self._op._graph._c_graph # pylint: disable=protected-access
-    if graph and _USE_C_SHAPES:
-      num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output())
-      if num_dims == -1:
-        dim_list = None
+    if self._shape_val is None:
+      if _USE_C_SHAPES:
+        self._shape_val = self._c_api_shape()
       else:
-        dim_list = c_api.TF_GraphGetTensorShape_wrapper(
-            graph, self._as_tf_output(), num_dims)
-        dim_list = [None if i == -1 else i for i in dim_list]
-      return tensor_shape.TensorShape(dim_list)
+        assert _USE_C_API
+        # Call set_shape_and_handle_data_for_outputs in topological order on all
+        # ops that are needed to compute self.op's shape. We do this instead of
+        # having set_shape_and_handle_data_for_outputs recursively call
+        # Operation.shape on self.op.inputs to overflowing the call stack.
+        need_shapes = self._get_input_ops_without_shapes(self.op)
+        need_shapes.sort(key=lambda op: op._id)
+        for op in need_shapes:
+          set_shape_and_handle_data_for_outputs(op)
     return self._shape_val
 
+  def _get_input_ops_without_shapes(self, target_op):
+    """Returns ops needing shape inference to compute target_op's shape."""
+    result = []
+    stack = [self._op]
+    visited = set()
+    while stack:
+      op = stack.pop()
+      if op in visited: continue
+      result.append(op)
+      stack.extend(t.op for t in op.inputs if t._shape_val is None)
+      visited.add(op)
+    return result
+
+  def _c_api_shape(self):
+    """Returns the TensorShape of this tensor according to the C API."""
+    c_graph = self._op._graph._c_graph  # pylint: disable=protected-access
+    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+        c_graph, self._as_tf_output())
+    if unknown_shape:
+      return tensor_shape.unknown_shape()
+    else:
+      shape_vector = [None if d == -1 else d for d in shape_vector]
+      return tensor_shape.TensorShape(shape_vector)
+
   @property
   def _shape(self):
     logging.warning("Tensor._shape is private, use Tensor.shape "
@@ -466,8 +504,11 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if not _USE_C_SHAPES:  # pylint: disable=protected-access
-      self._shape_val = self._shape_val.merge_with(shape)
+    if _USE_C_SHAPES:  # pylint: disable=protected-access
+      # Reset cached shape.
+      self._shape_val = None
+    else:
+      self._shape_val = self.shape.merge_with(shape)
 
     if not self._op._graph._c_graph: return
 
@@ -579,6 +620,16 @@ class Tensor(_TensorLike):
     # Necessary to support Python's collection membership operators
     return id(self) == id(other)
 
+  def __copy__(self):
+    # Make sure _shape_val is computed before we copy.
+    # TODO(b/77597810): get rid of Tensor copies.
+    if self._shape_val is None:
+      set_shape_and_handle_data_for_outputs(self.op)
+    cls = self.__class__
+    result = cls.__new__(cls)
+    result.__dict__.update(self.__dict__)
+    return result
+
   # NOTE(mrry): This enables the Tensor's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Tensor class higher priority than an ndarray, or a
@@ -1932,6 +1983,13 @@ class Operation(object):
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
+
+    # Make sure output shapes are already computed for this op in case we create
+    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
+    # lazily upon request.
+    if not _USE_C_SHAPES:
+      set_shape_and_handle_data_for_outputs(self)
+
     if self._c_op:
       # Reset cached inputs.
       self._inputs_val = None
@@ -2474,35 +2532,41 @@ class RegisterShape(object):
     return f
 
 
-def _set_shapes_for_outputs_c_api(op):
-  """set_shapes_for_outputs implementation when C API is enabled."""
-  # The C API computes the shapes when the TF_Operation is created. Fetch the
-  # output shapes from the C object.
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def _set_shape_and_handle_data_for_outputs_c_api(op):
+  """Set shapes and resource handle data using info from the C API."""
+  assert not _USE_C_SHAPES
   for output in op.outputs:
-    # pylint: disable=protected-access
-    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+    output._shape_val = output._c_api_shape()
+    # Set the resource handle data for compatibility with the Python shape
+    # inference code.
+    serialized = c_api.ResourceHandleShapeAndType(
         op._graph._c_graph, output._as_tf_output())
-    # pylint: enable=protected-access
-    if unknown_shape:
-      output.set_shape(tensor_shape.unknown_shape())
-    elif not shape_vector:
-      output.set_shape(tensor_shape.scalar())
-    else:
-      shape_vector = [None if d == -1 else d for d in shape_vector]
-      output.set_shape(tensor_shape.TensorShape(shape_vector))
-
-    serialized = c_api.ResourceHandleShapeAndType(op._graph._c_graph,
-                                                  output._as_tf_output())
     if serialized:
       output._handle_data = (
-          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
-              compat.as_bytes(serialized)))
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
+          .FromString(compat.as_bytes(serialized)))
     else:
       output._handle_data = None
 
-# TODO(skyewm): remove this when _USE_C_API flag is removed.
-def _set_shapes_for_outputs(op):
-  """set_shapes_for_outputs implementation when C API is disabled."""
+
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def set_shape_and_handle_data_for_outputs(op):
+  """Set the shapes and resource handle data for op's outputs.
+
+  When _USE_C_API = True, this is lazily called when a tensor's shape is first
+  requested. Usually this should work automatically, but some edge cases may
+  require manaully calling this first to make sure Tensor._shape_val and
+  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
+  Tensor).
+  """
+  if _USE_C_SHAPES: return
+
+  if op.graph._is_function(op.type):
+    for output in op.outputs:
+      output._shape_val = tensor_shape.unknown_shape()
+    return
+
   try:
     shape_func = _shape_registry.lookup(op.type)
   except LookupError:
@@ -2521,8 +2585,10 @@ def _set_shapes_for_outputs(op):
     shapes = shapes_dict["shapes"]
     handle_datas = shapes_dict["handle_data"]
     for output, handle_data in zip(op.outputs, handle_datas):
+      # Don't override any existing handle data that may have been manually set.
       # pylint: disable=protected-access
-      output._handle_data = handle_data
+      if output._handle_data is None:
+        output._handle_data = handle_data
       # pylint: enable=protected-access
 
   if len(op.outputs) != len(shapes):
@@ -2530,15 +2596,8 @@ def _set_shapes_for_outputs(op):
         "Shape function for op %s returned %d shapes but expected %d %s %s" %
         (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
   for output, s in zip(op.outputs, shapes):
-    output.set_shape(s)
-
-
-def set_shapes_for_outputs(op):
-  """Set the shapes for op's outputs."""
-  if op._c_op and _USE_C_SHAPES:  # pylint: disable=protected-access
-    return _set_shapes_for_outputs_c_api(op)
-  else:
-    return _set_shapes_for_outputs(op)
+    output._shape_val = tensor_shape.unknown_shape()
+    output._shape_val = output._shape_val.merge_with(s)
 
 
 class OpStats(object):
@@ -3216,9 +3275,11 @@ class Graph(object):
       # as this will be unnecessary.
       if not function._c_func:
         serialized = function.definition.SerializeToString()
-        function._c_func = c_api.TF_FunctionImportFunctionDef(serialized)
-      gradient = function._grad_func._c_func if function._grad_func else None
-      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient)
+        c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+        function._c_func = c_api_util.ScopedTFFunction(c_func)
+      gradient = (function._grad_func._c_func.func if function._grad_func
+                  else None)
+      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     else:
       # If there is already a function with the same name, raise an error
       # if bodies are different. Else, do nothing. The C API version above
@@ -3329,18 +3390,14 @@ class Graph(object):
           original_op=self._default_original_op,
           op_def=op_def)
 
-      # TODO(vrv): Instead of eagerly filling in shape property for every op,
-      # only populate the shape when requested.
+      # Note: shapes are lazily computed with the C API enabled.
       #
       # TODO(skyewm): unlike in the original Python implementation, the C API
       # always computes shape information (even for function calls, which the
       # original Python shape inference code doesn't handle). Deprecate the
       # compute_shapes argument.
-      #
-      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
-      # is removed
-      if (ret._c_op and _USE_C_SHAPES) or compute_shapes:  # pylint: disable=protected-access
-        set_shapes_for_outputs(ret)
+      if not _USE_C_API and compute_shapes:
+        set_shape_and_handle_data_for_outputs(ret)
 
       self._create_op_helper(ret, compute_shapes=compute_shapes,
                              compute_device=compute_device)
@@ -3482,18 +3539,17 @@ class Graph(object):
         for c_op in c_api_util.new_tf_operations(self)
     ]
 
+    # pylint: disable=protected-access
     for op in new_ops:
       # Operations created by the C API always retrieve shapes from the C API so
       # we preserve the shapes of ops created in import_graph_def (from the
       # "_output_shapes" attr of the imported NodeDef).
-      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
-      # is removed.
-      _set_shapes_for_outputs_c_api(op)
+      if not _USE_C_SHAPES:
+        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
-      # pylint: disable=protected-access
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
-      # pylint: enable=protected-access
+    # pylint: enable=protected-access
 
     return new_ops
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 984bcecdfe..64b0fa6c00 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,7 +22,6 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -828,7 +827,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   Returns:
     A `TensorShape` based on the constant value of the given `tensor`.
   """
-  if context.executing_eagerly():
+  if isinstance(tensor, ops.EagerTensor):
     return tensor_shape.as_shape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index ad96b53a45..12775fccec 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -84,11 +84,13 @@ from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import DepthwiseConv2D
 from tensorflow.python.keras._impl.keras.layers import Dropout
 from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import Reshape
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -116,195 +118,6 @@ def preprocess_input(x):
   return imagenet_utils.preprocess_input(x, mode='tf')
 
 
-class DepthwiseConv2D(Conv2D):
-  """Depthwise separable 2D convolution.
-
-  Depthwise Separable convolutions consists in performing
-  just the first step in a depthwise spatial convolution
-  (which acts on each input channel separately).
-  The `depth_multiplier` argument controls how many
-  output channels are generated per input channel in the depthwise step.
-
-  Arguments:
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          width and height of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the width and height.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `'valid'` or `'same'` (case-insensitive).
-      depth_multiplier: The number of depthwise convolution output channels
-          for each input channel.
-          The total number of depthwise convolution output
-          channels will be equal to `filters_in * depth_multiplier`.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be 'channels_last'.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. 'linear' activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix.
-      bias_initializer: Initializer for the bias vector.
-      depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its 'activation')..
-      depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-
-  Input shape:
-      4D tensor with shape:
-      `[batch, channels, rows, cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, rows, cols, channels]` if data_format='channels_last'.
-
-  Output shape:
-      4D tensor with shape:
-      `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
-  """
-
-  def __init__(self,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DepthwiseConv2D, self).__init__(
-        filters=None,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.depthwise_constraint = constraints.get(depthwise_constraint)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-  @shape_type_conversion
-  def build(self, input_shape):
-    if len(input_shape) < 4:
-      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
-                       'Received input shape:', str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = 3
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs to '
-                       '`DepthwiseConv2D` '
-                       'should be defined. Found `None`.')
-    input_dim = int(input_shape[channel_axis])
-    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
-                              input_dim, self.depth_multiplier)
-
-    self.depthwise_kernel = self.add_weight(
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        name='depthwise_kernel',
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(input_dim * self.depth_multiplier,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs, training=None):
-    outputs = K.depthwise_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        strides=self.strides,
-        padding=self.padding,
-        dilation_rate=self.dilation_rate,
-        data_format=self.data_format)
-
-    if self.bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-
-    return outputs
-
-  @shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-      out_filters = input_shape[1] * self.depth_multiplier
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      cols = input_shape[2]
-      out_filters = input_shape[3] * self.depth_multiplier
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding, self.strides[1])
-
-    if self.data_format == 'channels_first':
-      return (input_shape[0], out_filters, rows, cols)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, cols, out_filters)
-
-  def get_config(self):
-    config = super(DepthwiseConv2D, self).get_config()
-    config.pop('filters')
-    config.pop('kernel_initializer')
-    config.pop('kernel_regularizer')
-    config.pop('kernel_constraint')
-    config['depth_multiplier'] = self.depth_multiplier
-    config['depthwise_initializer'] = initializers.serialize(
-        self.depthwise_initializer)
-    config['depthwise_regularizer'] = regularizers.serialize(
-        self.depthwise_regularizer)
-    config['depthwise_constraint'] = constraints.serialize(
-        self.depthwise_constraint)
-    return config
-
-
 @tf_export('keras.applications.MobileNet',
            'keras.applications.mobilenet.MobileNet')
 def MobileNet(input_shape=None,
@@ -318,18 +131,11 @@ def MobileNet(input_shape=None,
               classes=1000):
   """Instantiates the MobileNet architecture.
 
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
   To load a MobileNet model via `load_model`, import the custom
-  objects `relu6` and `DepthwiseConv2D` and pass them to the
-  `custom_objects` parameter.
+  objects `relu6` and pass them to the `custom_objects` parameter.
   E.g.
   model = load_model('mobilenet.h5', custom_objects={
-                     'relu6': mobilenet.relu6,
-                     'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+                     'relu6': mobilenet.relu6})
 
   Arguments:
       input_shape: optional shape tuple, only to be specified
@@ -383,11 +189,6 @@ def MobileNet(input_shape=None,
           backend that does not support separable convolutions.
   """
 
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('Only TensorFlow backend is currently supported, '
-                       'as other backends do not support '
-                       'depthwise convolution.')
-
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
@@ -522,7 +323,7 @@ def MobileNet(input_shape=None,
   # load weights
   if weights == 'imagenet':
     if K.image_data_format() == 'channels_first':
-      raise ValueError('Weights for "channels_last" format '
+      raise ValueError('Weights for "channels_first" format '
                        'are not available.')
     if alpha == 1.0:
       alpha_text = '1_0'
@@ -598,14 +399,14 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   filters = int(filters * alpha)
+  x = ZeroPadding2D(padding=(1, 1), name='conv1_pad')(inputs)
   x = Conv2D(
       filters,
       kernel,
-      padding='same',
+      padding='valid',
       use_bias=False,
       strides=strides,
-      name='conv1')(
-          inputs)
+      name='conv1')(x)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -665,15 +466,14 @@ def _depthwise_conv_block(inputs,
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   pointwise_conv_filters = int(pointwise_conv_filters * alpha)
-
+  x = ZeroPadding2D(padding=(1, 1), name='conv_pad_%d' % block_id)(inputs)
   x = DepthwiseConv2D(  # pylint: disable=not-callable
       (3, 3),
-      padding='same',
+      padding='valid',
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(
-          inputs)
+      name='conv_dw_%d' % block_id)(x)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 46c0e63557..f8c6aff4f2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -45,6 +45,7 @@ from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -236,9 +237,9 @@ def ResNet50(include_top=True,
   else:
     bn_axis = 1
 
+  x = ZeroPadding2D(padding=(3, 3), name='conv1_pad')(img_input)
   x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
-          img_input)
+      64, (7, 7), strides=(2, 2), padding='valid', name='conv1')(x)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 162ae6c28f..7cdebc6aa4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
@@ -1024,6 +1025,200 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.DepthwiseConv2D')
+class DepthwiseConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Depthwise Separable convolutions consists in performing
+  just the first step in a depthwise spatial convolution
+  (which acts on each input channel separately).
+  The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Arguments:
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+        width and height of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the width and height.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: one of `'valid'` or `'same'` (case-insensitive).
+    depth_multiplier: The number of depthwise convolution output channels
+        for each input channel.
+        The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be 'channels_last'.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. 'linear' activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    depthwise_initializer: Initializer for the depthwise kernel matrix.
+    bias_initializer: Initializer for the bias vector.
+    depthwise_regularizer: Regularizer function applied to
+        the depthwise kernel matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+        the output of the layer (its 'activation').
+    depthwise_constraint: Constraint function applied to
+        the depthwise kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+    4D tensor with shape:
+    `[batch, channels, rows, cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+  Output shape:
+    4D tensor with shape:
+    `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               depth_multiplier=1,
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DepthwiseConv2D, self).__init__(
+        filters=None,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`DepthwiseConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0],
+                              self.kernel_size[1],
+                              input_dim,
+                              self.depth_multiplier)
+
+    self.depthwise_kernel = self.add_weight(
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs, training=None):
+    outputs = K.depthwise_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        strides=self.strides,
+        padding=self.padding,
+        dilation_rate=self.dilation_rate,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = K.bias_add(
+          outputs,
+          self.bias,
+          data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+
+    return outputs
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+      out_filters = input_shape[1] * self.depth_multiplier
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+      out_filters = input_shape[3] * self.depth_multiplier
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return (input_shape[0], out_filters, rows, cols)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], rows, cols, out_filters)
+
+  def get_config(self):
+    config = super(DepthwiseConv2D, self).get_config()
+    config.pop('filters')
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    return config
+
+
 @tf_export('keras.layers.UpSampling1D')
 class UpSampling1D(Layer):
   """Upsampling layer for 1D inputs.
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index b78962d66a..6b2a1d98fe 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
 """Convolutional-recurrent layers.
 """
 from __future__ import absolute_import
@@ -26,181 +27,456 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
+from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
+from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-class ConvRecurrent2D(Recurrent):
-  """Abstract base class for convolutional recurrent layers.
-
-  Do not use in a model -- it's not a functional layer!
+class ConvRNN2D(RNN):
+  """Base class for convolutional-recurrent layers.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of n integers, specifying the
-          dimensions of the convolution window.
-      strides: An integer or tuple/list of n integers,
-          specifying the strides of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of n integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
+    cell: A RNN cell instance. A RNN cell is a class that has:
+        - a `call(input_at_t, states_at_t)` method, returning
+            `(output_at_t, states_at_t_plus_1)`. The call method of the
+            cell can also take the optional argument `constants`, see
+            section "Note on passing external constants" below.
+        - a `state_size` attribute. This can be a single integer
+            (single state) in which case it is
+            the number of channels of the recurrent state
+            (which should be the same as the number of channels of the cell
+            output). This can also be a list/tuple of integers
+            (one size per state). In this case, the first entry
+            (`state_size[0]`) should be the same as
+            the size of the cell output.
+    return_sequences: Boolean. Whether to return the last output.
+        in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+        in addition to the output.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    input_shape: Use this argument to specify the shape of the
+        input when this layer is the first one in a model.
 
   Input shape:
-      5D tensor with shape `(num_samples, timesteps, channels, rows, cols)`.
+    5D tensor with shape:
+    `(samples, timesteps, channels, rows, cols)`
+    if data_format='channels_first' or 5D tensor with shape:
+    `(samples, timesteps, rows, cols, channels)`
+    if data_format='channels_last'.
 
   Output shape:
-      - if `return_sequences`: 5D tensor with shape
-          `(num_samples, timesteps, channels, rows, cols)`.
-      - else, 4D tensor with shape `(num_samples, channels, rows, cols)`.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
-      set to `True`.
-      **Note:** for the time being, masking is only supported with Theano.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch.
-      This assumes a one-to-one mapping between
-      samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              a `batch_input_size=(...)` to the first layer in your model.
-              This is the expected shape of your inputs *including the batch
-              size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
+    - if `return_state`: a list of tensors. The first tensor is
+        the output. The remaining tensors are the last states,
+        each 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to padding.
+    - if `return_sequences`: 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+    - else, 4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+
+  Masking:
+    This layer supports masking for input data with a variable number
+    of timesteps. To introduce masks to your data,
+    use an Embedding layer with the `mask_zero` parameter
+    set to `True`.
+
+  Note on using statefulness in RNNs:
+    You can set RNN layers to be 'stateful', which means that the states
+    computed for the samples in one batch will be reused as initial states
+    for the samples in the next batch. This assumes a one-to-one mapping
+    between samples in different successive batches.
+    To enable statefulness:
+        - specify `stateful=True` in the layer constructor.
+        - specify a fixed batch size for your model, by passing
+             - if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+             - if functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+                This is the expected shape of your inputs
+                *including the batch size*.
+                It should be a tuple of integers,
+                e.g. `(32, 10, 100, 100, 32)`.
+                Note that the number of rows and columns should be specified
+                too.
+        - specify `shuffle=False` when calling fit().
+    To reset the states of your model, call `.reset_states()` on either
+    a specific layer, or on your entire model.
+
+  Note on specifying the initial state of RNNs:
+    You can specify the initial state of RNN layers symbolically by
+    calling them with the keyword argument `initial_state`. The value of
+    `initial_state` should be a tensor or list of tensors representing
+    the initial state of the RNN layer.
+    You can specify the initial state of RNN layers numerically by
+    calling `reset_states` with the keyword argument `states`. The value of
+    `states` should be a numpy array or list of numpy arrays representing
+    the initial state of the RNN layer.
+
+  Note on passing external constants to RNNs:
+    You can pass "external" constants to the cell using the `constants`
+    keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+    requires that the `cell.call` method accepts the same keyword argument
+    `constants`. Such constants can be used to condition the cell
+    transformation on additional static inputs (not changing over time),
+    a.k.a. an attention mechanism.
   """
 
   def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
+               cell,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
+               unroll=False,
                **kwargs):
-    super(ConvRecurrent2D, self).__init__(**kwargs)
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
-                                                    'dilation_rate')
-    self.return_sequences = return_sequences
-    self.go_backwards = go_backwards
-    self.stateful = stateful
+    if unroll:
+      raise TypeError('Unrolling isn\'t possible with '
+                      'convolutional RNNs.')
+    if isinstance(cell, (list, tuple)):
+      # The StackedConvRNN2DCells isn't implemented yet.
+      raise TypeError('It is not possible at the moment to'
+                      'stack convolutional cells.')
+    super(ConvRNN2D, self).__init__(cell,
+                                    return_sequences,
+                                    return_state,
+                                    go_backwards,
+                                    stateful,
+                                    unroll,
+                                    **kwargs)
     self.input_spec = [InputSpec(ndim=5)]
-    self.state_spec = None
+    self.states = None
 
   @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    if self.data_format == 'channels_first':
+
+    cell = self.cell
+    if cell.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
-    elif self.data_format == 'channels_last':
+    elif cell.data_format == 'channels_last':
       rows = input_shape[2]
       cols = input_shape[3]
-    rows = conv_utils.conv_output_length(
-        rows,
-        self.kernel_size[0],
-        padding=self.padding,
-        stride=self.strides[0],
-        dilation=self.dilation_rate[0])
-    cols = conv_utils.conv_output_length(
-        cols,
-        self.kernel_size[1],
-        padding=self.padding,
-        stride=self.strides[1],
-        dilation=self.dilation_rate[1])
+    rows = conv_utils.conv_output_length(rows,
+                                         cell.kernel_size[0],
+                                         padding=cell.padding,
+                                         stride=cell.strides[0],
+                                         dilation=cell.dilation_rate[0])
+    cols = conv_utils.conv_output_length(cols,
+                                         cell.kernel_size[1],
+                                         padding=cell.padding,
+                                         stride=cell.strides[1],
+                                         dilation=cell.dilation_rate[1])
+
+    if cell.data_format == 'channels_first':
+      output_shape = input_shape[:2] + (cell.filters, rows, cols)
+    elif cell.data_format == 'channels_last':
+      output_shape = input_shape[:2] + (rows, cols, cell.filters)
+
+    if not self.return_sequences:
+      output_shape = output_shape[:1] + output_shape[2:]
+
+    if self.return_state:
+      output_shape = [output_shape]
+      if cell.data_format == 'channels_first':
+        output_shape += [(input_shape[0], cell.filters, rows, cols)
+                         for _ in range(2)]
+      elif cell.data_format == 'channels_last':
+        output_shape += [(input_shape[0], rows, cols, cell.filters)
+                         for _ in range(2)]
+    return output_shape
+
+  @shape_type_conversion
+  def build(self, input_shape):
+    # Note input_shape will be list of shapes of initial states and
+    # constants if these are passed in __call__.
+    if self._num_constants is not None:
+      constants_shape = input_shape[-self._num_constants:]
+    else:
+      constants_shape = None
+
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:5])
+
+    # allow cell (if layer) to build before we set or validate state_spec
+    if isinstance(self.cell, Layer):
+      step_input_shape = (input_shape[0],) + input_shape[2:]
+      if constants_shape is not None:
+        self.cell.build([step_input_shape] + constants_shape)
+      else:
+        self.cell.build(step_input_shape)
+
+    # set or validate state_spec
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = list(self.cell.state_size)
+    else:
+      state_size = [self.cell.state_size]
+
+    if self.state_spec is not None:
+      # initial_state was passed in call, check compatibility
+      if self.cell.data_format == 'channels_first':
+        ch_dim = 1
+      elif self.cell.data_format == 'channels_last':
+        ch_dim = 3
+      if [spec.shape[ch_dim] for spec in self.state_spec] != state_size:
+        raise ValueError(
+            'An initial_state was passed that is not compatible with '
+            '`cell.state_size`. Received `state_spec`={}; '
+            'However `cell.state_size` is '
+            '{}'.format([spec.shape for spec in self.state_spec],
+                        self.cell.state_size))
+    else:
+      if self.cell.data_format == 'channels_first':
+        self.state_spec = [InputSpec(shape=(None, dim, None, None))
+                           for dim in state_size]
+      elif self.cell.data_format == 'channels_last':
+        self.state_spec = [InputSpec(shape=(None, None, None, dim))
+                           for dim in state_size]
+    if self.stateful:
+      self.reset_states()
+    self.built = True
+
+  def get_initial_state(self, inputs):
+    # (samples, timesteps, rows, cols, filters)
+    initial_state = K.zeros_like(inputs)
+    # (samples, rows, cols, filters)
+    initial_state = K.sum(initial_state, axis=1)
+    shape = list(self.cell.kernel_shape)
+    shape[-1] = self.cell.filters
+    initial_state = self.cell.input_conv(initial_state,
+                                         K.zeros(tuple(shape)),
+                                         padding=self.cell.padding)
+
+    if hasattr(self.cell.state_size, '__len__'):
+      return [initial_state for _ in self.cell.state_size]
+    else:
+      return [initial_state]
+
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    inputs, initial_state, constants = self._standardize_args(
+        inputs, initial_state, constants)
+
+    if initial_state is None and constants is None:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+    # If any of `initial_state` or `constants` are specified and are Keras
+    # tensors, then add them to the inputs and temporarily modify the
+    # input_spec to include them.
+
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      self.state_spec = []
+      for state in initial_state:
+        shape = K.int_shape(state)
+        self.state_spec.append(InputSpec(shape=shape))
+
+      additional_specs += self.state_spec
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      self.constants_spec = [InputSpec(shape=K.int_shape(constant))
+                             for constant in constants]
+      self._num_constants = len(constants)
+      additional_specs += self.constants_spec
+    # at this point additional_inputs cannot be empty
+    for tensor in additional_inputs:
+      if K.is_keras_tensor(tensor) != K.is_keras_tensor(additional_inputs[0]):
+        raise ValueError('The initial state or constants of an RNN'
+                         ' layer cannot be specified with a mix of'
+                         ' Keras tensors and non-Keras tensors')
+
+    if K.is_keras_tensor(additional_inputs[0]):
+      # Compute the full input spec, including state and constants
+      full_input = [inputs] + additional_inputs
+      full_input_spec = self.input_spec + additional_specs
+      # Perform the call with temporarily replaced input_spec
+      original_input_spec = self.input_spec
+      self.input_spec = full_input_spec
+      output = super(ConvRNN2D, self).__call__(full_input, **kwargs)
+      self.input_spec = original_input_spec
+      return output
+    else:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           initial_state=None,
+           constants=None):
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' +
+                       str(len(initial_state)) +
+                       ' initial states.')
+    timesteps = K.int_shape(inputs)[1]
+
+    kwargs = {}
+    if generic_utils.has_arg(self.cell.call, 'training'):
+      kwargs['training'] = training
+
+    if constants:
+      if not generic_utils.has_arg(self.cell.call, 'constants'):
+        raise ValueError('RNN cell does not support constants')
+
+      def step(inputs, states):
+        constants = states[-self._num_constants:]
+        states = states[:-self._num_constants]
+        return self.cell.call(inputs, states, constants=constants,
+                              **kwargs)
+    else:
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+    last_output, outputs, states = K.rnn(step,
+                                         inputs,
+                                         initial_state,
+                                         constants=constants,
+                                         go_backwards=self.go_backwards,
+                                         mask=mask,
+                                         input_length=timesteps)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(K.update(self.states[i], states[i]))
+      self.add_update(updates, inputs=True)
+
     if self.return_sequences:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
-                        cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], input_shape[1], rows, cols,
-                        self.filters)
+      output = outputs
     else:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], self.filters, rows, cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], rows, cols, self.filters)
+      output = last_output
+
+    # Properly set learning phase
+    if getattr(last_output, '_uses_learning_phase', False):
+      output._uses_learning_phase = True
 
     if self.return_state:
-      if self.data_format == 'channels_first':
-        output_shape = [output_shape] + [
-            (input_shape[0], self.filters, rows, cols) for _ in range(2)
-        ]
-      elif self.data_format == 'channels_last':
-        output_shape = [output_shape] + [
-            (input_shape[0], rows, cols, self.filters) for _ in range(2)
-        ]
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [output] + states
+    else:
+      return output
 
-    return output_shape
+  def reset_states(self, states=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    input_shape = self.input_spec[0].shape
+    state_shape = self.compute_output_shape(input_shape)
+    if self.return_state:
+      state_shape = state_shape[0]
+    if self.return_sequences:
+      state_shape = state_shape[:1].concatenate(state_shape[2:])
+    if None in state_shape:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.\n'
+                       'The same thing goes for the number of rows and '
+                       'columns.')
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'return_sequences': self.return_sequences,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful
-    }
-    base_config = super(ConvRecurrent2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    # helper function
+    def get_tuple_shape(nb_channels):
+      result = list(state_shape)
+      if self.cell.data_format == 'channels_first':
+        result[1] = nb_channels
+      elif self.cell.data_format == 'channels_last':
+        result[3] = nb_channels
+      else:
+        raise KeyError
+      return tuple(result)
 
+    # initialize state if None
+    if self.states[0] is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        self.states = [K.zeros(get_tuple_shape(dim))
+                       for dim in self.cell.state_size]
+      else:
+        self.states = [K.zeros(get_tuple_shape(self.cell.state_size))]
+    elif states is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        for state, dim in zip(self.states, self.cell.state_size):
+          K.set_value(state, np.zeros(get_tuple_shape(dim)))
+      else:
+        K.set_value(self.states[0],
+                    np.zeros(get_tuple_shape(self.cell.state_size)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, ' +
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
+        if hasattr(self.cell.state_size, '__len__'):
+          dim = self.cell.state_size[index]
+        else:
+          dim = self.cell.state_size
+        if value.shape != get_tuple_shape(dim):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' +
+                           self.name + ': expected shape=' +
+                           str(get_tuple_shape(dim)) +
+                           ', found shape=' + str(value.shape))
+        # TODO(anjalisridhar): consider batch calls to `set_value`.
+        K.set_value(state, value)
 
-@tf_export('keras.layers.ConvLSTM2D')
-class ConvLSTM2D(ConvRecurrent2D):
-  """Convolutional LSTM.
 
-  It is similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+class ConvLSTM2DCell(Layer):
+  """Cell class for the ConvLSTM2D layer.
 
-  Arguments:
+  # Arguments
       filters: Integer, the dimensionality of the output space
           (i.e. the number of output filters in the convolution).
       kernel_size: An integer or tuple/list of n integers, specifying the
@@ -212,11 +488,6 @@ class ConvLSTM2D(ConvRecurrent2D):
       padding: One of `"valid"` or `"same"` (case-insensitive).
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -231,71 +502,32 @@ class ConvLSTM2D(ConvRecurrent2D):
           for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
+          used for the linear transformation of the recurrent state.
       bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Use in combination with `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+          This is recommended in [Jozefowicz et al.]
+          (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
           the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
           the `recurrent_kernel` weights matrix.
       bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
           the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
           the `recurrent_kernel` weights matrix.
       bias_constraint: Constraint function applied to the bias vector.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-
-  Input shape:
-      - if data_format='channels_first'
-          5D tensor with shape:
-          `(samples,time, channels, rows, cols)`
-      - if data_format='channels_last'
-          5D tensor with shape:
-          `(samples,time, rows, cols, channels)`
-
-   Output shape:
-      - if `return_sequences`
-           - if data_format='channels_first'
-              5D tensor with shape:
-              `(samples, time, filters, output_row, output_col)`
-           - if data_format='channels_last'
-              5D tensor with shape:
-              `(samples, time, output_row, output_col, filters)`
-      - else
-          - if data_format ='channels_first'
-              4D tensor with shape:
-              `(samples, filters, output_row, output_col)`
-          - if data_format='channels_last'
-              4D tensor with shape:
-              `(samples, output_row, output_col, filters)`
-          where o_row and o_col depend on the shape of the filter and
-          the padding
-
-  Raises:
-      ValueError: in case of invalid constructor arguments.
-
   """
 
   def __init__(self,
@@ -315,27 +547,20 @@ class ConvLSTM2D(ConvRecurrent2D):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
-               return_sequences=False,
-               go_backwards=False,
-               stateful=False,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(ConvLSTM2D, self).__init__(
-        filters,
-        kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        return_sequences=return_sequences,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
+    super(ConvLSTM2DCell, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                    'dilation_rate')
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
     self.use_bias = use_bias
@@ -348,7 +573,6 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -356,45 +580,29 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
+    self.state_size = (self.filters, self.filters)
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-    if self.stateful:
-      self.reset_states()
-    else:
-      # initial states: 2 all-zero tensor of shape (filters)
-      self.states = [None, None]
 
     if self.data_format == 'channels_first':
-      channel_axis = 2
+      channel_axis = 1
     else:
       channel_axis = -1
     if input_shape[channel_axis] is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
-    state_shape = [None] * 4
-    state_shape[channel_axis] = input_dim
-    state_shape = tuple(state_shape)
-    self.state_spec = [
-        InputSpec(shape=state_shape),
-        InputSpec(shape=state_shape)
-    ]
     kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
     self.kernel_shape = kernel_shape
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
 
-    self.kernel = self.add_weight(
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+    self.kernel = self.add_weight(shape=kernel_shape,
+                                  initializer=self.kernel_initializer,
+                                  name='kernel',
+                                  regularizer=self.kernel_regularizer,
+                                  constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
         shape=recurrent_kernel_shape,
         initializer=self.recurrent_initializer,
@@ -402,25 +610,24 @@ class ConvLSTM2D(ConvRecurrent2D):
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.filters * 4,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      self.bias = self.add_weight(shape=(self.filters * 4,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
       if self.unit_forget_bias:
         bias_value = np.zeros((self.filters * 4,))
-        bias_value[self.filters:self.filters * 2] = 1.
+        bias_value[self.filters: self.filters * 2] = 1.
         K.set_value(self.bias, bias_value)
     else:
       self.bias = None
 
     self.kernel_i = self.kernel[:, :, :, :self.filters]
     self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
-    self.kernel_f = self.kernel[:, :, :, self.filters:self.filters * 2]
+    self.kernel_f = self.kernel[:, :, :, self.filters: self.filters * 2]
     self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
                                                     self.filters * 2]
-    self.kernel_c = self.kernel[:, :, :, self.filters * 2:self.filters * 3]
+    self.kernel_c = self.kernel[:, :, :, self.filters * 2: self.filters * 3]
     self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
                                                     self.filters * 3]
     self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
@@ -428,8 +635,8 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     if self.use_bias:
       self.bias_i = self.bias[:self.filters]
-      self.bias_f = self.bias[self.filters:self.filters * 2]
-      self.bias_c = self.bias[self.filters * 2:self.filters * 3]
+      self.bias_f = self.bias[self.filters: self.filters * 2]
+      self.bias_c = self.bias[self.filters * 2: self.filters * 3]
       self.bias_o = self.bias[self.filters * 3:]
     else:
       self.bias_i = None
@@ -438,166 +645,419 @@ class ConvLSTM2D(ConvRecurrent2D):
       self.bias_o = None
     self.built = True
 
-  def get_initial_state(self, inputs):
-    # (samples, timesteps, rows, cols, filters)
-    initial_state = array_ops.zeros_like(inputs)
-    # (samples, rows, cols, filters)
-    initial_state = math_ops.reduce_sum(initial_state, axis=1)
-    shape = list(self.kernel_shape)
-    shape[-1] = self.filters
-    initial_state = self.input_conv(
-        initial_state, K.zeros(tuple(shape)), padding=self.padding)
-
-    initial_states = [initial_state for _ in range(2)]
-    return initial_states
+  def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          K.ones_like(inputs),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          K.ones_like(states[1]),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
 
-  def reset_states(self):
-    if not self.stateful:
-      raise RuntimeError('Layer must be stateful.')
-    input_shape = self.input_spec[0].shape
+    # dropout matrices for input units
+    dp_mask = self._dropout_mask
+    # dropout matrices for recurrent units
+    rec_dp_mask = self._recurrent_dropout_mask
 
-    if not input_shape[0]:
-      raise ValueError('If a RNN is stateful, a complete '
-                       'input_shape must be provided '
-                       '(including batch size). '
-                       'Got input shape: ' + str(input_shape))
+    h_tm1 = states[0]  # previous memory state
+    c_tm1 = states[1]  # previous carry state
 
-    if self.return_state:
-      output_shape = tuple(self.compute_output_shape(input_shape)[0].as_list())
-    else:
-      output_shape = tuple(self.compute_output_shape(input_shape).as_list())
-    if self.return_sequences:
-      output_shape = (input_shape[0],) + output_shape[2:]
+    if 0 < self.dropout < 1.:
+      inputs_i = inputs * dp_mask[0]
+      inputs_f = inputs * dp_mask[1]
+      inputs_c = inputs * dp_mask[2]
+      inputs_o = inputs * dp_mask[3]
     else:
-      output_shape = (input_shape[0],) + output_shape[1:]
+      inputs_i = inputs
+      inputs_f = inputs
+      inputs_c = inputs
+      inputs_o = inputs
 
-    if hasattr(self, 'states'):
-      K.set_value(self.states[0],
-                  np.zeros(output_shape))
-      K.set_value(self.states[1],
-                  np.zeros(output_shape))
+    if 0 < self.recurrent_dropout < 1.:
+      h_tm1_i = h_tm1 * rec_dp_mask[0]
+      h_tm1_f = h_tm1 * rec_dp_mask[1]
+      h_tm1_c = h_tm1 * rec_dp_mask[2]
+      h_tm1_o = h_tm1 * rec_dp_mask[3]
     else:
-      self.states = [
-          K.zeros(output_shape),
-          K.zeros(output_shape)
-      ]
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
-      ones = array_ops.zeros_like(inputs)
-      ones = math_ops.reduce_sum(ones, axis=1)
-      ones += 1
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(dp_mask)
-    else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-
-    if 0 < self.recurrent_dropout < 1:
-      shape = list(self.kernel_shape)
-      shape[-1] = self.filters
-      ones = array_ops.zeros_like(inputs)
-      ones = math_ops.reduce_sum(ones, axis=1)
-      ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
-      ones += 1.
-
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
-
-      rec_dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(rec_dp_mask)
-    else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-    return constants
+      h_tm1_i = h_tm1
+      h_tm1_f = h_tm1
+      h_tm1_c = h_tm1
+      h_tm1_o = h_tm1
+
+    x_i = self.input_conv(inputs_i, self.kernel_i, self.bias_i,
+                          padding=self.padding)
+    x_f = self.input_conv(inputs_f, self.kernel_f, self.bias_f,
+                          padding=self.padding)
+    x_c = self.input_conv(inputs_c, self.kernel_c, self.bias_c,
+                          padding=self.padding)
+    x_o = self.input_conv(inputs_o, self.kernel_o, self.bias_o,
+                          padding=self.padding)
+    h_i = self.recurrent_conv(h_tm1_i,
+                              self.recurrent_kernel_i)
+    h_f = self.recurrent_conv(h_tm1_f,
+                              self.recurrent_kernel_f)
+    h_c = self.recurrent_conv(h_tm1_c,
+                              self.recurrent_kernel_c)
+    h_o = self.recurrent_conv(h_tm1_o,
+                              self.recurrent_kernel_o)
+
+    i = self.recurrent_activation(x_i + h_i)
+    f = self.recurrent_activation(x_f + h_f)
+    c = f * c_tm1 + i * self.activation(x_c + h_c)
+    o = self.recurrent_activation(x_o + h_o)
+    h = o * self.activation(c)
+
+    if 0 < self.dropout + self.recurrent_dropout:
+      if training is None:
+        h._uses_learning_phase = True
+
+    return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
-    conv_out = K.conv2d(
-        x,
-        w,
-        strides=self.strides,
-        padding=padding,
-        data_format=self.data_format,
-        dilation_rate=self.dilation_rate)
+    conv_out = K.conv2d(x, w, strides=self.strides,
+                        padding=padding,
+                        data_format=self.data_format,
+                        dilation_rate=self.dilation_rate)
     if b is not None:
-      conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
+      conv_out = K.bias_add(conv_out, b,
+                            data_format=self.data_format)
     return conv_out
 
   def recurrent_conv(self, x, w):
-    conv_out = K.conv2d(
-        x, w, strides=(1, 1), padding='same', data_format=self.data_format)
+    conv_out = K.conv2d(x, w, strides=(1, 1),
+                        padding='same',
+                        data_format=self.data_format)
     return conv_out
 
-  def step(self, inputs, states):
-    assert len(states) == 4
-    h_tm1 = states[0]
-    c_tm1 = states[1]
-    dp_mask = states[2]
-    rec_dp_mask = states[3]
-
-    x_i = self.input_conv(
-        inputs * dp_mask[0], self.kernel_i, self.bias_i, padding=self.padding)
-    x_f = self.input_conv(
-        inputs * dp_mask[1], self.kernel_f, self.bias_f, padding=self.padding)
-    x_c = self.input_conv(
-        inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
-    x_o = self.input_conv(
-        inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
-    h_i = self.recurrent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
-    h_f = self.recurrent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
-    h_c = self.recurrent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
-    h_o = self.recurrent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+  def get_config(self):
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
+    base_config = super(ConvLSTM2DCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
-    i = self.recurrent_activation(x_i + h_i)
-    f = self.recurrent_activation(x_f + h_f)
-    c = f * c_tm1 + i * self.activation(x_c + h_c)
-    o = self.recurrent_activation(x_o + h_o)
-    h = o * self.activation(c)
-    return h, [h, c]
+@tf_export('keras.layers.ConvLSTM2D')
+class ConvLSTM2D(ConvRNN2D):
+  """Convolutional LSTM.
+
+  It is similar to an LSTM layer, but the input transformations
+  and recurrent transformations are both convolutional.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space
+        (i.e. the number output of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+    strides: An integer or tuple/list of n integers,
+        specifying the strides of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, time, ..., channels)`
+        while `channels_first` corresponds to
+        inputs with shape `(batch, time, channels, ...)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be "channels_last".
+    dilation_rate: An integer or tuple/list of n integers, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+        for the recurrent step.
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al.]
+        (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to.
+    kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+
+  Input shape:
+    - if data_format='channels_first'
+        5D tensor with shape:
+        `(samples,time, channels, rows, cols)`
+    - if data_format='channels_last'
+        5D tensor with shape:
+        `(samples,time, rows, cols, channels)`
+
+  Output shape:
+    - if `return_sequences`
+         - if data_format='channels_first'
+            5D tensor with shape:
+            `(samples, time, filters, output_row, output_col)`
+         - if data_format='channels_last'
+            5D tensor with shape:
+            `(samples, time, output_row, output_col, filters)`
+    - else
+        - if data_format ='channels_first'
+            4D tensor with shape:
+            `(samples, filters, output_row, output_col)`
+        - if data_format='channels_last'
+            4D tensor with shape:
+            `(samples, output_row, output_col, filters)`
+        where o_row and o_col depend on the shape of the filter and
+        the padding
+
+  Raises:
+    ValueError: in case of invalid constructor arguments.
+
+  References:
+    - [Convolutional LSTM Network: A Machine Learning Approach for
+    Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+    The current implementation does not include the feedback loop on the
+    cells output.
+
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    cell = ConvLSTM2DCell(filters=filters,
+                          kernel_size=kernel_size,
+                          strides=strides,
+                          padding=padding,
+                          data_format=data_format,
+                          dilation_rate=dilation_rate,
+                          activation=activation,
+                          recurrent_activation=recurrent_activation,
+                          use_bias=use_bias,
+                          kernel_initializer=kernel_initializer,
+                          recurrent_initializer=recurrent_initializer,
+                          bias_initializer=bias_initializer,
+                          unit_forget_bias=unit_forget_bias,
+                          kernel_regularizer=kernel_regularizer,
+                          recurrent_regularizer=recurrent_regularizer,
+                          bias_regularizer=bias_regularizer,
+                          kernel_constraint=kernel_constraint,
+                          recurrent_constraint=recurrent_constraint,
+                          bias_constraint=bias_constraint,
+                          dropout=dropout,
+                          recurrent_dropout=recurrent_dropout)
+    super(ConvLSTM2D, self).__init__(cell,
+                                     return_sequences=return_sequences,
+                                     go_backwards=go_backwards,
+                                     stateful=stateful,
+                                     **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    return super(ConvLSTM2D, self).call(inputs,
+                                        mask=mask,
+                                        training=training,
+                                        initial_state=initial_state)
+
+  @property
+  def filters(self):
+    return self.cell.filters
+
+  @property
+  def kernel_size(self):
+    return self.cell.kernel_size
+
+  @property
+  def strides(self):
+    return self.cell.strides
+
+  @property
+  def padding(self):
+    return self.cell.padding
+
+  @property
+  def data_format(self):
+    return self.cell.data_format
+
+  @property
+  def dilation_rate(self):
+    return self.cell.dilation_rate
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def recurrent_activation(self):
+    return self.cell.recurrent_activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def unit_forget_bias(self):
+    return self.cell.unit_forget_bias
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
 
   def get_config(self):
-    config = {
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
-    }
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'activity_regularizer': regularizers.serialize(
+                  self.activity_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
     base_config = super(ConvLSTM2D, self).get_config()
+    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
index 60137bdd72..9e768b4e95 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
@@ -64,6 +64,7 @@ class ConvLSTMTest(test.TestCase):
           self.assertEqual(len(states), 2)
           model = keras.models.Model(x, states[0])
           state = model.predict(inputs)
+
           self.assertAllClose(
               keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index f4a134b96c..12b4267675 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -961,5 +961,43 @@ class CroppingTest(test.TestCase):
       keras.layers.Cropping3D(cropping=None)
 
 
+class DepthwiseConv2DTest(test.TestCase):
+
+  def _run_test(self, kwargs, arg, values):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    test_kwargs = copy.copy(kwargs)
+    for value in values:
+      test_kwargs[arg] = value
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.DepthwiseConv2D,
+            kwargs=test_kwargs,
+            input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_depthwise_conv2d(self):
+    kwargs = {'kernel_size': (3, 3)}
+
+    self._run_test(kwargs, 'padding', ['valid', 'same'])
+    self._run_test(kwargs, 'strides', [(2, 2)])
+    if test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, 'data_format', ['channels_first'])
+    self._run_test(kwargs, 'depth_multiplier', [1, 2])
+
+    kwargs = {'kernel_size': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'activation': None,
+              'depthwise_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'activity_regularizer': 'l2',
+              'depthwise_constraint': 'unit_norm',
+              'strides': (2, 2),
+             }
+    self._run_test(kwargs, 'depth_multiplier', [1])
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 7f9f77c296..f53db987ff 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -251,7 +251,7 @@ class RNN(Layer):
           It is also possible for `cell` to be a list of RNN cell instances,
           in which cases the cells get stacked on after the other in the RNN,
           implementing an efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -797,10 +797,10 @@ class RNN(Layer):
 
   @property
   def losses(self):
-    losses = []
+    layer_losses = super(RNN, self).losses
     if isinstance(self.cell, Layer):
-      losses += self.cell.losses
-    return losses + self._losses
+      return self.cell.losses + layer_losses
+    return layer_losses
 
   @property
   def updates(self):
@@ -1017,7 +1017,7 @@ class SimpleRNN(RNN):
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1237,6 +1237,9 @@ class GRUCell(Layer):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
   """
 
   def __init__(self,
@@ -1256,6 +1259,7 @@ class GRUCell(Layer):
                dropout=0.,
                recurrent_dropout=0.,
                implementation=1,
+               reset_after=False,
                **kwargs):
     super(GRUCell, self).__init__(**kwargs)
     self.units = units
@@ -1278,6 +1282,7 @@ class GRUCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
+    self.reset_after = reset_after
     self.state_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
@@ -1299,12 +1304,25 @@ class GRUCell(Layer):
         constraint=self.recurrent_constraint)
 
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.units * 3,),
-          name='bias',
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      if not self.reset_after:
+        bias_shape = (3 * self.units,)
+      else:
+        # separate biases for input and recurrent kernels
+        # Note: the shape is intentionally different from CuDNNGRU biases
+        # `(2 * 3 * self.units,)`, so that we can distinguish the classes
+        # when loading and converting saved weights.
+        bias_shape = (2, 3 * self.units)
+      self.bias = self.add_weight(shape=bias_shape,
+                                  name='bias',
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+      if not self.reset_after:
+        self.input_bias, self.recurrent_bias = self.bias, None
+      else:
+        self.input_bias = K.flatten(self.bias[0])
+        self.recurrent_bias = K.flatten(self.bias[1])
+
     else:
       self.bias = None
     self.built = True
@@ -1340,13 +1358,15 @@ class GRUCell(Layer):
         inputs_z = inputs
         inputs_r = inputs
         inputs_h = inputs
+
       x_z = K.dot(inputs_z, self.kernel[:, :self.units])
       x_r = K.dot(inputs_r, self.kernel[:, self.units:self.units * 2])
       x_h = K.dot(inputs_h, self.kernel[:, self.units * 2:])
+
       if self.use_bias:
-        x_z = K.bias_add(x_z, self.bias[:self.units])
-        x_r = K.bias_add(x_r, self.bias[self.units:self.units * 2])
-        x_h = K.bias_add(x_h, self.bias[self.units * 2:])
+        x_z = K.bias_add(x_z, self.input_bias[:self.units])
+        x_r = K.bias_add(x_r, self.input_bias[self.units: self.units * 2])
+        x_h = K.bias_add(x_h, self.input_bias[self.units * 2:])
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1_z = h_tm1 * rec_dp_mask[0]
@@ -1356,42 +1376,70 @@ class GRUCell(Layer):
         h_tm1_z = h_tm1
         h_tm1_r = h_tm1
         h_tm1_h = h_tm1
-      z = self.recurrent_activation(
-          x_z + K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units]))
-      r = self.recurrent_activation(
-          x_r + K.dot(h_tm1_r, self.recurrent_kernel[:, self.units:
-                                                     self.units * 2]))
-
-      hh = self.activation(x_h + K.dot(r * h_tm1_h,
-                                       self.recurrent_kernel[:,
-                                                             self.units * 2:]))
+
+      recurrent_z = K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units])
+      recurrent_r = K.dot(h_tm1_r,
+                          self.recurrent_kernel[:, self.units:self.units * 2])
+      if self.reset_after and self.use_bias:
+        recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias[:self.units])
+        recurrent_r = K.bias_add(recurrent_r,
+                                 self.recurrent_bias[self.units:
+                                                     self.units * 2])
+
+      z = self.recurrent_activation(x_z + recurrent_z)
+      r = self.recurrent_activation(x_r + recurrent_r)
+
+      # reset gate applied after/before matrix multiplication
+      if self.reset_after:
+        recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
+        if self.use_bias:
+          recurrent_h = K.bias_add(recurrent_h,
+                                   self.recurrent_bias[self.units * 2:])
+        recurrent_h = r * recurrent_h
+      else:
+        recurrent_h = K.dot(r * h_tm1_h,
+                            self.recurrent_kernel[:, self.units * 2:])
+
+      hh = self.activation(x_h + recurrent_h)
     else:
       if 0. < self.dropout < 1.:
         inputs *= dp_mask[0]
+
+      # inputs projected by all gate matrices at once
       matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
-        matrix_x = K.bias_add(matrix_x, self.bias)
+        # biases: bias_z_i, bias_r_i, bias_h_i
+        matrix_x = K.bias_add(matrix_x, self.input_bias)
+
+      x_z = matrix_x[:, :self.units]
+      x_r = matrix_x[:, self.units: 2 * self.units]
+      x_h = matrix_x[:, 2 * self.units:]
+
       if 0. < self.recurrent_dropout < 1.:
         h_tm1 *= rec_dp_mask[0]
       matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
 
-      x_z = matrix_x[:, :self.units]
-      x_r = matrix_x[:, self.units:2 * self.units]
       recurrent_z = matrix_inner[:, :self.units]
       recurrent_r = matrix_inner[:, self.units:2 * self.units]
 
       z = self.recurrent_activation(x_z + recurrent_z)
       r = self.recurrent_activation(x_r + recurrent_r)
 
-      x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
+      if self.reset_after:
+        recurrent_h = r * matrix_inner[:, 2 * self.units:]
+      else:
+        recurrent_h = K.dot(r * h_tm1,
+                            self.recurrent_kernel[:, 2 * self.units:])
+
       hh = self.activation(x_h + recurrent_h)
+    # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
       if training is None and not context.executing_eagerly():
         # This would be harmless to set in eager mode, but eager tensors
         # disallow setting arbitrary attributes.
         h._uses_learning_phase = True
+
     return h, [h]
 
   def get_config(self):
@@ -1415,7 +1463,8 @@ class GRUCell(Layer):
         'bias_constraint': constraints.serialize(self.bias_constraint),
         'dropout': self.dropout,
         'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'implementation': self.implementation,
+        'reset_after': self.reset_after
     }
     base_config = super(GRUCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1423,9 +1472,16 @@ class GRUCell(Layer):
 
 @tf_export('keras.layers.GRU')
 class GRU(RNN):
-  """Gated Recurrent Unit - Cho et al.
+  """Gated Recurrent Unit - Cho et al. 2014.
 
-  2014.
+  There are two variants. The default one is based on 1406.1078v3 and
+  has reset gate applied to hidden state before matrix multiplication. The
+  other one is based on original 1406.1078v1 and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. Use `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
@@ -1469,7 +1525,7 @@ class GRU(RNN):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1485,6 +1541,9 @@ class GRU(RNN):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
 
   """
 
@@ -1511,6 +1570,7 @@ class GRU(RNN):
                go_backwards=False,
                stateful=False,
                unroll=False,
+               reset_after=False,
                **kwargs):
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
@@ -1532,7 +1592,8 @@ class GRU(RNN):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
+        implementation=implementation,
+        reset_after=reset_after)
     super(GRU, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -1613,6 +1674,10 @@ class GRU(RNN):
   def implementation(self):
     return self.cell.implementation
 
+  @property
+  def reset_after(self):
+    return self.cell.reset_after
+
   def get_config(self):
     config = {
         'units':
@@ -1648,7 +1713,9 @@ class GRU(RNN):
         'recurrent_dropout':
             self.recurrent_dropout,
         'implementation':
-            self.implementation
+            self.implementation,
+        'reset_after':
+            self.reset_after
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1929,7 +1996,7 @@ class LSTMCell(Layer):
 
 @tf_export('keras.layers.LSTM')
 class LSTM(RNN):
-  """Long-Short Term Memory layer - Hochreiter 1997.
+  """Long Short-Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index fb743b617f..641b563a25 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -232,6 +232,7 @@ class RNNTest(test.TestCase):
       cell = RNNCellWithConstants(32)
       layer = keras.layers.RNN(cell)
       y = layer(x, constants=c)
+
       model = keras.models.Model([x, c], y)
       model.compile(optimizer='rmsprop', loss='mse')
       model.train_on_batch(
@@ -279,6 +280,20 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
+    with self.test_session():
+      # Test GRUCell reset_after property.
+      x = keras.Input((None, 5))
+      c = keras.Input((3,))
+      cells = [keras.layers.recurrent.GRUCell(32, reset_after=True)]
+      layer = keras.layers.recurrent.RNN(cells)
+      y = layer(x, constants=c)
+      model = keras.models.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 32))
+      )
+
     with self.test_session():
       # Test stacked RNN serialization
       x_np = np.random.random((6, 5, 5))
@@ -541,6 +556,5 @@ class RNNTest(test.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 84ee5040dc..b45cafed31 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -49,6 +49,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras._impl.keras.layers.convolutional import DepthwiseConv2D
 
 # Image processing layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 68d446602e..fa26e07c85 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1566,6 +1566,16 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 # pylint: enable=invalid-name
 
 
+def _constant_if_small(value, shape, dtype, name):
+  try:
+    if np.prod(shape) < 1000:
+      return constant(value, shape=shape, dtype=dtype, name=name)
+  except TypeError:
+    # Happens when shape is a Tensor, list with Tensor elements, etc.
+    pass
+  return None
+
+
 @tf_export("zeros")
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -1596,8 +1606,15 @@ def zeros(shape, dtype=dtypes.float32, name=None):
       zero = ""
     else:
       zero = 0
+
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(zero, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
@@ -1729,6 +1746,12 @@ def ones(shape, dtype=dtypes.float32, name=None):
     one = True if dtype == dtypes.bool else 1
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(one, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b460ce5b95..01d670ea2d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1402,10 +1402,11 @@ def reduce_sum(input_tensor,
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
-    The reduced tensor.
+    The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum
+  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 07e25e540c..508ba9bfee 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -72,7 +72,12 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # know the shape and dtype of the variable pointed to by a handle. Since
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
-    handle._handle_data = h._handle_data  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if h._handle_data is None:
+      ops.set_shape_and_handle_data_for_outputs(h.op)
+    handle._handle_data = h._handle_data
+    # pylint: enable=protected-access
+
   # Clean up our reference cycles to avoid making the garbage collector run.
   # pylint: disable=protected-access
   # OrderedDict, constructed on Graph creation, makes a simple reference loop
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 7acb8eeb1a..5ee55301df 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -120,9 +120,9 @@ limitations under the License.
 
 }
 %typemap(out) (TFE_Context*) {
-  if ($1 == nullptr) {
-    SWIG_fail;
-  } else {
+  // When the TFE_Context* returned is a nullptr, we expect the status is not
+  // OK. This will raise an error (happens in another typemap).
+  if ($1 != nullptr) {
     $result = PyCapsule_New($1, nullptr, TFE_DeleteContextCapsule);
   }
 }
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index aae757b99a..094a9e886b 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -859,6 +859,7 @@ class ProfilerHook(session_run_hook.SessionRunHook):
           showing the sizes and lifetimes of tensors.
     """
     self._output_file = os.path.join(output_dir, "timeline-{}.json")
+    self._file_writer = SummaryWriterCache.get(output_dir)
     self._show_dataflow = show_dataflow
     self._show_memory = show_memory
     self._timer = SecondOrStepTimer(
@@ -889,6 +890,8 @@ class ProfilerHook(session_run_hook.SessionRunHook):
       self._save(global_step,
                  self._output_file.format(global_step),
                  run_values.run_metadata.step_stats)
+      self._file_writer.add_run_metadata(run_values.run_metadata,
+                                         "step_%d" % global_step)
 
     self._next_step = global_step + 1
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2547661e52..f39a5261a9 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -1274,6 +1274,19 @@ class ProfilerHookTest(test.TestCase):
         sess.run(self.train_op)  # Saved.
         self.assertEqual(3, self._count_timeline_files())
 
+  def test_run_metadata_saves_in_first_step(self):
+    writer_cache.FileWriterCache.clear()
+    fake_summary_writer.FakeSummaryWriter.install()
+    fake_writer = writer_cache.FileWriterCache.get(self.output_dir)
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.ProfilerHook(
+          save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(
+            list(fake_writer._added_run_metadata.keys()), ['step_1'])
+    fake_summary_writer.FakeSummaryWriter.uninstall()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 16e200d64d..c6b2dcdf98 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -1226,13 +1226,16 @@ _default_tower_mode = _DefaultTowerThreadMode()
 # So here we catch any attempts to deserialize variables
 # when using distribution strategies.
 # pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
 def _from_proto_fn(v, import_scope=None):
   if has_distribution_strategy():
     raise NotImplementedError(
         "Deserialization of variables is not yet supported when using"
         "distributed strategies.")
   else:
-    resource_variable_ops._from_proto_fn(v, import_scope=import_scope)
+    return _original_from_proto(v, import_scope=import_scope)
 
 resource_variable_ops._from_proto_fn = _from_proto_fn
 # pylint: enable=protected-access
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c563f8f931..1c550dbb13 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -2076,12 +2076,6 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2114,12 +2108,6 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2188,12 +2176,6 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index f408c06f46..3fd9275289 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -297,6 +297,9 @@ CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 namespace {
 
+// Forward declaration.
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+
 cudnnHandle_t ToHandle(void* opaque_handle) {
   return static_cast<cudnnHandle_t>(opaque_handle);
 }
@@ -381,6 +384,23 @@ port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
   }
   return port::Status::OK();
 }
+
+cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
+  if (algorithm.is_default()) {
+    return CUDNN_RNN_ALGO_STANDARD;
+  } else {
+    cudnnRNNAlgo_t algo = static_cast<cudnnRNNAlgo_t>(algorithm.algo_id());
+    switch (algo) {
+      case CUDNN_RNN_ALGO_STANDARD:
+      case CUDNN_RNN_ALGO_PERSIST_STATIC:
+      case CUDNN_RNN_ALGO_PERSIST_DYNAMIC:
+        return algo;
+      default:
+        LOG(FATAL) << "Unsupported Cudnn RNN algorithm: "
+                   << algorithm.algo_id();
+    }
+  }
+}
 #endif
 
 port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
@@ -1124,6 +1144,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
                      cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
+                     cudnnDataType_t compute_type,
+                     const dnn::AlgorithmConfig& algorithm_config,
                      float dropout, uint64 seed,
                      ScratchAllocator* state_allocator)
       : parent_(parent),
@@ -1134,7 +1156,9 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
-        data_type_(data_type) {
+        data_type_(data_type),
+        compute_type_(compute_type),
+        algorithm_config_(algorithm_config) {
     // Create the dropout handle.
     cudnn_dropout_desc_.reset(new CudnnDropoutDescriptor(
         parent, cudnn_handle, dropout, seed, state_allocator));
@@ -1148,18 +1172,20 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
 #if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
-    cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config_.algorithm());
     status = wrap::cudnnSetRNNDescriptor_v6(
         parent, cudnn_handle, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, rnn_algo /*algo*/, data_type /*dataType*/);
+        rnn_mode /*mode*/, rnn_algo /*algo*/, compute_type /*dataType*/);
 #else
+    CHECK(algorithm_config_.is_default())
+        << "Non-default algorithm not supported for CUDA version < 6.0";
     status = wrap::cudnnSetRNNDescriptor(
         parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, data_type /*dataType*/);
+        rnn_mode /*mode*/, compute_type /*dataType*/);
 #endif
     CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
 
@@ -1170,9 +1196,7 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
       SetFailure(cudnn_params_desc_->Status());
       return;
     }
-    if (data_type == CUDNN_DATA_HALF) {
-      set_use_tensor_op_math(true);
-    }
+    set_use_tensor_op_math(algorithm_config_.algorithm().tensor_ops_enabled());
   }
   ~CudnnRnnDescriptor() override {
     if (rnn_desc_) {
@@ -1206,6 +1230,10 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   cudnnDirectionMode_t direction_mode() const { return direction_mode_; }
   cudnnRNNMode_t rnn_mode() const { return rnn_mode_; }
   cudnnDataType_t data_type() const { return data_type_; }
+  cudnnDataType_t compute_type() const { return compute_type_; }
+  const dnn::AlgorithmConfig& algorithm_config() const {
+    return algorithm_config_;
+  }
   int64 ParamsSizeInBytes() const override {
     return cudnn_params_desc_->params_size_in_bytes();
   }
@@ -1236,6 +1264,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
   cudnnDataType_t data_type_;
+  cudnnDataType_t compute_type_;
+  dnn::AlgorithmConfig algorithm_config_;
   std::unique_ptr<CudnnDropoutDescriptor> cudnn_dropout_desc_;
   std::unique_ptr<CudnnRnnParamsDescriptor> cudnn_params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
@@ -1608,7 +1638,8 @@ bool CudnnSupport::DoRnnForwardImpl(
     const CudnnRnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<T>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1665,9 +1696,24 @@ bool CudnnSupport::DoRnnForwardImpl(
     }
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the forward call
+  cudnnStatus_t status;
   if (!is_training) {
-    cudnnStatus_t status = wrap::cudnnRNNForwardInference(
+    status = wrap::cudnnRNNForwardInference(
         parent_, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
@@ -1679,13 +1725,8 @@ bool CudnnSupport::DoRnnForwardImpl(
         output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
         workspace.opaque() /*workspace*/,
         workspace.size() /*workSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardInference: "
-                 << ToString(status);
-      return false;
-    }
   } else {
-    cudnnStatus_t status = wrap::cudnnRNNForwardTraining(
+    status = wrap::cudnnRNNForwardTraining(
         parent_, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
@@ -1699,8 +1740,24 @@ bool CudnnSupport::DoRnnForwardImpl(
         workspace.size() /*workSpaceSizeInBytes*/,
         reserve_space.opaque() /*reserveSpace*/,
         reserve_space.size() /*reserveSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardTraining"
+  }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    if (status == CUDNN_STATUS_SUCCESS) {
+      auto algo_desc = rnn_desc.algorithm_config().algorithm();
+      output_profile_result->set_algorithm(algo_desc);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
+  }
+  if (status != CUDNN_STATUS_SUCCESS) {
+    // Silently return when we are profiling.
+    if (!is_profiling) {
+      LOG(ERROR) << "Failed to call "
+                 << (is_training ? "cudnnRNNForwardTraining "
+                                 : "cudnnRNNForwardInference ")
                  << ToString(status);
       return false;
     }
@@ -1732,7 +1789,8 @@ bool CudnnSupport::DoRnnBackwardImpl(
     DeviceMemory<T>* input_c_backprop_data,
     DeviceMemory<T>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1761,6 +1819,20 @@ bool CudnnSupport::DoRnnBackwardImpl(
     return false;
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the backward data call
   cudnnStatus_t status = wrap::cudnnRNNBackwardData(
       parent_, ToHandle(dnn_handle_) /*handle*/, rnn_desc.handle() /*rnnDesc*/,
@@ -1781,7 +1853,11 @@ bool CudnnSupport::DoRnnBackwardImpl(
       workspace.size() /*workSpaceSizeInBytes*/,
       reserve_space_data->opaque() /*reserveSpace*/,
       reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+
   if (status != CUDNN_STATUS_SUCCESS) {
+    if (is_profiling) {
+      timer->Stop(AsCUDAStream(stream));
+    }
     LOG(ERROR) << "Failed to call cudnnRNNBackwardData: " << ToString(status);
     return false;
   }
@@ -1803,11 +1879,23 @@ bool CudnnSupport::DoRnnBackwardImpl(
         reserve_space_data->opaque() /*reserveSpace*/,
         reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
     if (status != CUDNN_STATUS_SUCCESS) {
+      if (is_profiling) {
+        timer->Stop(AsCUDAStream(stream));
+      }
       LOG(ERROR) << "Failed to call cudnnRNNBackwardWeights: "
                  << ToString(status);
       return false;
     }
   }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
 
   return true;
 }
@@ -1819,15 +1907,17 @@ CudnnSupport::createRnnDescriptor(int num_layers, int hidden_size,
                                   int input_size, dnn::RnnInputMode input_mode,
                                   dnn::RnnDirectionMode direction_mode,
                                   dnn::RnnMode rnn_mode,
-                                  dnn::DataType data_type, float dropout,
-                                  uint64 seed,
+                                  dnn::DataType data_type,
+                                  const dnn::AlgorithmConfig& algorithm_config,
+                                  float dropout, uint64 seed,
                                   ScratchAllocator* state_allocator) {
 #if CUDNN_VERSION >= 5000
   mutex_lock lock{dnn_handle_mutex_};
   std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
       parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
       ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
-      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type), dropout, seed,
+      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type),
+      GetRnnComputeType(data_type), algorithm_config, dropout, seed,
       state_allocator));
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
@@ -1904,7 +1994,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1925,7 +2016,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1946,7 +2038,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<float>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1967,7 +2060,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1989,7 +2083,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<double>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2010,7 +2105,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2039,7 +2135,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<Eigen::half>* input_c_backprop_data,
     DeviceMemory<Eigen::half>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2063,7 +2160,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2091,7 +2188,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<float>* input_c_backprop_data,
     DeviceMemory<float>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2115,7 +2213,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2144,7 +2242,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<double>* input_c_backprop_data,
     DeviceMemory<double>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2168,7 +2267,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2363,6 +2462,33 @@ cudnnDataType_t GetConvComputeType<double>() {
   return CUDNN_DATA_DOUBLE;
 }
 
+// A helper struct to decide whether to use FP32 as the internal compute type
+// for rnn when the input data type is FP16. By default it is turned on,
+// users can explicitly disable them (choose to use FP16 as the internal compute
+// type) through an env-var "TF_FP16_RNN_USE_FP32_COMPUTE=0".
+struct RnnDoFP32ComputationFP16Input {
+  static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
+  static constexpr bool kDefaultFlag = true;
+};
+
+// A helper function to return the internal compute type for
+// RNNs in cudnn.
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+      return CUDNN_DATA_FLOAT;
+    case dnn::DataType::kDouble:
+      return CUDNN_DATA_DOUBLE;
+    case dnn::DataType::kHalf:
+      if (CudnnEnvVar<RnnDoFP32ComputationFP16Input>::IsEnabled()) {
+        return CUDNN_DATA_FLOAT;
+      } else {
+        return CUDNN_DATA_HALF;
+      }
+    default:
+      LOG(FATAL) << "Invalid RNN data type: " << static_cast<int>(data_type);
+  }
+}
 }  // namespace
 
 template <class T>
@@ -2742,6 +2868,30 @@ bool CudnnSupport::GetConvolveAlgorithms(
   return true;
 }
 
+bool CudnnSupport::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
+  // clang-format off
+#if CUDNN_VERSION >= 6000
+    CUDNN_RNN_ALGO_STANDARD,
+    CUDNN_RNN_ALGO_PERSIST_STATIC,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
+#endif
+    // clang-format on
+  };
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+#if CUDNN_VERSION >= 7100
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+#endif
+  }
+  return true;
+}
+
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 48d56f71e3..e40ba9b012 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -50,8 +50,9 @@ class CudnnSupport : public dnn::DnnSupport {
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator* state_allocator) override;
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
@@ -77,7 +78,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -94,7 +96,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<float>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -111,7 +114,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<double>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -135,7 +139,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* input_c_backprop_data,
                      DeviceMemory<Eigen::half>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -159,7 +164,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<float>* input_c_backprop_data,
                      DeviceMemory<float>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -183,12 +189,16 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<double>* input_c_backprop_data,
                      DeviceMemory<double>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool GetConvolveAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
@@ -746,7 +756,8 @@ class CudnnSupport : public dnn::DnnSupport {
                         const CudnnRnnStateTensorDescriptor& output_c_desc,
                         DeviceMemory<T>* output_c_data, bool is_training,
                         ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator);
+                        ScratchAllocator* workspace_allocator,
+                        dnn::ProfileResult* output_profile_result);
 
   template <class T>
   bool DoRnnBackwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
@@ -771,7 +782,8 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemory<T>* input_c_backprop_data,
                          DeviceMemory<T>* params_backprop_data,
                          DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator);
+                         ScratchAllocator* workspace_allocator,
+                         dnn::ProfileResult* output_profile_result);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 4a2714dc1f..2abc55ec94 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -77,6 +77,13 @@ class CUDATimer : public internal::TimerInterface {
                          // executing in a stream.
 };
 
+struct TimerDeleter {
+  void operator()(CUDATimer *t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
 }  // namespace cuda
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 44144a0613..0a3c4bcf50 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -28,6 +28,10 @@ bool DnnSupport::GetConvolveAlgorithms(
   return false;
 }
 
+bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
+  return false;
+}
+
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index b41536e638..43cfd313c1 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1195,6 +1195,9 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  // Returns a list of supported rnn algorithms.
+  virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
+
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
   // original float coefficient[row * num_columns + column] =
@@ -2001,6 +2004,7 @@ class DnnSupport {
                       dnn::RnnInputMode input_mode,
                       dnn::RnnDirectionMode direction_mode,
                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
+                      const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
                       ScratchAllocator* state_allocator) {
     return port::Status{port::error::UNIMPLEMENTED,
@@ -2076,7 +2080,8 @@ class DnnSupport {
                             DeviceMemory<Eigen::half>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2096,7 +2101,8 @@ class DnnSupport {
                             DeviceMemory<float>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2116,7 +2122,8 @@ class DnnSupport {
                             DeviceMemory<double>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
   // Enqueue a backward operation of the RNN model onto the stream.
@@ -2183,7 +2190,8 @@ class DnnSupport {
       DeviceMemory<Eigen::half>* input_c_backprop_data,
       DeviceMemory<Eigen::half>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2210,7 +2218,8 @@ class DnnSupport {
       DeviceMemory<float>* input_c_backprop_data,
       DeviceMemory<float>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2237,7 +2246,8 @@ class DnnSupport {
       DeviceMemory<double>* input_c_backprop_data,
       DeviceMemory<double>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 1e3afde268..fe498507a8 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -4795,7 +4795,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<Eigen::half> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4803,7 +4804,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4827,7 +4829,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<float> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4835,7 +4838,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4860,7 +4864,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<double> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4868,7 +4873,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4900,7 +4906,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<Eigen::half> *input_c_backprop_data,
     DeviceMemory<Eigen::half> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4910,7 +4917,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4941,7 +4949,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<float> *input_c_backprop_data,
     DeviceMemory<float> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4951,7 +4960,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4983,7 +4993,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<double> *input_c_backprop_data,
     DeviceMemory<double> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4993,7 +5004,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index d7d1131569..4af426001f 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -1802,7 +1802,8 @@ class Stream {
                          DeviceMemory<Eigen::half> *output_c_data,
                          bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1819,7 +1820,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<float> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1836,7 +1838,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<double> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   // Enqueue a backward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnBackward for more details.
@@ -1863,7 +1866,8 @@ class Stream {
       DeviceMemory<Eigen::half> *input_c_backprop_data,
       DeviceMemory<Eigen::half> *params_backprop_data,
       DeviceMemory<uint8> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
+      ScratchAllocator *workspace_allocator,
+      dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1887,7 +1891,8 @@ class Stream {
                           DeviceMemory<float> *input_c_backprop_data,
                           DeviceMemory<float> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1911,7 +1916,8 @@ class Stream {
                           DeviceMemory<double> *input_c_backprop_data,
                           DeviceMemory<double> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   // Enqueue onto the stream a operation that transforms a tensor.
   // See DnnSupport::DoTransformTensor for more details.
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index afca1c2e59..f55fa68402 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -305,6 +305,15 @@ bool StreamExecutor::GetConvolveAlgorithms(
                                             cc_minor, out_algorithms);
 }
 
+bool StreamExecutor::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return false;
+  }
+  return dnn_support->GetRnnAlgorithms(out_algorithms);
+}
+
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmDesc> *out_algorithms) {
@@ -344,7 +353,8 @@ port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size,
     dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-    dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout, uint64 seed,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
     ScratchAllocator *state_allocator) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
@@ -353,7 +363,7 @@ StreamExecutor::createRnnDescriptor(
   }
   return dnn_support->createRnnDescriptor(
       num_layers, hidden_size, input_size, input_mode, direction_mode, rnn_mode,
-      data_type, dropout, seed, state_allocator);
+      data_type, algorithm_config, dropout, seed, state_allocator);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index a2a77218cb..69d0374d73 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -349,10 +349,14 @@ class StreamExecutor {
   // platform that underlies this interface.
   bool SupportsDnn() const;
 
-  // Get the list of supported algorithms for the forward convolution opeartion.
+  // Returns the list of supported algorithms for the forward convolution
+  // operation.
   bool GetConvolveAlgorithms(bool with_winograd_nonfused,
                              std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
+  // Returns the list of supported algorithms for rnn operation.
+  bool GetRnnAlgorithms(std::vector<dnn::AlgorithmDesc> *out_algorithms);
+
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
@@ -372,8 +376,9 @@ class StreamExecutor {
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator *state_allocator);
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator *state_allocator);
 
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 6a7da1aef8..a535f18170 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -1,20 +1,52 @@
 path: "tensorflow.keras.layers.ConvLSTM2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRecurrent2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "data_format"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dilation_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "filters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
@@ -35,6 +67,22 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_size"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -67,10 +115,42 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "padding"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_activation"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strides"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -79,10 +159,18 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "unit_forget_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -143,10 +231,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -187,28 +271,12 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_conv"
-    argspec: "args=[\'self\', \'x\', \'w\', \'b\', \'padding\'], varargs=None, keywords=None, defaults=[\'None\', \'valid\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "recurrent_conv"
-    argspec: "args=[\'self\', \'x\', \'w\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
new file mode 100644
index 0000000000..b38716aa2c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -0,0 +1,187 @@
+path: "tensorflow.keras.layers.DepthwiseConv2D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'kernel_size\', \'strides\', \'padding\', \'depth_multiplier\', \'data_format\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'1\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 1fd3febad2..4274b8d425 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -91,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index f5f41d879d..8d9f06083c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -122,6 +122,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "reset_after"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
@@ -160,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 088c8e88e2..affc9bd09b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -116,6 +116,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DepthwiseConv2D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Dot"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 438c5d52f6..5e9ae497e1 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,6 +42,14 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
+skip_test=0
+
+for ARG in "$@"; do
+  if [[ "$ARG" == --skip_test ]]; then
+    skip_test=1
+  fi
+done
+
 run_configure_for_cpu_build
 
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
@@ -49,6 +57,10 @@ run_configure_for_cpu_build
 BUILD_OPTS="--define=override_eigen_strong_inline=true"
 bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
+if [[ "$skip_test" == 1 ]]; then
+  exit 0
+fi
+
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 26523bb020..018a395063 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -221,11 +221,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "png_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
-          "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
+          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
+          "https://github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
       ],
-      sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
-      strip_prefix = "libpng-1.2.53",
+      sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
+      strip_prefix = "libpng-1.6.34",
       build_file = clean_dep("//third_party:png.BUILD"),
   )
 
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 6a7ad719aa..76ab32d69c 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -9,15 +9,20 @@ cc_library(
     name = "png",
     srcs = [
         "png.c",
+        "pngdebug.h",
         "pngerror.c",
         "pngget.c",
+        "pnginfo.h",
+        "pnglibconf.h",
         "pngmem.c",
         "pngpread.c",
+        "pngpriv.h",
         "pngread.c",
         "pngrio.c",
         "pngrtran.c",
         "pngrutil.c",
         "pngset.c",
+        "pngstruct.h",
         "pngtrans.c",
         "pngwio.c",
         "pngwrite.c",
@@ -33,3 +38,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = ["@zlib_archive//:zlib"],
 )
+
+genrule(
+    name = "snappy_stubs_public_h",
+    srcs = ["scripts/pnglibconf.h.prebuilt"],
+    outs = ["pnglibconf.h"],
+    cmd = "sed -e 's/PNG_ZLIB_VERNUM 0/PNG_ZLIB_VERNUM 0x12b0/' $< >$@",
+)
-- 
GitLab


From cb43bb37bfd5468efd92b03848edf6f3f06bfd5b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 14:32:28 -0700
Subject: [PATCH 0389/1262] Add RNNClassifier

PiperOrigin-RevId: 191941174
---
 tensorflow/contrib/estimator/BUILD            |   55 +
 tensorflow/contrib/estimator/__init__.py      |    1 +
 .../contrib/estimator/python/estimator/rnn.py |  481 +++++++
 .../estimator/python/estimator/rnn_test.py    | 1131 +++++++++++++++++
 tensorflow/python/ops/rnn_cell_impl.py        |   11 +-
 5 files changed, 1678 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/estimator/python/estimator/rnn.py
 create mode 100644 tensorflow/contrib/estimator/python/estimator/rnn_test.py

diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index bec0329ebb..9f4cd44afb 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -23,6 +23,7 @@ py_library(
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
+        ":rnn",
         "//tensorflow/python:util",
     ],
 )
@@ -412,3 +413,57 @@ cuda_py_test(
         "notap",
     ],
 )
+
+py_library(
+    name = "rnn",
+    srcs = ["python/estimator/rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":extenders",
+        "//tensorflow/contrib/feature_column:feature_column_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:optimizers",
+        "//tensorflow/python/feature_column",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "rnn_test",
+    size = "medium",
+    srcs = ["python/estimator/rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":rnn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index d2fc2c4bfa..9a87fa915d 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -52,6 +52,7 @@ _allowed_symbols = [
     'linear_logit_fn_builder',
     'replicate_model_fn',
     'TowerOptimizer',
+    'RNNClassifier',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
new file mode 100644
index 0000000000..b475c12f5a
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -0,0 +1,481 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent Neural Network estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import training_util
+
+
+# The defaults are historical artifacts of the initial implementation, but seem
+# reasonable choices.
+_DEFAULT_LEARNING_RATE = 0.05
+_DEFAULT_CLIP_NORM = 5.0
+
+_CELL_TYPES = {'basic_rnn': rnn_cell.BasicRNNCell,
+               'lstm': rnn_cell.BasicLSTMCell,
+               'gru': rnn_cell.GRUCell}
+
+# Indicates no value was provided by the user to a kwarg.
+USE_DEFAULT = object()
+
+
+def _single_rnn_cell(num_units, cell_type):
+  cell_type = _CELL_TYPES.get(cell_type, cell_type)
+  if not cell_type or not issubclass(cell_type, rnn_cell.RNNCell):
+    raise ValueError('Supported cell types are {}; got {}'.format(
+        list(_CELL_TYPES.keys()), cell_type))
+  return cell_type(num_units=num_units)
+
+
+def _make_rnn_cell_fn(num_units, cell_type='basic_rnn'):
+  """Convenience function to create `rnn_cell_fn` for canned RNN Estimators.
+
+  Args:
+    num_units: Iterable of integer number of hidden units per RNN layer.
+    cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+      the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+      `'gru'`.
+
+  Returns:
+    A function that takes a single argument, an instance of
+    `tf.estimator.ModeKeys`, and returns an instance derived from
+    `tf.nn.rnn_cell.RNNCell`.
+
+  Raises:
+    ValueError: If cell_type is not supported.
+  """
+  def rnn_cell_fn(mode):
+    # Unused. Part of the rnn_cell_fn interface since user specified functions
+    # may need different behavior across modes (e.g. dropout).
+    del mode
+    cells = [_single_rnn_cell(n, cell_type) for n in num_units]
+    if len(cells) == 1:
+      return cells[0]
+    return rnn_cell.MultiRNNCell(cells)
+  return rnn_cell_fn
+
+
+def _concatenate_context_input(sequence_input, context_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
+def _select_last_activations(activations, sequence_lengths):
+  """Selects the nth set of activations for each n in `sequence_length`.
+
+  Returns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
+  `None`, then `output[i, :] = activations[i, sequence_length[i] - 1, :]`. If
+  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
+
+  Args:
+    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
+    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
+  Returns:
+    A `Tensor` of shape `[batch_size, k]`.
+  """
+  with ops.name_scope(
+      'select_last_activations', values=[activations, sequence_lengths]):
+    activations_shape = array_ops.shape(activations)
+    batch_size = activations_shape[0]
+    padded_length = activations_shape[1]
+    output_units = activations_shape[2]
+    if sequence_lengths is None:
+      sequence_lengths = padded_length
+    start_indices = math_ops.to_int64(
+        math_ops.range(batch_size) * padded_length)
+    last_indices = start_indices + sequence_lengths - 1
+    reshaped_activations = array_ops.reshape(
+        activations, [batch_size * padded_length, output_units])
+
+    last_activations = array_ops.gather(reshaped_activations, last_indices)
+    last_activations.set_shape([activations.shape[0], activations.shape[2]])
+    return last_activations
+
+
+def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
+                          context_feature_columns, input_layer_partitioner):
+  """Function builder for a rnn logit_fn.
+
+  Args:
+    output_units: An int indicating the dimension of the logit layer.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent sequential input.
+    context_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent contextual input.
+    input_layer_partitioner: Partitioner for input layer.
+
+  Returns:
+    A logit_fn (see below).
+
+  Raises:
+    ValueError: If output_units is not an int.
+  """
+  if not isinstance(output_units, int):
+    raise ValueError('output_units must be an int.  Given type: {}'.format(
+        type(output_units)))
+
+  def rnn_logit_fn(features, mode):
+    """Recurrent Neural Network logit_fn.
+
+    Args:
+      features: This is the first item returned from the `input_fn`
+                passed to `train`, `evaluate`, and `predict`. This should be a
+                single `Tensor` or `dict` of same.
+      mode: Optional. Specifies if this training, evaluation or prediction. See
+            `ModeKeys`.
+
+    Returns:
+      A `Tensor` representing the logits.
+    """
+    with variable_scope.variable_scope(
+        'sequence_input_layer',
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner):
+      sequence_input, sequence_length = seq_fc.sequence_input_layer(
+          features=features, feature_columns=sequence_feature_columns)
+      summary.histogram('sequence_length', sequence_length)
+
+      if context_feature_columns:
+        context_input = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=context_feature_columns)
+        sequence_input = _concatenate_context_input(sequence_input,
+                                                    context_input)
+
+    cell = rnn_cell_fn(mode)
+    # Ignore output state.
+    rnn_outputs, _ = rnn.dynamic_rnn(
+        cell=cell,
+        inputs=sequence_input,
+        dtype=dtypes.float32,
+        time_major=False)
+    last_activations = _select_last_activations(rnn_outputs, sequence_length)
+
+    with variable_scope.variable_scope('logits', values=(rnn_outputs,)):
+      logits = core_layers.dense(
+          last_activations,
+          units=output_units,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer())
+    return logits
+
+  return rnn_logit_fn
+
+
+def _rnn_model_fn(features,
+                  labels,
+                  mode,
+                  head,
+                  rnn_cell_fn,
+                  sequence_feature_columns,
+                  context_feature_columns,
+                  optimizer='Adagrad',
+                  input_layer_partitioner=None,
+                  config=None):
+  """Recurrent Neural Net model_fn.
+
+  Args:
+    features: dict of `Tensor` and `SparseTensor` objects returned from
+      `input_fn`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] with labels.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: Iterable containing `FeatureColumn`s that
+      represent sequential model inputs.
+    context_feature_columns: Iterable containing `FeatureColumn`s that
+      represent model inputs not associated with a specific timestep.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use the Adagrad
+      optimizer with a default learning rate of 0.05 and gradient clip norm of
+      5.0.
+    input_layer_partitioner: Partitioner for input layer. Defaults
+      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: If mode or optimizer is invalid, or features has the wrong type.
+  """
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
+
+  # If user does not provide an optimizer instance, use the optimizer specified
+  # by the string with default learning rate and gradient clipping.
+  if not isinstance(optimizer, optimizer_lib.Optimizer):
+    optimizer = optimizers.get_optimizer_instance(
+        optimizer, learning_rate=_DEFAULT_LEARNING_RATE)
+    optimizer = extenders.clip_gradients_by_norm(optimizer, _DEFAULT_CLIP_NORM)
+
+  num_ps_replicas = config.num_ps_replicas if config else 0
+  partitioner = partitioned_variables.min_max_variable_partitioner(
+      max_partitions=num_ps_replicas)
+  with variable_scope.variable_scope(
+      'rnn',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+    input_layer_partitioner = input_layer_partitioner or (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas,
+            min_slice_size=64 << 20))
+
+    logit_fn = _rnn_logit_fn_builder(
+        output_units=head.logits_dimension,
+        rnn_cell_fn=rnn_cell_fn,
+        sequence_feature_columns=sequence_feature_columns,
+        context_feature_columns=context_feature_columns,
+        input_layer_partitioner=input_layer_partitioner)
+    logits = logit_fn(features=features, mode=mode)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class RNNClassifier(estimator.Estimator):
+  """A classifier for TensorFlow RNN models.
+
+  Trains a recurrent neural network model to classify instances into one of
+  multiple classes.
+
+  Example:
+
+  ```python
+  token_sequence = sequence_categorical_column_with_hash_bucket(...)
+  token_emb = embedding_column(categorical_column=token_sequence, ...)
+
+  estimator = RNNClassifier(
+      num_units=[32, 16], cell_type='lstm',
+      sequence_feature_columns=[token_emb])
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `sequence_feature_columns`:
+    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
+  * for each `column` in `context_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss is calculated by using softmax cross entropy.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_units=None,
+               cell_type=USE_DEFAULT,
+               rnn_cell_fn=None,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Adagrad',
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `RNNClassifier` instance.
+
+    Args:
+      sequence_feature_columns: An iterable containing the `FeatureColumn`s
+        that represent sequential input. All items in the set should either be
+        sequence columns (e.g. `sequence_numeric_column`) or constructed from
+        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
+        input).
+      context_feature_columns: An iterable containing the `FeatureColumn`s
+        for contextual input. The data represented by these columns will be
+        replicated and given to the RNN at each timestep. These columns must be
+        instances of classes derived from `_DenseColumn` such as
+        `numeric_column`, not the sequential variants.
+      num_units: Iterable of integer number of hidden units per RNN layer. If
+        set, `cell_type` must also be specified and `rnn_cell_fn` must be
+        `None`.
+      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
+        must be `None`.
+      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
+        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
+        This is for advanced users who need additional customization beyond
+        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
+        needed for stacked RNNs.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: Number of label classes. Defaults to 2, namely binary
+        classification. Must be > 1.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to Adagrad optimizer.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
+        compatible.
+    """
+    if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
+      raise ValueError(
+          'num_units and cell_type must not be specified when using rnn_cell_fn'
+      )
+    if not rnn_cell_fn:
+      if cell_type == USE_DEFAULT:
+        cell_type = 'basic_rnn'
+      rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
+
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _rnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          rnn_cell_fn=rnn_cell_fn,
+          sequence_feature_columns=tuple(sequence_feature_columns or []),
+          context_feature_columns=tuple(context_feature_columns or []),
+          optimizer=optimizer,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(RNNClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
new file mode 100644
index 0000000000..393f94f5c7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
@@ -0,0 +1,1131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rnn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import rnn
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+# Names of variables created by BasicRNNCell model.
+TOKEN_EMBEDDING_NAME = 'rnn/sequence_input_layer/input_layer/tokens_sequential_embedding/embedding_weights'
+CELL_WEIGHTS_NAME = 'rnn/rnn/basic_rnn_cell/kernel'
+CELL_BIAS_NAME = 'rnn/rnn/basic_rnn_cell/bias'
+MULTI_CELL_WEIGHTS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/kernel'
+MULTI_CELL_BIAS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/bias'
+LOGITS_WEIGHTS_NAME = 'rnn/logits/dense/kernel'
+LOGITS_BIAS_NAME = 'rnn/logits/dense/bias'
+
+
+def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases,
+                      global_step, model_dir):
+  """Create checkpoint file with provided model weights.
+
+  Args:
+    rnn_weights: Iterable of values of weights for the RNN cell.
+    rnn_biases: Iterable of values of biases for the RNN cell.
+    logits_weights: Iterable of values for matrix connecting RNN output to
+      logits.
+    logits_biases: Iterable of values for logits bias term.
+    global_step: Initial global step to save in checkpoint.
+    model_dir: Directory into which checkpoint is saved.
+  """
+  model_weights = {}
+  model_weights[CELL_WEIGHTS_NAME] = rnn_weights
+  model_weights[CELL_BIAS_NAME] = rnn_biases
+  model_weights[LOGITS_WEIGHTS_NAME] = logits_weights
+  model_weights[LOGITS_BIAS_NAME] = logits_biases
+
+  with ops.Graph().as_default():
+    # Create model variables.
+    for k, v in six.iteritems(model_weights):
+      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
+
+    # Create non-model variables.
+    global_step_var = training_util.create_global_step()
+    assign_op = global_step_var.assign(global_step)
+
+    # Initialize vars and save checkpoint.
+    with monitored_session.MonitoredTrainingSession(
+        checkpoint_dir=model_dir) as sess:
+      sess.run(assign_op)
+
+
+class RNNLogitFnTest(test.TestCase):
+  """Tests correctness of logits calculated from _rnn_logit_fn_builder."""
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_logits(self, mode, rnn_units, logits_dimension, features_fn,
+                   sequence_feature_columns, context_feature_columns,
+                   expected_logits):
+    """Tests that the expected logits are calculated."""
+    with ops.Graph().as_default():
+      # Global step needed for MonitoredSession, which is in turn used to
+      # explicitly set variable weights through a checkpoint.
+      training_util.create_global_step()
+      # Use a variable scope here with 'rnn', emulating the rnn model_fn, so
+      # the checkpoint naming is shared.
+      with variable_scope.variable_scope('rnn'):
+        input_layer_partitioner = (
+            partitioned_variables.min_max_variable_partitioner(
+                max_partitions=0, min_slice_size=64 << 20))
+        logit_fn = rnn._rnn_logit_fn_builder(
+            output_units=logits_dimension,
+            rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units),
+            sequence_feature_columns=sequence_feature_columns,
+            context_feature_columns=context_feature_columns,
+            input_layer_partitioner=input_layer_partitioner)
+        # Features are constructed within this function, otherwise the Tensors
+        # containing the features would be defined outside this graph.
+        logits = logit_fn(features=features_fn(), mode=mode)
+        with monitored_session.MonitoredTrainingSession(
+            checkpoint_dir=self._model_dir) as sess:
+          self.assertAllClose(expected_logits, sess.run(logits), atol=1e-4)
+
+  def testOneDimLogits(self):
+    """Tests one-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3]] = [[-0.6033]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033]])
+
+  def testMultiDimLogits(self):
+    """Tests multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [0.5*0.53 + 0.3*0.37 + 0.4],
+              [0.2*0.53 - 0.1*0.37 + 0.5]
+           = [[-0.6033, 0.7777, 0.5698]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698]])
+
+  def testMultiExampleMultiDim(self):
+    """Tests multiple examples and multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [7]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [tanh(.1*7 + .2*.38 + .3*.10 +.2),
+                              tanh(-.2*7 - .3*.38 - .4*.10 +.5)]]
+                          = [[0.53, -0.37], [0.76, -0.78]
+    logits = [[-1*0.53 - 1*0.37 + 0.3,
+               0.5*0.53 + 0.3*0.37 + 0.4,
+               0.2*0.53 - 0.1*0.37 + 0.5],
+              [-1*0.76 - 1*0.78 + 0.3,
+               0.5*0.76 +0.3*0.78 + 0.4,
+               0.2*0.76 -0.1*0.78 + 0.5]]
+           = [[-0.6033, 0.7777, 0.5698], [-1.2473, 1.0170, 0.5745]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))
+    ]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698],
+                           [-1.2473, 1.0170, 0.5745]])
+
+  def testMultiExamplesDifferentLength(self):
+    """Tests multiple examples with different lengths.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.53, -0.37], [<ignored-padding>]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [-1*0.38 + 1*0.10 + 0.3]]
+           = [[-0.6033], [0.0197]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033], [0.0197]])
+
+  def testMultiExamplesWithContext(self):
+    """Tests multiple examples with context features.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10, -0.5], [5, -0.5]], [[2, 0.8], [0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 - 1*.5 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - 0.9*.5 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + 1*.8 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 + .9*.8 - .3*0 - .4*0 +.5)]]
+                          = [[0.60, -0.96], [0.83, 0.68]]
+    rnn_output_timestep_2 = [[tanh(.1*5 - 1*.5 + .2*.60 - .3*.96 +.2),
+                              tanh(-.2*5 - .9*.5 - .3*.60 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.03, -0.63], [<ignored-padding>]]
+    logits = [[-1*0.03 - 1*0.63 + 0.3],
+              [-1*0.83 + 1*0.68 + 0.3]]
+           = [[-0.3662], [0.1414]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # Context features weights are inserted between input and state weights.
+        rnn_weights=[[.1, -.2], [1., 0.9], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'context': [[-0.5], [0.8]],
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = [fc.numeric_column('context', shape=(1,))]
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.3662], [0.1414]])
+
+  def testMultiExamplesMultiFeatures(self):
+    """Tests examples with multiple sequential feature columns.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[1, 0, 10], [0, 1, 5]], [[1, 0, 2], [0, 0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.5*1 + 1*0 + .1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.5*1 + 1*0 + .1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.94, -0.96], [0.72, -0.38]]
+    rnn_output_timestep_2 = [[tanh(.5*0 + 1*1 + .1*5 + .2*.94 - .3*.96 +.2),
+                              tanh(-.5*0 - 1*1 - .2*5 - .3*.94 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.92, -0.88], [<ignored-padding>]]
+    logits = [[-1*0.92 - 1*0.88 + 0.3],
+              [-1*0.72 - 1*0.38 + 0.3]]
+           = [[-1.5056], [-0.7962]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # FeatureColumns are sorted alphabetically, so on_sale weights are
+        # inserted before price.
+        rnn_weights=[[.5, -.5], [1., -1.], [.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'on_sale':
+              sparse_tensor.SparseTensor(
+                  values=[0, 1, 0],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    price_column = seq_fc.sequence_numeric_column('price', shape=(1,))
+    on_sale_column = fc.indicator_column(
+        seq_fc.sequence_categorical_column_with_identity(
+            'on_sale', num_buckets=2))
+    sequence_feature_columns = [price_column, on_sale_column]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-1.5056], [-0.7962]])
+
+
+class RNNClassifierTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _assert_checkpoint(
+      self, n_classes, input_units, cell_units, expected_global_step):
+
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    # RNN Cell variables.
+    if len(cell_units) > 1:
+      for i, cell_unit in enumerate(cell_units):
+        self.assertEqual([input_units + cell_unit, cell_unit],
+                         shapes[MULTI_CELL_WEIGHTS_NAME_PATTERN % i])
+        self.assertEqual([cell_unit],
+                         shapes[MULTI_CELL_BIAS_NAME_PATTERN % i])
+        input_units = cell_unit
+    elif len(cell_units) == 1:
+      self.assertEqual([input_units + cell_unit, cell_unit],
+                       shapes[CELL_WEIGHTS_NAME])
+      self.assertEqual([cell_unit], shapes[CELL_BIAS_NAME])
+
+    # Logits variables.
+    logits_dimension = n_classes if n_classes > 2 else 1
+    self.assertEqual([cell_units[-1], logits_dimension],
+                     shapes[LOGITS_WEIGHTS_NAME])
+    self.assertEqual([logits_dimension], shapes[LOGITS_BIAS_NAME])
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % CELL_BIAS_NAME,
+        '%s/part_0:0' % CELL_WEIGHTS_NAME,
+        '%s/part_0:0' % LOGITS_BIAS_NAME,
+        '%s/part_0:0' % LOGITS_WEIGHTS_NAME,
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = _assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def testConflictingRNNCellFn(self):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    cell_units = [4, 2]
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          num_units=cell_units)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          cell_type='lstm')
+
+  def _testFromScratchWithDefaultOptimizer(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        num_units=cell_units,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=2)
+
+  def testMultiClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=4)
+
+  def testFromScratchWithCustomRNNCellFn(self):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+    cell_units = [4, 2]
+    n_classes = 2
+
+    def rnn_cell_fn(mode):
+      del mode  # unused
+      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
+      return rnn_cell.MultiRNNCell(cells)
+
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        rnn_cell_fn=rnn_cell_fn,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def _testExampleWeight(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat', 'dog', 'barked'],
+                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
+                  dense_shape=[2, 3]),
+          'w': [[1], [2]],
+      }, [[1], [0]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=[embed],
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=2)
+
+  def testMultiClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=4)
+
+  def testBinaryClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testBinaryClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=1.119661)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+  def testMultiClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testMultiClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=2.662932)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+
+def sorted_key_dict(unsorted_dict):
+  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
+
+
+class RNNClassifierEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExamplesWithDifferentLength.
+    # See that test for logits calculation.
+    # logits = [[-0.603282], [0.019719]]
+    # probability = exp(logits) / (1 + exp(logits)) = [[0.353593], [0.504930]]
+    # loss = -label * ln(p) - (1 - label) * ln(1 - p)
+    #      = [[0.436326], [0.683335]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 1.119661,
+        metric_keys.MetricKeys.LOSS_MEAN: 0.559831,
+        metric_keys.MetricKeys.ACCURACY: 1.0,
+        metric_keys.MetricKeys.PREDICTION_MEAN: 0.429262,
+        metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+        metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+        # With default threshold of 0.5, the model is a perfect classifier.
+        metric_keys.MetricKeys.RECALL: 1.0,
+        metric_keys.MetricKeys.PRECISION: 1.0,
+        # Positive example is scored above negative, so AUC = 1.0.
+        metric_keys.MetricKeys.AUC: 1.0,
+        metric_keys.MetricKeys.AUC_PR: 1.0,
+    }
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+  def testMultiClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExampleMultiDim.
+    # See that test for logits calculation.
+    # logits = [[-0.603282, 0.777708, 0.569756],
+    #           [-1.247356, 1.017018, 0.574481]]
+    # logits_exp = exp(logits) / (1 + exp(logits))
+    #            = [[0.547013, 2.176468, 1.767836],
+    #               [0.287263, 2.764937, 1.776208]]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [[0.121793, 0.484596, 0.393611],
+    #                          [0.059494, 0.572639, 0.367866]]
+    # loss = -1. * log(softmax[label])
+    #      = [[2.105432], [0.557500]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 2.662932,
+        metric_keys.MetricKeys.LOSS_MEAN: 1.331466,
+        metric_keys.MetricKeys.ACCURACY: 0.5,
+    }
+
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+
+class RNNClassifierPredictionTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testOneDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282]
+    # logistic = exp(-0.6033) / (1 + exp(-0.6033)) = [0.353593]
+    # probabilities = [0.646407, 0.353593]
+    # class_ids = argmax(probabilities) = [0]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose([0.353593],
+                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
+    self.assertAllClose(
+        [0.646407, 0.353593],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([0],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_0'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def testMultiClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1', 'class_2']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testMultiDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282, 0.777708, 0.569756]
+    # logits_exp = exp(logits) = [0.547013, 2.176468, 1.767836]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [0.121793, 0.484596, 0.393611]
+    # class_ids = argmax(probabilities) = [1]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282, 0.777708, 0.569756],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose(
+        [0.121793, 0.484596, 0.393611],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([1],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_1'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+
+class RNNClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
+      batch_size):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    feature_columns = [embed]
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUATE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    # EXPORT
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def testNumpyInputFn(self):
+    """Tests complete flow with numpy_input_fn."""
+    n_classes = 3
+    batch_size = 10
+    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
+    # Numpy only supports dense input, so all examples will have same length.
+    # TODO(b/73160931): Update test when support for prepadded data exists.
+    sequence_length = 3
+
+    features = []
+    for _ in range(batch_size):
+      sentence = random.sample(words, sequence_length)
+      features.append(sentence)
+
+    x_data = np.array(features)
+    y_data = np.random.randint(n_classes, size=batch_size)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def testParseExampleInputFn(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    n_classes = 3
+    batch_size = 10
+    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']
+
+    serialized_examples = []
+    for _ in range(batch_size):
+      sequence_length = random.randint(1, len(words))
+      sentence = random.sample(words, sequence_length)
+      label = random.randint(0, n_classes - 1)
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'tokens':
+                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                      value=sentence)),
+              'label':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=[label])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    def _train_input_fn():
+      features = parsing_ops.parse_example(serialized_examples, feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _eval_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _predict_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features.pop('label')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index fe380c44da..cbc2dcf419 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -1206,7 +1206,16 @@ class DeviceWrapper(RNNCell):
 
 @tf_export("nn.rnn_cell.MultiRNNCell")
 class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells."""
+  """RNN cell composed sequentially of multiple simple cells.
+
+  Example:
+
+  ```python
+  num_units = [128, 64]
+  cells = [BasicLSTMCell(num_units=n) for n in num_units]
+  stacked_rnn_cell = MultiRNNCell(cells)
+  ```
+  """
 
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
-- 
GitLab


From 98c63b61e21dca08af1f46294ed90c02753ab816 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 6 Apr 2018 15:06:05 -0700
Subject: [PATCH 0390/1262] Create tuple if body doesn't return one.

Fixes #18257.

PiperOrigin-RevId: 191946459
---
 tensorflow/python/ops/control_flow_ops.py     |  8 ++++++-
 .../python/ops/control_flow_ops_test.py       | 23 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 1278768d8b..e56ab93666 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3181,12 +3181,18 @@ def while_loop(cond,
         body = lambda i, lv: (i + 1, orig_body(*lv))
 
     if context.executing_eagerly():
+      try_to_pack = len(loop_vars) == 1
+      packed = False  # whether the body result was packed into a 1-item tuple
+
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
+        if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
+          packed = True
+          loop_vars = (loop_vars,)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
-        return loop_vars
+        return loop_vars[0] if packed else loop_vars
 
     if shape_invariants is not None:
       if maximum_iterations is not None:
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index f22f3059d1..289df6f301 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -947,5 +947,28 @@ class CaseTest(test_util.TensorFlowTestCase):
         sess.run(output, feed_dict={x: 4})
 
 
+@test_util.with_c_api
+class WhileLoopTestCase(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testWhileLoopWithSingleVariable(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: math_ops.add(i, 1)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    self.assertEqual(self.evaluate(r), 10)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerWhileLoopWithSingleVariable_bodyReturnsTuple(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: (math_ops.add(i, 1),)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    # Expect a tuple since that is what the body returns.
+    self.assertEqual(self.evaluate(r), (10,))
+
+
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From 7b566ca32afc55dcd41b664161e8a0bc0d15dd8b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 6 Apr 2018 15:38:05 -0700
Subject: [PATCH 0391/1262] [tf.data] Replace the Reader-oriented documentation
 for supporting new datasets with a tf.data version.

PiperOrigin-RevId: 191950831
---
 .../docs_src/extend/new_data_formats.md       | 395 +++++++++++-------
 1 file changed, 237 insertions(+), 158 deletions(-)

diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 10e717c280..2c33a6b6f7 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -1,4 +1,4 @@
-# Custom Data Readers
+# Reading custom file and record formats
 
 PREREQUISITES:
 
@@ -9,187 +9,273 @@ PREREQUISITES:
 
 We divide the task of supporting a file format into two pieces:
 
-*   File formats: We use a *Reader* Op to read a *record* (which can be any
-    string) from a file.
-*   Record formats: We use decoder or parsing Ops to turn a string record
+*   File formats: We use a reader `tf.data.Dataset` to read raw *records* (which
+    are typically represented by scalar string tensors, but can have more
+    structure) from a file.
+*   Record formats: We use decoder or parsing ops to turn a string record
     into tensors usable by TensorFlow.
 
 For example, to read a
 [CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
-@{tf.TextLineReader$a Reader for text files}
-followed by
-@{tf.decode_csv$an Op that parses CSV data from a line of text}.
+@{tf.data.TextLineDataset$a dataset for reading text files line-by-line}
+and then @{tf.data.Dataset.map$map} an
+@{tf.decode_csv$op} that parses CSV data from each line of text in the dataset.
 
 [TOC]
 
-## Writing a Reader for a file format
+## Writing a `Dataset` for a file format
 
-A `Reader` is something that reads records from a file.  There are some examples
-of Reader Ops already built into TensorFlow:
+A @{tf.data.Dataset} represents a sequence of *elements*, which can be the
+individual records in a file. There are several examples of "reader" datasets
+that are already built into TensorFlow:
 
-*   @{tf.TFRecordReader}
-    ([source in `kernels/tf_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/tf_record_reader_op.cc))
-*   @{tf.FixedLengthRecordReader}
-    ([source in `kernels/fixed_length_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/fixed_length_record_reader_op.cc))
-*   @{tf.TextLineReader}
-    ([source in `kernels/text_line_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/text_line_reader_op.cc))
+*   @{tf.data.TFRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.FixedLengthRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.TextLineDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
 
-You can see these all expose the same interface, the only differences
-are in their constructors.  The most important method is `read`.
-It takes a queue argument, which is where it gets filenames to
-read from whenever it needs one (e.g. when the `read` op first runs, or
-the previous `read` reads the last record from a file).  It produces
-two scalar tensors: a string key and a string value.
+Each of these implementations comprises three related classes:
 
-To create a new reader called `SomeReader`, you will need to:
+* A `tensorflow::DatasetOpKernel` subclass (e.g. `TextLineDatasetOp`), which
+  tells TensorFlow how to construct a dataset object from the inputs to and
+  attrs of an op, in its `MakeDataset()` method.
 
-1.  In C++, define a subclass of
-    [`tensorflow::ReaderBase`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h)
-    called `SomeReader`.
-2.  In C++, register a new reader op and kernel with the name `"SomeReader"`.
-3.  In Python, define a subclass of @{tf.ReaderBase} called `SomeReader`.
+* A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`),
+  which represents the *immutable* definition of the dataset itself, and tells
+  TensorFlow how to construct an iterator object over that dataset, in its
+  `MakeIterator()` method.
 
-You can put all the C++ code in a file in
-`tensorflow/core/user_ops/some_reader_op.cc`. The code to read a file will live
-in a descendant of the C++ `ReaderBase` class, which is defined in
-[`tensorflow/core/kernels/reader_base.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h).
-You will need to implement the following methods:
+* A `tensorflow::DatasetIterator<Dataset>` subclass (e.g.
+  `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state
+  of an iterator over a particular dataset, and tells TensorFlow how to get the
+  next element from the iterator, in its `GetNextInternal()` method.
 
-*   `OnWorkStartedLocked`: open the next file
-*   `ReadLocked`: read a record or report EOF/error
-*   `OnWorkFinishedLocked`: close the current file, and
-*   `ResetLocked`: get a clean slate after, e.g., an error
+The most important method is the `GetNextInternal()` method, since it defines
+how to actually read records from the file and represent them as one or more
+`Tensor` objects.
 
-These methods have names ending in "Locked" since `ReaderBase` makes sure
-to acquire a mutex before calling any of these methods, so you generally don't
-have to worry about thread safety (though that only protects the members of the
-class, not global state).
+To create a new reader dataset called (for example) `MyReaderDataset`, you will
+need to:
 
-For `OnWorkStartedLocked`, the name of the file to open is the value returned by
-the `current_work()` method.  `ReadLocked` has this signature:
+1. In C++, define subclasses of `tensorflow::DatasetOpKernel`,
+   `tensorflow::GraphDatasetBase`, and `tensorflow::DatasetIterator<Dataset>`
+   that implement the reading logic.
+2. In C++, register a new reader op and kernel with the name
+   `"MyReaderDataset"`.
+3. In Python, define a subclass of @{tf.data.Dataset} called `MyReaderDataset`.
 
-```c++
-Status ReadLocked(string* key, string* value, bool* produced, bool* at_end)
-```
-
-If `ReadLocked` successfully reads a record from the file, it should fill in:
-
-*   `*key`: with an identifier for the record, that a human could use to find
-    this record again.  You can include the filename from `current_work()`,
-    and append a record number or whatever.
-*   `*value`: with the contents of the record.
-*   `*produced`: set to `true`.
-
-If you hit the end of a file (EOF), set `*at_end` to `true`.  In either case,
-return `Status::OK()`.  If there is an error, simply return it using one of the
-helper functions from
-[`tensorflow/core/lib/core/errors.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h)
-without modifying any arguments.
-
-Next you will create the actual Reader op.  It will help if you are familiar
-with @{$adding_an_op$the adding an op how-to}.  The main steps
-are:
-
-*   Registering the op.
-*   Define and register an `OpKernel`.
-
-To register the op, you will use a `REGISTER_OP` call defined in
-[`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h).
-Reader ops never take any input and always have a single output with type
-`resource`.  They should have string `container` and `shared_name` attrs.
-You may optionally define additional attrs
-for configuration or include documentation in a `Doc`.  For examples, see
-[`tensorflow/core/ops/io_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/ops/io_ops.cc),
-e.g.:
+You can put all the C++ code in a single file, such as
+`my_reader_dataset_op.cc`. It will help if you are
+familiar with @{$adding_an_op$the adding an op how-to}. The following skeleton
+can be used as a starting point for your implementation:
 
 ```c++
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
-REGISTER_OP("TextLineReader")
-    .Output("reader_handle: resource")
-    .Attr("skip_header_lines: int = 0")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-)doc");
-```
-
-To define an `OpKernel`, Readers can use the shortcut of descending from
-`ReaderOpKernel`, defined in
-[`tensorflow/core/framework/reader_op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_op_kernel.h),
-and implement a constructor that calls `SetReaderFactory`.  After defining
-your class, you will need to register it using `REGISTER_KERNEL_BUILDER(...)`.
-An example with no attrs:
+namespace tensorflow {
+namespace {
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TFRecordReaderOp : public ReaderOpKernel {
+class MyReaderDatasetOp : public DatasetOpKernel {
  public:
-  explicit TFRecordReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    Env* env = context->env();
-    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
-  }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
-                        TFRecordReaderOp);
-```
+  MyReaderDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    // Parse and validate any attrs that define the dataset using
+    // `ctx->GetAttr()`, and store them in member variables.
+  }
 
-An example with attrs:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Parse and validate any input tensors 0that define the dataset using
+    // `ctx->input()` or the utility function
+    // `ParseScalarArgument<T>(ctx, &arg)`.
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TextLineReaderOp : public ReaderOpKernel {
- public:
-  explicit TextLineReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    int skip_header_lines = -1;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("skip_header_lines", &skip_header_lines));
-    OP_REQUIRES(context, skip_header_lines >= 0,
-                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
-                                        skip_header_lines));
-    Env* env = context->env();
-    SetReaderFactory([this, skip_header_lines, env]() {
-      return new TextLineReader(name(), skip_header_lines, env);
-    });
+    // Create the dataset object, passing any (already-validated) arguments from
+    // attrs or input tensors.
+    *output = new Dataset(ctx);
   }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
-                        TextLineReaderOp);
-```
-
-The last step is to add the Python wrapper.  You can either do this by
-@{$adding_an_op#build_the_op_library$compiling a dynamic library}
-or, if you are building TensorFlow from source, adding to `user_ops.py`.
-For the latter, you will import `tensorflow.python.ops.io_ops` in
-[`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
-and add a descendant of [`io_ops.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx) : GraphDatasetBase(ctx) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::MyReader")}));
+    }
+
+    // Record structure: Each record is represented by a scalar string tensor.
+    //
+    // Dataset elements can have a fixed number of components of different
+    // types and shapes; replace the following two methods to customize this
+    // aspect of the dataset.
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "MyReaderDatasetOp::Dataset"; }
+
+   protected:
+    // Optional: Implementation of `GraphDef` serialization for this dataset.
+    //
+    // Implement this method if you want to be able to save and restore
+    // instances of this dataset (and any iterators over it).
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // Construct nodes to represent any of the input tensors from this
+      // object's member variables using `b->AddScalar()` and `b->AddVector()`.
+      std::vector<Node*> input_tensors;
+      TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      // Implementation of the reading logic.
+      //
+      // The example implementation in this file yields the string "MyReader!"
+      // ten times. In general there are three cases:
+      //
+      // 1. If an element is successfully read, store it as one or more tensors
+      //    in `*out_tensors`, set `*end_of_sequence = false` and return
+      //    `Status::OK()`.
+      // 2. If the end of input is reached, set `*end_of_sequence = true` and
+      //    return `Status::OK()`.
+      // 3. If an error occurs, return an error status using one of the helper
+      //    functions from "tensorflow/core/lib/core/errors.h".
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        // NOTE: `GetNextInternal()` may be called concurrently, so it is
+        // recommended that you protect the iterator state with a mutex.
+        mutex_lock l(mu_);
+        if (i_ < 10) {
+          // Create a scalar string tensor and add it to the output.
+          Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+          record_tensor.scalar<string>()() = "MyReader!";
+          out_tensors->emplace_back(std::move(record_tensor));
+          ++i_;
+          *end_of_sequence = false;
+        } else {
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     protected:
+      // Optional: Implementation of iterator state serialization for this
+      // iterator.
+      //
+      // Implement these two methods if you want to be able to save and restore
+      // instances of this iterator.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+        return Status::OK();
+      }
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+    };
+  };
+};
 
-```python
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import common_shapes
-from tensorflow.python.ops import io_ops
+// Register the op definition for MyReaderDataset.
+//
+// Dataset ops always have a single output, of type `variant`, which represents
+// the constructed `Dataset` object.
+//
+// Add any attrs and input tensors that define the dataset here.
+REGISTER_OP("MyReaderDataset")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 
-class SomeReader(io_ops.ReaderBase):
+// Register the kernel implementation for MyReaderDataset.
+REGISTER_KERNEL_BUILDER(Name("MyReaderDataset").Device(DEVICE_CPU),
+                        MyReaderDatasetOp);
 
-    def __init__(self, name=None):
-        rr = gen_user_ops.some_reader(name=name)
-        super(SomeReader, self).__init__(rr)
+}  // namespace
+}  // namespace tensorflow
+```
 
+The last step is to build the C++ code and add a Python wrapper. The easiest way
+to do this is by @{$adding_an_op#build_the_op_library$compiling a dynamic
+library} (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
+that subclasses @{tf.data.Dataset} to wrap it. An example Python program is
+given here:
 
-ops.NotDifferentiable("SomeReader")
+```python
+import tensorflow as tf
+
+# Assumes the file is in the current working directory.
+my_reader_dataset_module = tf.load_op_library("./my_reader_dataset_op.so")
+
+class MyReaderDataset(tf.data.Dataset):
+
+  def __init__(self):
+    super(MyReaderDataset, self).__init__()
+    # Create any input attrs or tensors as members of this class.
+
+  def _as_variant_tensor(self):
+    # Actually construct the graph node for the dataset op.
+    #
+    # This method will be invoked when you create an iterator on this dataset
+    # or a dataset derived from it.
+    return my_reader_dataset_module.my_reader_dataset()
+
+  # The following properties define the structure of each element: a scalar
+  # `tf.string` tensor. Change these properties to match the `output_dtypes()`
+  # and `output_shapes()` methods of `MyReaderDataset::Dataset` if you modify
+  # the structure of each element.
+  @property
+  def output_types(self):
+    return tf.string
+
+  @property
+  def output_shapes(self):
+    return tf.TensorShape([])
+
+  @property
+  def output_classes(self):
+    return tf.Tensor
+
+if __name__ == "__main__":
+  # Create a MyReaderDataset and print its elements.
+  with tf.Session() as sess:
+    iterator = MyReaderDataset().make_one_shot_iterator()
+    next_element = iterator.get_next()
+    try:
+      while True:
+        print(sess.run(next_element))  # Prints "MyReader!" ten times.
+    except tf.errors.OutOfRangeError:
+      pass
 ```
 
-You can see some examples in
-[`tensorflow/python/ops/io_ops.py`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+You can see some examples of `Dataset` wrapper classes in
+[`tensorflow/python/data/ops/dataset_ops.py`](https://www.tensorflow.org/code/tensorflow/python/data/ops/dataset_ops.py).
 
 ## Writing an Op for a record format
 
@@ -201,9 +287,7 @@ track down where the bad data came from.
 
 Examples of Ops useful for decoding records:
 
-*   @{tf.parse_single_example}
-    (and
-    @{tf.parse_example})
+*   @{tf.parse_single_example} (and @{tf.parse_example})
 *   @{tf.decode_csv}
 *   @{tf.decode_raw}
 
@@ -211,11 +295,6 @@ Note that it can be useful to use multiple Ops to decode a particular record
 format.  For example, you may have an image saved as a string in
 [a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 Depending on the format of that image, you might take the corresponding output
-from a
-@{tf.parse_single_example}
-op and call @{tf.image.decode_jpeg},
-@{tf.image.decode_png}, or
-@{tf.decode_raw}.  It is common to
-take the output of `tf.decode_raw` and use
-@{tf.slice} and
-@{tf.reshape} to extract pieces.
+from a @{tf.parse_single_example} op and call @{tf.image.decode_jpeg},
+@{tf.image.decode_png}, or @{tf.decode_raw}.  It is common to take the output
+of `tf.decode_raw` and use @{tf.slice} and @{tf.reshape} to extract pieces.
-- 
GitLab


From d017e6f030c06d4803897a0321144254ad563165 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 15:51:49 -0700
Subject: [PATCH 0392/1262] Follow up on earlier change, which tried to avoid
 reading the input file twice for InitializableLookupTable in combination with
 HashTable.

It turns out all files end at some point and thus and OutOfRange status is encountered on all successful reads. The old code would then compare next_id_ to total_size(), to see whether or not we should return an error. But this is exactly what we tried to prevent. Instead use vocab_size_ if it was initialized or don't return an error.

PiperOrigin-RevId: 191952441
---
 tensorflow/core/kernels/lookup_util.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 27031d9216..77386a16e0 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -101,9 +101,10 @@ class TextFileLineIterator
     string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
-      if (errors::IsOutOfRange(status_) && next_id_ != total_size()) {
+      if (errors::IsOutOfRange(status_) && vocab_size_ != -1 &&
+          next_id_ != vocab_size_) {
         status_ = errors::InvalidArgument("Invalid vocab_size in ", filename_,
-                                          ": expected ", total_size(),
+                                          ": expected ", vocab_size_,
                                           " but got ", next_id_);
       }
       valid_ = false;
-- 
GitLab


From d8ec179569514c068284c84540826d077a30485d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 16:00:41 -0700
Subject: [PATCH 0393/1262] Refactor LoopOptimizer:   * Put loop-invariant node
 motion in its own class.   * Add granular control of which passes to run.
 Swap order of LINM and stack push removal.

PiperOrigin-RevId: 191953537
---
 .../grappler/optimizers/loop_optimizer.cc     | 247 ++++++++++--------
 .../core/grappler/optimizers/loop_optimizer.h |  47 ++--
 .../optimizers/loop_optimizer_test.cc         |  43 ++-
 3 files changed, 193 insertions(+), 144 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index a063dc3381..28ce2c7a55 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -16,18 +16,17 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <deque>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -46,74 +45,36 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<int> GetStackPushNodesToConvert(
-    const SimpleGraphView& graph_view,
-    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
-  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
-  const std::unordered_set<string> op_types_to_traverse(
-      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
-       "Identity", "RefIdentity"});
-  std::vector<int> nodes_to_convert;
-  std::set<int> fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
-  for (int fanout_idx : fanout) {
-    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
-    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
-    if (IsStackPushOp(fanout_node)) {
-      nodes_to_convert.push_back(fanout_idx);
-    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
-               op_types_to_traverse.find(fanout_node.op()) !=
-                   op_types_to_traverse.end()) {
-      continue;
-    } else if (!IsStackPopOp(fanout_node) ||
-               (!graph_view.outputs(fanout_idx).empty() ||
-                nodes_to_preserve.find(fanout_node.name()) !=
-                    nodes_to_preserve.end())) {
-      // The node is either a stack pop with consumers or something unexpected
-      // so we leave the graph alone.
-      nodes_to_convert.clear();
-      break;
-    }
-  }
-  return nodes_to_convert;
-}
+class LoopInvariantNodeMotionOptimizer {
+ public:
+  explicit LoopInvariantNodeMotionOptimizer(GraphDef* optimized_graph)
+      : optimized_graph_(optimized_graph) {}
+  virtual ~LoopInvariantNodeMotionOptimizer() = default;
+  Status Optimize();
 
-Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
-  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
-  const GraphDef& graph = item.graph;
-  *optimized_graph = graph;
-  NodeMap node_map(optimized_graph);
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
-  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
-    if (IsStackOp(graph.node(node_idx))) {
-      for (int push_node_idx : GetStackPushNodesToConvert(
-               graph_view, nodes_to_preserve, node_idx)) {
-        // We found push nodes without corresponding pops. Convert them to
-        // Identity passing the data through and add a control dependency from
-        // the op supplying the stack handle.
-        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
-        VLOG(1) << "Converting " << push_node_idx << " : "
-                << push_node->DebugString();
-        if (push_node->attr().count("swap_memory") != 0) {
-          push_node->mutable_attr()->erase("swap_memory");
-        }
-        push_node->set_op("Identity");
-        push_node->mutable_input()->SwapElements(0, 1);
-        const string ctrl_dep = ConstantFolding::AddControlDependency(
-            push_node->input(1), optimized_graph, &node_map);
-        push_node->set_input(1, ctrl_dep);
-        VLOG(1) << "After converting: " << push_node->DebugString();
-      }
-    }
-  }
-  return Status::OK();
-}
+ private:
+  Status FindInvariantNodes(NodeDef* node);
+  Status RevertInvariantNodes();
+  Status MoveInvariantNodes(const int frame_id);
+  Status HandleInvariantNode(NodeDef* node, const int num_outputs,
+                             const int frame_id);
+  Status HandleConst(NodeDef* node, const int num_outputs, const int frame_id);
+  Status HandleInvariantEnter(NodeDef* node, const int num_outputs);
 
-}  // namespace
+  GraphDef* optimized_graph_;  // Not owned.
+  std::unique_ptr<NodeMap> node_map_;
+  std::map<NodeDef*, int> invariant_nodes_;
+  std::set<int> empty_set_;
+  // TODO(rmlarsen): Use vector instead of map, since frames ids are dense.
+  std::map<int, std::set<int>> frame_children_;
+  std::map<int, int> frame_parent_;
+  std::map<int, const NodeDef*> loop_cond_;
+  std::map<int, std::vector<NodeDef*>> invariant_enters_;
+  int new_enter_id_;
+};
 
-Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
-                                               const int num_outputs) {
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantEnter(
+    NodeDef* node, const int num_outputs) {
   auto consumers = node_map_->GetOutputs(node->name());
   std::vector<string> enter_control_inputs;
   string enter_input;
@@ -142,8 +103,9 @@ Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::LINMHandleConst(NodeDef* node,
-    const int num_outputs, const int frame_id) {
+Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
+                                                     const int num_outputs,
+                                                     const int frame_id) {
   NodeDef* const_node;
   if (num_outputs == 0) {
     // all successor nodes are invariant
@@ -185,8 +147,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
     int parent_id = parent_it->second;
     auto loop_cond_it = loop_cond_.find(parent_id);
     if (loop_cond_it == loop_cond_.end()) {
-      return errors::InvalidArgument(
-          "Frame ", frame_id, " doesn't have a LoopCond node");
+      return errors::InvalidArgument("Frame ", frame_id,
+                                     " doesn't have a LoopCond node");
     }
     auto& loop_cond_name = loop_cond_it->second->name();
     NodeDef* switch_node = nullptr;
@@ -197,9 +159,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
       }
     }
     if (!switch_node) {
-      return errors::InvalidArgument(
-          "LoopCond node of Frame ", frame_id,
-          " doesn't connect to any Switch node");
+      return errors::InvalidArgument("LoopCond node of Frame ", frame_id,
+                                     " doesn't connect to any Switch node");
     }
     string switch_output = StrCat(switch_node->name(), ":1");
     const string ctrl_dep = ConstantFolding::AddControlDependency(
@@ -210,8 +171,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
-    const int num_outputs, const int frame_id) {
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
+    NodeDef* node, const int num_outputs, const int frame_id) {
   // have to remove control inputs to the invariant node from the same frame
   // when moving this node out of this frame
   for (int i = 0; i < node->input_size(); ++i) {
@@ -228,16 +189,14 @@ Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
   DataTypeVector output_types;
   OpRegistryInterface* op_registry = OpRegistry::Global();
   const OpRegistrationData* op_reg_data = nullptr;
-  TF_RETURN_IF_ERROR(
-      op_registry->LookUp(node->op(), &op_reg_data));
-  TF_RETURN_IF_ERROR(
-      InOutTypesForNode(*node, op_reg_data->op_def,
-                        &input_types, &output_types));
+  TF_RETURN_IF_ERROR(op_registry->LookUp(node->op(), &op_reg_data));
+  TF_RETURN_IF_ERROR(InOutTypesForNode(*node, op_reg_data->op_def, &input_types,
+                                       &output_types));
 
   auto consumers = node_map_->GetOutputs(node->name());
   string fname = invariant_enters_[frame_id][0]->attr().at("frame_name").s();
-  int piterations = invariant_enters_[frame_id][0]
-                    ->attr().at("parallel_iterations").i();
+  int piterations =
+      invariant_enters_[frame_id][0]->attr().at("parallel_iterations").i();
   for (auto* consumer : consumers) {
     if (!invariant_nodes_.count(consumer)) {
       for (int i = 0; i < consumer->input_size(); ++i) {
@@ -281,28 +240,27 @@ Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::MoveInvariantNodes(const int frame_id) {
-  for (auto iter = invariant_nodes_.begin();
-       iter != invariant_nodes_.end(); ++iter) {
+Status LoopInvariantNodeMotionOptimizer::MoveInvariantNodes(
+    const int frame_id) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();
+       ++iter) {
     auto* invariant_node = iter->first;
     const int num_outputs = iter->second;
     if (IsEnter(*invariant_node)) {
-      TF_RETURN_IF_ERROR(
-          LINMHandleInvariantEnter(invariant_node, num_outputs));
+      TF_RETURN_IF_ERROR(HandleInvariantEnter(invariant_node, num_outputs));
     } else if (IsConstant(*invariant_node)) {
-      TF_RETURN_IF_ERROR(
-          LINMHandleConst(invariant_node, num_outputs, frame_id));
+      TF_RETURN_IF_ERROR(HandleConst(invariant_node, num_outputs, frame_id));
     } else {
       TF_RETURN_IF_ERROR(
-          LINMHandleInvariantNode(invariant_node, num_outputs, frame_id));
+          HandleInvariantNode(invariant_node, num_outputs, frame_id));
     }
   }
   return Status::OK();
 }
 
-Status LoopOptimizer::RevertInvariantNodes() {
+Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
   std::deque<const NodeDef*> reverted_nodes;
-  for (auto iter=invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
     bool erased = false;
     const auto* node = iter->first;
     if (!IsConstant(*node) && !IsEnter(*node) && iter->second > 0) {
@@ -331,8 +289,8 @@ Status LoopOptimizer::RevertInvariantNodes() {
       auto* producer = node_map_->GetNode(input);
       auto iter = invariant_nodes_.find(producer);
       if (iter != invariant_nodes_.end()) {
-        if (IsControlInput(input) &&
-            !IsConstant(*producer) && !IsEnter(*producer)) {
+        if (IsControlInput(input) && !IsConstant(*producer) &&
+            !IsEnter(*producer)) {
           reverted_nodes.push_back(producer);
           invariant_nodes_.erase(iter);
         } else {
@@ -357,12 +315,11 @@ Status LoopOptimizer::RevertInvariantNodes() {
   return Status::OK();
 }
 
-Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
+Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(NodeDef* node) {
   auto consumers = node_map_->GetOutputs(node->name());
   invariant_nodes_.insert(std::make_pair(node, consumers.size()));
   for (auto* consumer : consumers) {
-    if (invariant_nodes_.count(consumer) ||
-        ModifiesFrameInfo(*consumer)) {
+    if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) {
       continue;
     }
     bool is_invariant = true;
@@ -399,9 +356,14 @@ Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
   return Status::OK();
 }
 
-Status LoopOptimizer::LoopInvariantNodeMotion() {
+Status LoopInvariantNodeMotionOptimizer::Optimize() {
+  node_map_.reset(new NodeMap(optimized_graph_));
+  FrameMap frame_map;
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map, &num_frames));
   std::deque<int> worklist;
-  for (auto iter = frame_map_.begin(); iter != frame_map_.end(); ++iter) {
+  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
     auto* node = iter->first;
     auto& frame_ids = iter->second;
     if (frame_ids.size() >= 3) {
@@ -467,19 +429,82 @@ Status LoopOptimizer::LoopInvariantNodeMotion() {
   return Status::OK();
 }
 
-Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
+std::vector<int> GetStackPushNodesToConvert(
+    const SimpleGraphView& graph_view,
+    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
+  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+  const std::unordered_set<string> op_types_to_traverse(
+      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
+       "Identity", "RefIdentity"});
+  std::vector<int> nodes_to_convert;
+  std::set<int> fanout;
+  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
+  for (int fanout_idx : fanout) {
+    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
+    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
+    if (IsStackPushOp(fanout_node)) {
+      nodes_to_convert.push_back(fanout_idx);
+    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
+               op_types_to_traverse.find(fanout_node.op()) !=
+                   op_types_to_traverse.end()) {
+      continue;
+    } else if (!IsStackPopOp(fanout_node) ||
+               (!graph_view.outputs(fanout_idx).empty() ||
+                nodes_to_preserve.find(fanout_node.name()) !=
+                    nodes_to_preserve.end())) {
+      // The node is either a stack pop with consumers or something unexpected
+      // so we leave the graph alone.
+      nodes_to_convert.clear();
+      break;
+    }
+  }
+  return nodes_to_convert;
+}
+
+Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const GraphDef& graph = item.graph;
+  *optimized_graph = graph;
+  NodeMap node_map(optimized_graph);
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
+  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
+    if (IsStackOp(graph.node(node_idx))) {
+      for (int push_node_idx : GetStackPushNodesToConvert(
+               graph_view, nodes_to_preserve, node_idx)) {
+        // We found push nodes without corresponding pops. Convert them to
+        // Identity passing the data through and add a control dependency from
+        // the op supplying the stack handle.
+        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
+        VLOG(1) << "Converting " << push_node_idx << " : "
+                << push_node->DebugString();
+        if (push_node->attr().count("swap_memory") != 0) {
+          push_node->mutable_attr()->erase("swap_memory");
+        }
+        push_node->set_op("Identity");
+        push_node->mutable_input()->SwapElements(0, 1);
+        const string ctrl_dep = ConstantFolding::AddControlDependency(
+            push_node->input(1), optimized_graph, &node_map);
+        push_node->set_input(1, ctrl_dep);
+        VLOG(1) << "After converting: " << push_node->DebugString();
+      }
+    }
+  }
+  return Status::OK();
+}
 
-  TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
+}  // namespace
 
-  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
-    optimized_graph_ = optimized_graph;
-    // Set up helper data structures.
-    node_map_.reset(new NodeMap(optimized_graph_));
-    int num_frames;
-    TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                                 &frame_map_, &num_frames));
-    TF_RETURN_IF_ERROR(LoopInvariantNodeMotion());
+Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+  // Set up helper data structures.
+  if (options_.enable_loop_invariant_node_motion) {
+    LoopInvariantNodeMotionOptimizer linm_optimizer(optimized_graph);
+    TF_RETURN_IF_ERROR(linm_optimizer.Optimize());
+  }
+  if (options_.enable_stack_push_removal) {
+    TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index c1b0321e4e..83c499bbe7 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -30,9 +30,13 @@ constexpr char kLoopOptimizer[] = "LoopOptimizer";
 
 class LoopOptimizer : public GraphOptimizer {
  public:
-  LoopOptimizer() : opt_level_(RewriterConfig::ON) {}
+  LoopOptimizer()
+      : opt_level_(RewriterConfig::ON),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
   explicit LoopOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : opt_level_(opt_level),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
+
   ~LoopOptimizer() override {}
 
   string name() const override { return "loop_optimizer"; };
@@ -44,29 +48,24 @@ class LoopOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  Status LoopInvariantNodeMotion();
-  Status FindInvariantNodes(NodeDef* node);
-  Status RevertInvariantNodes();
-  Status MoveInvariantNodes(const int frame_id);
-  Status LINMHandleInvariantNode(NodeDef* node, const int num_outputs,
-      const int frame_id);
-  Status LINMHandleConst(NodeDef* node, const int num_outputs,
-      const int frame_id);
-  Status LINMHandleInvariantEnter(NodeDef* node, const int num_outputs);
-
-  std::map<NodeDef*, int> invariant_nodes_;
-  std::set<int> empty_set_;
-  std::map<int, std::set<int>> frame_children_;
-  std::map<int, int> frame_parent_;
-  std::map<int, const NodeDef*> loop_cond_;
-  std::map<int, std::vector<NodeDef*>> invariant_enters_;
-  int new_enter_id_;
-  RewriterConfig::Toggle opt_level_;
+  friend class LoopOptimizerTest;
+
+  // Granular control for loop optimizer stages.
+  struct LoopOptimizerOptions {
+    bool enable_loop_invariant_node_motion = false;
+    bool enable_stack_push_removal = true;
+
+    static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
+      LoopOptimizerOptions options;
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.enable_loop_invariant_node_motion = true;
+      }
+      return options;
+    }
+  };
 
-  std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
-  std::unique_ptr<GraphProperties> graph_properties_;
-  GraphDef* optimized_graph_;  // Not owned.
+  RewriterConfig::Toggle opt_level_;
+  LoopOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index a0bd335197..10ec544424 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
 class LoopOptimizerTest : public GrapplerTest {
  protected:
@@ -57,6 +56,23 @@ class LoopOptimizerTest : public GrapplerTest {
     attributes.emplace_back("T", type);
     AddNode(name, op, inputs, attributes, graph);
   }
+
+  void DisableAllStages(LoopOptimizer* optimizer) {
+    LoopOptimizer::LoopOptimizerOptions options;
+    options.enable_loop_invariant_node_motion = false;
+    options.enable_stack_push_removal = false;
+    optimizer->options_ = options;
+  }
+
+  void EnableOnlyLoopInvariantNodeMotion(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_loop_invariant_node_motion = true;
+  }
+
+  void EnableOnlyStackPushRemoval(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_stack_push_removal = true;
+  }
 };
 
 TEST_F(LoopOptimizerTest, Basic) {
@@ -81,7 +97,8 @@ TEST_F(LoopOptimizerTest, Basic) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -128,7 +145,8 @@ TEST_F(LoopOptimizerTest, Const) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -175,7 +193,8 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -235,7 +254,8 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -302,7 +322,8 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -365,7 +386,8 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -429,7 +451,8 @@ TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -475,6 +498,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   CHECK(fake_input.NextItem(&item));
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -504,6 +528,7 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   AddSimpleNode("stop", "StopGradient", {"stack3"}, &graph);
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -534,6 +559,7 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   item.fetch.push_back("pop4");
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -563,6 +589,5 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   }
 }
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From 32bb7c14eca309778853e91daed59a96be7033d9 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 6 Apr 2018 16:04:46 -0700
Subject: [PATCH 0394/1262] [TF] Add half precision to the supported data types
 for tensorflow operations.

Enable most of the half precision XLA compiler tests for the cpu backend,
except for two which are disabled and documented in a bug.

PiperOrigin-RevId: 191954183
---
 tensorflow/compiler/tests/build_defs.bzl      |  2 +-
 .../compiler/tests/spacetobatch_op_test.py    |  3 ++
 tensorflow/compiler/tests/unary_ops_test.py   |  3 ++
 tensorflow/compiler/tf2xla/xla_helpers.cc     |  3 ++
 tensorflow/core/ops/array_ops.cc              | 31 ++++++-----
 tensorflow/core/ops/math_ops.cc               | 52 +++++++++----------
 6 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index a9db1c173d..45b6a6eb86 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -51,7 +51,7 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     if backend == "cpu":
       backend_args += [
           "--test_device=XLA_CPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
       ]
     elif backend == "gpu":
       backend_args += [
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index 6083981493..ef47187477 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -163,6 +163,9 @@ class SpaceToBatchNDTest(XLATestCase):
         # error.
         if dtype == dtypes.bfloat16.as_numpy_dtype:
           continue
+        # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
+        if dtype == np.float16 and self.device == "XLA_CPU":
+          continue
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 17149aa1c8..ba79f393a8 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,9 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
+      if dtype == np.float16 and self.device == "XLA_CPU":
+        continue
       x = np.arange(-0.90, 0.90, 0.25)
       self._assertOpOutputMatchesExpected(
           math_ops.acos,
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 3b0b2f06eb..62a5114837 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -122,6 +122,9 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_HALF:
+      return b->ConstantR0<Eigen::half>(
+          static_cast<Eigen::half>(Eigen::NumTraits<Eigen::half>::epsilon()));
     case DT_BFLOAT16:
       return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 62ce70eb6b..4b119e2ebf 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -622,7 +622,7 @@ REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "T: {bfloat16, half, float, double, int8, uint8, int16, uint16, int32, "
         "int64, complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -630,7 +630,9 @@ REGISTER_OP("OnesLike")
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -645,7 +647,9 @@ REGISTER_OP("Diag")
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -789,7 +793,7 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, bfloat16, half, "
         "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
@@ -1165,7 +1169,7 @@ REGISTER_OP("PreventGradient")
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .Attr("message: string")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -2450,13 +2454,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "T: {bfloat16, half, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32}")
     .Attr(
-        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
-        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "type: {bfloat16, half, float, double, int64, int32, uint8, uint16, "
+        "int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, "
+        "qint32}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       if (!c->RankKnown(input)) {
@@ -2552,7 +2555,7 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Deprecated(22, "Replaced by QuantizeAndDequantizeV2");
 
@@ -2565,7 +2568,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2582,7 +2585,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f33d51d5a..1180973e41 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -65,7 +65,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -133,7 +133,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ComplexAbs")
@@ -148,27 +148,27 @@ REGISTER_OP("ComplexAbs")
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
       .Attr(                                                             \
-          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
           "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
   Input("x: T")                                   \
       .Output("y: T")                             \
-      .Attr("T: {half, bfloat16, float, double}") \
+      .Attr("T: {bfloat16, half, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_COMPLEX()                                                  \
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_GRADIENT_COMPLEX()                                         \
   Input("y: T")                                                          \
       .Input("dy: T")                                                    \
       .Output("z: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 REGISTER_OP("Neg").UNARY();
@@ -246,57 +246,57 @@ REGISTER_OP("Atan").UNARY();
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
 #define BINARY_MORE()                                                          \
   Input("x: T").Input("y: T").Output("z: T").Attr(                             \
-      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "T: {bfloat16, half, float, double, uint8, int8, uint16, int16, int32, " \
       "int64, complex64, complex128}")
 
 #define BINARY_FEWER()                                               \
   Input("x: T").Input("y: T").Output("z: T").Attr(                   \
-      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "T: {bfloat16, half, float, double, int32, int64, complex64, " \
       "complex128}")
 
 REGISTER_OP("Add")
@@ -304,7 +304,7 @@ REGISTER_OP("Add")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -315,7 +315,7 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
@@ -412,7 +412,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -437,7 +437,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -445,21 +445,21 @@ REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, float16, half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Pow")
@@ -467,7 +467,7 @@ REGISTER_OP("Pow")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, float, half, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -503,7 +503,7 @@ REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Betainc")
@@ -574,7 +574,7 @@ REGISTER_OP("GreaterEqual").COMPARISON();
       .Output("z: bool")                                                   \
       .SetIsCommutative()                                                  \
       .Attr(                                                               \
-          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
           "int64, complex64, quint8, qint8, qint32, string, bool, "        \
           "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -713,7 +713,7 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape);
 
 REGISTER_OP("SparseMatMul")
-- 
GitLab


From 59ce970732e7f8f1a22c12e52819ee43a4d3fec3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 16:09:37 -0700
Subject: [PATCH 0395/1262] Add support for ResourceVariable to recompute_grad

PiperOrigin-RevId: 191954813
---
 .../contrib/layers/python/layers/rev_block_lib.py  |  5 ++---
 .../layers/python/layers/rev_block_lib_test.py     | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index e49589ddf6..02d294c68f 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -247,9 +247,7 @@ class RevBlock(base.Layer):
     f_vars_idxs = [[] for _ in range(self.num_layers)]
     g_vars_idxs = [[] for _ in range(self.num_layers)]
 
-    for i, t in enumerate(variables):
-      ref = _underlying_variable_ref(t)
-
+    for i, ref in enumerate(variables):
       # Use the name to identify the layer number and function (f or g)
       regex = LAYER_RE.match(ref.name)
       layer_no = int(regex.group(1))
@@ -604,6 +602,7 @@ def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
     """Custom grad fn applying grad_fn for identity Defun."""
     fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
         defun_inputs, list(op.inputs))
+    fn_vars = [_underlying_variable_ref(v) for v in fn_vars]
     dys = list(dys)
     assert len(fn_outputs) == len(outputs)
     assert len(fn_outputs) == len(dys)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index d1ad4e8c98..392a490be1 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -304,6 +304,20 @@ class RecomputeTest(test.TestCase):
           self.assertAllClose(current, g)
           current = g
 
+  def testResourceVariable(self):
+    @rev_block_lib.recompute_grad(tupleize_grads=True)
+    def layer_with_recompute(inputs):
+      var = variable_scope.get_variable("var", ())
+      return var * inputs
+
+    inputs = array_ops.ones((), dtypes.float32)
+    with variable_scope.variable_scope("layer", use_resource=True):
+      outputs = layer_with_recompute(inputs)
+      loss = math_ops.square(outputs)
+      grads = gradients_impl.gradients(loss, variables.trainable_variables())
+      self.assertEqual(1, len(grads))
+      self.assertTrue(grads[0] is not None)
+
 
 class FnWithCustomGradTest(test.TestCase):
 
-- 
GitLab


From b79b11519abdda73e4ba9d9ddf690d7918fcca9d Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Fri, 6 Apr 2018 16:11:10 -0700
Subject: [PATCH 0396/1262] Revert to the previous version of the ResNet50
 model.

PiperOrigin-RevId: 191955019
---
 tensorflow/python/keras/_impl/keras/applications/resnet50.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index f8c6aff4f2..c3a92bea89 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -237,9 +237,8 @@ def ResNet50(include_top=True,
   else:
     bn_axis = 1
 
-  x = ZeroPadding2D(padding=(3, 3), name='conv1_pad')(img_input)
   x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='valid', name='conv1')(x)
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-- 
GitLab


From 0cc22638cf96c16933632de61ee8d454693f9ebd Mon Sep 17 00:00:00 2001
From: Patrick Nguyen <drpng@google.com>
Date: Fri, 6 Apr 2018 16:33:11 -0700
Subject: [PATCH 0397/1262] Move inplace update operators.

The ops are not part of the public API.

PiperOrigin-RevId: 191957660
---
 .../api_def/base_api/api_def_DeepCopy.pbtxt   |  15 +
 .../core/api_def/base_api/api_def_Empty.pbtxt |  23 ++
 .../api_def/base_api/api_def_InplaceAdd.pbtxt |  28 ++
 .../api_def/base_api/api_def_InplaceSub.pbtxt |  28 ++
 .../base_api/api_def_InplaceUpdate.pbtxt      |  28 ++
 .../api_def/python_api/api_def_DeepCopy.pbtxt |   4 +
 .../api_def/python_api/api_def_Empty.pbtxt    |   4 +
 .../python_api/api_def_InplaceAdd.pbtxt       |   4 +
 .../python_api/api_def_InplaceSub.pbtxt       |   4 +
 .../python_api/api_def_InplaceUpdate.pbtxt    |   4 +
 tensorflow/core/kernels/inplace_ops.cc        | 296 +++++++++++++++++-
 tensorflow/core/kernels/inplace_ops_functor.h |  17 +
 .../kernels/inplace_ops_functor_gpu.cu.cc     |  97 ++++++
 tensorflow/core/ops/array_ops.cc              |  45 +++
 tensorflow/python/BUILD                       |   5 +-
 tensorflow/python/kernel_tests/BUILD          |  16 +
 .../python/kernel_tests/inplace_ops_test.py   | 198 ++++++++++++
 tensorflow/python/ops/inplace_ops.py          | 227 ++++++++++++++
 18 files changed, 1039 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
 create mode 100644 tensorflow/python/kernel_tests/inplace_ops_test.py
 create mode 100644 tensorflow/python/ops/inplace_ops.py

diff --git a/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000..fe0fc3823f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "DeepCopy"
+  in_arg {
+    name: "x"
+    description: "The source tensor of type `T`."
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+    y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+      is not an alias of `x`.
+END
+  }
+  summary: "Makes a copy of `x`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000..746f561e92
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "Empty"
+  in_arg {
+    name: "shape"
+    description: "1-D. Represents the shape of the output tensor."
+  }
+  attr {
+    name: "init"
+    description:
+        "If True, initialize the returned tensor with the default value "
+        "of dtype.  Otherwise, the implementation is free not to initialize"
+        "the tensor's content."
+  }
+  out_arg {
+    name: "output"
+    description: "A `Tensor` of type `T`."
+  }
+  summary: <<END
+Creates a tensor with the given shape.
+
+This operation creates a tensor of `shape` and `dtype`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000..3654286cc3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceAdd"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Adds v into specified rows of x.
+
+    Computes y = x; y[i, :] += v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000..a9480b4a38
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceSub"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Subtracts `v` into specified rows of `x`.
+
+    Computes y = x; y[i, :] -= v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000..2fcd3659dc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  in_arg {
+    name: "x"
+    description: "A tensor of type `T`."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Updates specified rows with values in `v`.
+
+    Computes `x[i, :] = v; return x`.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000..2d5ed2b432
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeepCopy"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000..0b863520e9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Empty"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000..390e3bbf97
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000..af9634f9b2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000..5fa9d778ea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index a71d047ed1..ef6ce0546b 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -213,13 +213,13 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_EMPTY(type)                                  \
+#define REGISTER_PARALLEL_CONCAT_START(type)                  \
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
                               .Device(DEVICE_GPU)             \
                               .TypeConstraint<type>("dtype"), \
                           ParallelConcatStart<GPUDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_EMPTY)
-#undef REGISTER_EMPTY
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT_START)
+#undef REGISTER_PARALLEL_CONCAT_START
 
 #define REGISTER_PARALLEL_CONCAT(type)                                     \
   REGISTER_KERNEL_BUILDER(                                                 \
@@ -248,5 +248,295 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                         ParallelConcatUpdate<CPUDevice>);
 #endif
 
+class InplaceOpBase : public OpKernel {
+ public:
+  explicit InplaceOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    auto i = ctx->input(1);
+    auto v = ctx->input(2);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(i.shape()),
+                errors::InvalidArgument("i must be a vector. ",
+                                        i.shape().DebugString()));
+    OP_REQUIRES(ctx, x.dims() == v.dims(),
+                errors::InvalidArgument(
+                    "x and v shape doesn't match (ranks differ): ",
+                    x.shape().DebugString(), " vs. ", v.shape().DebugString()));
+    for (int i = 1; i < x.dims(); ++i) {
+      OP_REQUIRES(
+          ctx, x.dim_size(i) == v.dim_size(i),
+          errors::InvalidArgument("x and v shape doesn't match at index ", i,
+                                  " : ", x.shape().DebugString(), " vs. ",
+                                  v.shape().DebugString()));
+    }
+    OP_REQUIRES(ctx, i.dim_size(0) == v.dim_size(0),
+                errors::InvalidArgument(
+                    "i and x shape doesn't match at index 0: ",
+                    i.shape().DebugString(), " vs. ", v.shape().DebugString()));
+
+    Tensor y = x;  // This creates an alias intentionally.
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, i, v, &y));
+    ctx->set_output(0, y);
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& i,
+                           const Tensor& v, Tensor* y) = 0;
+};
+
+}  // end namespace
+
+namespace functor {
+
+template <typename T>
+void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<T>();
+  auto Ty = y->flat_outer_dims<T>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    switch (op) {
+      case I_UPDATE:
+        Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+        break;
+      case I_ADD:
+        Ty.template chip<0>(r).device(d) += Tv.template chip<0>(j);
+        break;
+      case I_SUB:
+        Ty.template chip<0>(r).device(d) -= Tv.template chip<0>(j);
+        break;
+    }
+  }
+}
+
+// String type only supports inplace update.
+void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
+                             const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<string>();
+  auto Ty = y->flat_outer_dims<string>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+  }
+}
+
+template <>
+Status DoInplace(const CPUDevice& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  if (op == I_UPDATE) {
+    if (v.dtype() == DT_STRING) {
+      DoInplaceStringUpdateOp(device, i, v, y);
+      return Status::OK();
+    } else if (v.dtype() == DT_BOOL) {
+      DoInplaceOp<bool>(device, op, i, v, y);
+      return Status::OK();
+    }
+  }
+  switch (v.dtype()) {
+#define CASE(type)                          \
+  case DataTypeToEnum<type>::value:         \
+    DoInplaceOp<type>(device, op, i, v, y); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, functor::InplaceOpType op>
+class InplaceOp : public InplaceOpBase {
+ public:
+  explicit InplaceOp(OpKernelConstruction* ctx) : InplaceOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& i, const Tensor& v,
+                   Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoInplace(d, op, i, v, y);
+  }
+};
+
+class CopyOpBase : public OpKernel {
+ public:
+  explicit CopyOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    Tensor* y;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x.shape(), &y));
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, x, y));
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& x,
+                           Tensor* y) = 0;
+};
+
+template <typename Device>
+class CopyOp : public CopyOpBase {
+ public:
+  explicit CopyOp(OpKernelConstruction* ctx) : CopyOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& x, Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoCopy(d, x, y);
+  }
+};
+
+}  // end namespace
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <>
+Status DoCopy(const CPUDevice& device, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                                   \
+  case DataTypeToEnum<type>::value:                  \
+    y->flat<type>().device(device) = x.flat<type>(); \
+    break;
+
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_bool(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", x.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, typename T>
+class EmptyOp : public OpKernel {
+ public:
+  explicit EmptyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape.shape()),
+        errors::InvalidArgument("shape must be a vector of int32, got shape ",
+                                shape.shape().DebugString()));
+    auto dims = shape.flat<int32>();
+    TensorShape out_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                            reinterpret_cast<const int32*>(dims.data()),
+                            dims.size(), &out_shape));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (init_) {
+      functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                           out->flat<T>());
+    }
+  }
+
+ private:
+  bool init_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+REGISTER_KERNEL_BUILDER(Name("DeepCopy").Device(DEVICE_CPU), CopyOp<CPUDevice>);
+
+#define REGISTER_EMPTY(type, dev)                             \
+  REGISTER_KERNEL_BUILDER(Name("Empty")                       \
+                              .Device(DEVICE_##dev)           \
+                              .HostMemory("shape")            \
+                              .TypeConstraint<type>("dtype"), \
+                          EmptyOp<dev##Device, type>)
+
+REGISTER_EMPTY(float, CPU)
+REGISTER_EMPTY(double, CPU)
+REGISTER_EMPTY(Eigen::half, CPU)
+REGISTER_EMPTY(string, CPU)
+REGISTER_EMPTY(int32, CPU)
+REGISTER_EMPTY(int64, CPU)
+REGISTER_EMPTY(bool, CPU)
+
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      InplaceOp<GPUDevice, functor::I_UPDATE>);                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceAdd").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_ADD>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceSub").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_SUB>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DeepCopy").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),      \
+      CopyOp<GPUDevice>);
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(Eigen::half);
+REGISTER(int64);
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+
+REGISTER_KERNEL_BUILDER(Name("DeepCopy")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        CopyOp<CPUDevice>);
+REGISTER_EMPTY(float, GPU);
+REGISTER_EMPTY(double, GPU);
+REGISTER_EMPTY(Eigen::half, GPU);
+REGISTER_EMPTY(int64, GPU);
+
+#endif  // GOOGLE_CUDA
+
 }  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops_functor.h b/tensorflow/core/kernels/inplace_ops_functor.h
index 53529f5165..b806787e91 100644
--- a/tensorflow/core/kernels/inplace_ops_functor.h
+++ b/tensorflow/core/kernels/inplace_ops_functor.h
@@ -26,6 +26,23 @@ template <typename Device>
 Status DoParallelConcat(const Device& device, const Tensor& value, int32 loc,
                         Tensor* output);
 
+// Inplace update/add/sub values in 'y'. It computes
+//   y[i, :] = v if op is I_UPDATE
+//   y[i, :] += v if op is I_ADD
+//   y[i, :] -= v if op is I_SUB
+// Returns an error if the operation fails.
+enum InplaceOpType {
+  I_UPDATE,  // x = y
+  I_ADD,     // x += y
+  I_SUB,     // x -= y
+};
+template <typename Device>
+Status DoInplace(const Device& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y);
+// Copies x into y.
+template <typename Device>
+Status DoCopy(const Device& device, const Tensor& x, Tensor* y);
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 8467360435..f1616b1ea8 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -77,6 +77,103 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
   return Status::OK();
 }
 
+template <typename T, InplaceOpType op>
+__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
+                                  const int64 cols, const int64 n, const T* src,
+                                  const int32* rowids, T* dst) {
+  CUDA_1D_KERNEL_LOOP(idx, nthreads) {
+    int64 r = idx / cols;
+    int64 c = idx % cols;
+    r = (rowids[r] % rows + rows) % rows;  // Guard index range.
+    T* p = dst + r * cols + c;
+    const T* q = src + idx;
+    switch (op) {
+      case I_UPDATE:
+        *p = ldg(q);
+        break;
+      case I_ADD:
+        *p += ldg(q);
+        break;
+      case I_SUB:
+        *p -= ldg(q);
+        break;
+    }
+  }
+}
+
+template <typename T>
+void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  const int64 nelem = v.NumElements();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  auto Ty = y->flat_outer_dims<T>();
+  const int64 nrows = Ty.dimension(0);
+  const int64 ncols = Ty.dimension(1);
+  const int64 n = i.NumElements();
+  const T* src = v.flat<T>().data();
+  // TODO(sjhwang): Check that first dimension fits in int32 range.
+  const int32* rowids = i.flat<int32>().data();
+  T* dst = y->flat<T>().data();
+  switch (op) {
+    case I_UPDATE:
+      DoInplaceOpKernel<T, I_UPDATE>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_ADD:
+      DoInplaceOpKernel<T, I_ADD>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_SUB:
+      DoInplaceOpKernel<T, I_SUB>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+  }
+}
+
+template <>
+Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  switch (v.dtype()) {
+#define CASE(type)                     \
+  case DataTypeToEnum<type>::value:    \
+    DoInplaceOp<type>(d, op, i, v, y); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+template <>
+Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                              \
+  case DataTypeToEnum<type>::value:             \
+    y->flat<type>().device(d) = x.flat<type>(); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported dtype: ", x.dtype());
+  }
+  return Status::OK();
+}
+
 }  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 4b119e2ebf..2a8b9f9bee 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -27,6 +27,7 @@ namespace tensorflow {
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
 
 namespace {
 
@@ -341,6 +342,50 @@ REGISTER_OP("Pack")
       return Status::OK();
     });
 
+REGISTER_OP("DeepCopy")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetIsStateful()
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceUpdate")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceAdd")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceSub")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("Empty")
+    .Input("shape: int32")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("init: bool = false")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Unpack")
     .Input("value: T")
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a8f1318509..01962fcf44 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1616,7 +1616,10 @@ py_library(
 
 py_library(
     name = "array_ops",
-    srcs = ["ops/array_ops.py"],
+    srcs = [
+        "ops/array_ops.py",
+        "ops/inplace_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops_gen",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 6c34ea1816..3033b48977 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1190,6 +1190,22 @@ cuda_py_test(
     shard_count = 10,
 )
 
+cuda_py_test(
+    name = "inplace_ops_test",
+    size = "small",
+    srcs = ["inplace_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+    shard_count = 10,
+)
+
 cuda_py_test(
     name = "batch_matmul_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
new file mode 100644
index 0000000000..0f95e13187
--- /dev/null
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for inplace_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class InplaceOpsTest(test_util.TensorFlowTestCase):
+
+  def testBasicUpdate(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] = 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [-1],
+                                       array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] = 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] = 7
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicUpdateBool(self):
+    with self.test_session(use_gpu=True):
+      x = array_ops.ones([7, 3], dtypes.bool)
+      y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3],
+                                                            dtypes.bool))
+      y[3, :] = True
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [-1],
+                                     array_ops.zeros([1, 3], dtypes.bool))
+      y[-1, :] = False
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, 5, array_ops.zeros([3], dtypes.bool))
+      y[5, :] = False
+      self.assertAllClose(x.eval(), y)
+
+  def testBasicAdd(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = array_ops.inplace_add(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] += 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] += 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] += 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] += 99
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicSub(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] -= 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] -= 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] -= 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] -= 99
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom(self):
+    with self.test_session(use_gpu=True):
+      d0, d1, d2 = 100, 3, 5
+      x = array_ops.zeros([d0, d1, d2])
+      y = np.zeros([d0, d1, d2])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10, d1, d2))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx, :] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx, :] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx, :] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom1D(self):
+    with self.test_session(use_gpu=True):
+      d0 = 100
+      x = array_ops.zeros([d0])
+      y = np.zeros([d0])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testAlias(self):
+    with self.test_session(use_gpu=True) as sess:
+      x = array_ops.ones([2, 3])
+      y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
+      with ops.control_dependencies([y]):
+        z = array_ops.identity(x)
+        _, vy, vz = sess.run([x, y, z])
+      self.assertAllClose(vy, vz)
+
+  def testError(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a vector"):
+        _ = inplace_ops.inplace_update([[1.]], [[0]], [[10]]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "x and v shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0], [10]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "i and x shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
+
+  def testEmpty(self):
+    for dtype in [
+        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool
+    ]:
+      with self.test_session(use_gpu=True):
+        test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
+        for shape in test_shapes:
+          val = inplace_ops.empty(shape, dtype).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty(shape, dtype, init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+          val = inplace_ops.empty_like(array_ops.zeros(shape, dtype)).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty_like(
+              array_ops.zeros(shape, dtype), init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/ops/inplace_ops.py b/tensorflow/python/ops/inplace_ops.py
new file mode 100644
index 0000000000..e5b000086b
--- /dev/null
+++ b/tensorflow/python/ops/inplace_ops.py
@@ -0,0 +1,227 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inplace operations.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _inplace_helper(x, i, v, op):
+  """Applies an inplace op on (x, i, v).
+
+  op is one of gen_array_ops.alias_inplace_update,
+  gen_array_ops.alias_inplace_add, or gen_array_ops.alias_inplace_sub.
+
+  If i is None, x and v must be the same shape. Computes
+    x op v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] op v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] op v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+    op: alias_inplace_update, alias_inplace_add, or alias_inplace_sub.
+
+  Returns:
+    Returns x.
+
+  """
+  x = ops.convert_to_tensor(x)
+  v = ops.convert_to_tensor(v, x.dtype)
+  if i is None:
+    # Full tensor.
+    return array_ops.reshape(
+        op(array_ops.reshape(x, [1, -1]), [0], array_ops.reshape(v, [1, -1])),
+        array_ops.shape(x))
+  i = math_ops.to_int32(i)
+  if i.get_shape().ndims == 0:
+    # Single 0-dim update.
+    return op(x, array_ops.reshape(i, [1]), array_ops.expand_dims(v, 0))
+  return op(x, i, v)
+
+
+def alias_inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_update)
+
+
+def alias_inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_add)
+
+
+def alias_inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_sub)
+
+
+def empty_like(x, init=None):
+  """Returns a non-initialized tensor with the same shape and dtype as x.
+
+  Args:
+    x: A Tensor.
+    init: Initialize the returned tensor with the default value of
+      x.dtype(), if True. Otherwise, do not initialize. Defaults to
+      None.
+
+  Returns:
+    A tensor y, whose dtype and shape are the same as those of x.
+    y is guaranteed not to be an alias of x. Upon return, y may contain
+    arbitrary data.
+
+  """
+  x = ops.convert_to_tensor(x)
+  return gen_array_ops.empty(array_ops.shape(x), x.dtype, init=init)
+
+
+def inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_update(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_add(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_sub(gen_array_ops.deep_copy(x), i, v)
+
+empty = gen_array_ops.empty
-- 
GitLab


From e834ba02059dec03dd6673175483f01704363df8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 16:34:46 -0700
Subject: [PATCH 0398/1262] [XLA] Redesign: implement and test Pad.

PiperOrigin-RevId: 191957827
---
 .../xla/client/xla_client/xla_builder.cc      | 17 +++++-
 tensorflow/compiler/xla/tests/BUILD           |  2 +-
 .../xla/tests/client_library_test_base.cc     |  8 +++
 .../xla/tests/client_library_test_base.h      |  5 ++
 tensorflow/compiler/xla/tests/pad_test.cc     | 54 +++++++++----------
 5 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 2d587cc3b9..e623639577 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -548,7 +548,22 @@ XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
 
 XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
                       const PaddingConfig& padding_config) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
+                        GetShape(padding_value));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
+                                      padding_config));
+
+    *instr.mutable_padding_config() = padding_config;
+
+    return AddInstruction(std::move(instr), HloOpcode::kPad,
+                          {operand, padding_value});
+  });
 }
 
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6f58c20f34..072c5cd149 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1266,9 +1266,9 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 17c6a83c1a..9124ccdb46 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -595,6 +595,14 @@ ComputationDataHandle ClientLibraryTestBase::AddParam(
   return data_handle;
 }
 
+XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
+                                      XlaBuilder* builder) {
+  XlaOp data_handle;
+  arguments_.push_back(CreateParameterAndTransferLiteral(
+      arguments_.size(), argument, "", builder, &data_handle));
+  return data_handle;
+}
+
 ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
     const Literal& literal, ComputationBuilder* builder) {
   return builder->ConstantLiteral(
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 52f31b0669..80e1bbbae8 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -300,12 +300,17 @@ class ClientLibraryTestBase : public ::testing::Test {
   // set exactly once. The first added parameter gets index 0, then 1 and so on.
   ComputationDataHandle AddParam(const Literal& argument,
                                  ComputationBuilder* builder);
+  XlaOp AddParam(const Literal& argument, XlaBuilder* builder);
 
   template <class T>
   ComputationDataHandle AddParam(const Array<T>& argument,
                                  ComputationBuilder* builder) {
     return AddParam(*Literal::CreateFromArray(argument), builder);
   }
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(*Literal::CreateFromArray(argument), builder);
+  }
 
   // Creates a constant instruction with the given literal. When the
   // use_bfloat16 flag is set but the literal has F32 elements, the elements
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 8cef8dd34d..ce295b832d 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -85,7 +85,7 @@ class PadTestFloat : public PadTest,
 
 // Tests a Pad() with a zero-element input and output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -100,7 +100,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -115,7 +115,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -130,7 +130,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
         AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
@@ -138,7 +138,7 @@ XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
       {1.0f, 2.0f},  // row 0
@@ -162,7 +162,7 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -181,7 +181,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -223,7 +223,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -266,7 +266,7 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
 }
 
 XLA_TEST_F(PadTest, Pad4DU8Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<uint8>>(1, 1, 3, 2);
   Array2D<uint8> input_xy({
       {1, 2},  // row 0
@@ -290,7 +290,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
 }
 
 XLA_TEST_F(PadTest, Pad4DPredArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   // Since bool is currently not well supported, use Broadcast operation to
   // create the operand for Pad.
@@ -317,7 +317,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
 }
 
 XLA_TEST_P(PadTestFloat, Large2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   auto ones = MakeUnique<Array2D<float>>(4, 4);
   ones->Fill(1.0f);
@@ -329,15 +329,14 @@ XLA_TEST_P(PadTestFloat, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
@@ -352,15 +351,14 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, High2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -378,8 +376,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -387,7 +384,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativePadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -406,8 +403,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -415,7 +411,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 8;
   constexpr int64 in_cols = 11;
@@ -434,8 +430,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -444,20 +439,19 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 
 // Regression test for b/31827337.
 XLA_TEST_P(PadTestFloat, ReducePad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
   ones->Fill(1.0);
   auto input = AddParam(*ones, &b);
 
-  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  XlaComputation add = CreateScalarAddComputation(FloatType(), &b);
   auto reduce =
       b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
-- 
GitLab


From d8d7d8ba35b9de83fbc983f753acf53e5185dfc0 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 6 Apr 2018 16:50:01 -0700
Subject: [PATCH 0399/1262] Water down some tests so they work in the Python
 2.7.9 release builds

Removing reference cycle checks from the tests that failed in
http://ci.tensorflow.org/view/Release/job/release-debian-cpu/99/consoleFull

uuid4() created reference cycles until Python 2.7.11. Removes checks for
reference cycles from tests which indirectly call it.

This issue will probably keep coming up until we move off of 2.7.9 (since
there's no presubmit), but this CL is an effort to fix the issues that came up
for the 1.7 release.

PiperOrigin-RevId: 191959519
---
 tensorflow/contrib/eager/python/BUILD                |  3 +--
 .../eager/python/checkpointable_utils_test.py        | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index edb9130266..4e088503bf 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -272,8 +272,7 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
     tags = [
-        "no_oss",  # b/74395663
         "no_windows",  # TODO: needs investigation on Windows
-        "notsan",
+        "notsan",  # b/74395663
     ],
 )
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 891c093a0f..e6498ddb06 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -714,7 +714,7 @@ class CheckpointingTests(test.TestCase):
     status.run_restore_ops()
     self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
@@ -779,7 +779,7 @@ class CheckpointingTests(test.TestCase):
       self.evaluate(train_op)
     slot_status.assert_consumed()
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
     save_root = checkpointable.Checkpointable()
@@ -830,7 +830,7 @@ class CheckpointingTests(test.TestCase):
     second_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(load_dep.var))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testAmbiguousLoad(self):
     # Not OK to split one checkpoint object into two
     checkpoint_directory = self.get_temp_dir()
@@ -853,7 +853,7 @@ class CheckpointingTests(test.TestCase):
                                  "resolved to different objects"):
       load_root.dep_two.dep_three = checkpointable.Checkpointable()
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testObjectsCombined(self):
     # Currently fine to load two checkpoint objects into one Python object
     checkpoint_directory = self.get_temp_dir()
@@ -1154,7 +1154,7 @@ class CheckpointingTests(test.TestCase):
 
 class TemplateTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def test_checkpointable_save_restore(self):
 
     def _templated():
@@ -1185,7 +1185,7 @@ class TemplateTests(test.TestCase):
     self.assertAllEqual([13.], self.evaluate(var_plus_one))
     self.assertAllEqual([14.], self.evaluate(var2))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def test_checkpointable_save_restore_nested(self):
 
     def _inner_template():
-- 
GitLab


From ba17f2a81949a0b35a92a4d6f7704d0fb2917bd3 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 6 Apr 2018 16:51:05 -0700
Subject: [PATCH 0400/1262] Update docs to include the most relevant paper.

PiperOrigin-RevId: 191959657
---
 tensorflow/contrib/quantize/README.md | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 348c824a40..c83623ec94 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -2,14 +2,17 @@
 
 tf.contrib.quantize provides tools for transforming graphs to include ops to
 model quantization of weights, biases and activations during both training and
-inference. This is done using the
+inference. The details of the transformation implemented in this package is
+described here [1].
+
+This is done using the
 [fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
 
-Recent literature has shown that fixed point networks provide comparable
-performance to floating point networks [1]. This is achieved by modeling the
-quantization operation during training in both the forward and backward passes.
+Literature has shown that fixed point networks provide comparable performance to
+floating point networks [2]. This is achieved by modeling the quantization
+operation during training in both the forward and backward passes.
 The fake quantization operator achieves this by modeling the quantizer as a pass
-through estimator [2]. Note that during back propagation, the parameters are
+through estimator [3]. Note that during back propagation, the parameters are
 updated at high precision as this is needed to ensure sufficient precision in
 accumulating tiny adjustments to the parameters. However, for the forward pass,
 the parameters and activations are quantized to the desired lower precision.
@@ -61,9 +64,11 @@ These rewrites are an active area of research and experimentation, so the
 rewrites and quantized training will likely not work across all models, though
 we hope to work towards generalizing these techniques.
 
+[1] B.Jacob et al., "Quantization and Training of Neural Networks for Efficient
+Integer-Arithmetic-Only Inference", https://arxiv.org/abs/1712.05877
 
-[1] P.Gysel, "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
+[2] P.Gysel et al., "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
 NEURAL NETWORKS", https://arxiv.org/pdf/1604.03168.pdf
 
-[2] Y.Bengio, "Estimating or Propagating Gradients Through Stochastic Neurons
-for Conditional Computation", https://arxiv.org/abs/1308.3432
+[3] Y.Bengio et al., "Estimating or Propagating Gradients Through Stochastic
+Neurons for Conditional Computation", https://arxiv.org/abs/1308.3432
-- 
GitLab


From dbfc3d5d364a61dcf8b2867aee6afd6dc387b34b Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Fri, 6 Apr 2018 16:58:18 -0700
Subject: [PATCH 0401/1262] Add methods to TestReporter to log extras for
 benchmarks.

PiperOrigin-RevId: 191960433
---
 tensorflow/core/util/reporter.cc      | 12 ++++++++++++
 tensorflow/core/util/reporter.h       | 10 +++++++++-
 tensorflow/core/util/reporter_test.cc | 23 +++++++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index ee38f81f3e..a595c9509e 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -47,6 +47,18 @@ Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
   return Status::OK();
 }
 
+Status TestReporter::SetProperty(const string& name, const string& value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_string_value(value);
+  return Status::OK();
+}
+
+Status TestReporter::SetProperty(const string& name, double value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_double_value(value);
+  return Status::OK();
+}
+
 Status TestReporter::Initialize() {
   if (fname_.empty()) {
     return Status::OK();
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index bcae12204e..e551e2e4f5 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -34,11 +34,13 @@ namespace tensorflow {
 //
 // If this environment variable is not defined, no logging is performed.
 //
-// The intended use is via the following 4 lines:
+// The intended use is via the following lines:
 //
 //  TestReporter reporter(test_name);
 //  TF_CHECK_OK(reporter.Initialize()));
 //  TF_CHECK_OK(reporter.Benchmark(iters, cpu_time, wall_time, throughput));
+//  TF_CHECK_OK(reporter.SetProperty("some_string_property", "some_value");
+//  TF_CHECK_OK(reporter.SetProperty("some_double_property", double_value);
 //  TF_CHECK_OK(reporter.Close());
 //
 // For example, if the environment variable
@@ -75,6 +77,12 @@ class TestReporter {
   Status Benchmark(int64 iters, double cpu_time, double wall_time,
                    double throughput);
 
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, double value);
+
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, const string& value);
+
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 90ea09876e..0972b86ea5 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -115,5 +115,28 @@ TEST(TestReporter, Benchmark) {
   EXPECT_EQ(benchmark_entry.throughput(), 3.0);
 }
 
+TEST(TestReporter, SetProperties) {
+  string fname =
+      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  TestReporter test_reporter(fname, "b2/3/4");
+  TF_EXPECT_OK(test_reporter.Initialize());
+  TF_EXPECT_OK(test_reporter.SetProperty("string_prop", "abc"));
+  TF_EXPECT_OK(test_reporter.SetProperty("double_prop", 4.0));
+
+  TF_EXPECT_OK(test_reporter.Close());
+  string expected_fname = strings::StrCat(fname, "b2__3__4");
+  string read;
+  TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
+
+  BenchmarkEntries benchmark_entries;
+  ASSERT_TRUE(benchmark_entries.ParseFromString(read));
+  ASSERT_EQ(1, benchmark_entries.entry_size());
+  const BenchmarkEntry& benchmark_entry = benchmark_entries.entry(0);
+  const auto& extras = benchmark_entry.extras();
+  ASSERT_EQ(2, extras.size());
+  EXPECT_EQ("abc", extras.at("string_prop").string_value());
+  EXPECT_EQ(4.0, extras.at("double_prop").double_value());
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From 49c6489368ea98feb3259d54a10c6fdfd01caf44 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 6 Apr 2018 17:02:16 -0700
Subject: [PATCH 0402/1262] Add CallableOptions.tensor_connection for feeding a
 tensor to another tensor.

PiperOrigin-RevId: 191960845
---
 .../common_runtime/direct_session_test.cc     | 210 +++++++++++++++++-
 .../common_runtime/graph_execution_state.cc   | 105 ++++++++-
 tensorflow/core/protobuf/config.proto         |  18 +-
 3 files changed, 327 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index f95cecfc66..fbe7b7daaf 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -81,6 +81,7 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     test::FillValues<float>(&a_tensor, a_values);
     Node* a = test::graph::Constant(&graph, a_tensor);
     a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+    a_ = a->name();
 
     Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
     test::FillValues<float>(&x_tensor, {1, 1});
@@ -97,12 +98,18 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     y_neg_ = y_neg->name();
     y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
 
+    Node* z = test::graph::Unary(&graph, "Identity", y_neg);
+    z_ = z->name();
+    z->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
+
     test::graph::ToGraphDef(&graph, &def_);
   }
 
+  string a_;
   string x_;
   string y_;
   string y_neg_;
+  string z_;
   GraphDef def_;
 };
 
@@ -133,7 +140,6 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
 
   // Run the test twice to ensure that the Make/Run/Release cycle is hermetic.
   for (int i = 0; i < 2; ++i) {
@@ -175,6 +181,159 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
   }
 }
 
+TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);".
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(1, outputs.size());
+    auto mat = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);"; also fetch the result of a.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(a_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(2, outputs.size());
+    auto mat_a = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(3.0, mat_a(0, 0));
+    EXPECT_FLOAT_EQ(2.0, mat_a(0, 1));
+    EXPECT_FLOAT_EQ(-1.0, mat_a(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_a(1, 1));
+
+    auto mat_y_neg = outputs[1].matrix<float>();
+    ASSERT_TRUE(outputs[1].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat_y_neg(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat_y_neg(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat_y_neg(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_y_neg(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Wire the output of "Neg(Matmul(a, x))" to the output of "a",
+    // creating an invalid cycle.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(y_ + ":0");
+    c->set_to_tensor(a_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(
+        StringPiece(s.error_message()).contains("would create a cycle"));
+  }
+
+  {
+    // Attempt to wire a non-existent node to a node that does exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor("unknown_node:0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(StringPiece(s.error_message()).contains("unknown node"));
+  }
+
+  {
+    // Attempt to wire a non-existent output from a node that does
+    // exist to another node.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":17");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(StringPiece(s.error_message()).contains("unknown edge"));
+  }
+
+  {
+    // Attempt to wire a tensor to a node that doesn't exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor("unknown_node:0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsNotFound(s));
+    EXPECT_TRUE(
+        StringPiece(s.error_message()).contains("unable to find feed output"));
+  }
+
+  {
+    // Attempt to wire two tensors to the same tensor.
+    CallableOptions callable_options;
+    TensorConnection* c1 = callable_options.add_tensor_connection();
+    c1->set_from_tensor(a_ + ":0");
+    c1->set_to_tensor(y_neg_ + ":0");
+    TensorConnection* c2 = callable_options.add_tensor_connection();
+    c2->set_from_tensor(x_ + ":0");
+    c2->set_to_tensor(y_neg_ + ":0");
+    callable_options.add_fetch(z_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  }
+
+  {
+    // Attempt to wire a tensor to a tensor that is also being fed.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_feed(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  }
+}
+
 TEST_F(DirectSessionMinusAXTest, TestFeed) {
   Initialize({1, 2, 3, 4});
   auto session = CreateSession();
@@ -654,6 +813,55 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
+TEST(DirectSessionTest, TestTensorConnectionUseTwice) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0, 3.0, 4.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+
+  Tensor dummy_tensor(DT_FLOAT, TensorShape({1}));
+  test::FillValues<float>(&dummy_tensor, {-1.0});
+
+  Node* left = test::graph::Constant(&graph, dummy_tensor);
+  Node* right = test::graph::Constant(&graph, dummy_tensor);
+
+  // y = A * x
+  Node* y = test::graph::Add(&graph, left, right);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  CallableOptions callable_options;
+  // Directly wire the output of node a to the outputs of nodes left
+  // and right, making the callable graph into "a + a;".
+  TensorConnection* c_left = callable_options.add_tensor_connection();
+  c_left->set_from_tensor(a->name() + ":0");
+  c_left->set_to_tensor(left->name() + ":0");
+  TensorConnection* c_right = callable_options.add_tensor_connection();
+  c_right->set_from_tensor(a->name() + ":0");
+  c_right->set_to_tensor(right->name() + ":0");
+
+  callable_options.add_fetch(y->name() + ":0");
+
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+  ASSERT_EQ(1, outputs.size());
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(2.0, mat(0, 0));
+  EXPECT_FLOAT_EQ(4.0, mat(0, 1));
+  EXPECT_FLOAT_EQ(6.0, mat(1, 0));
+  EXPECT_FLOAT_EQ(8.0, mat(1, 1));
+  TF_ASSERT_OK(session->ReleaseCallable(handle));
+}
+
 TEST(DirectSessionTest, FetchMultipleTimes) {
   Graph g(OpRegistry::Global());
   Tensor seven_tensor(DT_INT32, TensorShape());
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 2f17af273f..6a3e6906a3 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
@@ -237,6 +239,50 @@ void GraphExecutionState::RestoreStatefulNodes(Graph* graph) {
   }
 }
 
+namespace {
+
+class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
+ public:
+  TensorConnectionPruneRewrite(const string* endpoint_name,
+                               NodeBuilder::NodeOut from_tensor)
+      : subgraph::PruneRewrite(endpoint_name, nullptr /* device_info */),
+        from_tensor_(std::move(from_tensor)) {}
+
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override {
+    Status s;
+    auto check_no_cycle_fn = [this, feed_tensor, &s](Node* n) {
+      if (n == feed_tensor.node) {
+        s.Update(errors::InvalidArgument(
+            "Requested Tensor connection between nodes \"",
+            feed_tensor.node->name(), "\" and \"", from_tensor_.node->name(),
+            "\" would create a cycle."));
+      }
+    };
+    ReverseDFSFrom(*g, {from_tensor_.node}, std::move(check_no_cycle_fn),
+                   nullptr);
+    TF_RETURN_IF_ERROR(s);
+
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat("_identity_", feed_tensor.node->name(), "_",
+                                    feed_tensor.index),
+                    "Identity")
+            .Input(from_tensor_)
+            .Attr("T",
+                  BaseType(from_tensor_.node->output_type(from_tensor_.index)))
+            .Finalize(g, out_node));
+
+    (*out_node)->set_assigned_device_name(
+        feed_tensor.node->assigned_device_name());
+    return Status::OK();
+  }
+
+ private:
+  NodeBuilder::NodeOut from_tensor_;
+};
+
+}  // namespace
+
 Status GraphExecutionState::PruneGraph(
     const BuildGraphOptions& options, Graph* graph,
     subgraph::RewriteGraphMetadata* out_rewrite_metadata) {
@@ -265,12 +311,48 @@ Status GraphExecutionState::PruneGraph(
           new subgraph::SendFetchRewrite(&fetch, device_info));
     }
   }
+
+  for (const TensorConnection& tensor_connection :
+       options.callable_options.tensor_connection()) {
+    Node* from_node = nullptr;
+    TensorId from_id(ParseTensorName(tensor_connection.from_tensor()));
+
+    for (Node* n : graph->nodes()) {
+      if (n->name() == from_id.first) {
+        from_node = n;
+        break;
+      }
+    }
+    if (from_node == nullptr) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown node: \"",
+          tensor_connection.to_tensor(), "\".");
+    }
+    if (from_id.second >= from_node->num_outputs()) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown edge: \"",
+          tensor_connection.to_tensor(),
+          "\" (actual number of outputs = ", from_node->num_outputs(), ").");
+    }
+
+    feed_rewrites.emplace_back(new TensorConnectionPruneRewrite(
+        &tensor_connection.to_tensor(), {from_node, from_id.second}));
+  }
+
   std::vector<string> target_node_names(
       options.callable_options.target().begin(),
       options.callable_options.target().end());
-  return subgraph::RewriteGraphForExecution(graph, feed_rewrites,
-                                            fetch_rewrites, target_node_names,
-                                            out_rewrite_metadata);
+  TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
+      graph, feed_rewrites, fetch_rewrites, target_node_names,
+      out_rewrite_metadata));
+
+  CHECK_EQ(out_rewrite_metadata->feed_types.size(),
+           options.callable_options.feed_size() +
+               options.callable_options.tensor_connection_size());
+  for (int i = 0; i < options.callable_options.tensor_connection_size(); ++i) {
+    out_rewrite_metadata->feed_types.pop_back();
+  }
+  return Status::OK();
 }
 
 Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
@@ -340,7 +422,13 @@ Status GraphExecutionState::OptimizeGraph(
                       options.callable_options.target().begin(),
                       options.callable_options.target().end());
 
-    if (!options.callable_options.feed().empty()) {
+    for (const TensorConnection& tensor_connection :
+         options.callable_options.tensor_connection()) {
+      item.fetch.push_back(tensor_connection.from_tensor());
+    }
+
+    if (!(options.callable_options.feed().empty() &&
+          options.callable_options.tensor_connection().empty())) {
       std::unordered_set<string> feeds;
       for (const string& feed : options.callable_options.feed()) {
         TensorId id = ParseTensorName(feed);
@@ -349,6 +437,15 @@ Status GraphExecutionState::OptimizeGraph(
         }
         feeds.insert(id.first.ToString());
       }
+      for (const TensorConnection& tensor_connection :
+           options.callable_options.tensor_connection()) {
+        TensorId id = ParseTensorName(tensor_connection.to_tensor());
+        if (id.second != 0) {
+          return errors::InvalidArgument("Unsupported feed: ",
+                                         tensor_connection.to_tensor());
+        }
+        feeds.insert(id.first.ToString());
+      }
       for (const NodeDef& node : original_graph_def_.node()) {
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index a3557e4721..c1a0075b64 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -409,6 +409,17 @@ message RunMetadata {
   repeated GraphDef partition_graphs = 3;
 }
 
+// Defines a connection between two tensors in a `GraphDef`.
+message TensorConnection {
+  // A tensor name. The value of this tensor will be substituted for
+  // the tensor named in `to_tensor`.
+  string from_tensor = 1;
+
+  // A tensor name. The value of this tensor will be bound to the
+  // value of the tensor named in `from_tensor`.
+  string to_tensor = 2;
+}
+
 // Defines a subgraph in another `GraphDef` as a set of feed points and nodes
 // to be fetched or executed.
 //
@@ -429,5 +440,10 @@ message CallableOptions {
   // Options that will be applied to each run.
   RunOptions run_options = 4;
 
-  // Next: 5
+  // Tensors to be connected in the callable. Each TensorConnection denotes
+  // a pair of tensors in the graph, between which an edge will be created
+  // in the callable.
+  repeated TensorConnection tensor_connection = 5;
+
+  // Next: 6
 }
-- 
GitLab


From ddf54d1c24a2b4dcfd8eb52d21dc1f393785f1e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 17:13:13 -0700
Subject: [PATCH 0403/1262] Remove zipped argument, and use an implicit
 dispatch mechanism

PiperOrigin-RevId: 191962157
---
 tensorflow/contrib/lite/build_def.bzl         |   3 +-
 .../contrib/lite/testing/generate_examples.py | 106 ++++++++----------
 2 files changed, 45 insertions(+), 64 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 2813d1c347..b8f6b7fd59 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -200,8 +200,7 @@ def gen_zipped_test_files(name, files):
     native.genrule(
         name = name + "_" + f + ".files",
         cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-               + " --zip_to_output " + f +
-               " $(@D) zipped"),
+               + " --zip_to_output " + f + " $(@D)"),
         outs = [out_file],
         tools = [
             ":generate_examples",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 8045052452..f919517e93 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -17,10 +17,9 @@
 
 Usage:
 
-generate_examples <output directory> zipped
+generate_examples <output directory>
 
 bazel run //tensorflow/contrib/lite/testing:generate_examples
-    third_party/tensorflow/contrib/lite/testing/generated_examples zipped
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -52,8 +51,6 @@ from tensorflow.python.ops import rnn
 parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
 parser.add_argument("output_path",
                     help="Directory where the outputs will be go.")
-# TODO(ahentz): remove this flag
-parser.add_argument("type", help="zipped")
 parser.add_argument("--zip_to_output",
                     type=str,
                     help="Particular zip to output.",
@@ -543,6 +540,18 @@ def make_pool_tests(pool_op_in):
   return f
 
 
+def make_l2_pool_tests(zip_path):
+  make_pool_tests(make_l2_pool)(zip_path)
+
+
+def make_avg_pool_tests(zip_path):
+  make_pool_tests(tf.nn.avg_pool)(zip_path)
+
+
+def make_max_pool_tests(zip_path):
+  make_pool_tests(tf.nn.max_pool)(zip_path)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -902,6 +911,22 @@ def make_binary_op_tests_func(binary_operator):
   return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
 
 
+def make_add_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.add)
+
+
+def make_div_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.div)
+
+
+def make_sub_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.subtract)
+
+
+def make_mul_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.multiply)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -1169,7 +1194,7 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_concatenation_tests(zip_path):
+def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
   test_parameters = [{
@@ -1966,69 +1991,26 @@ def main(unused_args):
       if not os.path.isdir(x):
         raise RuntimeError("Failed to create dir %r" % x)
 
-  if FLAGS.type == "zipped":
-    opstest_path = os.path.join(FLAGS.output_path)
-    mkdir_if_not_exist(opstest_path)
-    def _path(filename):
-      return os.path.join(opstest_path, filename)
-
-    dispatch = {
-        "control_dep.zip": make_control_dep_tests,
-        "add.zip": make_binary_op_tests_func(tf.add),
-        "space_to_batch_nd.zip": make_space_to_batch_nd_tests,
-        "div.zip": make_binary_op_tests_func(tf.div),
-        "sub.zip": make_binary_op_tests_func(tf.subtract),
-        "batch_to_space_nd.zip": make_batch_to_space_nd_tests,
-        "conv.zip": make_conv_tests,
-        "constant.zip": make_constant_tests,
-        "depthwiseconv.zip": make_depthwiseconv_tests,
-        "concat.zip": make_concatenation_tests,
-        "fully_connected.zip": make_fully_connected_tests,
-        "global_batch_norm.zip": make_global_batch_norm_tests,
-        "gather.zip": make_gather_tests,
-        "fused_batch_norm.zip": make_fused_batch_norm_tests,
-        "l2norm.zip": make_l2norm_tests,
-        "local_response_norm.zip": make_local_response_norm_tests,
-        "mul.zip": make_binary_op_tests_func(tf.multiply),
-        "relu.zip": make_relu_tests,
-        "relu1.zip": make_relu1_tests,
-        "relu6.zip": make_relu6_tests,
-        "prelu.zip": make_prelu_tests,
-        "l2_pool.zip": make_pool_tests(make_l2_pool),
-        "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
-        "max_pool.zip": make_pool_tests(tf.nn.max_pool),
-        "pad.zip": make_pad_tests,
-        "reshape.zip": make_reshape_tests,
-        "resize_bilinear.zip": make_resize_bilinear_tests,
-        "sigmoid.zip": make_sigmoid_tests,
-        "softmax.zip": make_softmax_tests,
-        "space_to_depth.zip": make_space_to_depth_tests,
-        "topk.zip": make_topk_tests,
-        "split.zip": make_split_tests,
-        "transpose.zip": make_transpose_tests,
-        "mean.zip": make_mean_tests,
-        "squeeze.zip": make_squeeze_tests,
-        "strided_slice.zip": make_strided_slice_tests,
-        "exp.zip": make_exp_tests,
-        "log_softmax.zip": make_log_softmax_tests,
-        "lstm.zip": make_lstm_tests,
-        "maximum.zip": make_maximum_tests,
-    }
-    out = FLAGS.zip_to_output
-    bin_path = FLAGS.toco
-    if out in dispatch:
-      dispatch[out](_path(out))
-    else:
-      raise RuntimeError("Invalid zip to output %r" % out)
+  opstest_path = os.path.join(FLAGS.output_path)
+  mkdir_if_not_exist(opstest_path)
 
-  else:
-    raise RuntimeError("Invalid argument for type of generation.")
+  out = FLAGS.zip_to_output
+  bin_path = FLAGS.toco
+  test_function = ("make_%s_tests" % out.replace(".zip", ""))
+  if test_function not in globals():
+    raise RuntimeError("Can't find a test function to create %r. Tried %r" %
+                       (out, test_function))
+
+  # TODO(ahentz): accessing globals() is not very elegant. We should either
+  # break this file into multiple tests or use decorator-based registration to
+  # avoid using globals().
+  globals()[test_function](os.path.join(opstest_path, out))
 
 
 if __name__ == "__main__":
   FLAGS, unparsed = parser.parse_known_args()
 
   if unparsed:
-    print("Usage: %s <path out> zipped <zip file to generate>")
+    print("Usage: %s <path out> <zip file to generate>")
   else:
     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
-- 
GitLab


From 5e11bbacaffdf7bc4a9363301de6a0755f95e9c0 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 6 Apr 2018 17:17:22 -0700
Subject: [PATCH 0404/1262] Open sourcing proto/rpc ops.

PiperOrigin-RevId: 191962572
---
 tensorflow/contrib/cmake/python_modules.txt   |    6 +
 tensorflow/contrib/cmake/tf_core_ops.cmake    |    3 +
 tensorflow/contrib/cmake/tf_python.cmake      |    3 +
 tensorflow/contrib/makefile/tf_op_files.txt   |    2 +
 tensorflow/contrib/proto/BUILD                |   16 +
 tensorflow/contrib/proto/__init__.py          |   28 +
 tensorflow/contrib/proto/python/ops/BUILD     |   44 +
 .../proto/python/ops/decode_proto_op.py       |   25 +
 .../proto/python/ops/encode_proto_op.py       |   25 +
 tensorflow/contrib/rpc/BUILD                  |   13 +
 tensorflow/contrib/rpc/__init__.py            |   28 +
 tensorflow/contrib/rpc/python/ops/BUILD       |   24 +
 tensorflow/contrib/rpc/python/ops/rpc_op.py   |   26 +
 tensorflow/core/BUILD                         |    9 +
 .../base_api/api_def_DecodeProtoV2.pbtxt      |  116 ++
 .../base_api/api_def_EncodeProto.pbtxt        |   81 ++
 .../core/api_def/base_api/api_def_Rpc.pbtxt   |  108 ++
 .../api_def/base_api/api_def_TryRpc.pbtxt     |  123 ++
 tensorflow/core/distributed_runtime/rpc/BUILD |   30 +
 .../rpc/grpc_rpc_factory.cc                   |  213 ++++
 .../rpc/grpc_rpc_factory.h                    |   59 +
 .../rpc/grpc_rpc_factory_registration.cc      |   34 +
 tensorflow/core/kernels/BUILD                 |   47 +
 tensorflow/core/kernels/decode_proto_op.cc    | 1011 +++++++++++++++++
 tensorflow/core/kernels/encode_proto_op.cc    |  591 ++++++++++
 tensorflow/core/kernels/rpc_op.cc             |  129 +++
 tensorflow/core/ops/decode_proto_ops.cc       |   67 ++
 tensorflow/core/ops/encode_proto_ops.cc       |   49 +
 tensorflow/core/ops/rpc_ops.cc                |   81 ++
 tensorflow/core/util/proto/BUILD              |   62 +
 tensorflow/core/util/proto/decode.h           |  592 ++++++++++
 .../util/proto/descriptor_pool_registry.cc    |   45 +
 .../util/proto/descriptor_pool_registry.h     |   76 ++
 .../proto/descriptor_pool_registry_test.cc    |   43 +
 tensorflow/core/util/proto/descriptors.cc     |   85 ++
 tensorflow/core/util/proto/descriptors.h      |   42 +
 .../local_descriptor_pool_registration.cc     |   39 +
 tensorflow/core/util/rpc/BUILD                |   48 +
 tensorflow/core/util/rpc/call_container.h     |   90 ++
 tensorflow/core/util/rpc/rpc_factory.cc       |   53 +
 tensorflow/core/util/rpc/rpc_factory.h        |   70 ++
 .../core/util/rpc/rpc_factory_registry.cc     |   44 +
 .../core/util/rpc/rpc_factory_registry.h      |   72 ++
 .../util/rpc/rpc_factory_registry_test.cc     |   41 +
 tensorflow/python/BUILD                       |    1 +
 45 files changed, 4394 insertions(+)
 create mode 100644 tensorflow/contrib/proto/BUILD
 create mode 100644 tensorflow/contrib/proto/__init__.py
 create mode 100644 tensorflow/contrib/proto/python/ops/BUILD
 create mode 100644 tensorflow/contrib/proto/python/ops/decode_proto_op.py
 create mode 100644 tensorflow/contrib/proto/python/ops/encode_proto_op.py
 create mode 100644 tensorflow/contrib/rpc/BUILD
 create mode 100644 tensorflow/contrib/rpc/__init__.py
 create mode 100644 tensorflow/contrib/rpc/python/ops/BUILD
 create mode 100644 tensorflow/contrib/rpc/python/ops/rpc_op.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
 create mode 100644 tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
 create mode 100644 tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
 create mode 100644 tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
 create mode 100644 tensorflow/core/kernels/decode_proto_op.cc
 create mode 100644 tensorflow/core/kernels/encode_proto_op.cc
 create mode 100644 tensorflow/core/kernels/rpc_op.cc
 create mode 100644 tensorflow/core/ops/decode_proto_ops.cc
 create mode 100644 tensorflow/core/ops/encode_proto_ops.cc
 create mode 100644 tensorflow/core/ops/rpc_ops.cc
 create mode 100644 tensorflow/core/util/proto/BUILD
 create mode 100644 tensorflow/core/util/proto/decode.h
 create mode 100644 tensorflow/core/util/proto/descriptor_pool_registry.cc
 create mode 100644 tensorflow/core/util/proto/descriptor_pool_registry.h
 create mode 100644 tensorflow/core/util/proto/descriptor_pool_registry_test.cc
 create mode 100644 tensorflow/core/util/proto/descriptors.cc
 create mode 100644 tensorflow/core/util/proto/descriptors.h
 create mode 100644 tensorflow/core/util/proto/local_descriptor_pool_registration.cc
 create mode 100644 tensorflow/core/util/rpc/BUILD
 create mode 100644 tensorflow/core/util/rpc/call_container.h
 create mode 100644 tensorflow/core/util/rpc/rpc_factory.cc
 create mode 100644 tensorflow/core/util/rpc/rpc_factory.h
 create mode 100644 tensorflow/core/util/rpc/rpc_factory_registry.cc
 create mode 100644 tensorflow/core/util/rpc/rpc_factory_registry.h
 create mode 100644 tensorflow/core/util/rpc/rpc_factory_registry_test.cc

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 02c456c199..8e83b4e176 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -354,6 +354,9 @@ tensorflow/contrib/periodic_resample
 tensorflow/contrib/periodic_resample/python
 tensorflow/contrib/periodic_resample/python/ops
 tensorflow/contrib/predictor
+tensorflow/contrib/proto
+tensorflow/contrib/proto/python
+tensorflow/contrib/proto/python/ops
 tensorflow/contrib/quantization
 tensorflow/contrib/quantization/python
 tensorflow/contrib/quantize
@@ -382,6 +385,9 @@ tensorflow/contrib/rnn/ops
 tensorflow/contrib/rnn/python
 tensorflow/contrib/rnn/python/kernel_tests
 tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/rpc
+tensorflow/contrib/rpc/python
+tensorflow/contrib/rpc/python/ops
 tensorflow/contrib/saved_model
 tensorflow/contrib/saved_model/python
 tensorflow/contrib/saved_model/python/saved_model
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 092a48bc6b..e558691de4 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -25,6 +25,8 @@ set(tf_op_lib_names
     "cudnn_rnn_ops"
     "data_flow_ops"
     "dataset_ops"
+    "decode_proto_ops"
+    "encode_proto_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
@@ -40,6 +42,7 @@ set(tf_op_lib_names
     "random_ops"
     "remote_fused_graph_ops"
     "resource_variable_ops"
+    "rpc_ops"
     "script_ops"
     "sdca_ops"
     "set_ops"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index fae45ead5c..1a5ec34844 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,6 +330,8 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops")
+GENERATE_PYTHON_OP_LIB("encode_proto_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
@@ -343,6 +345,7 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index b6acf71b9d..0bc4c5d473 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -301,3 +301,5 @@ tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
 tensorflow/core/ops/audio_ops.cc
+tensorflow/core/kernels/decode_proto_op.cc
+tensorflow/core/kernels/encode_proto_op.cc
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
new file mode 100644
index 0000000000..046652cbc5
--- /dev/null
+++ b/tensorflow/contrib/proto/BUILD
@@ -0,0 +1,16 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "proto",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = [
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+    ],
+)
diff --git a/tensorflow/contrib/proto/__init__.py b/tensorflow/contrib/proto/__init__.py
new file mode 100644
index 0000000000..bc5a49de78
--- /dev/null
+++ b/tensorflow/contrib/proto/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to proto.
+
+@@decode_proto
+@@encode_proto
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.decode_proto_op import decode_proto
+from tensorflow.contrib.proto.python.ops.encode_proto_op import encode_proto
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/proto/python/ops/BUILD b/tensorflow/contrib/proto/python/ops/BUILD
new file mode 100644
index 0000000000..f17065477e
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+)
+
+py_library(
+    name = "decode_proto_op_py",
+    srcs = ["decode_proto_op.py"],
+    deps = [
+        ":gen_decode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_decode_proto_op_py",
+    out = "gen_decode_proto_op.py",
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "encode_proto_op_py",
+    srcs = ["encode_proto_op.py"],
+    deps = [
+        ":gen_encode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_encode_proto_op_py",
+    out = "gen_encode_proto_op.py",
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/ops/decode_proto_op.py b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
new file mode 100644
index 0000000000..7dc000ebe4
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer decoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_decode_proto_op import decode_proto_v2 as decode_proto
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("DecodeProtoV2")
diff --git a/tensorflow/contrib/proto/python/ops/encode_proto_op.py b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
new file mode 100644
index 0000000000..ac12198b2e
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer encoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_encode_proto_op import encode_proto
+from tensorflow.python.framework import ops
+
+ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
new file mode 100644
index 0000000000..597f18c771
--- /dev/null
+++ b/tensorflow/contrib/rpc/BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "rpc",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
+)
diff --git a/tensorflow/contrib/rpc/__init__.py b/tensorflow/contrib/rpc/__init__.py
new file mode 100644
index 0000000000..c65c1a05de
--- /dev/null
+++ b/tensorflow/contrib/rpc/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to RPC.
+
+@@rpc
+@@try_rpc
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.rpc_op import try_rpc
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/rpc/python/ops/BUILD b/tensorflow/contrib/rpc/python/ops/BUILD
new file mode 100644
index 0000000000..84d2a1832f
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+py_library(
+    name = "rpc_op_py",
+    srcs = ["rpc_op.py"],
+    deps = [
+        ":gen_rpc_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_rpc_op_py",
+    out = "gen_rpc_op.py",
+    deps = [
+        "//tensorflow/core:rpc_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/ops/rpc_op.py b/tensorflow/contrib/rpc/python/ops/rpc_op.py
new file mode 100644
index 0000000000..e1b6c41137
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/rpc_op.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""RPC communication."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import try_rpc
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("Rpc")
+ops.NotDifferentiable("TryRpc")
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7d5ae1c5b5..1eebeb3995 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -637,6 +637,8 @@ tf_gen_op_libs(
         "ctc_ops",
         "data_flow_ops",
         "dataset_ops",
+        "decode_proto_ops",
+        "encode_proto_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
@@ -653,6 +655,7 @@ tf_gen_op_libs(
         "random_ops",
         "remote_fused_graph_ops",
         "resource_variable_ops",
+        "rpc_ops",
         "scoped_allocator_ops",
         "sdca_ops",
         "set_ops",
@@ -751,6 +754,8 @@ cc_library(
         ":cudnn_rnn_ops_op_lib",
         ":data_flow_ops_op_lib",
         ":dataset_ops_op_lib",
+        ":decode_proto_ops_op_lib",
+        ":encode_proto_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
@@ -767,6 +772,7 @@ cc_library(
         ":random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
+        ":rpc_ops_op_lib",
         ":scoped_allocator_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
@@ -893,6 +899,8 @@ cc_library(
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:dataset_ops",
+        "//tensorflow/core/kernels:decode_proto_op",
+        "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:functional_ops",
@@ -914,6 +922,7 @@ cc_library(
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
+        "//tensorflow/core/kernels:rpc_op",
         "//tensorflow/core/kernels:scoped_allocator_ops",
         "//tensorflow/core/kernels:sdca_ops",
         "//tensorflow/core/kernels:set_kernels",
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
new file mode 100644
index 0000000000..c8152f53c4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
@@ -0,0 +1,116 @@
+op {
+  graph_op_name: "DecodeProtoV2"
+  in_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  out_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+Each entry is the number of values found for the corresponding field.
+Optional fields may have 0 or 1 values.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+`values[i]` has datatype `output_types[i]`
+and shape `[batch_shape, max(sizes[...,i])]`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+List of TF types to use for the respective field in field_names.
+END
+  }
+  attr {
+    name: "descriptor_source"
+    description: <<END
+Either the special value `local://` or a path to a file containing
+a serialized `FileDescriptorSet`.
+END
+  }
+  attr {
+    name: "message_format"
+    description: <<END
+Either `binary` or `text`.
+END
+  }
+  attr {
+    name: "sanitize"
+    description: <<END
+Whether to sanitize the result or not.
+END
+  }
+  summary: <<END
+The op extracts fields from a serialized protocol buffers message into tensors.
+END
+  description: <<END
+The `decode_proto` op extracts fields from a serialized protocol buffers
+message into tensors.  The fields in `field_names` are decoded and converted
+to the corresponding `output_types` if possible.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+Each output tensor is a dense tensor. This means that it is padded to
+hold the largest number of repeated elements seen in the input
+minibatch. (The shape is also padded by one to prevent zero-sized
+dimensions). The actual repeat counts for each example in the
+minibatch can be found in the `sizes` output. In many cases the output
+of `decode_proto` is fed immediately into tf.squeeze if missing values
+are not a concern. When using tf.squeeze, always pass the squeeze
+dimension explicitly to avoid surprises.
+
+For the most part, the mapping between Proto field types and
+TensorFlow dtypes is straightforward. However, there are a few
+special cases:
+
+- A proto field that contains a submessage or group can only be converted
+to `DT_STRING` (the serialized submessage). This is to reduce the
+complexity of the API. The resulting string can be used as input
+to another instance of the decode_proto op.
+
+- TensorFlow lacks support for unsigned integers. The ops represent uint64
+types as a `DT_INT64` with the same twos-complement bit pattern
+(the obvious way). Unsigned int32 values can be represented exactly by
+specifying type `DT_INT64`, or using twos-complement if the caller
+specifies `DT_INT32` in the `output_types` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+Both binary and text proto serializations are supported, and can be
+chosen using the `format` attribute.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
new file mode 100644
index 0000000000..fdbe47f236
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "EncodeProto"
+  in_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+END
+  }
+  out_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "Tinput_types"
+    description: <<END
+The input types.
+END
+  }
+  summary: <<END
+The op serializes protobuf messages provided in the input tensors.
+END
+  description: <<END
+The types of the tensors in `values` must match the schema for the
+fields specified in `field_names`. All the tensors in `values` must
+have a common shape prefix, *batch_shape*.
+
+The `sizes` tensor specifies repeat counts for each field.  The repeat
+count (last dimension) of a each tensor in `values` must be greater
+than or equal to corresponding repeat count in `sizes`.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+There are a few special cases in the value mapping:
+
+Submessage and group fields must be pre-serialized as TensorFlow strings.
+
+TensorFlow lacks support for unsigned int64s, so they must be
+represented as `tf.int64` with the same twos-complement bit pattern
+(the obvious way).
+
+Unsigned int32 values can be represented exactly with `tf.int64`, or
+with sign wrapping if the input is of type `tf.int32`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
new file mode 100644
index 0000000000..344ef191fd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
@@ -0,0 +1,108 @@
+op {
+  graph_op_name: "Rpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the RPC method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+If the connection fails or the remote worker returns an error
+status, the op reraises this exception locally.
+
+See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
new file mode 100644
index 0000000000..bded00e83c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
@@ -0,0 +1,123 @@
+op {
+  graph_op_name: "TryRpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  out_arg {
+    name: "status_code"
+    description: <<END
+Same shape as `request`.  Values correspond to tensorflow Status enum codes.
+END
+  }
+  out_arg {
+    name: "status_message"
+    description: <<END
+Same shape as `request`.  Values correspond to Status messages
+returned from the RPC calls.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+Unlike the standard `Rpc` op, if the connection fails or the remote worker
+returns an error status, this op does **not** reraise the exception.
+Instead, the `status_code` and `status_message` entry for the corresponding RPC
+call is set with the error returned from the RPC call.  The `response` tensor
+will contain valid response values for those minibatch entries whose RPCs did
+not fail; the rest of the entries will have empty strings.
+END
+}
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 9c655bfa31..d3478dfc38 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -499,3 +499,33 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
+
+cc_library(
+    name = "grpc_rpc_factory",
+    srcs = [
+        "grpc_rpc_factory.cc",
+    ],
+    hdrs = ["grpc_rpc_factory.h"],
+    deps = [
+        ":grpc_state",
+        ":grpc_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+    ],
+)
+
+cc_library(
+    name = "grpc_rpc_factory_registration",
+    srcs = [
+        "grpc_rpc_factory_registration.cc",
+    ],
+    deps = [
+        ":grpc_rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
new file mode 100644
index 0000000000..d004abd1c1
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -0,0 +1,213 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+
+namespace tensorflow {
+
+namespace {
+class GrpcCall {
+ public:
+  explicit GrpcCall(CallContainer<GrpcCall>* container, int index, bool try_rpc,
+                    const string* request_msg, string* response_msg,
+                    int32* status_code, string* status_message)
+      : container_(container),
+        index_(index),
+        try_rpc_(try_rpc),
+        request_msg_(request_msg),
+        response_msg_(response_msg),
+        status_code_(status_code),
+        status_message_(status_message) {}
+
+  void StartCancel() { call_opts_.StartCancel(); }
+
+  void Done(const Status& s) {
+    DCHECK(container_ != nullptr);
+    if (!s.ok() && try_rpc_) {
+      DCHECK(status_code_ != nullptr);
+      DCHECK(status_message_ != nullptr);
+      *status_code_ = s.code();
+      *status_message_ = s.error_message();
+    }
+    container_->Done(s, index_);
+  }
+
+  const string& request() const { return *request_msg_; }
+  string* response() const { return response_msg_; }
+  CallOptions* call_opts() { return &call_opts_; }
+
+ private:
+  CallContainer<GrpcCall>* const container_;
+  const int index_;
+  bool try_rpc_;
+  CallOptions call_opts_;
+  const string* request_msg_;
+  string* response_msg_;
+  int* status_code_;
+  string* status_message_;
+};
+
+}  // namespace
+
+GrpcRPCFactory::GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                               int64 timeout_in_ms)
+    : RPCFactory(), fail_fast_(fail_fast), timeout_in_ms_(timeout_in_ms) {
+  // TODO(ebrevdo): Investigate possible performance improvements by
+  // replacing this thread with a threadpool.
+  polling_thread_ =
+      ctx->env()->StartThread(ThreadOptions(), "rpc_op_grpc_factory", [this]() {
+        void* tag;
+        bool ok;
+        while (completion_queue_.Next(&tag, &ok)) {
+          GrpcClientCQTag* callback_tag = static_cast<GrpcClientCQTag*>(tag);
+          callback_tag->OnCompleted(ok);
+        }
+      });
+}
+
+GrpcRPCFactory::~GrpcRPCFactory() {
+  // The amount of time we wait depends on several parameters, including:
+  //   - the value of the fail_fast attribute.
+  //   - the timeout option of the rpc call in the proto declaration.
+  //   - the network roundtrip time and service's execution time.
+  //
+  // If a connection is made but the service doesn't ever respond, and
+  // there is no timeout option set for this rpc call, then it is
+  // possible the RPC request will wait forever.
+  //
+  completion_queue_.Shutdown();
+  delete polling_thread_;
+}
+
+void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements,
+                          const Tensor& address_t, const Tensor& method_t,
+                          const Tensor& request_t, const bool try_rpc,
+                          Tensor* response_t, Tensor* status_code_t,
+                          Tensor* status_message_t,
+                          AsyncOpKernel::DoneCallback done) {
+  auto address = address_t.flat<string>();
+  auto method = method_t.flat<string>();
+  auto request = request_t.flat<string>();
+
+  // Stubs are maintained by the GrpcRPCFactory class and will be
+  // deleted when the class is destroyed.
+  ::grpc::GenericStub* singleton_stub = nullptr;
+  if (address.size() == 1) {
+    singleton_stub = GetOrCreateStubForAddress(address(0));
+  }
+  auto get_stub = [&address, this,
+                   singleton_stub](int64 ix) -> ::grpc::GenericStub* {
+    return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
+                                : singleton_stub;
+  };
+  auto get_method_ptr = [&method](int64 ix) -> const string* {
+    return (method.size() > 1) ? &(method(ix)) : &(method(0));
+  };
+  auto get_request_ptr = [&request](int64 ix) -> const string* {
+    return (request.size() > 1) ? &(request(ix)) : &(request(0));
+  };
+
+  if (try_rpc) {
+    // In this case status_code will never be set in the response,
+    // so we just set it to OK.
+    DCHECK(status_code_t != nullptr);
+    status_code_t->flat<int32>().setConstant(
+        static_cast<int>(errors::Code::OK));
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken cancellation_token = cm->get_cancellation_token();
+
+  // This object will delete itself when done.
+  auto* container =
+      new CallContainer<GrpcCall>(ctx, num_elements, fail_fast_, try_rpc,
+                                  std::move(done), cancellation_token);
+
+  auto response = response_t->flat<string>();
+  int32* status_code_ptr = nullptr;
+  string* status_message_ptr = nullptr;
+  if (try_rpc) {
+    status_code_ptr = status_code_t->flat<int32>().data();
+    status_message_ptr = status_message_t->flat<string>().data();
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    container->calls()->emplace_back(
+        container, i, try_rpc, get_request_ptr(i), &response(i),
+        (try_rpc) ? &status_code_ptr[i] : nullptr,
+        (try_rpc) ? &status_message_ptr[i] : nullptr);
+  }
+
+  int i = 0;
+  for (GrpcCall& call : *(container->calls())) {
+    // This object will delete itself when done.
+    new RPCState<string>(get_stub(i), &completion_queue_, *get_method_ptr(i),
+                         call.request(), call.response(),
+                         /*done=*/[&call](const Status& s) { call.Done(s); },
+                         call.call_opts(), fail_fast_, timeout_in_ms_);
+    ++i;
+  }
+
+  // Need to register this callback after all the RPCs are in
+  // flight; otherwise we may try to cancel an RPC *before* it
+  // launches, which is a no-op, and then fall into a deadlock.
+  bool is_cancelled = !cm->RegisterCallback(
+      cancellation_token, [container]() { container->StartCancel(); });
+
+  if (is_cancelled) {
+    ctx->SetStatus(errors::Cancelled("Operation has been cancelled."));
+    // container's reference counter will take care of calling done().
+    container->StartCancel();
+  }
+}
+
+::grpc::GenericStub* GrpcRPCFactory::GetOrCreateStubForAddress(
+    const string& address) {
+  mutex_lock lock(mu_);
+
+  auto stub = stubs_.find(address);
+  if (stub != stubs_.end()) return stub->second.get();
+
+  ChannelPtr channel = CreateChannelForAddress(address);
+  auto* created = new ::grpc::GenericStub(channel);
+  stubs_[address].reset(created);
+  return created;
+}
+
+GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
+    const string& address) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+
+  // Set a standard backoff timeout of 1s instead of the
+  // (sometimes default) 20s.
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  return ::grpc::CreateCustomChannel(
+      /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
new file mode 100644
index 0000000000..34ec235aaf
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class GrpcRPCFactory : public RPCFactory {
+ public:
+  explicit GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                          int64 timeout_in_ms);
+
+  // Explicit destructor to control destruction order.
+  ~GrpcRPCFactory() override;
+
+  void Call(OpKernelContext* ctx, int64 num_elements, const Tensor& address_t,
+            const Tensor& method_t, const Tensor& request_t, const bool try_rpc,
+            Tensor* response_t, Tensor* status_code_t, Tensor* status_message_t,
+            AsyncOpKernel::DoneCallback done) override;
+
+ protected:
+  typedef std::shared_ptr<::grpc::Channel> ChannelPtr;
+  virtual ChannelPtr CreateChannelForAddress(const string& address);
+
+ private:
+  ::grpc::GenericStub* GetOrCreateStubForAddress(const string& address);
+
+  bool fail_fast_;
+  int64 timeout_in_ms_;
+  ::grpc::CompletionQueue completion_queue_;
+  Thread* polling_thread_;  // Owned.
+
+  mutex mu_;
+  typedef std::unique_ptr<::grpc::GenericStub> StubPtr;
+  std::unordered_map<string, StubPtr> stubs_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
new file mode 100644
index 0000000000..b884489378
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Used for adding the grpc factory to the RPC factory registry.
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return new GrpcRPCFactory(ctx, fail_fast, timeout_in_ms);
+  }
+};
+
+REGISTER_RPC_FACTORY("grpc", Value::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1857d8d655..783de6af88 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5121,6 +5121,9 @@ filegroup(
             "summary_interface.*",
             "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
+            "decode_proto_op.cc",
+            "encode_proto_op.cc",
+            "rpc_op.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
@@ -6153,6 +6156,50 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "decode_proto_op",
+    srcs = [
+        "decode_proto_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:decode",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "encode_proto_op",
+    srcs = ["encode_proto_op.cc"],
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "rpc_op",
+    srcs = [
+        "rpc_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:rpc_ops_op_lib",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+        "//third_party/eigen3",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
new file mode 100644
index 0000000000..b4e5b776ed
--- /dev/null
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -0,0 +1,1011 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DecodeProto is a TensorFlow Op which extracts arbitrary fields
+// from protos serialized as strings.
+//
+// See docs in ../ops/decode_proto_op.cc.
+//
+// This implementation reads the serialized format using a handful of
+// calls from the WireFormatLite API used by generated proto code.
+// WireFormatLite is marked as an "internal" proto API but is widely
+// used in practice and highly unlikely to change.
+// This will be much faster than the previous implementation based on
+// constructing a temporary dynamic message in memory and using the
+// proto reflection api to read it.
+// It can be used with any proto whose descriptors are available at
+// runtime but should be competitive in speed with approaches that
+// compile in the proto definitions.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/decode.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::MakeUnique;
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::DynamicMessageFactory;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::Message;
+using ::tensorflow::protobuf::TextFormat;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedInputStream;
+
+const bool kFailOnDecodeError = true;
+
+// Returns true if the proto field type can be converted to the
+// tensorflow::DataType.
+bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) {
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return output_type == tensorflow::DT_FLOAT ||
+             output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_BOOL:
+      return output_type == tensorflow::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_GROUP:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_MESSAGE:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_BYTES:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_UINT32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_ENUM:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_SINT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SINT64:
+      return output_type == tensorflow::DT_INT64;
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// A FieldInfo holds a handful of information from the FieldDescriptor
+// and user attributes.
+struct FieldInfo {
+  FieldInfo(const FieldDescriptor* field_desc, int user_index)
+      : output_index(user_index) {
+    // Without this intermediate data structure, the profile had hotspots
+    // calling methods of FieldDescriptor.
+    number = field_desc->number();
+
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    type = static_cast<WireFormatLite::FieldType>(field_desc->type());
+    is_repeated = field_desc->is_repeated();
+  }
+
+  // Disable copy and move.
+  FieldInfo(const FieldInfo&) = delete;
+  FieldInfo& operator=(const FieldInfo&) = delete;
+
+  // Internally we sort field descriptors by wire number for
+  // fast lookup. In general this is different from the order
+  // given by the user. Output_index gives the index into
+  // the field_names and output_types attributes and into
+  // the output tensor list.
+  int output_index = -1;
+
+  // This is a cache of the relevant fields from `FieldDescriptorProto`.
+  // This was added after noticing that FieldDescriptor->type() was
+  // using 6% of the cpu profile.
+  WireFormatLite::FieldType type;
+  int number;
+  bool is_repeated;
+};
+
+// A CountCollector counts sizes of repeated and optional fields in a proto.
+//
+// Each field is tracked by a single CountCollector instance. The
+// instance manages a single count, which is stored as a pointer (it
+// is intended to be a reference to the `sizes` output which is being
+// filled in). The pointer is passed in at initialization.
+//
+// Counting is done as a separate pass in order to allocate output tensors
+// all at once. This allows the TensorFlow runtime to optimize allocation
+// for the consumer, while removing the need for copying inside this op.
+// After this pass, the DenseCollector class (below) gathers the data:
+// It is more complex and provides better motivation for the API here.
+class CountCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  CountCollector() = default;
+
+  // The count may be stored inside an Eigen Tensor to eliminate copying.
+  explicit CountCollector(int32* count) : count_ptr_(count) {}
+
+  // Reads (in this case counts) a single value.
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // Only repeated fields can have count > 1.
+    if (*count_ptr_ == 0 || field.is_repeated) {
+      (*count_ptr_)++;
+    }
+    // We expect a wire type based on the schema field_type, to allow
+    // a little more checking.
+    if (!SkipValue(input, field)) {
+      return errors::DataLoss("ReadValue: Failed skipping field when counting");
+    }
+    return Status::OK();
+  }
+
+  // Reads (in this case counts) a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          size_t buf_size) {
+    if (buf_size == 0) {
+      return Status::OK();
+    }
+
+    const void* tmpbuf;
+    int unused_max_buf_size;
+
+    input->GetDirectBufferPointerInline(&tmpbuf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    const uint8* buf = reinterpret_cast<const uint8*>(tmpbuf);
+
+    // Important: we skipped the input->{Push,Pop}Limit() calls for speed,
+    // so the bounds check on buf_size inside Skip() is critical, and
+    // must be done before scanning the contents.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss("ReadPackedValues: Skipping packed field failed");
+    }
+
+    // Dispatch to the appropriately typed field reader based on the
+    // schema type.
+    Status st;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        st = CountPackedFixed<double>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FLOAT:
+        st = CountPackedFixed<float>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_UINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED64:
+        st = CountPackedFixed<uint64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED32:
+        st = CountPackedFixed<uint32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_BOOL:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_STRING:
+        st = errors::DataLoss("TYPE_STRING encountered as packed");
+        break;
+      case WireFormatLite::TYPE_GROUP:
+        st = errors::DataLoss("TYPE_GROUP encountered as packed");
+        break;
+      case WireFormatLite::TYPE_MESSAGE:
+        st = errors::DataLoss("TYPE_MESSAGE encountered as packed");
+        break;
+      case WireFormatLite::TYPE_BYTES:
+        st = errors::DataLoss("TYPE_BYTES encountered as packed");
+        break;
+      case WireFormatLite::TYPE_UINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_ENUM:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED32:
+        st = CountPackedFixed<int32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED64:
+        st = CountPackedFixed<int64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+        // default: intentionally omitted in order to enable static checking.
+    }
+    if (!st.ok()) {
+      return st;
+    }
+
+    if (!field.is_repeated && *count_ptr_ > 1) {
+      *count_ptr_ = 1;
+    }
+    return Status::OK();
+  }
+
+ private:
+  // Skips a length-delimited value.
+  static bool SkipBytes(CodedInputStream* input) {
+    uint32 length;
+    if (!input->ReadVarint32(&length)) {
+      return false;
+    }
+    return input->Skip(length);
+  }
+
+  // Counts the number of packed varints in an array.
+  // The end of a varint is signaled by a value < 0x80,
+  // so counting them requires parsing the bytestream.
+  // It is the caller's responsibility to ensure that len > 0.
+  Status CountPackedVarint(const uint8* buf, size_t len) {
+    const uint8* bound = buf + len;
+    int count;
+
+    // The last byte in a valid encoded varint is guaranteed to have
+    // the high bit unset. We rely on this property to prevent
+    // ReadVarint64FromArray from going out of bounds, so validate
+    // the end of the buf before scanning anything.
+    if (bound[-1] & 0x80) {
+      return errors::DataLoss("Corrupt packed varint");
+    }
+
+    // Now we can trust ReadVarint64FromArray to stay in bounds.
+    for (count = 0; buf < bound; ++count) {
+      uint64 temp;
+      bool ok;
+      buf = internal::ReadVarint64FromArray(buf, &ok, &temp);
+      if (!ok) {
+        return errors::DataLoss("Corrupt packed varint");
+      }
+    }
+
+    *count_ptr_ += count;
+    return Status::OK();
+  }
+
+  // Counts the number of fixed-size values in a packed field.
+  // This can be done without actually parsing anything.
+  template <typename T>
+  Status CountPackedFixed(const uint8* unused_buf, size_t len) {
+    int count = len / sizeof(T);
+    if (count * sizeof(T) != len) {
+      return errors::DataLoss(
+          "Illegal data length for packed fixed-size type: ", len);
+    }
+    *count_ptr_ += len / sizeof(T);
+    return Status::OK();
+  }
+
+  // Skips a single value in the input stream.
+  // Dispatches to the appropriately typed field skipper based on the
+  // schema type tag.
+  // This is not as permissive as just handling the wire type.
+  static bool SkipValue(CodedInputStream* input, const FieldInfo& field) {
+    uint32 tmp32;
+    protobuf_uint64 tmp64;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FLOAT:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_INT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_UINT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_INT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_FIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_BOOL:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_STRING:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_GROUP:
+        return WireFormatLite::SkipField(
+            input, WireFormatLite::MakeTag(
+                       field.number, WireFormatLite::WIRETYPE_START_GROUP));
+      case WireFormatLite::TYPE_MESSAGE:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_BYTES:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_UINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_ENUM:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_SINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SINT64:
+        return input->ReadVarint64(&tmp64);
+        // default: intentionally omitted in order to enable static checking.
+    }
+  }
+
+  int32* count_ptr_ = nullptr;
+};
+
+// A DenseCollector accumulates values from a proto into a tensor.
+//
+// There is an instance of DenseCollector for each field of each
+// proto. The DenseCollector deserializes the value from the wire
+// directly into the preallocated output Tensor.
+//
+// This class is named DenseCollector because in the future there should
+// be a SparseCollector that accumulates field data into sparse tensors if
+// the user requests it.
+class DenseCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  DenseCollector() = default;
+
+  // A DenseCollector applies to one field of a serialized message.
+  DenseCollector(uint8* datap, DataType dtype, int max_repeat_count)
+      : datap_(datap), dtype_(dtype), max_repeat_count_(max_repeat_count) {}
+
+  // Reads a value from the input stream and stores it.
+  //
+  // Always inlining gave a ~50% speedup on microbenchmarks at one point.
+  // TODO(nix): try removing it to see if that still holds.
+  // TODO(jsimsa): ABSL_ATTRIBUTE_ALWAYS_INLINE
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // For required and optional fields, we overwrite values[0] with
+    // the latest one in the wire stream.
+    // See https://developers.google.com/protocol-buffers/docs/encoding#optional
+    // Only for repeated fields do we advance the next_repeat_index_ past 1.
+    // TODO(nix): to handle oneof we must also zero out any previous values
+    //  seen on the wire.
+    int32 index = 0;
+    if (field.is_repeated) {
+      index = next_repeat_index_;
+    }
+    next_repeat_index_ = index + 1;
+
+    return internal::ReadValue(input, field.type, field.number, dtype_, index,
+                               datap_);
+  }
+
+  // Reads and stores a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          const size_t buf_size) {
+    const void* buf;
+    int unused_max_buf_size;
+    input->GetDirectBufferPointerInline(&buf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss(
+          "ReadPackedValues: Skipping packed field failed.  Field tag: ",
+          field.number);
+    }
+
+    // Setting stride=0 causes new values to overwrite old ones for
+    // non-repeated fields.
+    const int stride = field.is_repeated ? 1 : 0;
+
+    if (next_repeat_index_ >= max_repeat_count_) {
+      return errors::DataLoss(
+          "ReadPackedValues: Tried to write more entries than allowed.  "
+          "Field tag: ",
+          field.number, ", Max entries allowed: ", max_repeat_count_);
+    } else {
+      return internal::ReadPackedFromArray(buf, buf_size, field.type,
+                                           field.number, dtype_, stride,
+                                           &next_repeat_index_, datap_);
+    }
+  }
+
+  // Fills in any missing values in the output array with defaults.
+  // Dispatches to the appropriately typed field default based on the
+  // runtime type tag.
+  Status FillWithDefaults() {
+    switch (dtype_) {
+      case DataType::DT_FLOAT:
+        return FillDefault<float>();
+      case DataType::DT_DOUBLE:
+        return FillDefault<double>();
+      case DataType::DT_INT32:
+        return FillDefault<int32>();
+      case DataType::DT_UINT8:
+        return FillDefault<uint8>();
+      case DataType::DT_INT8:
+        return FillDefault<int8>();
+      case DataType::DT_STRING:
+        return FillDefault<string>();
+      case DataType::DT_INT64:
+        return FillDefault<int64>();
+      case DataType::DT_BOOL:
+        return FillDefault<bool>();
+      default:
+        // There are many tensorflow dtypes not handled here, but they
+        // should not come up unless type casting is added to the Op.
+        // Chaining with tf.cast() should do the right thing until then.
+        return errors::DataLoss(
+            "Failed filling defaults in unknown tf::DataType");
+    }
+  }
+
+ private:
+  // Fills empty values in the dense representation with a
+  // default value. This uses next_repeat_index_ which counts the number
+  // of parsed values for the field.
+  template <class T>
+  Status FillDefault() {
+    for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
+      reinterpret_cast<T*>(datap_)[i] = T();
+    }
+    return Status::OK();
+  }
+
+  int32 next_repeat_index_ = 0;
+
+  // This is a pointer to data_[message_index_].
+  // There is no bounds checking at this level: we computed the max
+  // repeat size for each field in CountCollector and use the same
+  // code to traverse it here, so we are guaranteed not to be called
+  // for more items than we have allocated space.
+  void* const datap_ = nullptr;
+
+  const DataType dtype_ = DataType::DT_INVALID;
+  const int max_repeat_count_ = 0;
+};
+
+class DecodeProtoOp : public OpKernel {
+ public:
+  explicit DecodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    std::vector<string> field_names;
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names));
+    std::vector<DataType> output_types;
+    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_types));
+    OP_REQUIRES(
+        context, field_names.size() == output_types.size(),
+        errors::InvalidArgument("field_names and output_types attributes must "
+                                "have the same length"));
+
+    // Gather the field descriptors and check that requested output types match.
+
+    int field_index = 0;
+    std::vector<const FieldDescriptor*> field_descs;
+    for (const string& name : field_names) {
+      auto fd = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, fd != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+      OP_REQUIRES(context,
+                  CheckOutputType(fd->type(), output_types[field_index]),
+                  // Many TensorFlow types don't have corresponding proto types
+                  // and the user will get an error if they are requested. It
+                  // would be nice to allow conversions here, but tf.cast
+                  // already exists so we don't duplicate the functionality.
+                  // Known unhandled types:
+                  //   DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+                  //   DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+                  errors::InvalidArgument("Unexpected output type for ",
+                                          fd->full_name(), ": ", fd->cpp_type(),
+                                          " to ", output_types[field_index]));
+
+      field_index++;
+      field_descs.push_back(fd);
+    }
+
+    // Internally we want the field_descs sorted by their number on the wire.
+    // But the output tensors are allocated in the order given by the caller.
+    // Build a mapping i->j, where field_descs[i] corresponds to outputs[j].
+    std::vector<int> output_indices;
+    output_indices.reserve(field_names.size());
+    for (int i = 0; i < field_names.size(); i++) {
+      output_indices.push_back(i);
+    }
+    std::sort(output_indices.begin(), output_indices.end(),
+              [field_descs](int a, int b) {
+                return field_descs[a]->number() < field_descs[b]->number();
+              });
+
+    // Now store the fields in sorted order.
+    for (int i = 0; i < field_names.size(); i++) {
+      fields_.push_back(MakeUnique<FieldInfo>(field_descs[output_indices[i]],
+                                              output_indices[i]));
+    }
+
+    message_prototype_ = message_factory_.GetPrototype(message_desc);
+    OP_REQUIRES(context, message_prototype_ != nullptr,
+                errors::InvalidArgument("Couldn't get prototype message: ",
+                                        message_desc->full_name()));
+    string format;
+    OP_REQUIRES_OK(context, context->GetAttr("message_format", &format));
+    OP_REQUIRES(
+        context, format == "binary" || format == "text",
+        errors::InvalidArgument("format must be one of binary or text"));
+    is_binary_ = format == "binary";
+
+    // Enable the initial protobuf sanitizer, which is much
+    // more expensive than the decoder.
+    // TODO(nix): Remove this once the fast decoder
+    // has passed security review.
+    OP_REQUIRES_OK(context, context->GetAttr("sanitize", &sanitize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& buf_tensor = ctx->input(0);
+    int message_count = buf_tensor.NumElements();
+    OP_REQUIRES(ctx, message_count >= 1,
+                errors::InvalidArgument(
+                    "Bufs argument must contain at least one value"));
+
+    int field_count = fields_.size();
+
+    // Save the argument shape for later, then flatten the input
+    // Tensor since we are working componentwise. We will restore
+    // the same shape in the returned Tensor.
+    const TensorShape& shape_prefix = buf_tensor.shape();
+
+    TensorShape sizes_shape = shape_prefix;
+    sizes_shape.AddDim(field_count);
+    Tensor* sizes_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, sizes_shape, &sizes_tensor));
+
+    // This is used to allocate binary bufs if used. It serves only
+    // to define memory ownership.
+    std::vector<string> tmp_binary_bufs(message_count);
+
+    // These are the actual buffers to use, which may be in tmp_binary_bufs
+    // or may be pointers into the buf_tensor. Either way they are not owned
+    // here.
+    std::vector<const string*> bufs;
+
+    if (is_binary_ && !sanitize_) {
+      // Fast path.
+      for (int mi = 0; mi < message_count; ++mi) {
+        const string* buf = &buf_tensor.flat<string>()(mi);
+        bufs.push_back(buf);
+      }
+    } else {
+      // We will have to allocate a copy, either to convert from text to
+      // binary or to sanitize a binary proto.
+      for (int mi = 0; mi < message_count; ++mi) {
+        ReserializeMessage(ctx, buf_tensor.flat<string>()(mi),
+                           &tmp_binary_bufs[mi]);
+        if (!ctx->status().ok()) {
+          return;
+        }
+        bufs.push_back(&tmp_binary_bufs[mi]);
+      }
+    }
+
+    // Walk through all the strings in the input tensor, counting
+    // the number of fields in each.
+    // We can't allocate our actual output Tensor until we know the
+    // maximum repeat count, so we do a first pass through the serialized
+    // proto just counting fields.
+    // We always allocate at least one value so that optional fields
+    // are populated with default values - this avoids a TF
+    // conditional when handling the output data.
+    // The caller can distinguish between real data and defaults
+    // using the repeat count matrix that is returned by decode_proto.
+    std::vector<int32> max_sizes(field_count, 1);
+    for (int mi = 0; mi < message_count; ++mi) {
+      CountFields(ctx, mi, *bufs[mi], sizes_tensor, &max_sizes);
+      if (!ctx->status().ok()) {
+        return;
+      }
+    }
+
+    // Allocate the output tensors now that we've seen the max size.
+    // TODO(nix): Use allocate_output_or_forward_input for the largest
+    //   output tensor. This can avoid one large allocation by re-using
+    //   the memory of the input tensor.
+    std::vector<Tensor*> outputs(field_count);
+    for (int fi = 0; fi < field_count; ++fi) {
+      TensorShape flat_shape = {static_cast<int64>(message_count),
+                                max_sizes[fi]};
+      TensorShape out_shape = shape_prefix;
+      out_shape.AddDim(max_sizes[fi]);
+
+      // Surprisingly we don't specify the types from the output_types
+      // attribute: that is done for us based on the Op declaration:
+      //  REGISTER_OP(...)
+      //    .Attr("output_types: list(type) >= 0")
+      //    .Output("values: output_types")
+      OP_REQUIRES_OK(ctx,
+                     // ctx->allocate_output(output_indices_[fi] + 1,
+                     ctx->allocate_output(fields_[fi]->output_index + 1,
+                                          out_shape, &outputs[fi]));
+    }
+
+    // Make the second pass through the serialized proto, decoding
+    // into preallocated tensors.
+    AccumulateFields(ctx, bufs, outputs);
+  }
+
+ private:
+  // Copy a serialized message to binary, e.g. to handle text proto inputs.
+  void ReserializeMessage(OpKernelContext* ctx, const string& buf,
+                          string* binary_buf) {
+    // Handle text protos by translating them to binary.
+    std::unique_ptr<Message> message(message_prototype_->New());
+    OP_REQUIRES(ctx, message, errors::DataLoss("Initializing message failed"));
+
+    if (is_binary_) {
+      // If we get here we are sanitizing the input protobuf by parsing
+      // and reserializing it with a trusted (but very slow) library.
+      OP_REQUIRES(ctx, message->ParseFromString(buf),
+                  errors::DataLoss("Unable to parse binary protobuf"));
+    } else {
+      OP_REQUIRES(ctx, TextFormat::ParseFromString(buf, message.get()),
+                  errors::DataLoss("Unable to parse text protobuf"));
+    }
+
+    OP_REQUIRES(ctx, message->SerializeToString(binary_buf),
+                errors::DataLoss("Unable to reserialize text proto as binary"));
+  }
+
+  // Count the number of occurrences of each requested field in a message batch.
+  void CountFields(OpKernelContext* ctx, int message_index, const string& buf,
+                   Tensor* sizes_tensor, std::vector<int32>* max_sizes) {
+    int field_count = fields_.size();
+
+    CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                           buf.size());
+
+    std::vector<int32> field_sizes(field_count, 0);
+    std::vector<CountCollector> counters;
+    counters.reserve(field_count);
+    for (int i = 0; i < field_count; i++) {
+      counters.emplace_back(&field_sizes[i]);
+    }
+
+    Status st = Collect(&input, &counters);
+    if (st.ok() && !input.ConsumedEntireMessage()) {
+      st = errors::DataLoss("CountFields: Failed to consume entire buffer");
+    }
+    if (kFailOnDecodeError) {
+      OP_REQUIRES_OK(ctx, st);  // NOLINT
+    }
+    if (!st.ok()) {
+      // This code suppresses the corrupt proto, treating it as empty
+      // to avoid crashing the process.
+      LOG(WARNING) << "Proto counting error for message type " << message_type_
+                   << ": " << st;
+
+      for (int fi = 0; fi < field_count; fi++) {
+        field_sizes[fi] = 0;
+      }
+      // Finished decoding this message.
+      return;
+    }
+
+    // Update the size tensor and max repeat size for each field.
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+    for (int fi = 0; fi < field_count; fi++) {
+      int32 size = field_sizes[fi];
+      sizes(message_index, fields_[fi]->output_index) = size;
+      if ((*max_sizes)[fi] < size) {
+        (*max_sizes)[fi] = size;
+      }
+    }
+  }
+
+  // Parse fields from a serialized message into preallocated tensors.
+  void AccumulateFields(OpKernelContext* ctx,
+                        const std::vector<const string*>& bufs,
+                        std::vector<Tensor*> outputs) {
+    struct TensorInfo {
+      explicit TensorInfo(Tensor* tensor) {
+        // Note that we can decode only max_repeat_count values before overflow.
+        // No other bounds checking is done for repeated fields. For
+        // optional fields there is a check to make sure that only the last
+        // value on the wire appears in the output tensor.
+        dtype = tensor->dtype();
+        last_dim_size = tensor->dim_size(tensor->dims() - 1);
+
+        if (dtype != DT_STRING) {
+          const int element_size = DataTypeSize(dtype);
+          CHECK_GT(element_size, 0);
+          stride = last_dim_size * element_size;
+
+          const int64 flatshape[1] = {tensor->NumElements() * element_size};
+          data = tensor->bit_casted_shaped<uint8, 1>(flatshape).data();
+        } else {
+          // DataTypeSize() returns 0 for string types.
+          stride = last_dim_size * sizeof(string);
+          data = reinterpret_cast<uint8*>(tensor->flat<string>().data());
+        }
+      }
+
+      DataType dtype;
+      int last_dim_size;
+      int stride;
+      uint8* data;
+    };
+
+    int field_count = fields_.size();
+
+    std::vector<TensorInfo> tensors;
+    tensors.reserve(field_count);
+    for (int fi = 0; fi < field_count; fi++) {
+      tensors.emplace_back(outputs[fi]);
+    }
+
+    for (int message_index = 0; message_index < bufs.size(); ++message_index) {
+      const string& buf = *bufs[message_index];
+
+      std::vector<DenseCollector> collectors;
+      collectors.reserve(field_count);
+      for (const TensorInfo& info : tensors) {
+        collectors.emplace_back(info.data + message_index * info.stride,
+                                info.dtype, info.last_dim_size);
+      }
+
+      // Fill in output tensors from the wire.
+      CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                             buf.size());
+      Status st = Collect(&input, &collectors);
+      if (st.ok() && !input.ConsumedEntireMessage()) {
+        st = errors::DataLoss(
+            "AccumulateFields: Failed to consume entire buffer");
+      }
+      if (kFailOnDecodeError) {
+        OP_REQUIRES_OK(ctx, st);  // NOLINT
+      }
+      if (!st.ok()) {
+        // This code suppresses the corrupt proto, treating it as empty
+        // to avoid crashing training.
+        LOG(WARNING) << "Proto counting error for message type "
+                     << message_type_ << ": " << st;
+      }
+
+      // Fill the remainder of the dense outputs with default values.
+      for (auto& collector : collectors) {
+        OP_REQUIRES_OK(ctx, collector.FillWithDefaults());
+      }
+    }
+  }
+
+  // Look up the FieldDescriptor for a particular field number.
+  bool LookupField(int field_number, int* field_index) {
+    // Look up the FieldDescriptor using linear search.
+    // TODO(nix): this could be sped up with binary search, but we are
+    // already way off the fastpath at this point. If you see a hotspot
+    // here, somebody is sending you very inefficient protos.
+    for (int fi = fields_.size() - 1; fi >= 0; fi--) {
+      if (field_number == fields_[fi]->number) {
+        *field_index = fi;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Traverses a serialized protobuf, dispatching values to the collectors.
+  template <class CollectorClass>
+  Status Collect(CodedInputStream* input,
+                 std::vector<CollectorClass>* collectors) {
+    int last_good_field_index = -1;
+    bool fields_disordered = false;
+    int prev_field_number = -1;
+    int field_number = -1;
+    int last_good_field_number = -1;
+    int next_good_field_number = fields_[0]->number;
+
+    // The 'tag' variable should always be treated as tainted.
+    for (uint32 tag = input->ReadTag();
+         tag != 0 && WireFormatLite::GetTagWireType(tag) !=
+                         WireFormatLite::WIRETYPE_END_GROUP;
+         tag = input->ReadTag(), prev_field_number = field_number) {
+      field_number = WireFormatLite::GetTagFieldNumber(tag);
+      const FieldInfo* field = nullptr;
+
+      // This takes advantage of the sorted field numbers in most serialized
+      // protos: it tries the next expected field first rather than doing
+      // a lookup by field number.
+      // TODO(nix): haberman@ suggests a hybrid approach with a lookup table
+      // for small field numbers and a hash table for larger ones. This would
+      // be a simpler approach that should offer comparable speed in most
+      // cases.
+      if (field_number == last_good_field_number) {
+        field = fields_[last_good_field_index].get();
+      } else {
+        if (field_number < prev_field_number) {
+          fields_disordered = true;
+        }
+
+        // If fields are out of order, fall back to slow lookup.
+        if (fields_disordered) {
+          int field_index;
+          if (LookupField(field_number, &field_index)) {
+            field = fields_[field_index].get();
+            last_good_field_index = field_index;
+          }
+        } else {
+          // If we see a field that is past the next field we want,
+          // it was empty. Look for the one after that.
+          // Repeat until we run out of fields that we care about.
+          while (field_number >= next_good_field_number) {
+            if (field_number == next_good_field_number) {
+              last_good_field_number = field_number;
+              field = fields_[last_good_field_index + 1].get();
+            }
+
+            // Start looking for the field after the current one.
+            ++last_good_field_index;
+            if (last_good_field_index < fields_.size() - 1) {
+              next_good_field_number =
+                  fields_[last_good_field_index + 1]->number;
+            } else {
+              // Saw something past the last field we care about.
+              // Continue parsing the message just in case there
+              // are disordered fields later, but any remaining
+              // ordered fields will have no effect.
+              next_good_field_number = INT_MAX;
+            }
+          }
+        }
+      }
+
+      if (!field) {
+        // Unknown and unrequested fields are skipped.
+        if (!WireFormatLite::SkipField(input, tag)) {
+          return errors::DataLoss("Failed skipping unrequested field");
+        }
+        continue;
+      }
+
+      Status st = CollectField(*field, WireFormatLite::GetTagWireType(tag),
+                               input, &(*collectors)[last_good_field_index]);
+      if (!st.ok()) {
+        return st;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Collects values for a single field.
+  template <class CollectorClass>
+  Status CollectField(const FieldInfo& field,
+                      WireFormatLite::WireType wire_type,
+                      CodedInputStream* input, CollectorClass* collector) {
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    WireFormatLite::WireType schema_wire_type =
+        WireFormatLite::WireTypeForFieldType(field.type);
+
+    // Handle packed repeated fields. SkipField would skip the
+    // whole length-delimited blob without letting us count the
+    // values, so we have to scan them ourselves.
+    if (wire_type == WireFormatLite::WIRETYPE_LENGTH_DELIMITED &&
+        schema_wire_type != WireFormatLite::WIRETYPE_LENGTH_DELIMITED) {
+      // Handle packed repeated primitives.
+      int length;
+      if (!input->ReadVarintSizeAsInt(&length)) {
+        return errors::DataLoss("CollectField: Failed reading packed size");
+      }
+      Status st = collector->ReadPackedValues(input, field, length);
+      if (!st.ok()) {
+        return st;
+      }
+      return Status::OK();
+    }
+
+    // Read ordinary values, including strings, bytes, and messages.
+    if (wire_type != schema_wire_type) {
+      if (!WireFormatLite::SkipField(
+              input, WireFormatLite::MakeTag(field.number, wire_type))) {
+        return errors::DataLoss(
+            "CollectField: Failed skipping malformed field");
+      }
+      return Status::OK();
+    }
+    return collector->ReadValue(input, field);
+  }
+
+  string message_type_;
+  // Note that fields are sorted by increasing field number,
+  // which is not in general the order given by the user-specified
+  // field_names and output_types Op attributes.
+  std::vector<std::unique_ptr<const FieldInfo>> fields_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+  DynamicMessageFactory message_factory_;
+  const Message* message_prototype_;
+
+  // True if decoding binary format, false if decoding text format.
+  bool is_binary_;
+
+  // True if the protos should be sanitized before parsing.
+  // Enables the initial protobuf sanitizer, which is much
+  // more expensive than the decoder. The flag defaults to true
+  // but can be set to false for trusted sources.
+  // TODO(nix): flip the default to false when the fast decoder
+  // has passed security review.
+  bool sanitize_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DecodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeProtoV2").Device(DEVICE_CPU),
+                        DecodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
new file mode 100644
index 0000000000..3b02ae52a2
--- /dev/null
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -0,0 +1,591 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// EncodeProto is a TensorFlow Op which serializes tensors into
+// arbitrary protobufs.
+//
+// See the docstring in ../ops/encode_proto_op.cc for usage of the op.
+//
+// This implementation writes the serialized format using a handful of
+// calls from the WireFormatLite API.
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedOutputStream;
+using ::tensorflow::protobuf::io::StringOutputStream;
+
+// Computes the total serialized size for a packed repeated field.
+// For fixed-size types this can just multiply, but for variable-sized
+// types it has to iterate through the values in the tensor.
+template <WireFormatLite::FieldType FieldType, typename TensorT>
+size_t TotalPackedSize(const Tensor& input, int message_index, int size);
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_DOUBLE, double>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kDoubleSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, double>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, float>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT64, int64>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT32, int32>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED64, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int32>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_BOOL, bool>(const Tensor& input,
+                                                        int message_index,
+                                                        int size) {
+  return size * WireFormatLite::kBoolSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_ENUM, int32>(const Tensor& input,
+                                                         int message_index,
+                                                         int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size +=
+        WireFormatLite::EnumSize(input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED32, int32>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED64, int64>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+// Writes a possibly repeated primitive field.
+// TensorFlow does not have unsigned types, so we decode them to signed and
+// encode them back to unsigned.
+template <typename TensorT, typename ProtoT,
+          WireFormatLite::FieldType FieldType,
+          void Writer(ProtoT, CodedOutputStream*)>
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto wire_type = WireFormatLite::WireTypeForFieldType(
+      WireFormatLite::FieldType(field_desc.type()));
+
+  auto input_t = input.flat_inner_dims<TensorT>();
+  if (field_desc.options().packed()) {
+    // Write the tag for the packed field.
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+
+    // Write the total packed length.
+    size_t data_size =
+        TotalPackedSize<FieldType, TensorT>(input, message_index, size);
+    output->WriteVarint32(data_size);
+
+    // Write individual values.
+    for (int64 i = 0; i < size; i++) {
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  } else {
+    for (int64 i = 0; i < size; i++) {
+      WireFormatLite::WriteTag(field_desc.number(), wire_type, output);
+
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  }
+}
+
+// Writes a possibly repeated string, bytes, or message field.
+template <typename T, void Writer(int, const T&, CodedOutputStream*)>
+void WriteVarLenField(const FieldDescriptor& field_desc, const Tensor& input,
+                      int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<T>();
+  for (int64 i = 0; i < size; i++) {
+    const T& value = input_t(static_cast<int64>(message_index), i);
+    // TODO(nix): there doesn't seem to be an inlined version of
+    // WireFormatLite::WriteString or its relatives, which might allow a
+    // small speedup.
+    Writer(field_desc.number(), value, output);
+  }
+}
+
+// Writes a group field.
+// Groups are treated like submessages, but tag-delimited
+// instead of length-delimited. WireFormatLite handles this
+// differently so we code it ourselves.
+void WriteGroup(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<string>();
+  for (int64 i = 0; i < size; i++) {
+    const string& value = input_t(static_cast<int64>(message_index), i);
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_START_GROUP, output);
+    // Note the use of WriteRaw instead of WriteString to skip the length.
+    output->WriteRaw(value.data(), value.size());
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_END_GROUP, output);
+  }
+}
+
+// Writes a (possibly repeated) field into an output stream.
+// It is the caller's responsibility to ensure that the type of
+// the input tensor is compatible with the type of the proto
+// field descriptor, and that (message_index, size-1) is within
+// bounds.
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  DataType tf_type = input.dtype();
+
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return WriteField<double, double, WireFormatLite::TYPE_DOUBLE,
+                        WireFormatLite::WriteDoubleNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FLOAT:
+      switch (tf_type) {
+        case DataType::DT_FLOAT:
+          return WriteField<float, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_DOUBLE:
+          return WriteField<double, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_INT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_INT64,
+                        WireFormatLite::WriteInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_UINT64,
+                        WireFormatLite::WriteUInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_INT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_INT32,
+                        WireFormatLite::WriteInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_FIXED64,
+                        WireFormatLite::WriteFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_BOOL:
+      return WriteField<bool, bool, WireFormatLite::TYPE_BOOL,
+                        WireFormatLite::WriteBoolNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_STRING:
+      return WriteVarLenField<string, WireFormatLite::WriteString>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_GROUP:
+      return WriteGroup(field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_MESSAGE:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_BYTES:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_ENUM:
+      return WriteField<int32, int32, WireFormatLite::TYPE_ENUM,
+                        WireFormatLite::WriteEnumNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SFIXED32,
+                        WireFormatLite::WriteSFixed32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SFIXED64,
+                        WireFormatLite::WriteSFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SINT32,
+                        WireFormatLite::WriteSInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SINT64,
+                        WireFormatLite::WriteSInt64NoTag>(
+          field_desc, input, message_index, size, output);
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// Checks that a Protobuf field is compatible with a TensorFlow datatype.
+// This is separated from WriteField to lift it out of the inner loop.
+bool IsCompatibleType(const FieldDescriptor& field_desc, DataType tf_type) {
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return tf_type == DataType::DT_FLOAT || tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SFIXED64:
+    case WireFormatLite::TYPE_SINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_SFIXED32:
+    case WireFormatLite::TYPE_SINT32:
+      return tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return tf_type == DataType::DT_INT64 || tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_BOOL:
+      return tf_type == DataType::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return tf_type == DataType::DT_STRING;
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return false;
+}
+
+class EncodeProtoOp : public OpKernel {
+ public:
+  explicit EncodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names_));
+
+    // Gather the field descriptors for the given field_names.
+    field_descs_.resize(field_names_.size());
+    for (int i = 0; i < field_names_.size(); i++) {
+      const string& name = field_names_[i];
+      auto field_desc = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, field_desc != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+
+      field_descs_[i] = field_desc;
+    }
+
+    // Build a list of indices into field_descs sorted by increasing
+    // field_number. This will be used to output fields in sorted order,
+    // which is strongly encouraged when serializing protobufs.
+    sorted_field_index_.resize(field_names_.size());
+    // Start with the fields sorted by current index.
+    for (int i = 0; i < field_names_.size(); i++) sorted_field_index_[i] = i;
+    // Then sort the field indices by their proto field number.
+    std::sort(sorted_field_index_.begin(), sorted_field_index_.end(),
+              [this](int a, int b) -> bool {
+                return field_descs_[a]->number() < field_descs_[b]->number();
+              });
+  }
+
+  void Compute(OpKernelContext* cx) override {
+    const Tensor* sizes_tensor;
+    OP_REQUIRES_OK(cx, cx->input("sizes", &sizes_tensor));
+
+    OpInputList values;
+    OP_REQUIRES_OK(cx, cx->input_list("values", &values));
+
+    OP_REQUIRES(cx, field_descs_.size() == values.size(),
+                errors::InvalidArgument(
+                    "Length of inputs list must match field_names"));
+
+    // Check the arguments for consistency.
+    TensorShape common_prefix;
+    int message_count;
+    for (int i = 0; i < field_descs_.size(); i++) {
+      const Tensor& v = values[i];
+
+      // The type of each value tensor must match the corresponding field.
+      OP_REQUIRES(cx, IsCompatibleType(*field_descs_[i], v.dtype()),
+                  errors::InvalidArgument(
+                      "Incompatible type for field " + field_names_[i] +
+                          ".  Saw dtype: ",
+                      DataTypeString(v.dtype()),
+                      " but field type is: ", field_descs_[i]->type_name()));
+
+      // All value tensors must have the same shape prefix (i.e. batch size).
+      TensorShape shape_prefix = v.shape();
+      shape_prefix.RemoveDim(shape_prefix.dims() - 1);
+
+      // Do some initialization on the first input value. The rest will
+      // have to match this one.
+      if (i == 0) {
+        OP_REQUIRES(cx, v.dims() >= 1,
+                    errors::InvalidArgument(
+                        "Expected value to be at least a vector, saw shape: ",
+                        v.shape().DebugString()));
+        common_prefix = shape_prefix;
+        message_count = common_prefix.num_elements();
+      } else {
+        OP_REQUIRES(cx, shape_prefix == common_prefix,
+                    errors::InvalidArgument(
+                        "Values must match up to the last dimension"));
+      }
+    }
+
+    TensorShape expected_sizes_shape = common_prefix;
+    expected_sizes_shape.AddDim(field_descs_.size());
+
+    OP_REQUIRES(cx, sizes_tensor->shape() == expected_sizes_shape,
+                errors::InvalidArgument(
+                    "sizes should be batch_size + [len(field_names)].  Saw: ",
+                    sizes_tensor->shape().DebugString(),
+                    " but expected: ", expected_sizes_shape.DebugString()));
+
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+
+    for (int i = 0; i < field_descs_.size(); ++i) {
+      const Tensor& v = values[i];
+      int max_size = v.dim_size(v.dims() - 1);
+
+      // The last dimension of a value tensor must be greater than the
+      // corresponding
+      // size in the sizes tensor.
+      for (int message_index = 0; message_index < message_count;
+           message_index++) {
+        OP_REQUIRES(
+            cx, sizes(message_index, i) <= max_size,
+            errors::InvalidArgument(
+                "Size to write must not be larger than value tensor; but saw: ",
+                sizes(message_index, i), " > ", max_size, " at message ",
+                message_index, " field ", i));
+      }
+    }
+
+    // This pointer is owned by the context.
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(cx, cx->allocate_output(0, common_prefix, &output_tensor));
+
+    auto bufs = output_tensor->flat<string>();
+    for (int message_index = 0; message_index < message_count;
+         message_index++) {
+      // TODO(nix): possibly optimize allocation here by calling
+      //   bufs(message_index).reserve(DEFAULT_BUF_SIZE);
+      StringOutputStream output_string(&bufs(message_index));
+      CodedOutputStream out(&output_string);
+      // Write fields in ascending field_number order.
+      for (int i : sorted_field_index_) {
+        auto& field_desc = *field_descs_[i];
+        const Tensor& v = values[i];
+        int size = sizes(message_index, i);
+        if (!size) continue;
+        WriteField(field_desc, v, message_index, size, &out);
+      }
+    }
+  }
+
+ private:
+  std::vector<string> field_names_;
+  std::vector<const FieldDescriptor*> field_descs_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+
+  // Contains indices into field_names_, sorted by field number since
+  // that's the order of writing.
+  std::vector<int> sorted_field_index_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EncodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("EncodeProto").Device(DEVICE_CPU), EncodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/rpc_op.cc b/tensorflow/core/kernels/rpc_op.cc
new file mode 100644
index 0000000000..2447ef5040
--- /dev/null
+++ b/tensorflow/core/kernels/rpc_op.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// RpcOp is a TensorFlow op that sends and receives arbitrary messages.
+//
+// See docs in ../ops/rpc_op.cc.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+class RpcOp : public AsyncOpKernel {
+ public:
+  explicit RpcOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("protocol", &protocol_));
+    OP_REQUIRES(context, !protocol_.empty(),
+                errors::InvalidArgument("protocol must be non-empty."));
+    bool fail_fast;
+    OP_REQUIRES_OK(context, context->GetAttr("fail_fast", &fail_fast));
+    int64 timeout_in_ms;
+    OP_REQUIRES_OK(context, context->GetAttr("timeout_in_ms", &timeout_in_ms));
+
+    RPCFactoryRegistry::RPCFactoryFn* rpc_factory_fn =
+        RPCFactoryRegistry::Global()->Get(protocol_);
+    OP_REQUIRES(context, rpc_factory_fn != nullptr,
+                errors::InvalidArgument("The protocol ", protocol_,
+                                        " was not recognized."));
+
+    rpc_factory_.reset((*rpc_factory_fn)(context, fail_fast, timeout_in_ms));
+  }
+
+  ~RpcOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& address_t = ctx->input(0);
+    const Tensor& method_t = ctx->input(1);
+    const Tensor& request_t = ctx->input(2);
+
+    OP_REQUIRES_ASYNC(
+        ctx, address_t.dims() == 0 || address_t.dims() == 1,
+        errors::InvalidArgument("address must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, method_t.dims() == 0 || method_t.dims() == 1,
+        errors::InvalidArgument("method must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, request_t.dims() == 0 || request_t.dims() == 1,
+        errors::InvalidArgument("request must be a scalar or vector."), done);
+
+    TensorShape output_shape({});
+    for (const Tensor& t : {address_t, method_t, request_t}) {
+      if (t.dims() == 1) {
+        OP_REQUIRES_ASYNC(
+            ctx,
+            output_shape.dims() == 0 ||
+                output_shape.dim_size(0) == t.dim_size(0),
+            errors::InvalidArgument(
+                "Input vector shapes don't match: ", output_shape.DebugString(),
+                " vs. ", t.shape().DebugString()),
+            done);
+        output_shape = t.shape();
+      }
+    }
+
+    Tensor* response_t;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->allocate_output(0, output_shape, &response_t), done);
+
+    const bool try_rpc = (ctx->num_outputs() > 1);
+
+    Tensor* status_code_t = nullptr;
+    Tensor* status_message_t = nullptr;
+    if (try_rpc) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(1, output_shape, &status_code_t), done);
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(2, output_shape, &status_message_t), done);
+    }
+
+    if (request_t.NumElements() == 0) {
+      // Special case, we finished early!
+      done();
+      return;
+    }
+
+    int64 num_elements = output_shape.num_elements();
+
+    rpc_factory_->Call(ctx, num_elements, address_t, method_t, request_t,
+                       try_rpc, response_t, status_code_t, status_message_t,
+                       std::move(done));
+  }
+
+ private:
+  string protocol_;
+  std::unique_ptr<RPCFactory> rpc_factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RpcOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Rpc").Device(DEVICE_CPU), RpcOp);
+REGISTER_KERNEL_BUILDER(Name("TryRpc").Device(DEVICE_CPU), RpcOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/decode_proto_ops.cc b/tensorflow/core/ops/decode_proto_ops.cc
new file mode 100644
index 0000000000..3f6fb2f582
--- /dev/null
+++ b/tensorflow/core/ops/decode_proto_ops.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("DecodeProtoV2")
+    .Input("bytes: string")
+    .Attr("message_type: string")
+    .Attr("field_names: list(string)")
+    .Attr("output_types: list(type) >= 0")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("message_format: string = 'binary'")
+    .Attr("sanitize: bool = false")
+    .Output("sizes: int32")
+    .Output("values: output_types")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input = c->input(0);
+
+      std::vector<tensorflow::DataType> output_types;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_types", &output_types));
+
+      ShapeHandle sizes;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(input, c->Vector(output_types.size()), &sizes));
+      c->set_output(0, sizes);
+
+      // TODO(nix): to do the best possible job of shape inference, we
+      // should examine the proto descriptors here in order to set shape
+      // indices to 1 instead of unknown for optional or required fields.
+      // Any general-purpose code will have to handle the unknown case,
+      // but there might be XLA code that could be sped up with the additional
+      // knowledge.
+      for (int i = 0; i < output_types.size(); ++i) {
+        ShapeHandle values;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(input, c->Vector(c->UnknownDim()), &values));
+        c->set_output(i + 1, values);
+      }
+
+      return Status::OK();
+    });
+
+// TODO(nix): Consider adding an additional input argument that truncates
+// repeated fields to a maximum count. For now this could be done by passing
+// the output through tf.slice.
+
+// TODO(nix): define missing value behavior.
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/encode_proto_ops.cc b/tensorflow/core/ops/encode_proto_ops.cc
new file mode 100644
index 0000000000..f5ec3056e3
--- /dev/null
+++ b/tensorflow/core/ops/encode_proto_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("EncodeProto")
+    .Input("sizes: int32")
+    .Input("values: Tinput_types")
+    .Attr("field_names: list(string)")
+    .Attr("message_type: string")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("Tinput_types: list(type)")
+    .Output("bytes: string")
+    .SetShapeFn([](InferenceContext* c) {
+      int first_field_index = 1;
+      int num_fields = c->num_inputs() - 1;
+
+      ShapeHandle output;
+      for (int i = num_fields - 1; i >= 0; --i) {
+        ShapeHandle input = c->input(first_field_index + i);
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, 2, &input));
+        ShapeHandle inner;
+        TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &inner));
+        TF_RETURN_IF_ERROR(c->Merge(inner, output, &output));
+      }
+
+      c->set_output(0, output);
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/rpc_ops.cc b/tensorflow/core/ops/rpc_ops.cc
new file mode 100644
index 0000000000..72fda5e6eb
--- /dev/null
+++ b/tensorflow/core/ops/rpc_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+Status RpcShapeOp(InferenceContext* c, bool try_rpc) {
+  ShapeHandle address;
+  ShapeHandle method;
+  ShapeHandle request;
+  ShapeHandle output;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &address));
+  if (c->Rank(address) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, address, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &method));
+  if (c->Rank(method) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, method, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &request));
+  if (c->Rank(request) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, request, &output));
+  }
+  if (!c->RankKnown(output)) {
+    output = request;
+  }
+  c->set_output(0, output);  // response
+  if (try_rpc) {
+    c->set_output(1, output);  // status_code
+    c->set_output(2, output);  // status_message
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("Rpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/false);
+    });
+
+REGISTER_OP("TryRpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .Output("status_code: int32")
+    .Output("status_message: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/true);
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
new file mode 100644
index 0000000000..ade14ed162
--- /dev/null
+++ b/tensorflow/core/util/proto/BUILD
@@ -0,0 +1,62 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "decode",
+    hdrs = ["decode.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptors",
+    srcs = ["descriptors.cc"],
+    hdrs = ["descriptors.h"],
+    deps = [
+        ":descriptor_pool_registry",
+        ":local_descriptor_pool_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptor_pool_registry",
+    srcs = ["descriptor_pool_registry.cc"],
+    hdrs = ["descriptor_pool_registry.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "descriptor_pool_registry_test",
+    srcs = ["descriptor_pool_registry_test.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Depending on this target adds support for using the special
+# value "local://" (or "") for descriptor source, in which case
+# descriptors linked into the code will be searched.
+cc_library(
+    name = "local_descriptor_pool_registration",
+    srcs = ["local_descriptor_pool_registration.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
new file mode 100644
index 0000000000..74634a356a
--- /dev/null
+++ b/tensorflow/core/util/proto/decode.h
@@ -0,0 +1,592 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Inline functions for parsing the protocol buffers wire format.
+//
+// These functions have been optimized at the expense of safety.
+// They are broken out into a separate file for readability but are
+// not intended for use by clients other than the decode_proto op.
+//
+// The calling code in the decode_proto op does some fairly
+// complicated things to ensure that this code is called
+// safely. Changes to this code should be thoroughly fuzz tested.
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+using tensorflow::protobuf::internal::WireFormatLite;
+using tensorflow::protobuf::io::CodedInputStream;
+using tensorflow::protobuf::io::CodedOutputStream;
+using tensorflow::protobuf::io::StringOutputStream;
+
+// Converts an uint64 to an int64 without loss of information.
+// Unsigned values greater than INT64_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int64 WrapUnsignedAsSigned64(uint64 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT64_MAX) {
+    return static_cast<int64>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT64_MIN) {
+    return static_cast<int64>(unsigned_value - INT64_MIN) + INT64_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Converts an uint32 to an int32 without loss of information.
+// Unsigned values greater than INT_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT_MAX) {
+    return static_cast<int32>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT_MIN) {
+    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value);
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+// This is slightly less efficient than the private version in
+// coded_stream.cc but we duplicate less code by calling
+// the 64 bit version instead of copying the code.
+inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
+                                          uint32* value) {
+  uint64 tmp;
+  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+  *value = tmp & 0xffffffff;
+  return buf;
+}
+
+// Reads a single proto field value from a byte array into an array.
+// The array is part of a Tensor that was allocated by the caller
+// with type TensorType, while DeclaredType is the proto field type.
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int32>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int64>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_uint64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
+                                               WireFormatLite::TYPE_FIXED64>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
+    const uint8* buf, int32* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32,
+                                                WireFormatLite::TYPE_SFIXED32>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SFIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_int64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
+                                               WireFormatLite::TYPE_SFIXED64>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8* buf, float* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<float,
+                                                WireFormatLite::TYPE_FLOAT>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8* buf, double* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<double,
+                                                WireFormatLite::TYPE_DOUBLE>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8* buf, bool* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = temp != 0;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8* buf, int* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int>(temp);
+  return buf;
+}
+
+// Reads packed values from an array.
+// Stride is set to 1 for repeated fields, and 0 for non-repeated fields
+// (where any value overwrites previous values).
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+inline int ReadPackedPrimitives(const void* bufp, const size_t len,
+                                const int index, const int stride,
+                                void* datap) {
+  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
+  const uint8* bound = buf + len;
+  TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
+  int count;
+
+  // This could overrun the bound by stride-1. This is defended
+  // against in the caller, where it ensures that the input buffer
+  // contains complete values.
+  for (count = 0; buf < bound; count += stride) {
+    buf = ReadFromArray<TensorType, DeclaredType>(buf, data + count);
+  }
+  return count;
+}
+
+// Reads a primitive value field from a serialized proto.
+// The value is parsed from the serialized format, then static_cast
+// to the desired type for TensorFlow and stored.
+template <class ValueType, class TensorType,
+          enum WireFormatLite::FieldType DeclaredType>
+inline Status ReadPrimitive(CodedInputStream* input, int index, void* data) {
+  ValueType v;
+  if (!WireFormatLite::ReadPrimitive<ValueType, DeclaredType>(input, &v)) {
+    return errors::DataLoss("Failed reading primitive");
+  }
+
+  reinterpret_cast<TensorType*>(data)[index] = v;
+  return Status::OK();
+}
+
+// Reads a string, submessage, or other variable-length field from a
+// serialized proto.
+// May read all or part of a repeated field.
+inline Status ReadBytes(CodedInputStream* input, int index, void* datap) {
+  string* data = reinterpret_cast<string*>(datap) + index;
+  if (!WireFormatLite::ReadBytes(input, data)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+  return Status::OK();
+}
+
+// Reads a tag-delimited field (TYPE_GROUP) from a serialized proto,
+// as a bytestring.
+inline Status ReadGroupBytes(CodedInputStream* input, int field_number,
+                             int index, void* datap) {
+  // WireFormatLite::SkipField has an option to emit the
+  // skipped bytes to an output stream. We could do better by implementing our
+  // own scanner but this is simpler for now.
+  // TODO(nix): there is a faster way to grab TYPE_GROUP bytes by relying
+  // on input->IsFlat() == true and using input->GetDirectBufferPointer()
+  // with input->CurrentPosition().
+  string* data = reinterpret_cast<string*>(datap) + index;
+  StringOutputStream string_stream(data);
+  CodedOutputStream out(&string_stream);
+  if (!WireFormatLite::SkipField(
+          input,
+          WireFormatLite::MakeTag(field_number,
+                                  WireFormatLite::WIRETYPE_START_GROUP),
+          &out)) {
+    return errors::DataLoss("Failed reading group");
+  }
+  return Status::OK();
+}
+
+// Reads a single field value from a CodedInputStream into a tensor.
+inline Status ReadValue(CodedInputStream* input,
+                        WireFormatLite::FieldType field_type, int field_number,
+                        DataType dtype, int index, void* datap) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return ReadPrimitive<double, double, WireFormatLite::TYPE_DOUBLE>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FLOAT:
+      if (dtype == DataType::DT_FLOAT) {
+        return ReadPrimitive<float, float, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_DOUBLE) {
+        return ReadPrimitive<float, double, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FLOAT");
+    case WireFormatLite::TYPE_INT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_INT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_UINT64:
+      return ReadPrimitive<protobuf_uint64, int64, WireFormatLite::TYPE_UINT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_INT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FIXED64:
+      return ReadPrimitive<protobuf_uint64, int64,
+                           WireFormatLite::TYPE_FIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      return ReadPrimitive<bool, bool, WireFormatLite::TYPE_BOOL>(input, index,
+                                                                  datap);
+    case WireFormatLite::TYPE_STRING:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_GROUP:
+      return ReadGroupBytes(input, field_number, index, datap);
+    case WireFormatLite::TYPE_MESSAGE:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_BYTES:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED64:
+      return ReadPrimitive<protobuf_int64, int64,
+                           WireFormatLite::TYPE_SFIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_SINT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SINT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_SINT64>(
+          input, index, datap);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads and stores a length-delimited list of values.
+inline Status ReadPackedFromArray(const void* buf, size_t buf_size,
+                                  const WireFormatLite::FieldType field_type,
+                                  const int field_number, const DataType dtype,
+                                  const int stride, int* index, void* data) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      *index += ReadPackedPrimitives<double, WireFormatLite::TYPE_DOUBLE>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FLOAT:
+      *index += ReadPackedPrimitives<float, WireFormatLite::TYPE_FLOAT>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_INT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_UINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      *index += ReadPackedPrimitives<bool, WireFormatLite::TYPE_BOOL>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return errors::DataLoss("Non-primitive type encountered as packed");
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_SFIXED32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SFIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SFIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads a varint from the given buffer, write it to *value, and return the
+// new buffer pointer.
+// This was copied from coded_stream.cc where it is private.
+// Important: This routine may read as much as kMaxVarintBytes from
+// the buffer. It is the caller's responsibility to make sure that there is
+// enough space in the buffer.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value) {
+  const uint8* ptr = buffer;
+  uint32 b;
+
+  // Splitting into 32-bit pieces gives better performance on 32-bit
+  // processors.
+  uint32 part0 = 0, part1 = 0, part2 = 0;
+
+  b = *(ptr++);
+  part0 = b;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80;
+  b = *(ptr++);
+  part0 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 7;
+  b = *(ptr++);
+  part0 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 14;
+  b = *(ptr++);
+  part0 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 21;
+  b = *(ptr++);
+  part1 = b;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80;
+  b = *(ptr++);
+  part1 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 7;
+  b = *(ptr++);
+  part1 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 14;
+  b = *(ptr++);
+  part1 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 21;
+  b = *(ptr++);
+  part2 = b;
+  if (!(b & 0x80)) goto done;
+  part2 -= 0x80;
+  b = *(ptr++);
+  part2 += b << 7;
+  if (!(b & 0x80)) goto done;
+  // "part2 -= 0x80 << 7" is irrelevant because (0x80 << 7) << 56 is 0.
+
+  // We have overrun the maximum size of a varint (10 bytes).  Assume
+  // the data is corrupt.
+  *ok = false;
+  return ptr;
+
+done:
+  *ok = true;
+  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
+           (static_cast<uint64>(part2) << 56);
+  return ptr;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.cc b/tensorflow/core/util/proto/descriptor_pool_registry.cc
new file mode 100644
index 0000000000..5f0423f76b
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+
+DescriptorPoolRegistry* DescriptorPoolRegistry::Global() {
+  static DescriptorPoolRegistry* registry = new DescriptorPoolRegistry;
+  return registry;
+}
+
+DescriptorPoolRegistry::DescriptorPoolFn* DescriptorPoolRegistry::Get(
+    const string& source) {
+  auto found = fns_.find(source);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void DescriptorPoolRegistry::Register(
+    const string& source,
+    const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+  auto existing = Get(source);
+  CHECK_EQ(existing, nullptr)
+      << "descriptor pool for source: " << source << " already registered";
+  fns_.insert(std::pair<const string&, DescriptorPoolFn>(source, pool_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.h b/tensorflow/core/util/proto/descriptor_pool_registry.h
new file mode 100644
index 0000000000..66c20e9e41
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class DescriptorPoolRegistry {
+ public:
+  typedef std::function<Status(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool)>
+      DescriptorPoolFn;
+
+  // Returns a pointer to a global DescriptorPoolRegistry object.
+  static DescriptorPoolRegistry* Global();
+
+  // Returns a pointer to a descriptor pool function for the given source.
+  DescriptorPoolFn* Get(const string& source);
+
+  // Registers a descriptor pool factory.
+  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+
+ private:
+  std::map<string, DescriptorPoolFn> fns_;
+};
+
+namespace descriptor_pool_registration {
+
+class DescriptorPoolRegistration {
+ public:
+  DescriptorPoolRegistration(
+      const string& source,
+      const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+    DescriptorPoolRegistry::Global()->Register(source, pool_fn);
+  }
+};
+
+}  // namespace descriptor_pool_registration
+
+#define REGISTER_DESCRIPTOR_POOL(source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(__COUNTER__, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(ctr, source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)       \
+  static descriptor_pool_registration::DescriptorPoolRegistration \
+      descriptor_pool_registration_fn_##ctr(source, pool_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry_test.cc b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
new file mode 100644
index 0000000000..a6899998ab
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("TEST POOL 1", Value::Function);
+REGISTER_DESCRIPTOR_POOL("TEST POOL 2", Value::Function);
+}  // namespace
+
+TEST(DescriptorPoolRegistryTest, TestBasic) {
+  EXPECT_EQ(DescriptorPoolRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto pool1 = DescriptorPoolRegistry::Global()->Get("TEST POOL 1");
+  EXPECT_NE(pool1, nullptr);
+  auto pool2 = DescriptorPoolRegistry::Global()->Get("TEST POOL 2");
+  EXPECT_NE(pool2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
new file mode 100644
index 0000000000..271c85efd8
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+// Build a `DescriptorPool` from the named file or URI. The file or URI
+// must be available to the current TensorFlow environment.
+//
+// The file must contiain a serialized `FileDescriptorSet`. See
+// `GetDescriptorPool()` for more information.
+Status GetDescriptorPoolFromFile(
+    tensorflow::Env* env, const string& filename,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  Status st = env->FileExists(filename);
+  if (!st.ok()) {
+    return st;
+  }
+
+  // Read and parse the FileDescriptorSet.
+  tensorflow::protobuf::FileDescriptorSet descs;
+  std::unique_ptr<tensorflow::ReadOnlyMemoryRegion> buf;
+  st = env->NewReadOnlyMemoryRegionFromFile(filename, &buf);
+  if (!st.ok()) {
+    return st;
+  }
+  if (!descs.ParseFromArray(buf->data(), buf->length())) {
+    return errors::InvalidArgument(
+        "descriptor_source contains invalid FileDescriptorSet: ", filename);
+  }
+
+  // Build a DescriptorPool from the FileDescriptorSet.
+  owned_desc_pool->reset(new tensorflow::protobuf::DescriptorPool());
+  for (const auto& filedesc : descs.file()) {
+    if ((*owned_desc_pool)->BuildFile(filedesc) == nullptr) {
+      return errors::InvalidArgument(
+          "Problem loading FileDescriptorProto (missing dependencies?): ",
+          filename);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  // Attempt to lookup the pool in the registry.
+  auto pool_fn = DescriptorPoolRegistry::Global()->Get(descriptor_source);
+  if (pool_fn != nullptr) {
+    return (*pool_fn)(desc_pool, owned_desc_pool);
+  }
+
+  // If there is no pool function registered for the given source, let the
+  // runtime find the file or URL.
+  Status status =
+      GetDescriptorPoolFromFile(env, descriptor_source, owned_desc_pool);
+  if (status.ok()) {
+    *desc_pool = owned_desc_pool->get();
+  }
+  *desc_pool = owned_desc_pool->get();
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
new file mode 100644
index 0000000000..92ee8997ab
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+class Env;
+class Status;
+
+// Get a `DescriptorPool` object from the named `descriptor_source`.
+// `descriptor_source` may be a path to a file accessible to TensorFlow, in
+// which case it is parsed as a `FileDescriptorSet` and used to build the
+// `DescriptorPool`.
+//
+// `owned_desc_pool` will be filled in with the same pointer as `desc_pool` if
+// the caller should take ownership.
+extern tensorflow::Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
diff --git a/tensorflow/core/util/proto/local_descriptor_pool_registration.cc b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
new file mode 100644
index 0000000000..48fe0102d0
--- /dev/null
+++ b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+namespace {
+
+struct LocalDescriptorPool {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    *desc_pool = ::tensorflow::protobuf::DescriptorPool::generated_pool();
+    if (*desc_pool == nullptr) {
+      return errors::InvalidArgument("Problem loading protobuf generated_pool");
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("", LocalDescriptorPool::Function);
+REGISTER_DESCRIPTOR_POOL("local://", LocalDescriptorPool::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/BUILD b/tensorflow/core/util/rpc/BUILD
new file mode 100644
index 0000000000..f0f161ecc0
--- /dev/null
+++ b/tensorflow/core/util/rpc/BUILD
@@ -0,0 +1,48 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "call_container",
+    hdrs = ["call_container.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory",
+    srcs = ["rpc_factory.cc"],
+    hdrs = ["rpc_factory.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory_registry",
+    srcs = ["rpc_factory_registry.cc"],
+    hdrs = ["rpc_factory_registry.h"],
+    deps = [
+        ":rpc_factory",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "rpc_factory_registry_test",
+    srcs = ["rpc_factory_registry_test.cc"],
+    deps = [
+        ":rpc_factory_registry",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
new file mode 100644
index 0000000000..7f36056797
--- /dev/null
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+#define TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+
+#include <list>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+namespace tensorflow {
+
+template <typename Call>
+class CallContainer {
+ public:
+  explicit CallContainer(OpKernelContext* ctx, int num_calls, bool fail_fast,
+                         bool try_rpc, AsyncOpKernel::DoneCallback done,
+                         CancellationToken token)
+      : ctx_(ctx),
+        done_(std::move(done)),
+        token_(token),
+        fail_fast_(fail_fast),
+        try_rpc_(try_rpc) {
+    CHECK_GT(num_calls, 0);
+
+    // This will run when all RPCs are finished.
+    reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
+      ctx_->cancellation_manager()->DeregisterCallback(token_);
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    });
+
+    // Subtract reference count from the initial creation.
+    core::ScopedUnref unref(reffed_status_callback_);
+
+    for (int i = 0; i < num_calls; ++i) {
+      // Increase the reference on the callback for each new RPC.
+      reffed_status_callback_->Ref();
+    }
+  }
+
+  std::list<Call>* calls() { return &calls_; }
+
+  void StartCancel() {
+    // Once this loop is done, can no longer assume anything is valid
+    // because "delete this" may have been immediately called.
+    // Nothing should run after this loop.
+    for (auto& call : calls_) {
+      call.StartCancel();
+    }
+  }
+
+  void Done(const Status& s, int index) {
+    if (!try_rpc_) {
+      reffed_status_callback_->UpdateStatus(s);
+    }
+    reffed_status_callback_->Unref();
+  }
+
+ private:
+  OpKernelContext* ctx_;
+  std::list<Call> calls_;
+  const AsyncOpKernel::DoneCallback done_;
+  const CancellationToken token_;
+  const bool fail_fast_;
+  const bool try_rpc_;
+
+  // Performs its own reference counting.
+  ReffedStatusCallback* reffed_status_callback_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory.cc b/tensorflow/core/util/rpc/rpc_factory.cc
new file mode 100644
index 0000000000..8530f02b6e
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/numbers.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+template <>
+bool GetEnvVar(const char* key, const string& default_value, string* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+  } else {
+    *value = env_value;
+  }
+  return true;
+}
+
+template <>
+bool GetEnvVar(const char* key, const int64& default_value, int64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strto64(env_value, value);
+}
+
+template <>
+bool GetEnvVar(const char* key, const uint64& default_value, uint64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strtou64(env_value, value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory.h b/tensorflow/core/util/rpc/rpc_factory.h
new file mode 100644
index 0000000000..9bf078c0f4
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+// Return the environment variable `key`.  If the variable is not set,
+// use the default value.  If it is set but could not be parsed,
+// return `false`.  Otherwise set `value` and return `true`.
+template <typename T>
+bool GetEnvVar(const char* key, const T& default_value, T* value);
+
+class RPCFactory {
+ public:
+  RPCFactory() {}
+  virtual ~RPCFactory() {}
+
+  // Start a Call() to methods `method_t` at addresses `address_t` with
+  // request strings from `request_t`.  Any of these may be scalar
+  // Tensors, in which case the operands are broadcasted.
+  // Upon completion of all requests, `response_t` will be populated.
+  //
+  // If `try_rpc` is `true`, then `status_message_t` and
+  // `status_code_t` will be populated as well.
+  //
+  // If `try_rpc` is `false`, then `status_message_t` and
+  // `status_code_t` are ignored (and may be nullptr).  Instead, the
+  // status of any failed call will be propagated to the op.
+  //
+  // REQUIRES:
+  //   - `response_t` is not null, and is a string Tensor with the same shape as
+  //     `request_t`.
+  //
+  //   If `try_rpc` is `true`:
+  //      - `status_code_t` and `status_message_t` are not null.
+  //      - `status_code_t` is an int32 Tensor with the same shape as
+  //        `request_t`.
+  //      - `status_message_t` is a string Tensor with the same shape as
+  //        `request_t`.
+  virtual void Call(OpKernelContext* ctx, int64 num_elements,
+                    const Tensor& address_t, const Tensor& method_t,
+                    const Tensor& request_t, const bool try_rpc,
+                    Tensor* response_t, Tensor* status_code_t,
+                    Tensor* status_message_t,
+                    AsyncOpKernel::DoneCallback done) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RPCFactory);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.cc b/tensorflow/core/util/rpc/rpc_factory_registry.cc
new file mode 100644
index 0000000000..a148b5c04d
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+RPCFactoryRegistry* RPCFactoryRegistry::Global() {
+  static RPCFactoryRegistry* registry = new RPCFactoryRegistry;
+  return registry;
+}
+
+RPCFactoryRegistry::RPCFactoryFn* RPCFactoryRegistry::Get(
+    const string& protocol) {
+  auto found = fns_.find(protocol);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void RPCFactoryRegistry::Register(const string& protocol,
+                                  const RPCFactoryFn& factory_fn) {
+  auto existing = Get(protocol);
+  CHECK_EQ(existing, nullptr)
+      << "RPC factory for protocol: " << protocol << " already registered";
+  fns_.insert(std::pair<const string&, RPCFactoryFn>(protocol, factory_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.h b/tensorflow/core/util/rpc/rpc_factory_registry.h
new file mode 100644
index 0000000000..2635a4012e
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class RPCFactoryRegistry {
+ public:
+  typedef std::function<RPCFactory*(OpKernelConstruction* ctx, bool fail_fast,
+                                    int64 timeout_in_ms)>
+      RPCFactoryFn;
+
+  // Returns a pointer to a global RPCFactoryRegistry object.
+  static RPCFactoryRegistry* Global();
+
+  // Returns a pointer to an function that creates an RPC factory for the given
+  // protocol.
+  RPCFactoryFn* Get(const string& protocol);
+
+  // Registers a function that creates and RPC factory for the given protocol.
+  // The function should transfer the ownership of the factory to its caller.
+  void Register(const string& protocol, const RPCFactoryFn& factory_fn);
+
+ private:
+  std::map<string, RPCFactoryFn> fns_;
+};
+
+namespace rpc_factory_registration {
+
+class RPCFactoryRegistration {
+ public:
+  RPCFactoryRegistration(const string& protocol,
+                         const RPCFactoryRegistry::RPCFactoryFn& factory_fn) {
+    RPCFactoryRegistry::Global()->Register(protocol, factory_fn);
+  }
+};
+
+}  // namespace rpc_factory_registration
+
+#define REGISTER_RPC_FACTORY(protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ_HELPER(__COUNTER__, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ_HELPER(ctr, protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn) \
+  static rpc_factory_registration::RPCFactoryRegistration    \
+      rpc_factory_registration_fn_##ctr(protocol, factory_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry_test.cc b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
new file mode 100644
index 0000000000..cfd0f95016
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return nullptr;
+  }
+};
+
+REGISTER_RPC_FACTORY("TEST FACTORY 1", Value::Function);
+REGISTER_RPC_FACTORY("TEST FACTORY 2", Value::Function);
+}  // namespace
+
+TEST(RPCFactoryRegistryTest, TestBasic) {
+  EXPECT_EQ(RPCFactoryRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto factory1 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 1");
+  EXPECT_NE(factory1, nullptr);
+  auto factory2 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 2");
+  EXPECT_NE(factory2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 01962fcf44..a22b9f40b1 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3370,6 +3370,7 @@ tf_py_wrap_cc(
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
         "//tensorflow/core/grappler:grappler_item",
-- 
GitLab


From a1e0ac4b627619051aee448b750d8ef312c3fb71 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 17:18:33 -0700
Subject: [PATCH 0405/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 191962652
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 2476 ++++++++++++++++-
 tensorflow/core/ops/ops.pbtxt                 |  242 +-
 2 files changed, 2547 insertions(+), 171 deletions(-)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 10b24c2d34..1fc1de22bb 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -64,6 +64,31 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "AccumulateNV2"
   input_arg {
@@ -607,6 +632,33 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acosh"
   input_arg {
@@ -656,6 +708,31 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -725,6 +802,41 @@ op {
     }
   }
 }
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "AddManySparseToTensorsMap"
   input_arg {
@@ -1094,6 +1206,42 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AdjustContrast"
   input_arg {
@@ -6166,6 +6314,33 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Asinh"
   input_arg {
@@ -6215,6 +6390,31 @@ op {
     }
   }
 }
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Assert"
   input_arg {
@@ -6761,6 +6961,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Atan2"
   input_arg {
@@ -6812,6 +7039,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Atanh"
   input_arg {
@@ -6861,6 +7115,31 @@ op {
     }
   }
 }
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -8328,6 +8607,50 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "BatchMatrixBandPart"
   input_arg {
@@ -10154,6 +10477,67 @@ op {
     }
   }
 }
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
 op {
   name: "BitwiseAnd"
   input_arg {
@@ -11081,6 +11465,29 @@ op {
     }
   }
 }
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "CheckNumerics"
   input_arg {
@@ -11134,6 +11541,33 @@ op {
     type: "string"
   }
 }
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
 op {
   name: "Cholesky"
   input_arg {
@@ -13318,6 +13752,31 @@ op {
     }
   }
 }
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Cosh"
   input_arg {
@@ -13367,6 +13826,31 @@ op {
     }
   }
 }
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "CountUpTo"
   input_arg {
@@ -16004,6 +16488,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -17054,6 +17554,58 @@ op {
     }
   }
 }
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DiagPart"
   input_arg {
@@ -17069,6 +17621,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -17095,6 +17648,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -17150,6 +17704,29 @@ op {
     }
   }
 }
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Dilation2D"
   input_arg {
@@ -17923,6 +18500,41 @@ op {
     }
   }
 }
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DrawBoundingBoxes"
   input_arg {
@@ -18218,6 +18830,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -18524,6 +19159,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Erf"
   input_arg {
@@ -18569,6 +19244,29 @@ op {
     }
   }
 }
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Erfc"
   input_arg {
@@ -18614,6 +19312,29 @@ op {
     }
   }
 }
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -18678,6 +19399,31 @@ op {
     }
   }
 }
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExpandDims"
   input_arg {
@@ -18759,6 +19505,31 @@ op {
     }
   }
 }
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExtractGlimpse"
   input_arg {
@@ -20539,6 +21310,29 @@ op {
     }
   }
 }
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FloorDiv"
   input_arg {
@@ -20608,6 +21402,41 @@ op {
     }
   }
 }
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "FloorMod"
   input_arg {
@@ -20663,6 +21492,35 @@ op {
     }
   }
 }
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FlushSummaryWriter"
   input_arg {
@@ -23248,6 +24106,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "InterleaveDataset"
   input_arg {
@@ -23480,6 +24407,33 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "InvGrad"
   input_arg {
@@ -23664,6 +24618,35 @@ op {
     }
   }
 }
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Invert"
   input_arg {
@@ -23797,6 +24780,29 @@ op {
     }
   }
 }
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsInf"
   input_arg {
@@ -23842,6 +24848,29 @@ op {
     }
   }
 }
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsNan"
   input_arg {
@@ -23887,6 +24916,29 @@ op {
     }
   }
 }
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsVariableInitialized"
   input_arg {
@@ -24849,6 +25901,29 @@ op {
     }
   }
 }
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "LinSpace"
   input_arg {
@@ -25065,6 +26140,31 @@ op {
     }
   }
 }
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Log1p"
   input_arg {
@@ -25114,6 +26214,31 @@ op {
     }
   }
 }
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "LogMatrixDeterminant"
   input_arg {
@@ -26129,6 +27254,50 @@ op {
     }
   }
 }
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatchingFiles"
   input_arg {
@@ -30007,6 +31176,36 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Mean"
   input_arg {
@@ -30662,6 +31861,36 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "MirrorPad"
   input_arg {
@@ -30801,6 +32030,36 @@ op {
     }
   }
 }
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Mul"
   input_arg {
@@ -30872,6 +32131,42 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Multinomial"
   input_arg {
@@ -31474,6 +32769,33 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "NegTrain"
   input_arg {
@@ -31657,6 +32979,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "NthElement"
   input_arg {
@@ -31924,6 +33286,38 @@ op {
     }
   }
 }
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
@@ -33347,28 +34741,58 @@ op {
   }
 }
 op {
-  name: "PopulationCount"
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
-    type: DT_UINT8
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -33393,6 +34817,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -33422,9 +34847,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
@@ -34111,68 +35536,256 @@ op {
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
+  input_arg {
     name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 21
+    type_attr: "T"
   }
-}
-op {
-  name: "QuantizeAndDequantize"
   input_arg {
-    name: "input"
+    name: "input_max"
     type_attr: "T"
   }
   output_arg {
@@ -34200,20 +35813,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -34224,16 +35823,21 @@ op {
       }
     }
   }
-  deprecation {
-    version: 22
-  }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -34259,20 +35863,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -34284,9 +35874,6 @@ op {
       }
     }
   }
-  deprecation {
-    version: 22
-  }
 }
 op {
   name: "QuantizeAndDequantizeV2"
@@ -34332,6 +35919,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34339,7 +35928,7 @@ op {
   }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -34352,6 +35941,10 @@ op {
     name: "input_max"
     type_attr: "T"
   }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -34363,18 +35956,11 @@ op {
       b: true
     }
   }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
   attr {
     name: "range_given"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
@@ -34382,7 +35968,6 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34430,6 +36015,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34478,6 +36064,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -38530,6 +40117,41 @@ op {
     }
   }
 }
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Reciprocal"
   input_arg {
@@ -38583,6 +40205,33 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ReciprocalGrad"
   input_arg {
@@ -38668,6 +40317,35 @@ op {
     }
   }
 }
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "RecordInput"
   output_arg {
@@ -48266,6 +49944,56 @@ op {
     }
   }
 }
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "RightShift"
   input_arg {
@@ -48341,6 +50069,29 @@ op {
     }
   }
 }
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Roll"
   input_arg {
@@ -48437,6 +50188,33 @@ op {
     }
   }
 }
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -48486,6 +50264,31 @@ op {
     }
   }
 }
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "RsqrtGrad"
   input_arg {
@@ -48571,6 +50374,35 @@ op {
     }
   }
 }
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SampleDistortedBoundingBox"
   input_arg {
@@ -52767,6 +54599,31 @@ op {
     }
   }
 }
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SigmoidGrad"
   input_arg {
@@ -52852,6 +54709,35 @@ op {
     }
   }
 }
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sign"
   input_arg {
@@ -52905,6 +54791,33 @@ op {
     }
   }
 }
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sin"
   input_arg {
@@ -52954,6 +54867,31 @@ op {
     }
   }
 }
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sinh"
   input_arg {
@@ -53003,6 +54941,31 @@ op {
     }
   }
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Size"
   input_arg {
@@ -62161,6 +64124,31 @@ op {
     }
   }
 }
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SqrtGrad"
   input_arg {
@@ -62246,6 +64234,35 @@ op {
     }
   }
 }
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Square"
   input_arg {
@@ -62299,6 +64316,33 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SquaredDifference"
   input_arg {
@@ -62362,6 +64406,38 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Squeeze"
   input_arg {
@@ -63579,6 +65655,41 @@ op {
     }
   }
 }
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Substr"
   input_arg {
@@ -64219,6 +66330,33 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
@@ -64268,6 +66406,31 @@ op {
     }
   }
 }
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TanhGrad"
   input_arg {
@@ -64353,6 +66516,35 @@ op {
     }
   }
 }
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TemporaryVariable"
   output_arg {
@@ -66630,6 +68822,41 @@ op {
     }
   }
 }
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateMod"
   input_arg {
@@ -66685,6 +68912,35 @@ op {
     }
   }
 }
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "TruncatedNormal"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 5764976aee..2b56339f40 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -30,8 +30,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -210,8 +210,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -237,8 +237,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -266,8 +266,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -423,8 +423,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -1932,8 +1932,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -1959,8 +1959,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2191,8 +2191,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2223,6 +2223,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2244,8 +2245,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -3004,8 +3005,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -3854,6 +3855,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3869,7 +3871,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -3879,6 +3880,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3894,7 +3896,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4637,8 +4638,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4660,8 +4661,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5759,8 +5760,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5784,8 +5785,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -7408,6 +7409,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -7960,6 +7977,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7986,6 +8004,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -8011,8 +8030,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8217,8 +8236,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8432,6 +8451,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -8678,8 +8720,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8714,8 +8756,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8737,8 +8779,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8775,8 +8817,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8832,8 +8874,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -9700,8 +9742,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9727,8 +9769,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -9765,6 +9807,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11621,6 +11664,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "InterleaveDataset"
   input_arg {
@@ -11680,8 +11792,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -11711,8 +11823,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11799,8 +11911,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11822,8 +11934,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11845,8 +11957,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12360,8 +12472,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12508,8 +12620,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12533,8 +12645,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -13390,8 +13502,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14625,8 +14737,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14881,8 +14993,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14997,6 +15109,8 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -15023,8 +15137,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15445,8 +15559,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -15581,8 +15695,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15746,6 +15860,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT8
@@ -17024,9 +17139,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
@@ -17456,6 +17571,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17511,6 +17627,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17559,6 +17676,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -20106,8 +20224,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -20137,8 +20255,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -20168,8 +20286,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23457,8 +23575,8 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BOOL
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23516,6 +23634,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -23580,8 +23699,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -23607,8 +23726,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23636,8 +23755,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25487,8 +25606,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25516,8 +25635,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25541,8 +25660,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -25568,8 +25687,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25593,8 +25712,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -28988,8 +29107,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29017,8 +29136,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29042,8 +29161,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29073,8 +29192,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30022,8 +30141,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -30407,8 +30526,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30434,8 +30553,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -30463,8 +30582,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -32085,8 +32204,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -32123,6 +32242,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
-- 
GitLab


From 21ab9e8aa4b541b40ab27b394d390b08ea54a6d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 17:19:45 -0700
Subject: [PATCH 0406/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 191962763
---
 tensorflow/compiler/xla/tests/xla_internal_test_main.cc   | 8 +++++---
 tensorflow/core/framework/op_kernel.cc                    | 2 +-
 .../core/grappler/optimizers/arithmetic_optimizer.cc      | 2 +-
 tensorflow/core/platform/s3/s3_file_system.cc             | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 0af40bc15a..a9f2915b45 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -32,14 +33,15 @@ GTEST_API_ int main(int argc, char** argv) {
   // tests.
   for (int i = 1; i < argc; i++) {
     tensorflow::StringPiece arg(argv[i]);
-    if (arg == "--benchmarks" || arg.starts_with("--benchmarks=")) {
+    if (arg == "--benchmarks" ||
+        tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
       const char* pattern = nullptr;
-      if (arg.starts_with("--benchmarks=")) {
+      if (tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
         pattern = argv[i] + strlen("--benchmarks=");
       } else {
         // Handle flag of the form '--benchmarks foo' (no '=').
         if (i + 1 >= argc ||
-            tensorflow::StringPiece(argv[i + 1]).starts_with("--")) {
+            tensorflow::str_util::StartsWith(argv[i + 1], "--")) {
           LOG(ERROR) << "--benchmarks flag requires an argument.";
           return 2;
         }
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index cfde1e8ea3..05171006b0 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -96,7 +96,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(StringPiece(type_string()).starts_with("_")),
+      is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 7bf264ba30..da8d677737 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -475,7 +475,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
       return false;
     }
     // it must not be created by this stage at any of previous optimization runs
-    if (StringPiece(node->name()).contains(stage_name_)) {
+    if (str_util::StrContains(node->name(), stage_name_)) {
       return false;
     }
     // should not drive or be driven by control dependency
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index ee423699b2..6da679dc75 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -156,7 +156,7 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("S3 path doesn't contain an object name: ",
-- 
GitLab


From 38d1ac1e4f5b2a6e88eee43d332292898e0afc41 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 17:31:43 -0700
Subject: [PATCH 0407/1262] Initial Python API for specifying
 outside_compilation blocks that call out from a TPU computation.

For now outside_compilation cannot occur inside any compiled control flow (while loop or conditional). If the computation is replicated, the outside_compilation ops are also replicated. Both of these restrictions will be lifted in followup CLs.

PiperOrigin-RevId: 191963758
---
 .../tf2xla/functionalize_control_flow.cc      |   8 +
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 211 +++++++++++++++++-
 tensorflow/contrib/tpu/python/tpu/tpu_test.py |   2 +-
 tensorflow/python/eager/function.py           |  10 +
 tensorflow/python/framework/ops.py            |  24 +-
 tensorflow/python/ops/control_flow_ops.py     |  10 +
 tensorflow/python/ops/gradients_impl.py       |  48 +++-
 7 files changed, 292 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 8b7beef83e..16b9142cbf 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -901,6 +901,14 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       int src_depth = switch_depth[src_id];
       if (!e->IsControlEdge() || new_switch_depth == src_depth) {
         if (src_depth != new_switch_depth) {
+          // TODO(b/77601805) remove this when outside_compilation supports
+          // control flow.
+          if (str_util::StrContains(src->name(), "outside_compilation") ||
+              str_util::StrContains(n->name(), "outside_compilation")) {
+            return errors::InvalidArgument(
+                "outside_compilation is not yet supported within TensorFlow "
+                "control flow constructs b/77601805");
+          }
           return errors::InvalidArgument(
               "Unable to functionalize control flow in graph: Operand ('",
               src->name(), "') and operator ('", n->name(),
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 3f2db548ac..a1690dadff 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -25,6 +25,8 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -56,6 +58,7 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
 def _tpu_system_device_name(job):
@@ -121,8 +124,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name):
+  def __init__(self, name, num_replicas):
     super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
 
@@ -136,6 +147,143 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         logging.warning("... and %d more" %
                         (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an
+          # outside_compilation cluster C in a forward computation we
+          # would like to put the ops corresponding to the gradient of
+          # X into a new outside_compilation cluster C'. However, if
+          # we take the gradient of X twice, the second one should get
+          # yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is
+          # the cluster that X was in before we took gradients, and a
+          # 'gradient_uid' which is different for every invocation of
+          # gradients, and put the gradient of X in cluster
+          # 'root_cluster.gradient_uid'.
+          #
+          # When the gradient code adds multiple Ops, it asks them to
+          # be colocated either with the original Op X, or with one of
+          # the preceding Ops that was added to the gradient. In other
+          # words, we want to detect the case where we are colocating
+          # with an Op that is in cluster root_cluster.gradient_uid
+          # and put the new Op in that same cluster if the
+          # gradient_uid is the same (the case that we are in the same
+          # invocation of gradients, and just adding new Ops to the
+          # cluster); and in a different cluster if the gradient_uids
+          # are different (the case that we are in a new invocation of
+          # gradients, taking the gradient of a previously-computed
+          # gradient).
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          if len(parts) > 1:
+            uid = parts[-1]
+            if uid == gradient_uid:
+              # Keep using the same cluster
+              cluster = outside_attr
+            else:
+              # We're taking the gradient of a gradient so make a new
+              # cluster attr, adding a new '.uid' on the end to
+              # preserve the invariant that the gradient_uid is the
+              # suffix after the last '.' in the attr.
+              cluster = outside_attr + "." + gradient_uid
+          else:
+            # We're taking the gradient of an Op in the forward pass, so
+            # make a new cluster combining the Op's cluster and the
+            # gradient id.
+            cluster = outside_attr + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        self._device = device.to_string()
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def Exit(self):
+    super(TPUReplicateContext, self).Exit()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
   def AddOp(self, op):
     self._AddOpInternal(op)
 
@@ -157,9 +305,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError("TPU computations cannot be nested")
     op._set_attr(_TPU_REPLICATE_ATTR,
                  attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
-    op.graph.prevent_feeding(op)
-    op.graph.prevent_fetching(op)
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
 
   def AddValue(self, val):
     result = val
@@ -181,6 +336,45 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
+def outside_compilation(computation, args=None):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    args: Inputs to pass to computation.
+  Returns:
+    The Tensors returned by computation.
+  """
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
@@ -280,7 +474,8 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
-  context = TPUReplicateContext(name=graph.unique_name("cluster"))
+  context = TPUReplicateContext(
+      name=graph.unique_name("cluster"), num_replicas=num_replicas)
   try:
     context.Enter()
 
@@ -361,6 +556,12 @@ def replicate(computation,
   finally:
     context.report_unsupported_operations()
     context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
 
   # Fan-out: Builds a TPUReplicatedOutput node for each output.
   outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index 336d8260c3..c3882b8a27 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -37,7 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context")
+    context = tpu.TPUReplicateContext(b"context", 1)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 61859d6be3..5168ad3b18 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -223,6 +223,16 @@ class HelperContext(object):
     else:
       return val
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def __enter__(self):
     # pylint: disable=protected-access
     self._g = ops.get_default_graph()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2574fa57a4..e3ca5a4977 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4179,6 +4179,19 @@ class Graph(object):
     """
     return self._name_stack
 
+  @tf_contextlib.contextmanager
+  def _colocate_with_for_gradient(self, op, gradient_uid,
+                                  ignore_existing=False):
+    with self.colocate_with(op, ignore_existing):
+      if gradient_uid is not None and self._control_flow_context is not None:
+        try:
+          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
+          yield
+        finally:
+          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
+      else:
+        yield
+
   @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
@@ -4958,8 +4971,7 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-@tf_export("colocate_with")
-def colocate_with(op, ignore_existing=False):
+def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
       return device(op.device)
@@ -4973,7 +4985,13 @@ def colocate_with(op, ignore_existing=False):
       else:
         raise ValueError("Encountered an Eager-defined Tensor during graph "
                          "construction, but a function was not being built.")
-    return default_graph.colocate_with(op, ignore_existing)
+    return default_graph._colocate_with_for_gradient(
+        op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
+
+
+@tf_export("colocate_with")
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
 
 @tf_export("control_dependencies")
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e56ab93666..7be8628073 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1595,6 +1595,16 @@ class ControlFlowContext(object):
     last_context = self._context_stack.pop()
     graph._set_control_flow_context(last_context)
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 44473ec69c..13420b7f0e 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -208,7 +208,10 @@ def _AsList(x):
   return x if isinstance(x, (list, tuple)) else [x]
 
 
-def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
   """Fill in default values for grad_ys.
 
   Args:
@@ -216,6 +219,9 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
     ys: List of tensors.
     colocate_gradients_with_ops: If True, try colocating gradients with
       the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
 
   Returns:
     A list of gradients to use, without None.
@@ -231,7 +237,7 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
   for i in xrange(len(grad_ys)):
     grad_y = grad_ys[i]
     y = ys[i]
-    with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
       if grad_y is None:
         if y.dtype.is_complex:
           raise TypeError(
@@ -338,10 +344,10 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
 
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, colocate_gradients_with_ops):
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
   """Context to colocate with `op` if `colocate_gradients_with_ops`."""
   if colocate_gradients_with_ops:
-    with ops.colocate_with(op):
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
       yield
   else:
     yield
@@ -506,6 +512,9 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
   with ops.name_scope(
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [
         x.handle if resource_variable_ops.is_resource_variable(x) else x
@@ -513,7 +522,8 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
 
     # The approach we take here is as follows: Create a list of all ops in the
     # subgraph between the ys and xs.  Visit these ops in reverse order of ids
@@ -570,10 +580,11 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
-      with _maybe_colocate_with(op, colocate_gradients_with_ops):
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=True)
 
@@ -633,7 +644,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
               if gate_gradients and len([x for x in in_grads
                                          if x is not None]) > 1:
                 with ops.device(None):
-                  with ops.colocate_with(None, ignore_existing=True):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
@@ -789,7 +803,7 @@ def _LogOpGradients(op, out_grads, in_grads):
                ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
 
 
-def _MultiDeviceAddN(tensor_list):
+def _MultiDeviceAddN(tensor_list, gradient_uid):
   """Adds tensors from potentially multiple devices."""
   # Basic function structure comes from control_flow_ops.group().
   # Sort tensors according to their devices.
@@ -808,7 +822,10 @@ def _MultiDeviceAddN(tensor_list):
 
   for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
     tensors = tensors_on_device[dev]
-    with ops.colocate_with(tensors[0].op, ignore_existing=True):
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
       summands.append(math_ops.add_n(tensors))
 
   return math_ops.add_n(summands)
@@ -834,12 +851,19 @@ class AggregationMethod(object):
   EXPERIMENTAL_ACCUMULATE_N = 2
 
 
-def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
   """Get the aggregated gradients for op.
 
   Args:
     grads: The map of memoized gradients.
     op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
     loop_state: An object for maintaining the state of the while loops in the
                 graph. It is of type ControlFlowState. None if the graph
                 contains no while loops.
@@ -916,7 +940,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
             out_grads[i] = running_sum
         else:
           used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad)
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
-- 
GitLab


From 470cc0f75108e68965f89026399f7b3a7a08196b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 6 Apr 2018 17:39:17 -0700
Subject: [PATCH 0408/1262] Add remote session support for the MakeCallable
 API.

PiperOrigin-RevId: 191964391
---
 .../core/distributed_runtime/local_master.cc  |  41 ++
 .../core/distributed_runtime/local_master.h   |  10 +
 tensorflow/core/distributed_runtime/master.cc |  51 ++
 tensorflow/core/distributed_runtime/master.h  |   7 +
 .../distributed_runtime/master_interface.h    |  10 +
 .../distributed_runtime/master_session.cc     | 499 +++++++++++++-----
 .../core/distributed_runtime/master_session.h |  28 +-
 .../distributed_runtime/message_wrappers.cc   |  26 +
 .../distributed_runtime/message_wrappers.h    |   9 +
 .../rpc/grpc_master_service.cc                |  46 ++
 .../rpc/grpc_master_service_impl.cc           |  35 +-
 .../rpc/grpc_master_service_impl.h            |  45 ++
 .../rpc/grpc_remote_master.cc                 |  22 +
 .../distributed_runtime/rpc/grpc_session.cc   |  78 ++-
 .../distributed_runtime/rpc/grpc_session.h    |  27 +-
 .../rpc/grpc_session_test.cc                  |  43 ++
 tensorflow/core/protobuf/master.proto         |  68 +++
 tensorflow/core/protobuf/master_service.proto |   9 +
 18 files changed, 898 insertions(+), 156 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index aaa4cfa734..76315462a7 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -157,6 +157,47 @@ Status LocalMaster::Reset(CallOptions* call_options,
   return ret;
 }
 
+Status LocalMaster::MakeCallable(CallOptions* call_options,
+                                 const MakeCallableRequest* request,
+                                 MakeCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->MakeCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::RunCallable(CallOptions* call_options,
+                                const RunCallableRequest* request,
+                                RunCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->RunCallable(call_options, request, response,
+                            [&n, &ret](const Status& s) {
+                              ret.Update(s);
+                              n.Notify();
+                            });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::ReleaseCallable(CallOptions* call_options,
+                                    const ReleaseCallableRequest* request,
+                                    ReleaseCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->ReleaseCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+
 namespace {
 mutex* get_local_master_registry_lock() {
   static mutex local_master_registry_lock(LINKER_INITIALIZED);
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index c20b40329a..cad6babad8 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -71,6 +71,16 @@ class LocalMaster : public MasterInterface {
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override;
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override;
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override;
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response);
+
   // Registers the mapping from the given `target` to the given `master`.
   //
   // WARNING: The `master` pointer remains owned by the caller. It is
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 1a488303ac..f47502e844 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -611,4 +611,55 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
   });
 }
 
+void Master::MakeCallable(const MakeCallableRequest* req,
+                          MakeCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->MakeCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                         RunCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, opts, req, resp](MyClosure done) {
+        Status s = session->RunCallable(opts, *req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::ReleaseCallable(const ReleaseCallableRequest* req,
+                             ReleaseCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->ReleaseCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index 678fc46bd7..dbb337fd48 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -61,6 +61,13 @@ class Master {
   // See tensorflow::Reset() and the comment on ResetRequest.
   void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
 
+  void MakeCallable(const MakeCallableRequest* req, MakeCallableResponse* resp,
+                    MyClosure done);
+  void RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                   RunCallableResponse* resp, MyClosure done);
+  void ReleaseCallable(const ReleaseCallableRequest* req,
+                       ReleaseCallableResponse* resp, MyClosure done);
+
  private:
   typedef Master ME;
 
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index bf6a2db3e2..a8ae3cba3c 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -89,6 +89,16 @@ class MasterInterface {
   virtual Status Reset(CallOptions* call_options, const ResetRequest* request,
                        ResetResponse* response) = 0;
 
+  virtual Status MakeCallable(CallOptions* call_options,
+                              const MakeCallableRequest* request,
+                              MakeCallableResponse* response) = 0;
+  virtual Status RunCallable(CallOptions* call_options,
+                             const RunCallableRequest* request,
+                             RunCallableResponse* response) = 0;
+  virtual Status ReleaseCallable(CallOptions* call_options,
+                                 const ReleaseCallableRequest* request,
+                                 ReleaseCallableResponse* response) = 0;
+
  protected:
   // NOTE: This should only be called by implementations of this
   // interface whose CreateRunStepResponse() method returns a
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 64adf35c5e..e0a5bb4c53 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -72,7 +72,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
-        debug_opts_(bopts.callable_options.run_options().debug_options()),
+        callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
         should_deregister_(should_deregister) {
     VLOG(1) << "Created ReffedClientGraph for node with "
@@ -94,12 +94,18 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   const ClientGraph* client_graph() { return client_graph_.get(); }
 
+  const CallableOptions& callable_options() { return callable_opts_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
     return stats_publisher_->GetProfileHandler(step, execution_count, ropts);
   }
 
+  int64 get_and_increment_execution_count() {
+    return execution_count_.fetch_add(1);
+  }
+
   // Turn RPC logging on or off, both at the WorkerCache used by this
   // master process, and at each remote worker in use for the current
   // partitions.
@@ -178,6 +184,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                        CallOptions* opts, const RunStepRequestWrapper& req,
                        MutableRunStepResponseWrapper* resp,
                        CancellationManager* cm, const bool is_last_partial_run);
+  Status RunPartitions(const MasterEnv* env, int64 step_id,
+                       int64 execution_count, PerStepState* pss,
+                       CallOptions* call_opts, const RunCallableRequest& req,
+                       RunCallableResponse* resp, CancellationManager* cm);
 
   // Calls workers to cleanup states for the step "step_id".  Calls
   // `done` when all cleanup RPCs have completed.
@@ -211,10 +221,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<ClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
-  const DebugOptions& debug_opts_;
+  const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
   const bool should_deregister_;
+  std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
   struct Part {
@@ -269,6 +280,17 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
       const PartitionOptions& popts,
       std::unordered_map<string, GraphDef> graph_partitions);
 
+  // Prepares a number of calls to workers. One call per partition.
+  // This is a generic method that handles Run, PartialRun, and RunCallable.
+  template <class FetchListType, class ClientRequestType,
+            class ClientResponseType>
+  Status RunPartitionsHelper(
+      const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+      const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+      int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+      const ClientRequestType& req, ClientResponseType* resp,
+      CancellationManager* cm, bool is_last_partial_run);
+
   // Deregisters the partitions on the workers.  Called in the
   // destructor and does not wait for the rpc completion.
   void DeregisterPartitions();
@@ -411,7 +433,8 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     c->req.set_session_handle(session_handle_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
-    *c->req.mutable_debug_options() = debug_opts_;
+    *c->req.mutable_debug_options() =
+        callable_opts_.run_options().debug_options();
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -490,24 +513,46 @@ class RunManyGraphs {
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
 };
 
-Status MasterSession::ReffedClientGraph::RunPartitions(
-    const MasterEnv* env, int64 step_id, int64 execution_count,
-    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
-    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
-    const bool is_last_partial_run) {
-  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
-          << execution_count;
-  // Maps the names of fed tensors to their index in `req`.
-  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+namespace {
+Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunStepRequest(client_req, index, send_key);
+}
 
-  for (size_t i = 0; i < req.num_feeds(); ++i) {
-    if (!feeds.insert({req.feed_name(i), i}).second) {
-      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
-    }
-  }
+Status AddSendFromClientRequest(const RunCallableRequest& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunCallableRequest(client_req, index, send_key);
+}
 
-  // Prepares a number of calls to workers. One call per partition.
+// TODO(mrry): Add a full-fledged wrapper that avoids TensorProto copies for
+// in-process messages.
+struct RunCallableResponseWrapper {
+  RunCallableResponse* resp;  // Not owned.
+  std::unordered_map<string, TensorProto> fetch_key_to_protos;
+
+  RunMetadata* mutable_metadata() { return resp->mutable_metadata(); }
 
+  Status AddTensorFromRunGraphResponse(
+      const string& tensor_name, MutableRunGraphResponseWrapper* worker_resp,
+      size_t index) {
+    // TODO(b/74355905): Add a specialized implementation that avoids
+    // copying the tensor into the RunCallableResponse when at least
+    // two of the {client, master, worker} are in the same process.
+    return worker_resp->RecvValue(index, &fetch_key_to_protos[tensor_name]);
+  }
+};
+}  // namespace
+
+template <class FetchListType, class ClientRequestType,
+          class ClientResponseType>
+Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
+    const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+    const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+    int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+    const ClientRequestType& req, ClientResponseType* resp,
+    CancellationManager* cm, bool is_last_partial_run) {
   // Collect execution cost stats on a smoothly decreasing frequency.
   ExecutorOpts exec_opts;
   if (pss->report_tensor_allocations_upon_oom) {
@@ -553,28 +598,19 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     // We keep these as separate paths for now, to ensure we aren't
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
-      for (size_t i = 0; i < req.num_feeds(); ++i) {
-        const string& name = req.feed_name(i);
-        const auto iter = part.feed_key.find(name);
+      for (const auto& name_index : feeds) {
+        const auto iter = part.feed_key.find(name_index.first.ToString());
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
         }
         const string& key = iter->second;
-        auto feeds_iter = feeds.find(name);
-        if (feeds_iter == feeds.end()) {
-          return errors::InvalidArgument("No feed is provided for feed=", name,
-                                         ", key=", key);
-        } else if (feeds_iter->second != static_cast<size_t>(i)) {
-          return errors::Internal("Cannot find feed named \"", name,
-                                  " in request.");
-        }
-        TF_RETURN_IF_ERROR(c->req->AddSendFromRunStepRequest(req, i, key));
+        TF_RETURN_IF_ERROR(AddSendFromClientRequest(req, c->req.get(),
+                                                    name_index.second, key));
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (int i = 0; static_cast<size_t>(i) < req.num_fetches(); ++i) {
-        const string& req_fetch = req.fetch_name(i);
+      for (const string& req_fetch : fetches) {
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
             c->req->add_recv_key(key_fetch.first);
@@ -586,9 +622,13 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       for (const auto& feed_key : part.feed_key) {
         const string& feed = feed_key.first;
         const string& key = feed_key.second;
-        const int64 feed_index = feeds[feed];
+        auto iter = feeds.find(feed);
+        if (iter == feeds.end()) {
+          return errors::Internal("No feed index found for feed: ", feed);
+        }
+        const int64 feed_index = iter->second;
         TF_RETURN_IF_ERROR(
-            c->req->AddSendFromRunStepRequest(req, feed_index, key));
+            AddSendFromClientRequest(req, c->req.get(), feed_index, key));
       }
       for (const auto& key_fetch : part.key_fetch) {
         const string& key = key_fetch.first;
@@ -622,50 +662,115 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   } else {
     return errors::Cancelled("Step was cancelled");
   }
+  TF_RETURN_IF_ERROR(calls.status());
 
-  // Collects fetches.
-  Status status = calls.status();
-  if (status.ok()) {
-    for (int i = 0; i < num; ++i) {
-      const Part& part = partitions_[i];
-      MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
-      for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
-        auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
-        if (iter == part.key_fetch.end()) {
-          status.Update(errors::Internal("Unexpected fetch key: ",
-                                         run_graph_resp->recv_key(j)));
-          break;
-        }
-        const string& fetch = iter->second;
-        status.Update(
-            resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
-        if (!status.ok()) {
-          break;
-        }
+  // Collects fetches and metadata.
+  Status status;
+  for (int i = 0; i < num; ++i) {
+    const Part& part = partitions_[i];
+    MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
+    for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
+      auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
+      if (iter == part.key_fetch.end()) {
+        status.Update(errors::Internal("Unexpected fetch key: ",
+                                       run_graph_resp->recv_key(j)));
+        break;
       }
-      if (pss->collect_timeline) {
-        pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+      const string& fetch = iter->second;
+      status.Update(
+          resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
+      if (!status.ok()) {
+        break;
       }
-      if (pss->collect_costs) {
-        CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
-        for (int j = 0; j < cost_graph->node_size(); ++j) {
-          resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
-              cost_graph->mutable_node(j));
-        }
+    }
+    if (pss->collect_timeline) {
+      pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+    }
+    if (pss->collect_costs) {
+      CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
+      for (int j = 0; j < cost_graph->node_size(); ++j) {
+        resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
+            cost_graph->mutable_node(j));
       }
-      if (pss->collect_partition_graphs) {
-        protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
-            resp->mutable_metadata()->mutable_partition_graphs();
-        for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
-          partition_graph_defs->Add()->Swap(
-              run_graph_resp->mutable_partition_graph(i));
-        }
+    }
+    if (pss->collect_partition_graphs) {
+      protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+          resp->mutable_metadata()->mutable_partition_graphs();
+      for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
+        partition_graph_defs->Add()->Swap(
+            run_graph_resp->mutable_partition_graph(i));
       }
     }
   }
   return status;
 }
 
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
+    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
+    const bool is_last_partial_run) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    if (!feeds.insert({req.feed_name(i), i}).second) {
+      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
+    }
+  }
+
+  std::vector<string> fetches;
+  fetches.reserve(req.num_fetches());
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    fetches.push_back(req.fetch_name(i));
+  }
+
+  return RunPartitionsHelper(feeds, fetches, env, step_id, execution_count, pss,
+                             call_opts, req, resp, cm, is_last_partial_run);
+}
+
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunCallableRequest& req,
+    RunCallableResponse* resp, CancellationManager* cm) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < callable_opts_.feed_size(); ++i) {
+    if (!feeds.insert({callable_opts_.feed(i), i}).second) {
+      // MakeCallable will fail if there are two feeds with the same name.
+      return errors::Internal("Duplicated feeds in callable: ",
+                              callable_opts_.feed(i));
+    }
+  }
+
+  // Create a wrapped response object to collect the fetched values and
+  // rearrange them for the RunCallableResponse.
+  RunCallableResponseWrapper wrapped_resp;
+  wrapped_resp.resp = resp;
+
+  TF_RETURN_IF_ERROR(RunPartitionsHelper(
+      feeds, callable_opts_.fetch(), env, step_id, execution_count, pss,
+      call_opts, req, &wrapped_resp, cm, false /* is_last_partial_run */));
+
+  // Collects fetches.
+  // TODO(b/74355905): Add a specialized implementation that avoids
+  // copying the tensor into the RunCallableResponse when at least
+  // two of the {client, master, worker} are in the same process.
+  for (const string& fetch : callable_opts_.fetch()) {
+    TensorProto* fetch_proto = resp->mutable_fetch()->Add();
+    auto iter = wrapped_resp.fetch_key_to_protos.find(fetch);
+    if (iter == wrapped_resp.fetch_key_to_protos.end()) {
+      return errors::Internal("Worker did not return a value for fetch: ",
+                              fetch);
+    }
+    fetch_proto->Swap(&iter->second);
+  }
+  return Status::OK();
+}
+
 namespace {
 
 class CleanupBroadcastHelper {
@@ -1266,15 +1371,11 @@ WorkerCacheInterface* MasterSession::get_worker_cache() const {
   return env_->worker_cache;
 }
 
-Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
-                                ReffedClientGraph** rcg, bool is_partial) {
+Status MasterSession::StartStep(const BuildGraphOptions& opts, bool is_partial,
+                                ReffedClientGraph** out_rcg, int64* out_count) {
   const uint64 hash = HashBuildGraphOptions(opts);
   {
     mutex_lock l(mu_);
-    // Keep track of how many times this subgraph has been executed in
-    // this session.
-    int64* c = &subgraph_execution_counts_[hash];
-    *count = (*c)++;
     // TODO(suharshs): We cache partial run graphs and run graphs separately
     // because there is preprocessing that needs to only be run for partial
     // run calls.
@@ -1296,8 +1397,9 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
-    *rcg = iter->second;
-    (*rcg)->Ref();
+    *out_rcg = iter->second;
+    (*out_rcg)->Ref();
+    *out_count = (*out_rcg)->get_and_increment_execution_count();
   }
   return Status::OK();
 }
@@ -1316,6 +1418,12 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
+namespace {
+uint64 MakeStepId() {
+  return (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+}
+}  // namespace
+
 Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                       PartialRunSetupResponse* resp) {
   std::vector<string> inputs, outputs, targets;
@@ -1332,15 +1440,15 @@ Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
   string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
 
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
 
   // Prepare.
   BuildGraphOptions opts;
   BuildBuildGraphOptions(*req, &opts);
-  TF_RETURN_IF_ERROR(StartStep(opts, &count, &rcg, true));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(opts, true, &rcg, &count));
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
   rcg->Ref();
@@ -1585,6 +1693,73 @@ Status MasterSession::CreateDebuggerState(
   return Status::OK();
 }
 
+void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                                     const RunOptions& run_options,
+                                     uint64 step_id, int64 count,
+                                     PerStepState* out_pss,
+                                     std::unique_ptr<ProfileHandler>* out_ph) {
+  out_pss->collect_timeline =
+      run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->collect_rpcs = run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->report_tensor_allocations_upon_oom =
+      run_options.report_tensor_allocations_upon_oom();
+  // Build the cost model every 'build_cost_model_every' steps after skipping an
+  // initial 'build_cost_model_after' steps.
+  const int64 build_cost_model_after =
+      session_opts_.config.graph_options().build_cost_model_after();
+  const int64 build_cost_model_every =
+      session_opts_.config.graph_options().build_cost_model();
+  out_pss->collect_costs =
+      build_cost_model_every > 0 &&
+      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+  out_pss->collect_partition_graphs = run_options.output_partition_graphs();
+
+  *out_ph = rcg->GetProfileHandler(step_id, count, run_options);
+  if (*out_ph) {
+    out_pss->collect_timeline = true;
+    out_pss->collect_rpcs = (*out_ph)->should_collect_rpcs();
+  }
+}
+
+Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
+                                     uint64 step_id,
+                                     const RunOptions& run_options,
+                                     PerStepState* pss,
+                                     const std::unique_ptr<ProfileHandler>& ph,
+                                     const Status& run_status,
+                                     RunMetadata* out_run_metadata) {
+  Status s = run_status;
+  if (s.ok()) {
+    pss->end_micros = Env::Default()->NowMicros();
+
+    // Schedule post-processing and cleanup to be done asynchronously.
+    rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
+  } else if (errors::IsCancelled(s)) {
+    mutex_lock l(mu_);
+    if (closed_) {
+      if (garbage_collected_) {
+        s = errors::Cancelled(
+            "Step was cancelled because the session was garbage collected due "
+            "to inactivity.");
+      } else {
+        s = errors::Cancelled(
+            "Step was cancelled by an explicit call to `Session::Close()`.");
+      }
+    }
+  }
+  Ref();
+  rcg->Ref();
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
+    if (!s.ok()) {
+      LOG(ERROR) << "Cleanup partition error: " << s;
+    }
+    rcg->Unref();
+    MarkRunCompletion();
+    Unref();
+  });
+  return s;
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
@@ -1597,8 +1772,8 @@ Status MasterSession::DoRunWithLocalExecution(
   BuildGraphOptions bgopts;
   BuildBuildGraphOptions(req, &bgopts);
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
-  TF_RETURN_IF_ERROR(StartStep(bgopts, &count, &rcg, false));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(bgopts, false, &rcg, &count));
 
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
@@ -1614,64 +1789,133 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
-  pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.report_tensor_allocations_upon_oom =
-      req.options().report_tensor_allocations_upon_oom();
-  // Build the cost model every 'build_cost_model_every' steps after skipping an
-  // initial 'build_cost_model_after' steps.
-  const int64 build_cost_model_after =
-      session_opts_.config.graph_options().build_cost_model_after();
-  const int64 build_cost_model_every =
-      session_opts_.config.graph_options().build_cost_model();
-  pss.collect_costs =
-      build_cost_model_every > 0 &&
-      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
-  pss.collect_partition_graphs = req.options().output_partition_graphs();
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, req.options(), step_id, count, &pss, &ph);
 
-  std::unique_ptr<ProfileHandler> ph =
-      rcg->GetProfileHandler(step_id, count, req.options());
-  if (ph) {
-    pss.collect_timeline = true;
-    pss.collect_rpcs = ph->should_collect_rpcs();
+  Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
+                                &cancellation_manager_, false);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
+                        resp->mutable_metadata());
+}
+
+Status MasterSession::MakeCallable(const MakeCallableRequest& req,
+                                   MakeCallableResponse* resp) {
+  UpdateLastAccessTime();
+
+  BuildGraphOptions opts;
+  opts.callable_options = req.options();
+  opts.use_function_convention = false;
+
+  ReffedClientGraph* callable;
+
+  {
+    mutex_lock l(mu_);
+    if (closed_) {
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    std::unique_ptr<ClientGraph> client_graph;
+    TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+    callable = new ReffedClientGraph(handle_, opts, std::move(client_graph),
+                                     session_opts_, stats_publisher_factory_,
+                                     false /* is_partial */, get_worker_cache(),
+                                     !should_delete_worker_sessions_);
+  }
+
+  Status s = BuildAndRegisterPartitions(callable);
+  if (!s.ok()) {
+    callable->Unref();
+    return s;
   }
 
+  uint64 handle;
+  {
+    mutex_lock l(mu_);
+    handle = next_callable_handle_++;
+    callables_[handle] = callable;
+  }
+
+  resp->set_handle(handle);
+  return Status::OK();
+}
+
+Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                                    const RunCallableRequest& req,
+                                    RunCallableResponse* resp) {
+  VLOG(2) << "DoRunCallable req: " << req.DebugString();
+  PerStepState pss;
+  pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
+
+  // Prepare.
+  int64 count = rcg->get_and_increment_execution_count();
+
+  // Keeps the highest 8 bits 0x01: we reserve some bits of the
+  // step_id for future use.
+  const uint64 step_id = MakeStepId();
+  TRACEPRINTF("stepid %llu", step_id);
+
+  const RunOptions& run_options = rcg->callable_options().run_options();
+
+  if (run_options.timeout_in_ms() != 0) {
+    opts->SetTimeout(run_options.timeout_in_ms());
+  }
+
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, run_options, step_id, count, &pss, &ph);
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
-                                &cancellation_manager_, false);
-  if (s.ok()) {
-    pss.end_micros = Env::Default()->NowMicros();
+                                &cancellation_manager_);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, run_options, &pss, ph, s,
+                        resp->mutable_metadata());
+}
 
-    // Schedule post-processing and cleanup to be done asynchronously.
-    rcg->ProcessStats(step_id, &pss, ph.get(), req.options(),
-                      resp->mutable_metadata());
-  } else if (errors::IsCancelled(s)) {
+Status MasterSession::RunCallable(CallOptions* opts,
+                                  const RunCallableRequest& req,
+                                  RunCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* callable;
+  {
     mutex_lock l(mu_);
     if (closed_) {
-      if (garbage_collected_) {
-        s = errors::Cancelled(
-            "Step was cancelled because the session was garbage collected due "
-            "to inactivity.");
-      } else {
-        s = errors::Cancelled(
-            "Step was cancelled by an explicit call to `Session::Close()`.");
-      }
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    int64 handle = req.handle();
+    if (handle >= next_callable_handle_) {
+      return errors::InvalidArgument("No such callable handle: ", handle);
+    }
+    auto iter = callables_.find(req.handle());
+    if (iter == callables_.end()) {
+      return errors::InvalidArgument(
+          "Attempted to run callable after handle was released: ", handle);
     }
+    callable = iter->second;
+    callable->Ref();
+    ++num_running_;
   }
-  Ref();
-  rcg->Ref();
-  cleanup.release();  // MarkRunCompletion called in done closure.
-  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
-    if (!s.ok()) {
-      LOG(ERROR) << "Cleanup partition error: " << s;
+  core::ScopedUnref unref_callable(callable);
+  return DoRunCallable(opts, callable, req, resp);
+}
+
+Status MasterSession::ReleaseCallable(const ReleaseCallableRequest& req,
+                                      ReleaseCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* to_unref = nullptr;
+  {
+    mutex_lock l(mu_);
+    auto iter = callables_.find(req.handle());
+    if (iter != callables_.end()) {
+      to_unref = iter->second;
+      callables_.erase(iter);
     }
-    rcg->Unref();
-    MarkRunCompletion();
-    Unref();
-  });
-  return s;
+  }
+  if (to_unref != nullptr) {
+    to_unref->Unref();
+  }
+  return Status::OK();
 }
 
 Status MasterSession::Close() {
@@ -1688,6 +1932,7 @@ Status MasterSession::Close() {
     }
     ClearRunsTable(&to_unref, &run_graphs_);
     ClearRunsTable(&to_unref, &partial_run_graphs_);
+    ClearRunsTable(&to_unref, &callables_);
   }
   for (ReffedClientGraph* rcg : to_unref) rcg->Unref();
   if (should_delete_worker_sessions_) {
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 4bd4e1367a..a05419904f 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -89,6 +89,15 @@ class MasterSession : public core::RefCounted {
 
   Status ListDevices(ListDevicesResponse* resp) const;
 
+  Status MakeCallable(const MakeCallableRequest& req,
+                      MakeCallableResponse* resp);
+
+  Status RunCallable(CallOptions* opts, const RunCallableRequest& req,
+                     RunCallableResponse* resp);
+
+  Status ReleaseCallable(const ReleaseCallableRequest& req,
+                         ReleaseCallableResponse* resp);
+
   // Close this session and delete "*this". Returns OK if all known
   // states are cleanup successfully.
   //
@@ -140,6 +149,8 @@ class MasterSession : public core::RefCounted {
   typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
   RCGMap run_graphs_ GUARDED_BY(mu_);
   RCGMap partial_run_graphs_ GUARDED_BY(mu_);
+  int64 next_callable_handle_ GUARDED_BY(mu_) = 0;
+  RCGMap callables_ GUARDED_BY(mu_);
 
   struct PerStepState {
     bool collect_costs = false;
@@ -205,15 +216,28 @@ class MasterSession : public core::RefCounted {
   bool should_delete_worker_sessions_ = false;
   Status DeleteWorkerSessions();
 
-  Status StartStep(const BuildGraphOptions& opts, int64* count,
-                   ReffedClientGraph** graph, bool is_partial);
+  Status StartStep(const BuildGraphOptions& opts, bool is_partial,
+                   ReffedClientGraph** out_rcg, int64* out_count);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
                       RCGMap* rcg_map) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                        const RunOptions& run_options, uint64 step_id,
+                        int64 count, PerStepState* out_pss,
+                        std::unique_ptr<ProfileHandler>* out_ph);
   Status DoRunWithLocalExecution(CallOptions* opts,
                                  const RunStepRequestWrapper& req,
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  Status DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                       const RunCallableRequest& req,
+                       RunCallableResponse* resp);
+  Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg, uint64 step_id,
+                        const RunOptions& run_options, PerStepState* pss,
+                        const std::unique_ptr<ProfileHandler>& ph,
+                        const Status& run_status,
+                        RunMetadata* out_run_metadata);
+
   void MarkRunCompletion();
   void UpdateLastAccessTime();
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 66ebb3080a..18668b44d3 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -326,6 +326,20 @@ Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  Tensor tensor;
+  if (!ParseTensorProtoToTensor(run_callable_request.feed(i), &tensor)) {
+    return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
+  }
+  sends_.emplace_back(send_key, std::move(tensor));
+  return Status::OK();
+}
+
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
 
 const string& InMemoryRunGraphRequest::recv_key(size_t i) const {
@@ -439,6 +453,18 @@ Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  NamedTensorProto* send = request_.add_send();
+  send->set_name(send_key);
+  *send->mutable_tensor() = run_callable_request.feed(i);
+  return Status::OK();
+}
+
 size_t MutableProtoRunGraphRequest::num_recvs() const {
   return request_.recv_key_size();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 79fa6f926e..1f7cdb98a4 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -302,6 +302,9 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   virtual Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) = 0;
+  virtual Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) = 0;
 
   virtual void add_recv_key(const string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
@@ -334,6 +337,9 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
@@ -385,6 +391,9 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 63745e8ebd..23968e24c8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -111,6 +111,11 @@ class GrpcMasterService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(CloseSession, false);
     ENQUEUE_REQUEST(ListDevices, false);
     ENQUEUE_REQUEST(Reset, false);
+    ENQUEUE_REQUEST(MakeCallable, false);
+    for (int i = 0; i < 100; ++i) {
+      ENQUEUE_REQUEST(RunCallable, true);
+    }
+    ENQUEUE_REQUEST(ReleaseCallable, false);
 
     void* tag;
     bool ok;
@@ -236,6 +241,47 @@ class GrpcMasterService : public AsyncServiceInterface {
                         });
     ENQUEUE_REQUEST(Reset, false);
   }
+
+  // RPC handler for making a callable.
+  void MakeCallableHandler(
+      MasterCall<MakeCallableRequest, MakeCallableResponse>* call) {
+    master_impl_->MakeCallable(&call->request, &call->response,
+                               [call](const Status& status) {
+                                 call->SendResponse(ToGrpcStatus(status));
+                               });
+    ENQUEUE_REQUEST(MakeCallable, false);
+  }
+
+  // RPC handler for running a callable.
+  void RunCallableHandler(
+      MasterCall<RunCallableRequest, RunCallableResponse>* call) {
+    auto* trace = TraceRpc("RunCallable/Server", call->client_metadata());
+    CallOptions* call_opts = new CallOptions;
+    // The timeout may be overridden by a non-zero timeout in the
+    // callable's `RunOptions`; this overriding will happen inside the
+    // `MasterSession` implementation.
+    call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
+    call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+    master_impl_->RunCallable(call_opts, &call->request, &call->response,
+                              [call, call_opts, trace](const Status& status) {
+                                call->ClearCancelCallback();
+                                delete call_opts;
+                                delete trace;
+                                call->SendResponse(ToGrpcStatus(status));
+                              });
+    ENQUEUE_REQUEST(RunCallable, false);
+  }
+
+  // RPC handler for making a callable.
+  void ReleaseCallableHandler(
+      MasterCall<ReleaseCallableRequest, ReleaseCallableResponse>* call) {
+    master_impl_->ReleaseCallable(&call->request, &call->response,
+                                  [call](const Status& status) {
+                                    call->SendResponse(ToGrpcStatus(status));
+                                  });
+    ENQUEUE_REQUEST(ReleaseCallable, false);
+  }
+
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index e2016e824c..c832adbbbf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -36,6 +36,9 @@ static const char* grpcMasterService_method_names[] = {
     "/tensorflow.MasterService/CloseSession",
     "/tensorflow.MasterService/ListDevices",
     "/tensorflow.MasterService/Reset",
+    "/tensorflow.MasterService/MakeCallable",
+    "/tensorflow.MasterService/RunCallable",
+    "/tensorflow.MasterService/ReleaseCallable",
 };
 
 std::unique_ptr<MasterService::Stub> MasterService::NewStub(
@@ -64,7 +67,14 @@ MasterService::Stub::Stub(
       rpcmethod_ListDevices_(grpcMasterService_method_names[5],
                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_Reset_(grpcMasterService_method_names[6],
-                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
+                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_MakeCallable_(grpcMasterService_method_names[7],
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_RunCallable_(grpcMasterService_method_names[8],
+                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_ReleaseCallable_(grpcMasterService_method_names[9],
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel) {}
 
 ::grpc::Status MasterService::Stub::CreateSession(
     ::grpc::ClientContext* context, const CreateSessionRequest& request,
@@ -115,8 +125,29 @@ MasterService::Stub::Stub(
                                              context, request, response);
 }
 
+::grpc::Status MasterService::Stub::MakeCallable(
+    ::grpc::ClientContext* context, const MakeCallableRequest& request,
+    MakeCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_MakeCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::RunCallable(
+    ::grpc::ClientContext* context, const RunCallableRequest& request,
+    RunCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_RunCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::ReleaseCallable(
+    ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+    ReleaseCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ReleaseCallable_, context, request, response);
+}
+
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 7; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 6ae94b7441..3c382738c4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -79,6 +79,15 @@ class MasterService final {
     virtual ::grpc::Status Reset(::grpc::ClientContext* context,
                                  const ResetRequest& request,
                                  ResetResponse* response) = 0;
+    virtual ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                        const MakeCallableRequest& request,
+                                        MakeCallableResponse* response) = 0;
+    virtual ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                                       const RunCallableRequest& request,
+                                       RunCallableResponse* response) = 0;
+    virtual ::grpc::Status ReleaseCallable(
+        ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+        ReleaseCallableResponse* response) = 0;
   };
   class Stub final : public StubInterface {
    public:
@@ -104,6 +113,15 @@ class MasterService final {
     ::grpc::Status Reset(::grpc::ClientContext* context,
                          const ResetRequest& request,
                          ResetResponse* response) override;
+    ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                const MakeCallableRequest& request,
+                                MakeCallableResponse* response) override;
+    ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                               const RunCallableRequest& request,
+                               RunCallableResponse* response) override;
+    ::grpc::Status ReleaseCallable(::grpc::ClientContext* context,
+                                   const ReleaseCallableRequest& request,
+                                   ReleaseCallableResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
@@ -114,6 +132,9 @@ class MasterService final {
     const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
     const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
     const ::grpc::internal::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_MakeCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReleaseCallable_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -179,6 +200,30 @@ class MasterService final {
       ::grpc::Service::RequestAsyncUnary(6, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+    void RequestMakeCallable(
+        ::grpc::ServerContext* context, MakeCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<MakeCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(7, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRunCallable(
+        ::grpc::ServerContext* context, RunCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RunCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(8, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestReleaseCallable(
+        ::grpc::ServerContext* context, ReleaseCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ReleaseCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(9, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
   };
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 1088e9be66..1b92a79a67 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -95,6 +95,28 @@ class GrpcRemoteMaster : public MasterInterface {
                 &MasterServiceStub::Reset);
   }
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::MakeCallable);
+  }
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::RunCallable);
+  }
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::ReleaseCallable);
+  }
+
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
   port::Tracing::TraceMe TraceRpc(StringPiece name,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 3e79a40683..fd1c150fa7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -91,6 +91,15 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+Status GrpcSession::Handle(string* out_handle) {
+  mutex_lock l(mu_);
+  if (handle_.empty()) {
+    return errors::InvalidArgument("A session is not created yet....");
+  }
+  *out_handle = handle_;
+  return Status::OK();
+}
+
 Status GrpcSession::CreateImpl(CallOptions* call_options,
                                const GraphDef& graph) {
   {
@@ -274,14 +283,9 @@ Status GrpcSession::Run(const std::vector<std::pair<string, Tensor>>& inputs,
 Status GrpcSession::RunProto(CallOptions* call_options,
                              MutableRunStepRequestWrapper* req,
                              MutableRunStepResponseWrapper* resp) {
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req->set_session_handle(handle_);
-  }
+  string handle;
+  TF_RETURN_IF_ERROR(Handle(&handle));
+  req->set_session_handle(handle);
   return master_->RunStep(call_options, req, resp);
 }
 
@@ -293,14 +297,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   PartialRunSetupRequest req;
   PartialRunSetupResponse resp;
   CallOptions call_options;
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req.set_session_handle(handle_);
-  }
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   for (const string& feed : input_names) {
     req.add_feed(feed);
   }
@@ -400,6 +397,55 @@ Status GrpcSession::Reset(const SessionOptions& options,
   return ret;
 }
 
+Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
+                                 CallableHandle* out_handle) {
+  MakeCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  *req.mutable_options() = callable_options;
+  MakeCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->MakeCallable(&call_options, &req, &resp));
+  *out_handle = resp.handle();
+  return Status::OK();
+}
+
+Status GrpcSession::RunCallable(CallableHandle handle,
+                                const std::vector<Tensor>& feed_tensors,
+                                std::vector<Tensor>* fetch_tensors,
+                                RunMetadata* run_metadata) {
+  RunCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  for (const Tensor& feed : feed_tensors) {
+    feed.AsProtoTensorContent(req.mutable_feed()->Add());
+  }
+
+  RunCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->RunCallable(&call_options, &req, &resp));
+  for (const TensorProto& fetch : resp.fetch()) {
+    Tensor fetch_tensor;
+    if (!fetch_tensor.FromProto(cpu_allocator(), fetch)) {
+      return errors::Internal(
+          "Could not parse fetched tensor data in response from master.");
+    }
+    fetch_tensors->push_back(std::move(fetch_tensor));
+  }
+  return Status::OK();
+}
+
+Status GrpcSession::ReleaseCallable(CallableHandle handle) {
+  ReleaseCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  ReleaseCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  return master_->ReleaseCallable(&call_options, &req, &resp);
+}
+
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index d87956a135..63795117f9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -82,20 +82,27 @@ class GrpcSession : public Session {
   Status Close() override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRunSetup(const std::vector<string>& input_names,
-                                 const std::vector<string>& output_names,
-                                 const std::vector<string>& target_nodes,
-                                 string* handle) override;
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRun(
-      const string& handle,
-      const std::vector<std::pair<string, Tensor> >& inputs,
-      const std::vector<string>& output_names,
-      std::vector<Tensor>* outputs) override;
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor> >& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override;
 
   Status ListDevices(std::vector<DeviceAttributes>* response) override;
 
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle) override;
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata) override;
+  Status ReleaseCallable(CallableHandle handle) override;
+
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
@@ -111,6 +118,8 @@ class GrpcSession : public Session {
   // The current version of the graph.
   int64 current_graph_version_ GUARDED_BY(mu_);
 
+  Status Handle(string* out_handle) LOCKS_EXCLUDED(mu_);
+
   Status RunHelper(const RunOptions& run_options,
                    const std::vector<std::pair<string, Tensor> >& inputs,
                    const std::vector<string>& output_tensor_names,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 335c3febe2..45b15a54a2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -120,6 +120,49 @@ TEST(GrpcSessionTest, BasicNonProtoAPI) {
   }
 }
 
+TEST(GrpcSessionTest, BasicCallable) {
+  GraphDef graph;
+  string node_names[3];
+  // c = a * b
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  for (int iters = 0; iters < 25; ++iters) {
+    TF_CHECK_OK(session->Create(graph));
+    {
+      // Just run to target node
+      CallableOptions opts;
+      opts.add_target(node_names[2]);
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      TF_CHECK_OK(session->RunCallable(handle, {}, nullptr, nullptr));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+    {
+      // Run to a target node and a real tensor
+      CallableOptions opts;
+      opts.add_target(node_names[1]);
+      opts.add_fetch(node_names[2] + ":0");
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      std::vector<Tensor> outputs;
+      TF_CHECK_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+      ASSERT_EQ(1, outputs.size());
+      ASSERT_TRUE(outputs[0].IsInitialized());
+      ASSERT_EQ(4.0, outputs[0].flat<float>()(0));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+
+    TF_CHECK_OK(session->Close());
+  }
+}
+
 TEST(GrpcSessionTest, BasicNonProtoAPIConsistentOrder) {
   GraphDef graph;
   string node_names[3];
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 0437cb1b83..96c91536f7 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -23,6 +23,7 @@ option java_package = "org.tensorflow.distruntime";
 
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
@@ -264,3 +265,70 @@ message ListDevicesResponse {
   repeated DeviceAttributes local_device = 1;
   repeated DeviceAttributes remote_device = 2;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// MakeCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message MakeCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // Options that define the behavior of the created callable.
+  CallableOptions options = 2;
+}
+
+message MakeCallableResponse {
+  // A handle to the created callable.
+  int64 handle = 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// RunCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message RunCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+
+  // Values of the tensors passed as arguments to the callable, in the order
+  // defined in the CallableOptions.feed field passed to MakeCallable.
+  repeated TensorProto feed = 3;
+}
+
+message RunCallableResponse {
+  // Values of the tensors returned by the callable, in the order defined in the
+  // CallableOptions.fetch field passed to MakeCallable.
+  repeated TensorProto fetch = 1;
+
+  // Returned metadata if requested in the options.
+  RunMetadata metadata = 2;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ReleaseCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message ReleaseCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+}
+
+message ReleaseCallableResponse {
+}
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index 771c80562a..1170611f37 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -107,4 +107,13 @@ service MasterService {
   // will no longer affect fresh ones via the resources in containers listed in
   // the ResetRequest.  See ResetRequest for more details.
   rpc Reset(ResetRequest) returns (ResetResponse);
+
+  // Registers a callable for execution with RunCallable.
+  rpc MakeCallable(MakeCallableRequest) returns (MakeCallableResponse);
+
+  // Executes a callable registered with MakeCallable.
+  rpc RunCallable(RunCallableRequest) returns (RunCallableResponse);
+
+  // Frees resources associated with a callable registered with MakeCallable.
+  rpc ReleaseCallable(ReleaseCallableRequest) returns (ReleaseCallableResponse);
 }
-- 
GitLab


From 1b6202dd44a3f8881bcaa1034543af9c981067c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 17:46:39 -0700
Subject: [PATCH 0409/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 191964971

---
 tensorflow/go/op/wrappers.go | 2450 +++++++++++++++++-----------------
 1 file changed, 1225 insertions(+), 1225 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 0fd2177df7..53959d4956 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1845,6 +1845,262 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCounts",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
+
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+// find the unique elements.
+//
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueV2",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unique",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Shuffle dimensions of x according to a permutation and conjugate the result.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConjugateTranspose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a tensor.
+//
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+//
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reshape",
+		Input: []tf.Input{
+			tensor, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the complex conjugate of a complex number.
 //
 // Given a tensor `input` of complex numbers, this operation returns a tensor of
@@ -2671,120 +2927,6 @@ func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns (x - y)(x - y) element-wise.
-//
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Forwards the input to the output.
-//
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
-//
-// Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
-//
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LoopCond",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ApproximateEqualAttr is an optional argument to ApproximateEqual.
 type ApproximateEqualAttr func(optionalAttr)
 
@@ -3257,6 +3399,69 @@ func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
 type Conv2DBackpropFilterAttr func(optionalAttr)
 
@@ -4419,6 +4624,66 @@ func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the inverse permutation of a tensor.
+//
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvertPermutation",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the truth value of (x <= y) element-wise.
 //
 // *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
@@ -5657,70 +5922,10 @@ func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the bias tensor will be added to the last dimension
@@ -5910,10 +6115,367 @@ func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns locations of nonzero / true values in a tensor.
+//
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
+//
+// For example:
+//
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
+//
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Where",
+		Input: []tf.Input{
+			condition,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues a tuple of one or more tensors from the given queue.
+//
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
+	}
+	return components
+}
+
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Floor",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
+
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns a one-hot tensor.
+//
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OneHot",
 		Input: []tf.Input{
-			x,
+			indices, depth, on_value, off_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -6541,6 +7103,34 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 	return op.Output(0)
 }
 
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes tan of x element-wise.
 func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -6843,60 +7433,6 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
-
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
 type StatelessRandomNormalAttr func(optionalAttr)
 
@@ -7475,85 +8011,6 @@ func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candida
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
 type QuantizedRelu6Attr func(optionalAttr)
 
@@ -8108,101 +8565,23 @@ func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
-		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8211,14 +8590,13 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			x, axis,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Computes the mean along segments of a tensor.
@@ -9607,24 +9985,6 @@ func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (o
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
 type TensorArrayGatherV3Attr func(optionalAttr)
 
@@ -9857,249 +10217,61 @@ func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	opspec := tf.OpSpec{
 		Type: "GetSessionHandle",
 		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
-
-// OneHotAxis sets the optional axis attribute to value.
-//
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
-//
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
-//
-//
-// Examples
-// =========
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-//
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
-//
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[2 x 2 x 3]`:
-//
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
-//
-// Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-//
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OneHot",
-		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// The tensor returned by this operation is immutable.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			resource,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
@@ -11406,6 +11578,97 @@ func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LogUniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes softmax cross entropy cost and gradients to backpropagate.
 //
 // Inputs are the logits, not probabilities.
@@ -12768,69 +13031,6 @@ func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes sigmoid of `x` element-wise.
 //
 // Specifically, `y = 1 / (1 + exp(-x))`.
@@ -16533,30 +16733,6 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ShapeAttr is an optional argument to Shape.
 type ShapeAttr func(optionalAttr)
 
@@ -16578,20 +16754,44 @@ func ShapeOutType(value tf.DataType) ShapeAttr {
 // # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 // shape(t) ==> [2, 2, 3]
 // ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "Pow",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -16951,6 +17151,47 @@ func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (pr
 	return op.Output(0)
 }
 
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Forwards the input to the output.
+//
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
+//
+// Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LoopCond",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the gradient for the inverse of `x` wrt its input.
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
@@ -17053,272 +17294,75 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
-
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
-//
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
-//
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGamma",
-		Input: []tf.Input{
-			shape, alpha,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_max"] = value
-	}
-}
-
-// Use QuantizeAndDequantizeV2 instead.
-//
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns locations of nonzero / true values in a tensor.
-//
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
-//
-// For example:
-//
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
-//
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			condition,
+			input, input_min, input_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["seed"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			handle,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
 // RandomUniformIntAttr is an optional argument to RandomUniformInt.
@@ -17816,6 +17860,164 @@ func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_uppe
 	return op.Output(0)
 }
 
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
 type QuantizedMatMulAttr func(optionalAttr)
 
@@ -21902,80 +22104,64 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
-// Reshapes a tensor.
-//
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
-// For example:
+// **A note about the input flow_in:**
 //
-// ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
 //
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
 //
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+// **A note about the source attribute:**
 //
-// # -1 can also be used to infer the shape
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
 //
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
 //
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
 //
 // Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			tensor, shape,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // Creates a dataset that splits a SparseTensor into elements row-wise.
@@ -24260,66 +24446,6 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
-//
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
-//
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
-//
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
-//
-// Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
 // Each comparison returns a boolean `true` (if `input_value > threshold`)
@@ -26991,58 +27117,6 @@ func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (aud
 	return op.Output(0), op.Output(1)
 }
 
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unique",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Concatenates a list of `N` tensors along the first dimension.
 //
 // The input tensors are all required to have size 1 in the first dimension.
@@ -27813,77 +27887,3 @@ func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Ou
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Shuffle dimensions of x according to a permutation and conjugate the result.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueV2Attr is an optional argument to UniqueV2.
-type UniqueV2Attr func(optionalAttr)
-
-// UniqueV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
-// find the unique elements.
-//
-// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.
-func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueV2",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-- 
GitLab


From c7c108bfca264aa82a01f0c30d4db386f8e20bff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 17:49:47 -0700
Subject: [PATCH 0410/1262] [XLA] Redesign: implement Collapse and migrate
 reshape_test.

PiperOrigin-RevId: 191965245
---
 .../xla/client/xla_client/xla_builder.cc      |  40 ++++-
 tensorflow/compiler/xla/tests/BUILD           |   3 +-
 tensorflow/compiler/xla/tests/reshape_test.cc | 162 +++++++++---------
 3 files changed, 120 insertions(+), 85 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index e623639577..3d0cb35b48 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -593,7 +593,45 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 
 XlaOp XlaBuilder::Collapse(const XlaOp& operand,
                            tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (dimensions.size() <= 1) {
+      // Not collapsing anything, trivially we can return the operand versus
+      // enqueueing a trivial reshape.
+      return operand;
+    }
+
+    // Out-of-order collapse is not supported.
+    // Checks that the collapsed dimensions are in order and consecutive.
+    for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
+         i < dimensions.size(); ++i) {
+      if (dimensions[i] - 1 != dimensions[i - 1]) {
+        return InvalidArgument(
+            "Collapsed dimensions are not in consecutive order.");
+      }
+    }
+
+    // Create a new sizes vector from the old shape, replacing the collapsed
+    // dimensions by the product of their sizes.
+    TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand));
+
+    VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape);
+    VLOG(3) << "dims to collapse: "
+            << tensorflow::str_util::Join(dimensions, ",");
+
+    std::vector<int64> new_sizes;
+    for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
+      if (i <= dimensions.front() || i > dimensions.back()) {
+        new_sizes.push_back(original_shape.dimensions(i));
+      } else {
+        new_sizes.back() *= original_shape.dimensions(i);
+      }
+    }
+
+    VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
+            << "]";
+
+    return Reshape(operand, new_sizes);
+  });
 }
 
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 072c5cd149..0276db9925 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1372,11 +1372,10 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 02272d6017..d7462d581b 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -20,11 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -53,11 +52,11 @@ class ReshapeTest : public ::testing::WithParamInterface<bool>,
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
 XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -68,9 +67,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
@@ -81,9 +80,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
@@ -95,11 +94,11 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
 XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -112,15 +111,14 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   auto a = builder.Neg(parameter);
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+  builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
   auto expected_literal = Literal::CreateR1<float>({-1.0f});
   ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
@@ -131,10 +129,10 @@ XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(0, 3);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -147,11 +145,11 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -164,10 +162,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(3, 0);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -178,9 +176,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
 
 // Collapses a 2-dimensional row vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -191,9 +189,9 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) {
 
 // Collapses a 2-dimensional column vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial3x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -344,9 +342,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -359,10 +357,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -379,9 +377,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
 // with an incorrect result rank.
 //
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -394,10 +392,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -421,9 +419,9 @@ static Array3D<float> ArrayForDocR3Tests() {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -436,9 +434,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -456,9 +454,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -471,9 +469,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -491,9 +489,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -521,12 +519,12 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = Literal::CreateFromArray(t2x2x2x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
@@ -540,7 +538,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
 
 // As above, but uses reshape directly.
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
   t(0, 0, 0, 1) = 1;
@@ -551,7 +549,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
   auto input_literal = Literal::CreateFromArray(t);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -566,7 +564,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
 // Reshape various ranks to a scalar.
 XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -574,7 +572,7 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     std::vector<int64> zeros(rank, 0);  // this is {0, ..., 0}.
     input_literal.Set<float>(zeros, 83.0f);
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
                                                    &b, &parameter);
     b.Reshape(parameter, dimensions, {});
@@ -586,9 +584,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, BadDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {}, {});
@@ -598,9 +596,9 @@ XLA_TEST_P(ReshapeTest, BadDimensions) {
 }
 
 XLA_TEST_P(ReshapeTest, BadNewSizes) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {1}, {});
@@ -609,7 +607,7 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) {
 }
 
 XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
@@ -635,7 +633,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
 
@@ -646,7 +644,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
       {222, 333, 444, 555, 666, 777, 888, 999},
   });
 
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
@@ -664,13 +662,13 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
 }
 
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
@@ -691,13 +689,13 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
@@ -717,7 +715,7 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -727,7 +725,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
@@ -739,7 +737,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -749,7 +747,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
@@ -762,7 +760,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
 XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -772,7 +770,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
@@ -789,7 +787,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
 }
 
 XLA_TEST_P(ReshapeTest, NoopReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -799,12 +797,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
@@ -827,12 +825,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -846,8 +844,8 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
@@ -880,8 +878,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -909,8 +907,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -938,8 +936,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -968,8 +966,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -997,8 +995,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
-- 
GitLab


From 12576beec31ae0d73cce8f96e418e628a0c01654 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 6 Apr 2018 17:51:40 -0700
Subject: [PATCH 0411/1262] We no longer need updates_collections in quant ops
 since we rely on the data dependency from Assign ops.

PiperOrigin-RevId: 191965466
---
 tensorflow/contrib/quantize/python/quant_ops.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index a4f7b1b221..5c0e17dc86 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -51,7 +51,6 @@ def LastValueQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
                       init_max=6.0,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='LastValueQuant',
                       reuse=None,
@@ -69,8 +68,6 @@ def LastValueQuantize(inputs,
       quantization ranges per output channel.
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -133,7 +130,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -146,7 +142,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
@@ -163,7 +158,6 @@ def MovingAvgQuantize(inputs,
                       init_min=-6.0,
                       init_max=6.0,
                       ema_decay=0.999,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='MovingAvgQuantize',
                       reuse=None,
@@ -182,8 +176,6 @@ def MovingAvgQuantize(inputs,
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
     ema_decay: EMA decay parameter.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -246,7 +238,6 @@ def MovingAvgQuantize(inputs,
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = moving_averages.assign_moving_average(
         min_var, batch_min, ema_decay, name='AssignMinEma')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -260,7 +251,6 @@ def MovingAvgQuantize(inputs,
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = moving_averages.assign_moving_average(
         max_var, batch_max, ema_decay, name='AssignMaxEma')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
-- 
GitLab


From a0268d58d6668fe2471b21a63db4ccc0179405de Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 6 Apr 2018 18:27:18 -0700
Subject: [PATCH 0412/1262] [XLA] Make HloTestBase's hlo verifier to allow
 mixed precision.

PiperOrigin-RevId: 191968158
---
 tensorflow/compiler/xla/tests/hlo_test_base.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index e574644dea..21f71fc91b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -91,7 +91,7 @@ HloTestBase::HloTestBase()
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
-  hlo_verifier_ = MakeUnique<HloVerifier>();
+  hlo_verifier_ = MakeUnique<HloVerifier>(/*allow_mixed_precision=*/true);
 }
 
 /* static */
@@ -142,8 +142,7 @@ StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
           "reference preprocessor must not modify the program shape");
     }
   }
-  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
-                                     reference_module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(reference_module.get()).status());
   return std::move(reference_module);
 }
 
@@ -151,8 +150,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
     std::unique_ptr<HloModule> module, const ArraySlice<Literal*> arguments,
     const optional<ErrorSpec>& error, bool run_hlo_passes,
     const std::function<void(HloModule*)>& reference_preprocessor) {
-  TF_RETURN_IF_ERROR(
-      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(module.get()).status());
   TF_ASSIGN_OR_RETURN(auto reference_module,
                       MakeReferenceModule(*module, reference_preprocessor));
 
-- 
GitLab


From 6cb842114d1c016fafddbd6397b83df736b5b45e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 19:00:59 -0700
Subject: [PATCH 0413/1262] Automated g4 rollback of changelist 191963758

PiperOrigin-RevId: 191970209
---
 .../tf2xla/functionalize_control_flow.cc      |   8 -
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 211 +-----------------
 tensorflow/contrib/tpu/python/tpu/tpu_test.py |   2 +-
 tensorflow/python/eager/function.py           |  10 -
 tensorflow/python/framework/ops.py            |  24 +-
 tensorflow/python/ops/control_flow_ops.py     |  10 -
 tensorflow/python/ops/gradients_impl.py       |  48 +---
 7 files changed, 21 insertions(+), 292 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 16b9142cbf..8b7beef83e 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -901,14 +901,6 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       int src_depth = switch_depth[src_id];
       if (!e->IsControlEdge() || new_switch_depth == src_depth) {
         if (src_depth != new_switch_depth) {
-          // TODO(b/77601805) remove this when outside_compilation supports
-          // control flow.
-          if (str_util::StrContains(src->name(), "outside_compilation") ||
-              str_util::StrContains(n->name(), "outside_compilation")) {
-            return errors::InvalidArgument(
-                "outside_compilation is not yet supported within TensorFlow "
-                "control flow constructs b/77601805");
-          }
           return errors::InvalidArgument(
               "Unable to functionalize control flow in graph: Operand ('",
               src->name(), "') and operator ('", n->name(),
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index a1690dadff..3f2db548ac 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -25,8 +25,6 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -58,7 +56,6 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
-_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
 def _tpu_system_device_name(job):
@@ -124,16 +121,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name, num_replicas):
+  def __init__(self, name):
     super(TPUReplicateContext, self).__init__()
-    self._num_replicas = num_replicas
-    self._outer_device_function_stack = None
-    self._oc_dev_fn_stack = None
-    self._outside_compilation_cluster = None
-    self._outside_compilation_counter = 0
-    self._in_gradient_colocation = None
-    self._gradient_colocation_stack = []
-    self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
 
@@ -147,143 +136,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         logging.warning("... and %d more" %
                         (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
-  def EnterGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      self._gradient_colocation_stack.append(op)
-      if not self._outside_compilation_cluster:
-        try:
-          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
-          if self._in_gradient_colocation:
-            raise NotImplementedError(
-                "Cannot nest gradient colocation operations outside compilation"
-            )
-          if gradient_uid == "__unsupported__":
-            raise NotImplementedError(
-                "No gradient_uid calling gradient within outside_compilation")
-          # When we take the gradient of an op X in an
-          # outside_compilation cluster C in a forward computation we
-          # would like to put the ops corresponding to the gradient of
-          # X into a new outside_compilation cluster C'. However, if
-          # we take the gradient of X twice, the second one should get
-          # yet another new outside_compilation cluster C''.
-          #
-          # The mechanism we adopt is to use a 'root_cluster' which is
-          # the cluster that X was in before we took gradients, and a
-          # 'gradient_uid' which is different for every invocation of
-          # gradients, and put the gradient of X in cluster
-          # 'root_cluster.gradient_uid'.
-          #
-          # When the gradient code adds multiple Ops, it asks them to
-          # be colocated either with the original Op X, or with one of
-          # the preceding Ops that was added to the gradient. In other
-          # words, we want to detect the case where we are colocating
-          # with an Op that is in cluster root_cluster.gradient_uid
-          # and put the new Op in that same cluster if the
-          # gradient_uid is the same (the case that we are in the same
-          # invocation of gradients, and just adding new Ops to the
-          # cluster); and in a different cluster if the gradient_uids
-          # are different (the case that we are in a new invocation of
-          # gradients, taking the gradient of a previously-computed
-          # gradient).
-          self._in_gradient_colocation = op
-          parts = outside_attr.split(".")
-          if len(parts) > 1:
-            uid = parts[-1]
-            if uid == gradient_uid:
-              # Keep using the same cluster
-              cluster = outside_attr
-            else:
-              # We're taking the gradient of a gradient so make a new
-              # cluster attr, adding a new '.uid' on the end to
-              # preserve the invariant that the gradient_uid is the
-              # suffix after the last '.' in the attr.
-              cluster = outside_attr + "." + gradient_uid
-          else:
-            # We're taking the gradient of an Op in the forward pass, so
-            # make a new cluster combining the Op's cluster and the
-            # gradient id.
-            cluster = outside_attr + "." + gradient_uid
-          self._EnterOutsideCompilationScope(cluster=cluster)
-        except ValueError:
-          # The attr was not present: do nothing.
-          pass
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      if not self._gradient_colocation_stack:
-        raise errors.InternalError(
-            op.node_def, op,
-            "Badly nested gradient colocation: empty stack when popping Op " +
-            op.name)
-      last_op = self._gradient_colocation_stack.pop()
-      if op is last_op:
-        if op is self._in_gradient_colocation:
-          self._in_gradient_colocation = None
-          self._ExitOutsideCompilationScope()
-      else:
-        raise errors.InternalError(
-            op.node_def, op, "Badly nested gradient colocation, expected " +
-            last_op + ", got " + op.name)
-
-  def _EnterOutsideCompilationScope(self, cluster=None):
-
-    class FakeOp(object):
-      """A helper class to determine the current device.
-
-      Supports only the device set/get methods needed to run the
-      graph's _apply_device_function method.
-      """
-
-      def __init__(self):
-        self._device = ""
-
-      @property
-      def device(self):
-        return self._device
-
-      def _set_device(self, device):
-        self._device = device.to_string()
-
-    if self._outside_compilation_cluster:
-      raise NotImplementedError("Cannot nest outside_compilation clusters")
-    if cluster:
-      self._outside_compilation_cluster = cluster
-    else:
-      self._outside_compilation_cluster = str(self._outside_compilation_counter)
-      self._outside_compilation_counter += 1
-    graph = ops.get_default_graph()
-    fake_op = FakeOp()
-    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
-    device = pydev.DeviceSpec.from_string(fake_op.device)
-    if (device.device_type == "TPU_REPLICATED_CORE" and
-        device.device_index is not None):
-      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
-                                     str(device.device_index))
-    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
-    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
-
-  def _ExitOutsideCompilationScope(self):
-    if not self._outside_compilation_cluster:
-      raise NotImplementedError(
-          "Attempted to exit outside_compilation scope when not in scope")
-    self._outside_compilation_cluster = None
-    graph = ops.get_default_graph()
-    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
-
-  def Enter(self):
-    if not self._outer_device_function_stack:
-      # Capture the device function stack at the time of first entry
-      # since that is the stack that will be used outside_compilation.
-      graph = ops.get_default_graph()
-      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
-    super(TPUReplicateContext, self).Enter()
-
-  def Exit(self):
-    super(TPUReplicateContext, self).Exit()
-
-  def HostComputeCore(self):
-    return self._host_compute_core
-
   def AddOp(self, op):
     self._AddOpInternal(op)
 
@@ -305,16 +157,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError("TPU computations cannot be nested")
     op._set_attr(_TPU_REPLICATE_ATTR,
                  attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    if self._outside_compilation_cluster:
-      op._set_attr(
-          _OUTSIDE_COMPILATION_ATTR,
-          attr_value_pb2.AttrValue(
-              s=compat.as_bytes(self._outside_compilation_cluster)))
-    if self._num_replicas > 1 or not self._outside_compilation_cluster:
-      # Prevent feeding or fetching anything that is being compiled,
-      # and any replicated outside_compilation Op.
-      op.graph.prevent_feeding(op)
-      op.graph.prevent_fetching(op)
+    # pylint: enable=protected-access
+    op.graph.prevent_feeding(op)
+    op.graph.prevent_fetching(op)
 
   def AddValue(self, val):
     result = val
@@ -336,45 +181,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
-def outside_compilation(computation, args=None):
-  """Builds part of a computation outside any current TPU replicate scope.
-
-  Args:
-    computation: A Python function that builds the computation to
-      place on the host.
-    args: Inputs to pass to computation.
-  Returns:
-    The Tensors returned by computation.
-  """
-  graph = ops.get_default_graph()
-
-  # If we are in a TPUReplicateContext, signal that we are now
-  # outside_compilation
-  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  retval = computation(*args)
-
-  # If we are in a TPUReplicateContext, signal that we are no longer
-  # outside_compilation
-  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  if initial_context is not final_context:
-    raise NotImplementedError(
-        "Control-flow context cannot be different at start and end of an "
-        "outside_compilation scope")
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  return retval
-
-
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
@@ -474,8 +280,7 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
-  context = TPUReplicateContext(
-      name=graph.unique_name("cluster"), num_replicas=num_replicas)
+  context = TPUReplicateContext(name=graph.unique_name("cluster"))
   try:
     context.Enter()
 
@@ -556,12 +361,6 @@ def replicate(computation,
   finally:
     context.report_unsupported_operations()
     context.Exit()
-    host_compute_core = context.HostComputeCore()
-
-  if host_compute_core:
-    attr_value = attr_value_pb2.AttrValue()
-    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
-    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
 
   # Fan-out: Builds a TPUReplicatedOutput node for each output.
   outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index c3882b8a27..336d8260c3 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -37,7 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context", 1)
+    context = tpu.TPUReplicateContext(b"context")
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 5168ad3b18..61859d6be3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -223,16 +223,6 @@ class HelperContext(object):
     else:
       return val
 
-  def EnterGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.EnterGradientColocation(op, gradient_uid)
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.ExitGradientColocation(op, gradient_uid)
-
   def __enter__(self):
     # pylint: disable=protected-access
     self._g = ops.get_default_graph()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e3ca5a4977..2574fa57a4 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4179,19 +4179,6 @@ class Graph(object):
     """
     return self._name_stack
 
-  @tf_contextlib.contextmanager
-  def _colocate_with_for_gradient(self, op, gradient_uid,
-                                  ignore_existing=False):
-    with self.colocate_with(op, ignore_existing):
-      if gradient_uid is not None and self._control_flow_context is not None:
-        try:
-          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
-          yield
-        finally:
-          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
-      else:
-        yield
-
   @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
@@ -4971,7 +4958,8 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
+@tf_export("colocate_with")
+def colocate_with(op, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
       return device(op.device)
@@ -4985,13 +4973,7 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
       else:
         raise ValueError("Encountered an Eager-defined Tensor during graph "
                          "construction, but a function was not being built.")
-    return default_graph._colocate_with_for_gradient(
-        op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
-
-
-@tf_export("colocate_with")
-def colocate_with(op, ignore_existing=False):
-  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+    return default_graph.colocate_with(op, ignore_existing)
 
 
 @tf_export("control_dependencies")
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 7be8628073..e56ab93666 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1595,16 +1595,6 @@ class ControlFlowContext(object):
     last_context = self._context_stack.pop()
     graph._set_control_flow_context(last_context)
 
-  def EnterGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.EnterGradientColocation(op, gradient_uid)
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.ExitGradientColocation(op, gradient_uid)
-
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 13420b7f0e..44473ec69c 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -208,10 +208,7 @@ def _AsList(x):
   return x if isinstance(x, (list, tuple)) else [x]
 
 
-def _DefaultGradYs(grad_ys,
-                   ys,
-                   colocate_gradients_with_ops,
-                   gradient_uid="__unsupported__"):
+def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
   """Fill in default values for grad_ys.
 
   Args:
@@ -219,9 +216,6 @@ def _DefaultGradYs(grad_ys,
     ys: List of tensors.
     colocate_gradients_with_ops: If True, try colocating gradients with
       the corresponding op.
-    gradient_uid: A unique identifier within the graph indicating
-      which invocation of gradients is being executed. Used to cluster
-      ops for compilation.
 
   Returns:
     A list of gradients to use, without None.
@@ -237,7 +231,7 @@ def _DefaultGradYs(grad_ys,
   for i in xrange(len(grad_ys)):
     grad_y = grad_ys[i]
     y = ys[i]
-    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
+    with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
       if grad_y is None:
         if y.dtype.is_complex:
           raise TypeError(
@@ -344,10 +338,10 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
 
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
+def _maybe_colocate_with(op, colocate_gradients_with_ops):
   """Context to colocate with `op` if `colocate_gradients_with_ops`."""
   if colocate_gradients_with_ops:
-    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
+    with ops.colocate_with(op):
       yield
   else:
     yield
@@ -512,9 +506,6 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
   with ops.name_scope(
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
-    # Get a uid for this call to gradients that can be used to help
-    # cluster ops for compilation.
-    gradient_uid = ops.get_default_graph().unique_name("uid")
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [
         x.handle if resource_variable_ops.is_resource_variable(x) else x
@@ -522,8 +513,7 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
-                             gradient_uid)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
 
     # The approach we take here is as follows: Create a list of all ops in the
     # subgraph between the ys and xs.  Visit these ops in reverse order of ids
@@ -580,11 +570,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
-      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
+      with _maybe_colocate_with(op, colocate_gradients_with_ops):
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
-                                     aggregation_method)
+        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=True)
 
@@ -644,10 +633,7 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
               if gate_gradients and len([x for x in in_grads
                                          if x is not None]) > 1:
                 with ops.device(None):
-                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
-                      None,
-                      gradient_uid,
-                      ignore_existing=True):
+                  with ops.colocate_with(None, ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
@@ -803,7 +789,7 @@ def _LogOpGradients(op, out_grads, in_grads):
                ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
 
 
-def _MultiDeviceAddN(tensor_list, gradient_uid):
+def _MultiDeviceAddN(tensor_list):
   """Adds tensors from potentially multiple devices."""
   # Basic function structure comes from control_flow_ops.group().
   # Sort tensors according to their devices.
@@ -822,10 +808,7 @@ def _MultiDeviceAddN(tensor_list, gradient_uid):
 
   for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
     tensors = tensors_on_device[dev]
-    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
-        tensors[0].op,
-        gradient_uid,
-        ignore_existing=True):
+    with ops.colocate_with(tensors[0].op, ignore_existing=True):
       summands.append(math_ops.add_n(tensors))
 
   return math_ops.add_n(summands)
@@ -851,19 +834,12 @@ class AggregationMethod(object):
   EXPERIMENTAL_ACCUMULATE_N = 2
 
 
-def _AggregatedGrads(grads,
-                     op,
-                     gradient_uid,
-                     loop_state,
-                     aggregation_method=None):
+def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
   """Get the aggregated gradients for op.
 
   Args:
     grads: The map of memoized gradients.
     op: The op to get gradients for.
-    gradient_uid: A unique identifier within the graph indicating
-      which invocation of gradients is being executed. Used to cluster
-      ops for compilation.
     loop_state: An object for maintaining the state of the while loops in the
                 graph. It is of type ControlFlowState. None if the graph
                 contains no while loops.
@@ -940,7 +916,7 @@ def _AggregatedGrads(grads,
             out_grads[i] = running_sum
         else:
           used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
+          out_grads[i] = _MultiDeviceAddN(out_grad)
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
-- 
GitLab


From 992d1ebaab7f234bc0b8f28c524236e3cea580ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 6 Apr 2018 19:18:11 -0700
Subject: [PATCH 0414/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 191971265
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 175 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 175 ++++++++++++++++++
 2 files changed, 350 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 1fc1de22bb..81546d52f2 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -16390,6 +16390,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -19014,6 +19063,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -50215,6 +50300,47 @@ op {
     }
   }
 }
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -69036,6 +69162,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2b56339f40..0cf66d2bd6 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7345,6 +7345,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -8635,6 +8684,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -23711,6 +23796,47 @@ op {
     }
   }
 }
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -32297,6 +32423,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
-- 
GitLab


From 4ce9d3a577ba3d9a0c46f05534510c00028652e6 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Fri, 6 Apr 2018 19:52:27 -0700
Subject: [PATCH 0415/1262] Place data format op on host if input tensor is in
 host memory.

PiperOrigin-RevId: 191972759
---
 tensorflow/core/grappler/optimizers/BUILD     |  1 +
 .../grappler/optimizers/layout_optimizer.cc   | 47 ++++++++++++++++++-
 .../optimizers/layout_optimizer_test.cc       | 24 +++++++++-
 3 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 122fd48584..e4bc030885 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -480,6 +480,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:virtual_placer",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 308eecd420..561226f945 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -17,9 +17,13 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -363,6 +367,28 @@ std::vector<int> DataInputPos(const NodeDef& node) {
   return {};
 }
 
+bool IsHostMemory(const NodeDef& node, int output_port) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+    DeviceType device_type(parsed_name.type);
+    Status s = FindKernelDef(device_type, node, nullptr, nullptr);
+    if (s.ok()) {
+      tensorflow::MemoryTypeVector in_mtypes;
+      tensorflow::MemoryTypeVector out_mtypes;
+      s = tensorflow::MemoryTypesForNode(OpRegistry::Global(), device_type,
+                                         node, &in_mtypes, &out_mtypes);
+      if (s.ok()) {
+        if (out_mtypes[output_port] == HOST_MEMORY) {
+          return true;
+        }
+      }
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
 class GraphProcessor {
  public:
   GraphProcessor(const GraphProperties& graph_properties,
@@ -883,6 +909,23 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(3, w);
   }
 
+  string MaybeGetHostDevice(const string& input_name) const {
+    string device = node_->device();
+    DeviceNameUtils::ParsedName parsed_name;
+    if (DeviceNameUtils::ParseFullName(device, &parsed_name)) {
+      if (parsed_name.type != "CPU") {
+        NodeDef* input = node_map_->GetNode(input_name);
+        int port;
+        ParseNodeName(input_name, &port);
+        if (IsHostMemory(*input, port)) {
+          parsed_name.type = "CPU";
+          device = DeviceNameUtils::ParsedNameToString(parsed_name);
+        }
+      }
+    }
+    return device;
+  }
+
   NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
                                const string& op, DataType dtype,
                                bool nhwc_to_nchw) {
@@ -890,7 +933,9 @@ class NodeProcessor : public GraphProcessor {
     added_node->set_name(name);
     added_node->set_op(op);
     node_map_->AddNode(added_node->name(), added_node);
-    added_node->set_device(node_->device());
+    // The inputs of a DataFormat op could be in host memory for ops such as
+    // Reshape.
+    added_node->set_device(MaybeGetHostDevice(input_name));
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     added_node->mutable_attr()->insert({"T", attr_data_type});
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 1c912fcaa2..260347b0e8 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -158,7 +159,7 @@ class LayoutOptimizerTest : public ::testing::Test {
     return output.x_backprop;
   }
 
-  std::unique_ptr<VirtualCluster> virtual_cluster_;
+  std::unique_ptr<Cluster> virtual_cluster_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -1130,6 +1131,27 @@ TEST_F(LayoutOptimizerTest, LoopNoLiveLock) {
   EXPECT_EQ(mul_node->input(0),
             "Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
 }
+
+TEST_F(LayoutOptimizerTest, DevicePlacement) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto shape = ops::Shape(s.WithOpName("s"), conv);
+  auto i = ops::Identity(s.WithOpName("i"), shape);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  VirtualPlacer virtual_placer(virtual_cluster_.get());
+  for (auto& node : *item.graph.mutable_node()) {
+    string device = virtual_placer.get_canonical_device_name(node);
+    node.set_device(device);
+  }
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto vec_permute =
+      node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(vec_permute->device(), "/device:CPU:0");
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From b874783ccdf4cc36cb3546e6b6a998cb8f3470bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 7 Apr 2018 11:05:10 +0800
Subject: [PATCH 0416/1262] tf.Dimension raises TypeError for tf.DType (#17086)

* BUG: raise error for Dtype

* TST: add test case
---
 tensorflow/python/BUILD                          | 1 +
 tensorflow/python/framework/tensor_shape.py      | 3 +++
 tensorflow/python/framework/tensor_shape_test.py | 5 +++++
 3 files changed, 9 insertions(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a8f1318509..753be82425 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -835,6 +835,7 @@ py_library(
     srcs = ["framework/tensor_shape.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dtypes",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index af2a5b1a7e..26069d9d90 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -30,6 +31,8 @@ class Dimension(object):
     """Creates a new Dimension with the given value."""
     if value is None:
       self._value = None
+    elif isinstance(value, dtypes.DType):
+      raise TypeError("Cannot convert %s to Dimension" % value)
     else:
       self._value = int(value)
       if (not isinstance(value, compat.bytes_or_text_types) and
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 4e8ce4d889..4f23922833 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -184,6 +185,10 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(str(tensor_shape.Dimension(7)), "7")
     self.assertEqual(str(tensor_shape.Dimension(None)), "?")
 
+  def testUnsupportedType(self):
+    with self.assertRaises(TypeError):
+      tensor_shape.Dimension(dtypes.string)
+      
   def testMod(self):
     four = tensor_shape.Dimension(4)
     nine = tensor_shape.Dimension(9)
-- 
GitLab


From 7f97f1bf69765be51b9f79f5134eb44736d216eb Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Fri, 6 Apr 2018 20:26:07 -0700
Subject: [PATCH 0417/1262] eager: s/tfe.GradientTape/tf.GradientTape/

In the next release (and at HEAD), GradientTape has graduated
out of the tf.contrib.eager namespace.

PiperOrigin-RevId: 191974294
---
 .../docs_src/programmers_guide/eager.md       | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index dc5b403428..595e6be4af 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -102,11 +102,11 @@ print(a.numpy())
 #     [3 4]]
 ```
 
-The `tfe` module contains symbols available to both eager and graph execution
+The `tf.contrib.eager` module contains symbols available to both eager and graph execution
 environments and is useful for writing code to [work with graphs](#work_with_graphs):
 
 ```py
-import tensorflow.contrib.eager as tfe
+tfe = tf.contrib.eager
 ```
 
 ## Dynamic control flow
@@ -213,25 +213,25 @@ their objects.
 [Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
 is useful for implementing machine learning algorithms such as
 [backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
-neural networks. During eager execution, use `tfe.GradientTape` to trace
+neural networks. During eager execution, use `tf.GradientTape` to trace
 operations for computing gradients later.
 
-`tfe.GradientTape` is an opt-in feature to provide maximal performance when
+`tf.GradientTape` is an opt-in feature to provide maximal performance when
 not tracing. Since different operations can occur during each call, all
 forward-pass operations get recorded to a "tape". To compute the gradient, play
-the tape backwards and then discard. A particular `tfe.GradientTape` can only
+the tape backwards and then discard. A particular `tf.GradientTape` can only
 compute one gradient; subsequent calls throw a runtime error.
 
 ```py
 w = tfe.Variable([[1.0]])
-with tfe.GradientTape() as tape:
+with tf.GradientTape() as tape:
   loss = w * w
 
 grad = tape.gradient(loss, [w])
 print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
 ```
 
-Here's an example of `tfe.GradientTape` that records forward-pass operations
+Here's an example of `tf.GradientTape` that records forward-pass operations
 to train a simple model:
 
 ```py
@@ -251,8 +251,8 @@ def loss(weights, biases):
 
 # Return the derivative of loss with respect to weight and bias
 def grad(weights, biases):
-  with tfe.GradientTape() as tape:
-    loss_value = loss(weights, biases) 
+  with tf.GradientTape() as tape:
+    loss_value = loss(weights, biases)
   return tape.gradient(loss_value, [weights, biases])
 
 train_steps = 200
@@ -292,7 +292,7 @@ Final loss: 0.974
 W = 3.01582956314, B = 2.1191945076
 ```
 
-Replay the `tfe.GradientTape` to compute the gradients and apply them in a
+Replay the `tf.GradientTape` to compute the gradients and apply them in a
 training loop. This is demonstrated in an excerpt from the
 [mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py)
 example:
@@ -301,9 +301,9 @@ example:
 dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
                                               data.train.labels))
 ...
-for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
+for (batch, (images, labels)) in enumerate(dataset):
   ...
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     logits = model(images, training=True)
     loss_value = loss(logits, labels)
   ...
@@ -353,17 +353,17 @@ def loss(model, x, y):
   return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)
 
 def grad(model, inputs, targets):
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     loss_value = loss(model, inputs, targets)
   return tape.gradient(loss_value, model.variables)
 
 optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
 
-x, y = tfe.Iterator(dataset_train).next()
+x, y = iter(dataset_train).next()
 print("Initial loss: {:.3f}".format(loss(model, x, y)))
 
 # Training loop
-for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+for (i, (x, y)) in enumerate(dataset_train):
   # Calculate derivatives of the input function with respect to its parameters.
   grads = grad(model, x, y)
   # Apply the gradient to the model
@@ -398,7 +398,7 @@ And for faster training, move the computation to a GPU:
 
 ```py
 with tf.device("/gpu:0"):
-  for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+  for (i, (x, y)) in enumerate(dataset_train):
     # minimize() is equivalent to the grad() and apply_gradients() calls.
     optimizer.minimize(lambda: loss(model, x, y),
                        global_step=tf.train.get_or_create_global_step())
@@ -411,7 +411,7 @@ training to make automatic differentiation easier. The parameters of a model can
 be encapsulated in classes as variables.
 
 Better encapsulate model parameters by using `tfe.Variable` with
-`tfe.GradientTape`. For example, the automatic differentiation example above
+`tf.GradientTape`. For example, the automatic differentiation example above
 can be rewritten:
 
 ```py
@@ -435,7 +435,7 @@ def loss(model, inputs, targets):
   return tf.reduce_mean(tf.square(error))
 
 def grad(model, inputs, targets):
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     loss_value = loss(model, inputs, targets)
   return tape.gradient(loss_value, [model.W, model.B])
 
@@ -585,14 +585,14 @@ for _ in range(iterations):
 
 ### Dynamic models
 
-`tfe.GradientTape` can also be used in dynamic models. This example for a
+`tf.GradientTape` can also be used in dynamic models. This example for a
 [backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
 algorithm looks like normal NumPy code, except there are gradients and is
 differentiable, despite the complex control flow:
 
 ```py
 def line_search_step(fn, init_x, rate=1.0):
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     # Variables are automatically recorded, but manually watch a tensor
     tape.watch(init_x)
     value = fn(init_x)
@@ -608,7 +608,7 @@ def line_search_step(fn, init_x, rate=1.0):
 
 ### Additional functions to compute gradients
 
-`tfe.GradientTape` is a powerful interface for computing gradients, but there
+`tf.GradientTape` is a powerful interface for computing gradients, but there
 is another [Autograd](https://github.com/HIPS/autograd)-style API available for
 automatic differentiation. These functions are useful if writing math code with
 only tensors and gradient functions, and without `tfe.Variables`:
-- 
GitLab


From 273495dc2c957402f832cae31a438e550db2b7f0 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Fri, 6 Apr 2018 21:00:42 -0700
Subject: [PATCH 0418/1262] Improvements to ResourceVariable + Variant code.

* Works in graph + eager modes
* Fixed shape inference
* Updated shape inference + refiner + constant eval code to support static shape tensor of `-1` meaning unknown shape.
* Gather and Scatter for Variants now properly supported.
* Variable copy-on-write for Variants now does a more shallow copy (as Variants are not expected to be updated "in-place" inside a variable; instead Variants will be updated via read-update-write inside a CriticalSection)

PiperOrigin-RevId: 191975898
---
 tensorflow/contrib/makefile/tf_op_files.txt   |   1 +
 .../core/common_runtime/shape_refiner.cc      |  26 ++++
 tensorflow/core/framework/shape_inference.cc  |  78 ++++++++++-
 tensorflow/core/framework/shape_inference.h   |  11 ++
 .../core/framework/shape_inference_test.cc    |  13 +-
 tensorflow/core/kernels/BUILD                 |   1 +
 .../core/kernels/dense_update_functor.cc      |  56 ++++++++
 .../core/kernels/dense_update_functor.h       |  14 ++
 tensorflow/core/kernels/gather_functor.h      |  13 +-
 .../core/kernels/resource_variable_ops.cc     | 118 ++++++++---------
 tensorflow/core/kernels/scatter_functor.h     | 118 ++++++++++++++---
 tensorflow/core/kernels/training_op_helpers.h |  30 +++--
 tensorflow/core/ops/list_ops.cc               |   4 +-
 tensorflow/python/framework/tensor_util.py    |  19 ++-
 .../python/kernel_tests/list_ops_test.py      | 121 +++++++++++++-----
 tensorflow/python/ops/list_ops.py             |  11 +-
 16 files changed, 491 insertions(+), 143 deletions(-)

diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 0bc4c5d473..d4c3f2eda8 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -151,6 +151,7 @@ tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
 tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
+tensorflow/core/kernels/dense_update_functor.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
 tensorflow/core/kernels/decode_wav_op.cc
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 1b7e3138ee..06dbe04986 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -431,6 +431,32 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   InferenceContext* src_context = GetContext(input_edge->src());
   if (src_context == nullptr) return errors::Internal("Missing src context");
   ShapeHandle src_shape = src_context->output(input_edge->src_output());
+
+  if (src_context->Value(src_context->Rank(src_shape)) == 0) {
+    Tensor t;
+    bool evaluated = false;
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    if (!evaluated) {
+      return errors::InvalidArgument(
+          "Received a shape scalar with unknown static value.  A static value "
+          "of '-1' is required to represent an unknown shape.");
+    }
+    if (t.dims() == 0) {
+      if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      } else if (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      }
+    }
+    return errors::InvalidArgument(
+        "Received an invalid shape scalar with a static value that is not "
+        "'-1': ",
+        t.DebugString());
+  }
+
   TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
 
   const string& src_op = input_edge->src()->type_string();
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 54ecaa5dd4..cc1ec47a83 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -726,6 +726,24 @@ ShapeHandle InferenceContext::Matrix(DimensionOrConstant dim1,
   return MakeShape({dim1, dim2});
 }
 
+Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+    int input_idx, ShapeHandle* out) {
+  ShapeHandle input_shape;
+  TF_RETURN_IF_ERROR(WithRankAtMost(input(input_idx), 1, &input_shape));
+
+  requested_input_tensor_as_partial_shape_[input_idx] = true;
+  if (input_idx < input_tensors_as_shapes_.size() &&
+      input_tensors_as_shapes_[input_idx].IsSet() &&
+      RankKnown(input_tensors_as_shapes_[input_idx])) {
+    *out = input_tensors_as_shapes_[input_idx];
+    return Status::OK();
+  }
+
+  return InternalMakeShapeFromTensor(
+      true /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
+}
+
 Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
                                                   ShapeHandle* out) {
   ShapeHandle input_shape;
@@ -739,13 +757,31 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
     return Status::OK();
   }
 
-  return MakeShapeFromTensor(input_tensor(input_idx), input_shape, out);
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
 }
 
 Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
                                              ShapeHandle tensor_shape,
                                              ShapeHandle* out) {
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */, t, tensor_shape,
+      out);
+}
+
+Status InferenceContext::InternalMakeShapeFromTensor(
+    bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+    ShapeHandle tensor_shape, ShapeHandle* out) {
+  // Only callers who have set
+  if (!treat_unknown_scalar_tensor_as_unknown_shape) {
+    TF_RETURN_IF_ERROR(WithRank(tensor_shape, 1, &tensor_shape));
+  }
   if (t == nullptr) {
+    // This is guarded by the check above.
+    if (Rank(tensor_shape) == 0) {
+      return ReturnUnknownShape(out);
+    }
     // Shape tensor is not known, but if the shape of the shape tensor is then
     // the right number of unknown dims can be created.
     DimensionHandle shape_dim = Dim(tensor_shape, 0);
@@ -759,10 +795,46 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
     return ReturnCreatedShape(dims, out);
   }
 
+  if (t->shape().dims() == 0) {
+    if (t->dtype() == DataType::DT_INT32) {
+      auto flat_t = t->scalar<int32>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else if (t->dtype() == DataType::DT_INT64) {
+      auto flat_t = t->scalar<int64>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else {
+      *out = nullptr;
+      return errors::InvalidArgument(
+          "Input tensor must be int32 or int64, but was ",
+          DataTypeString(t->dtype()));
+    }
+  }
+
   if (t->shape().dims() != 1) {
     *out = nullptr;
-    return errors::InvalidArgument("Input tensor must be rank 1, but was rank ",
-                                   t->shape().dims());
+    return errors::InvalidArgument(
+        "Input tensor must be rank 1, but was rank ", t->shape().dims(), ".",
+        ((t->shape().dims() == 0)
+             ? "If it is rank 0 rank 0 it must have statically known value -1 "
+               "(representing an unknown shape). "
+             : " "),
+        "Saw tensor shape ", t->shape().DebugString());
   }
   std::vector<DimensionHandle> dims;
   if (t->dtype() == DataType::DT_INT32) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index accc587000..cdb4bd79bb 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -463,6 +463,12 @@ class InferenceContext {
   // the input tensor is NULL, then an unknown shape is returned.
   Status MakeShapeFromShapeTensor(int input_idx, ShapeHandle* out);
 
+  // Like the function above, but treats scalar values as unknown
+  // shapes.  **NOTE** If the scalar is statically known, its value
+  // must be -1 or an error is returned.
+  Status MakeShapeFromShapeTensorTreatScalarAsUnknownShape(int input_idx,
+                                                           ShapeHandle* out);
+
   // Returns in <out> a new shape corresponding to <proto>.
   Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                  ShapeHandle* out);
@@ -708,6 +714,11 @@ class InferenceContext {
     merged_dims_.clear();
   }
 
+  // Helper method for MakeShapeFromTensor and MakeShapeFromShapeTensor.
+  Status InternalMakeShapeFromTensor(
+      bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+      ShapeHandle tensor_shape, ShapeHandle* out);
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index da103bfec9..586c38e43b 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -1081,17 +1081,26 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   t = ::tensorflow::test::AsTensor<int64>({});
   EXPECT_EQ("[]", create(&t));
 
+  // Test negative scalar
+  t = ::tensorflow::test::AsScalar<int32>(-1);
+  EXPECT_EQ("?", create(&t));
+
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
   EXPECT_TRUE(str_util::StrContains(
       create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
+  auto s_scalar = create(&t);
   EXPECT_TRUE(str_util::StrContains(
-      create(&t), "Input tensor must be rank 1, but was rank 0"));
+      s_scalar,
+      "Input tensor must be rank 1, or if its rank 0 it must have value -1"))
+      << s_scalar;
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
+  auto s_matrix = create(&t);
   EXPECT_TRUE(str_util::StrContains(
-      create(&t), "Input tensor must be rank 1, but was rank 2"));
+      s_matrix, "Input tensor must be rank 1, but was rank 2"))
+      << s_matrix;
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 783de6af88..b931f79b72 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1395,6 +1395,7 @@ tf_kernel_library(
     visibility = [":friends"],
     deps = [
         ":bounds_check",
+        ":dense_update_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index a878fe9a97..3ed3794e01 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/dense_update_functor.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -70,4 +71,59 @@ struct DenseUpdate<CPUDevice, string, ASSIGN> {
 
 }  // namespace functor
 
+#define CPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+
+#define INSTANTIATE_GET_VARIANT_COPY_FN(DEVICE, TYPE_CALLER, TYPE_DENSE_COPY) \
+  template <>                                                                 \
+  Status VariantCopyFn<DEVICE>(OpKernelContext * context, const Tensor& from, \
+                               Tensor* to) {                                  \
+    PersistentTensor tmp;                                                     \
+    Tensor* tensor;                                                           \
+    AllocatorAttributes attr;                                                 \
+    attr.set_gpu_compatible(true);                                            \
+    attr.set_nic_compatible(true);                                            \
+    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
+        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
+    switch (from.dtype()) {                                                   \
+      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
+      default:                                                                \
+        return errors::InvalidArgument(                                       \
+            "VariantCopyFn: Could not perform a deep copy of variant "        \
+            "element of type: ",                                              \
+            DataTypeString(from.dtype()),                                     \
+            " using device: ", context->device()->name());                    \
+    }                                                                         \
+    *to = *tensor;                                                            \
+    return Status::OK();                                                      \
+  }
+
+INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
+
+#if GOOGLE_CUDA
+#define GPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
+  TF_CALL_GPU_ALL_TYPES(T);                 \
+  TF_CALL_int32(T);                         \
+  TF_CALL_int64(T);
+INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
+                                GPU_DENSE_COPY);
+#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
+#undef GPU_DENSE_COPY
+#endif  // GOOGLE_CUDA
+
+#undef CPU_DENSE_COPY
+#undef INSTANTIATE_GET_VARIANT_COPY_FN
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 4aefe26c54..240c13261e 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -19,11 +19,14 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
@@ -89,6 +92,17 @@ struct DenseUpdate<SYCLDevice, T, ASSIGN> {
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
+
+template <typename Device>
+Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
+
+template <>
+Status VariantCopyFn<CPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+template <>
+Status VariantCopyFn<GPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 16ccb03b85..2c6e8bf3bc 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
@@ -50,7 +51,7 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   }
   // Compute slice_bytes here so that static knowledge is available
   const size_t slice_bytes = slice_elems * sizeof(T);
-  auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
   // Store the value of invalidate index for printing error information, it's a
   // shared variable.
@@ -162,6 +163,16 @@ struct GatherFunctor<CPUDevice, T, Index> {
   }
 };
 
+template <typename Index>
+struct GatherFunctor<GPUDevice, Variant, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<Variant, 3>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<Variant, 3>::Tensor out) {
+    return GatherFunctorCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index f49a05c70a..72504200cc 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -279,64 +279,6 @@ class AssignVariableOp : public OpKernel {
   DataType dtype_;
 };
 
-template <typename Device>
-Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
-
-#define CPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-
-#define INSTANTIATE_GET_VARIANT_COPY_FN(Device, TYPE_CALLER, TYPE_DENSE_COPY) \
-  template <>                                                                 \
-  Status VariantCopyFn<Device>(OpKernelContext * context, const Tensor& from, \
-                               Tensor* to) {                                  \
-    PersistentTensor tmp;                                                     \
-    Tensor* tensor;                                                           \
-    AllocatorAttributes attr;                                                 \
-    attr.set_gpu_compatible(true);                                            \
-    attr.set_nic_compatible(true);                                            \
-    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
-        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
-    switch (from.dtype()) {                                                   \
-      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
-      default:                                                                \
-        return errors::InvalidArgument(                                       \
-            "VariantCopyFn: Could not perform a deep copy of variant "        \
-            "element of type: ",                                              \
-            DataTypeString(from.dtype()),                                     \
-            " using device: ", context->device()->name());                    \
-    }                                                                         \
-    *to = *tensor;                                                            \
-    return Status::OK();                                                      \
-  }
-
-INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
-
-#if GOOGLE_CUDA
-#define GPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
-  TF_CALL_GPU_ALL_TYPES(T);                 \
-  TF_CALL_int32(T);                         \
-  TF_CALL_int64(T);
-INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
-                                GPU_DENSE_COPY);
-#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
-#undef GPU_DENSE_COPY
-#endif  // GOOGLE_CUDA
-
-#undef CPU_DENSE_COPY
-#undef INSTANTIATE_GET_VARIANT_COPY_FN
-
 template <typename Device>
 class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
@@ -370,9 +312,16 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
     // Note that Variant objects themselves always reside on host.
+    //
+    // We nevertheless want to signal to the runtime that the tensor
+    // should reside in memory of the associated device, as Variant
+    // tensors may be marked as sitting on either CPU or GPU.  This
+    // helps to elide one or more copies.
     std::unique_ptr<Tensor> input_alias = context->forward_input(
         1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
-        value.shape(), HOST_MEMORY, attr);
+        value.shape(),
+        std::is_same<Device, CPUDevice>::value ? HOST_MEMORY : DEVICE_MEMORY,
+        attr);
 
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
@@ -396,12 +345,8 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
     const auto elements_in = value.flat<Variant>();
     auto elements_out = variable->tensor()->flat<Variant>();
-    auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
-                             std::placeholders::_1, std::placeholders::_2);
     for (int64 i = 0; i < elements_in.size(); ++i) {
-      OP_REQUIRES_OK(context, VariantDeviceCopy(
-                                  VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
-                                  elements_in(i), &elements_out(i), copy_fn));
+      elements_out(i) = elements_in(i);
     }
   }
 
@@ -560,7 +505,14 @@ class ResourceGatherOp : public OpKernel {
     }
 
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    Tensor tmp;
+    if (params.dtype() == DT_VARIANT) {
+      tmp = Tensor(DT_VARIANT, result_shape);
+      c->set_output(0, tmp);
+      out = &tmp;
+    } else {
+      OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    }
     if (N > 0) {
       const int64 gather_dim_size = params.dim_size(0);
       int64 inner_size = 1;
@@ -607,6 +559,23 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 
+// Variant objects themselves sit on CPU, even if they contain data
+// pointing to a device.
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int64>)
+
 #endif  // GOOGLE_CUDA
 
 #undef REGISTER_GATHER_CPU
@@ -721,6 +690,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
+REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
@@ -733,6 +704,23 @@ REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
 
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int32,
+                                                scatter_op::UpdateOp::ASSIGN>)
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int64,
+                                                scatter_op::UpdateOp::ASSIGN>)
+
 #endif  // GOOGLE_CUDA
 
 #undef REGISTER_SCATTER_ARITHMETIC
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 52666645bf..ebaa2bd9c6 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -20,8 +20,11 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -203,9 +206,9 @@ struct ScatterFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -216,6 +219,42 @@ struct ScatterFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   typename TTypes<Variant>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    DCHECK_EQ(N, updates.dimension(0));
+    DCHECK_EQ(cols, updates.dimension(1));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      for (int j = 0; j < cols; ++j) {
+        const Variant& to_scatter = updates(i, j);
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterFunctor<CPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<CPUDevice, Index> {};
+
+template <typename Index>
+struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
@@ -227,9 +266,9 @@ struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -252,9 +291,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
     const Index limit = static_cast<Index>(params.dimension(0));
     if (!std::is_same<T, string>::value) {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         memmove(params.data() + index * params.dimension(1),
@@ -263,9 +303,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
       }
     } else {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -321,9 +362,9 @@ struct ScatterScalarFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
@@ -334,6 +375,41 @@ struct ScatterScalarFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterScalarFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   const typename TTypes<Variant>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    const Variant& to_scatter = update();
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      for (Index j = 0; j < cols; ++j) {
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterScalarFunctor<CPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<CPUDevice, Index> {};
+template <typename Index>
+struct ScatterScalarFunctor<GPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
@@ -345,9 +421,9 @@ struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
@@ -370,9 +446,9 @@ struct ScatterScalarFunctorBase<CPUDevice, T, Index,
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index f6e2a5ae25..857daae177 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
@@ -40,14 +41,27 @@ Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
     // updating.
     PersistentTensor unused;
     Tensor* tmp;
-    AllocatorAttributes attr;
-    attr.set_gpu_compatible(true);
-    attr.set_nic_compatible(true);
-    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
-    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
-    copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
-                 const_cast<const Tensor*>(tensor)->flat<T>());
+    if (std::is_same<T, Variant>::value) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+
+      const auto elements_in = tensor->flat<Variant>();
+      auto elements_out = tmp->flat<Variant>();
+      for (int64 i = 0; i < elements_in.size(); ++i) {
+        elements_out(i) = elements_in(i);
+      }
+    } else {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
+                   const_cast<const Tensor*>(tensor)->flat<T>());
+    }
     *tensor = *tmp;
   }
   return Status::OK();
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index cad617638f..c151055ee6 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -30,7 +30,8 @@ REGISTER_OP("EmptyTensorList")
       DataType t;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
       shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{{s, t}});
       return Status::OK();
@@ -193,6 +194,7 @@ REGISTER_OP("TensorListReserve")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       DataType t;
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 64b0fa6c00..8cf24206ed 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -822,17 +822,32 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   all-or-nothing.
 
   Args:
-    tensor: The rank-1 Tensor to be evaluated.
+    tensor: The rank-0 or rank-1 Tensor to be evaluated.
 
   Returns:
     A `TensorShape` based on the constant value of the given `tensor`.
+
+  Raises:
+    ValueError: If the shape is rank-0 and is not statically known to be -1.
   """
   if isinstance(tensor, ops.EagerTensor):
     return tensor_shape.as_shape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
+  if tensor.get_shape().ndims == 0:
+    value = constant_value(tensor)
+    if value is None:
+      raise ValueError(
+          "Received a scalar with unknown value as shape; require a statically "
+          "known scalar with value '-1' to describe an unknown shape.")
+    if value != -1:
+      raise ValueError(
+          "Received a scalar value '%s' as shape; require a statically known "
+          "scalar with value '-1' to describe an unknown shape." % value)
+    return tensor_shape.unknown_shape()
+
   shape = tensor.get_shape().with_rank(1)
-  if tensor.get_shape() == [0]:
+  if shape == [0]:
     return tensor_shape.scalar()
   elif tensor.op.type == "Shape":
     return tensor.op.inputs[0].get_shape()
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index dbbed39c72..d969f0e03a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -31,8 +31,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -43,71 +46,83 @@ def scalar_shape():
 
 class ListOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPop(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testPushPop()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStack(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [1.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testStack()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 2.0)
+    self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
-    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
+    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
-    self.assertAllEqual(e0, 1.0)
+    self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [3.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUnknownShape(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=-1)
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=-1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
-    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
@@ -116,15 +131,16 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
-          list_ops.tensor_list_pop_back(
-              l_gpu, element_dtype=dtypes.float32)[1],
-          2.0)
+          self.evaluate(
+              list_ops.tensor_list_pop_back(
+                  l_gpu, element_dtype=dtypes.float32)[1]), 2.0)
     l_cpu = array_ops.identity(l_gpu)
     self.assertAllEqual(
-        list_ops.tensor_list_pop_back(
-            l_cpu, element_dtype=dtypes.float32)[1],
-        2.0)
+        self.evaluate(
+            list_ops.tensor_list_pop_back(
+                l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStack(self):
     with context.graph_mode(), self.test_session():
       tl = list_ops.empty_tensor_list(
@@ -132,9 +148,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           element_dtype=dtypes.int32)
       tl = list_ops.tensor_list_push_back(tl, [1])
       self.assertAllEqual(
-          list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32).eval(),
+          self.evaluate(
+              list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
           [[1]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackInLoop(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
@@ -149,9 +167,10 @@ class ListOpsTest(test_util.TensorFlowTestCase):
 
       i, t1 = control_flow_ops.while_loop(lambda i, t1: math_ops.less(i, 4),
                                           body, [i, t1])
-      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32).eval()
-      self.assertAllEqual(s1, [0, 1, 2, 3])
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
+      self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       list_ = list_ops.empty_tensor_list(
@@ -169,11 +188,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       for _ in range(2):
         list_, m = body(list_, m)
 
-      s1 = list_ops.tensor_list_stack(
-          list_, element_dtype=dtypes.float32).eval()
+      s1 = list_ops.tensor_list_stack(list_, element_dtype=dtypes.float32)
       np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
-      self.assertAllEqual(s1, np_s1)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackInLoopSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
@@ -193,10 +212,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
 
       i, m, t1 = control_flow_ops.while_loop(
           lambda i, m, t1: math_ops.less(i, 4), body, [i, m, t1])
-      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32).eval()
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32)
       np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
-      self.assertAllEqual(s1, np_s1)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSerialize(self):
     # pylint: disable=g-import-not-at-top
     try:
@@ -226,8 +246,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               l_ps, element_dtype=dtypes.float32)
         with ops.device("/job:worker"):
           worker_e = array_ops.identity(e)
-        self.assertAllEqual(worker_e.eval(), [2.0])
+        self.assertAllEqual(self.evaluate(worker_e), [2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
@@ -237,18 +258,24 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       l = list_ops.tensor_list_push_back(l, c)
       l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       e = 2 * e
-    self.assertAllEqual(tape.gradient(e, [c])[0], 2.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
       l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-      c2 = list_ops.tensor_list_stack(
-          l, element_dtype=dtypes.float32)
+      c2 = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       result = c2 * 2.0
-    self.assertAllEqual(tape.gradient(result, [c])[0], [2.0, 2.0])
-
+    if context.in_eager_mode():
+      # TODO(b/77609620): Fix this in graph mode.
+      grad = tape.gradient(result, [c])[0]
+    else:
+      grad = gradients.gradients(result, [c])[0]
+    self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
@@ -261,16 +288,40 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       ee = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       y = e * e + ee * ee
     grad_c, grad_c2 = tape.gradient(y, [c, c2])
-    self.assertAllEqual(grad_c, [0.0, 4.0])
-    self.assertAllEqual(grad_c2, 6.0)
+    self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
+    self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
     with self.assertRaises(errors.InvalidArgumentError):
-      list_ops.tensor_list_set_item(l, 20, 3.0)
+      self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testResourceVariableScatterGather(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
+    v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
+    self.evaluate(v.initializer)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_0_stacked))
+    v_r_sparse_stacked = list_ops.tensor_list_stack(
+        v.sparse_read(0), dtypes.float32)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
+    l_new_0 = list_ops.tensor_list_from_tensor(
+        [3.0, 4.0], element_shape=scalar_shape())
+    l_new_1 = list_ops.tensor_list_from_tensor(
+        [5.0, 6.0], element_shape=scalar_shape())
+    updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
+    updated_v_elems = array_ops.unstack(updated_v)
+    updated_v_stacked = [
+        list_ops.tensor_list_stack(el, dtypes.float32) for el in updated_v_elems
+    ]
+    expected = ([[1.0, 2.0]] * 3 + [[3.0, 4.0], [1.0, 2.0], [5.0, 6.0]] +
+                [[1.0, 2.0]] * 4)
+    self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
 
 if __name__ == "__main__":
-  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index bba59ebcef..bdf0774bbf 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -54,8 +54,8 @@ def _TensorListStackGrad(unused_op, dtensor):
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape[0] is not None:
-    num_elements = op.inputs[0].shape[0]
+  if op.inputs[0].shape[0].value is not None:
+    num_elements = op.inputs[0].shape[0].value
   else:
     num_elements = None
   if dlist is None:
@@ -63,9 +63,10 @@ def _TensorListFromTensorGrad(op, dlist):
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype,
-      num_elements=num_elements)
+  tensor_grad = gen_list_ops.tensor_list_stack(
+      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+  shape_grad = None
+  return tensor_grad, shape_grad
 
 
 @ops.RegisterGradient("TensorListGetItem")
-- 
GitLab


From 0b9eedd684b4085ab65d60627efa8594a92a0b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 7 Apr 2018 11:47:03 +0800
Subject: [PATCH 0419/1262] TST: add test case for duplicate indices

---
 .../kernel_tests/scatter_nd_ops_test.py       | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 03b2f892c6..dfe9600dbb 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -366,13 +366,35 @@ class ScatterNdTest(test.TestCase):
 
   def testString(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string)
+    updates = constant_op.constant(["four", "three", "one", "seven"],
+                                   dtype=dtypes.string)
     expected = np.array(["", "one", "", "three", "four", "", "", "seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
 
+    # Same indice is updated twice by same value.
+    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "b", "c"],
+                                   dtype=dtypes.string)
+    expected = np.array(["", "", "", "bb", "a", "", "", "c"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by different value.
+    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "c", "d"],
+                                   dtype=dtypes.string)
+    expected = [np.array(["", "", "", "bc", "a", "", "", "d"]),
+                np.array(["", "", "", "cb", "a", "", "", "d"])]
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
-      self.assertTrue(np.array_equal(result, expected))
+      self.assertTrue(np.array_equal(result, expected[0]) or
+                      np.array_equal(result, expected[1]))
 
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
-- 
GitLab


From 9e1bbbc0fb770f077d9de295b53181e3592f1d24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 7 Apr 2018 12:07:11 +0800
Subject: [PATCH 0420/1262] DOC: remove the misleading 'empty tensor'

---
 tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 4e95895f54..58753a651a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -25,7 +25,7 @@ A new tensor with the given shape and updates applied according
 to the indices.
 END
   }
-  summary: "Scatter `updates` into a new empty tensor according to `indices`."
+  summary: "Scatter `updates` into a new tensor according to `indices`."
   description: <<END
 Creates a new tensor by applying sparse `updates` to individual values or
 slices within a tensor (initially zero for numeric, empty for string) of
-- 
GitLab


From 30e2b97897d05e47b457ab1d5d0d9c4227b87845 Mon Sep 17 00:00:00 2001
From: Rob Sloan <varomodt@google.com>
Date: Fri, 6 Apr 2018 21:55:10 -0700
Subject: [PATCH 0421/1262] Add analytical cost model for
 FusedConv2DBiasActivation.

PiperOrigin-RevId: 191978272
---
 .../grappler/costs/op_level_cost_estimator.cc | 165 +++++++++++++++++-
 .../grappler/costs/op_level_cost_estimator.h  |  26 +++
 .../costs/op_level_cost_estimator_test.cc     |  64 ++++++-
 3 files changed, 249 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 79735e6cc2..087190ad2a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -30,6 +30,7 @@ constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
+constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -196,6 +197,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
       {kConv2dBackpropInput,
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
+      {kFusedConv2dBiasActivation,
+       wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
@@ -545,7 +548,6 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   ops *= conv_dims.kx * conv_dims.ky;
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for Conv2D " << ops;
 
   if (conv_info != nullptr) {
     *conv_info = conv_dims;
@@ -983,6 +985,91 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
+    const OpContext& op_context) const {
+  // FusedConv2DBiasActivation computes a fused kernel which implements:
+  // 2D convolution, adds side input with separate scaling on convolution and
+  // side inputs, then adds bias, and finally applies the ReLU activation
+  // function to the result:
+  //
+  // Input -> Conv2D  ->  Add  -> BiasAdd  -> ReLU
+  //            ^          ^         ^
+  //          Filter   Side Input   Bias
+  //
+  // Note that when adding the side input, the operation multiplies the output
+  // of Conv2D by conv_input_scale, confusingly, and the side_input by
+  // side_input_scale.
+  //
+  // Note that in the special case that side_input_scale is 0, which we infer
+  // from side_input having dimensions [], we skip that addition operation.
+  //
+  // For more information, see
+  // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+  auto& conv_input = op_context.op_info.inputs(0);
+  auto& filter = op_context.op_info.inputs(1);
+  auto& bias = op_context.op_info.inputs(2);
+  auto& side_input = op_context.op_info.inputs(3);
+  auto& conv_input_scale = op_context.op_info.inputs(4);
+  auto& side_input_scale = op_context.op_info.inputs(5);
+
+  // Manually compute our convolution dimensions.
+  bool found_unknown_shapes = false;
+  auto dims = ConvolutionDimensionsFromInputs(
+      conv_input.shape(), filter.shape(), op_context.op_info,
+      &found_unknown_shapes);
+
+  // Construct the shape of our output tensor from our convolution dimensions
+  // and format, as it may not be available yet.
+  //
+  // TODO(varomodt): should we centralize the Conv2D input/output shapes?
+  bool unknown_conv_format = false;
+  OpInfo::TensorProperties output;
+  switch (GetConvolutionFormat(op_context)) {
+    case NCHW:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.oz, dims.ox, dims.oy});
+      break;
+    case NHWC:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      break;
+    default:
+      // TODO(b/77722245): support cost estimation for NCHW_VECT_C.
+      LOG(WARNING) << "unsupported data format: "
+                   << GetDataFormat(op_context.op_info)
+                   << " Defaulting to NHWC.";
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      unknown_conv_format = true;
+      break;
+  }
+
+  // Add the operations the fused op always computes.
+  std::vector<OpContext> component_ops = {
+      FusedChildContext(op_context, "Conv2D", output, {conv_input, filter}),
+      FusedChildContext(op_context, "Mul", output, {output, conv_input_scale}),
+      FusedChildContext(op_context, "BiasAdd", output, {output, bias}),
+      FusedChildContext(op_context, "Relu", output, {output})};
+
+  // Add our side_input iff it's non-empty.
+  if (side_input.shape().dim_size() > 0) {
+    component_ops.push_back(FusedChildContext(op_context, "Mul", side_input,
+                                              {side_input, side_input_scale}));
+    component_ops.push_back(
+        FusedChildContext(op_context, "Add", output, {side_input, output}));
+  }
+
+  // Construct an op_context which definitely has our output shape.
+  auto op_context_with_output = op_context;
+  op_context_with_output.op_info.mutable_outputs()->Clear();
+  *op_context_with_output.op_info.mutable_outputs()->Add() = output;
+
+  // Construct component operations and run the cost computation.
+  auto costs = PredictFusedOp(op_context_with_output, component_ops);
+  costs.inaccurate |= found_unknown_shapes || unknown_conv_format;
+  return costs;
+}
+
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
   bool found_unknown_shapes = false;
@@ -1086,6 +1173,66 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedOp(
+    const OpContext& op_context,
+    const std::vector<OpContext>& fused_op_contexts) const {
+  // Note that PredictOpCountBasedCost will get the correct memory_time from
+  // the node's inputs and outputs; but we don't want to have to re-implement
+  // the logic for computing the operation count of each of our component
+  // operations here; so we simply add the compute times of each component
+  // operation, then update the execution time.
+  Costs fused_cost = PredictOpCountBasedCost(0, op_context.op_info);
+  fused_cost.compute_time = 0;
+  fused_cost.inaccurate = false;
+  for (auto& fused_op : fused_op_contexts) {
+    auto op_cost = PredictCosts(fused_op);
+    fused_cost.compute_time += op_cost.compute_time;
+    fused_cost.inaccurate |= op_cost.inaccurate;
+  }
+
+  CombineCostsAndUpdateExecutionTime(&fused_cost);
+  return fused_cost;
+}
+
+/* static */
+OpContext OpLevelCostEstimator::FusedChildContext(
+    const OpContext& parent, const string& op_name,
+    const OpInfo::TensorProperties& output,
+    const std::vector<OpInfo::TensorProperties>& inputs) {
+  // Setup the base parameters of our new context.
+  OpContext new_context;
+  new_context.name = op_name;
+  new_context.device_name = parent.device_name;
+  new_context.op_info = parent.op_info;
+  new_context.op_info.set_op(op_name);
+
+  // Setup the inputs of our new context.
+  new_context.op_info.mutable_inputs()->Clear();
+  for (const auto& input : inputs) {
+    *new_context.op_info.mutable_inputs()->Add() = input;
+  }
+
+  // Setup the output of our new context.
+  new_context.op_info.mutable_outputs()->Clear();
+  *new_context.op_info.mutable_outputs()->Add() = output;
+
+  return new_context;
+}
+
+/* static */
+OpInfo::TensorProperties OpLevelCostEstimator::DescribeTensor(
+    DataType type, const std::vector<int64>& dims) {
+  OpInfo::TensorProperties ret;
+  ret.set_dtype(type);
+
+  auto shape = ret.mutable_shape();
+  for (const int dim : dims) {
+    shape->add_dim()->set_size(dim);
+  }
+
+  return ret;
+}
+
 /* static */
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::OpDimensionsFromInputs(
@@ -1371,6 +1518,21 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   return costs;
 }
 
+/* static */
+OpLevelCostEstimator::ConvolutionFormat
+OpLevelCostEstimator::GetConvolutionFormat(const OpContext& op_context) {
+  auto data_format = GetDataFormat(op_context.op_info);
+  if (data_format == "NCHW") {
+    return NCHW;
+  } else if (data_format == "NHWC") {
+    return NHWC;
+  } else if (data_format == "NCHW_VECT_C") {
+    return NCHW_VECT_C;
+  }
+
+  return UNKNOWN_CONVOLUTION_FORMAT;
+}
+
 void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
     Costs* costs) const {
   if (compute_memory_overlap_) {
@@ -1379,6 +1541,5 @@ void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
     costs->execution_time = costs->compute_time + costs->memory_time;
   }
 }
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 7080264698..35649f7ee9 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -82,6 +82,13 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
+  enum ConvolutionFormat {
+    UNKNOWN_CONVOLUTION_FORMAT,
+    NHWC,
+    NCHW,
+    NCHW_VECT_C,
+    NCHW_VECT_W,
+  };
   int64 CountConv2DOperations(const OpInfo& op_features,
                               bool* found_unknown_shapes) const;
   int64 CountConv2DOperations(const OpInfo& op_features,
@@ -138,6 +145,7 @@ class OpLevelCostEstimator {
   Costs PredictCwiseOp(const OpContext& op_context) const;
   Costs PredictConv2DBackpropInput(const OpContext& op_context) const;
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
+  Costs PredictFusedConv2DBiasActivation(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
   Costs PredictIdentity(const OpContext& op_context) const;
@@ -152,6 +160,10 @@ class OpLevelCostEstimator {
   Costs PredictFusedBatchNorm(const OpContext& op_context) const;
   Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
 
+  // Generic cost prediction method for fused operations.
+  Costs PredictFusedOp(const OpContext& op_context,
+                       const std::vector<OpContext>& fused_op_contexts) const;
+
   // Utility function for safe division. Returns 0
   // if rhs is 0 or negative.
   static double SafeDiv(const double lhs, const double rhs) {
@@ -173,6 +185,20 @@ class OpLevelCostEstimator {
       const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
+  // Helper to construct child operation contexts for the component operations
+  // of fused ops.
+  static OpContext FusedChildContext(
+      const OpContext& parent, const string& op_name,
+      const OpInfo::TensorProperties& output,
+      const std::vector<OpInfo::TensorProperties>& inputs);
+
+  // Helper to construct tensor shapes.
+  static OpInfo::TensorProperties DescribeTensor(
+      DataType type, const std::vector<int64>& dims);
+
+  // Returns the Conv2D format for this operation.
+  static ConvolutionFormat GetConvolutionFormat(const OpContext& op_context);
+
   // This method calculates the execution time depending on whether IO can
   // overlap with computation. It assumes the memory and the compute times have
   // already been calculated.
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index d797a8a8c1..13ea43bed6 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -93,6 +93,14 @@ OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
   return op_context;
 }
 
+// Wrangles the minimum number of proto fields to set up a 1D Tensor for cost
+// estimation purposes.
+void DescribeTensor1D(int dim0, OpInfo::TensorProperties* tensor) {
+  auto shape = tensor->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  tensor->set_dtype(DT_FLOAT);
+}
+
 // Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
 // estimation purposes.
 void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
@@ -120,6 +128,38 @@ OpContext DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2,
   return op_context;
 }
 
+// DescribeFusedConv2DBiasActivation constructs an OpContext for a
+// FusedConv2DBiasActivation applied to a convolution input tensor with shape
+// (batch, ix, iy, iz1), a kernel tensor with shape (kx, ky, iz2, oz), a
+// bias tensor with shape (oz), a side input tensor with shape
+// (batch, ox, oy, oz) if has_side_input is set, and two scaling tensors with
+// shape (1).
+//
+// Note that this assumes the NHWC data format.
+OpContext DescribeFusedConv2DBiasActivation(int batch, int ix, int iy, int iz1,
+                                            int iz2, int kx, int ky, int ox,
+                                            int oy, int oz,
+                                            bool has_side_input) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("FusedConv2DBiasActivation");
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, oz, op_context.op_info.add_inputs());
+  DescribeTensor1D(oz, op_context.op_info.add_inputs());
+
+  // Add the side_input, if any.
+  auto side_input = op_context.op_info.add_inputs();
+  if (has_side_input) {
+    DescribeTensor4D(batch, ox, oy, oz, side_input);
+  }
+
+  // Add the scaling tensors.
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+
+  return op_context;
+}
+
 // DescribeUnaryOp constructs an OpContext for the given operation applied to
 // a 4-tensor with shape (size1, 1, 1, 1).
 OpContext DescribeUnaryOp(const string& op, int size1) {
@@ -162,12 +202,9 @@ OpContext DescribeBiasAdd(int size1, int size2) {
   op_context.op_info.set_op("BiasAdd");
 
   DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_inputs());
+  DescribeTensor1D(size1, op_context.op_info.add_inputs());
   DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_outputs());
 
-  auto bias = op_context.op_info.add_inputs();
-  bias->mutable_shape()->add_dim()->set_size(size1);
-  bias->set_dtype(DT_FLOAT);
-
   return op_context;
 }
 
@@ -486,6 +523,25 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
+TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true));
+  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest,
+       FusedConv2DBiasActivationNoSideInputExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ false));
+  EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
   auto cost = PredictCosts(DescribeBinaryOp("Mul", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-- 
GitLab


From 994fef1b4e702fb6cf178cff8a30cd75794c6451 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 7 Apr 2018 02:03:00 -0700
Subject: [PATCH 0422/1262] Remove 'Print' in DebugStripper.

PiperOrigin-RevId: 191989327
---
 tensorflow/core/grappler/op_types.cc          |  2 ++
 tensorflow/core/grappler/op_types.h           |  1 +
 .../grappler/optimizers/debug_stripper.cc     | 17 +++++++--
 .../optimizers/debug_stripper_test.cc         | 36 +++++++++++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index a24d2dbd9f..1fb1711f54 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -245,6 +245,8 @@ bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
 
 bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
 
+bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
+
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 8667f72c7e..d516baebf3 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -95,6 +95,7 @@ bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
 bool IsPolygamma(const NodeDef& node);
+bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
 bool IsReal(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index 8bd10171f1..9701a038d0 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -40,10 +41,22 @@ Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
           inp = AsControlDependency(inp);
         }
       }
-    } else if (IsCheckNumerics(node)) {
+    } else if (IsCheckNumerics(node) || IsPrint(node)) {
       // Replace with Identity op which will be pruned later.
       node.set_op("Identity");
-      node.mutable_attr()->erase("message");
+      // Only preserve T attribute.
+      protobuf::Map<string, AttrValue> new_attr;
+      if (node.attr().find("T") != node.attr().end()) {
+        new_attr.insert({"T", node.attr().at("T")});
+      }
+      node.mutable_attr()->swap(new_attr);
+      // As Identity op only takes one input, mark redundant inputs as control
+      // input.
+      for (size_t i = 1; i < node.input_size(); ++i) {
+        if (!IsControlInput(node.input(i))) {
+          *node.mutable_input(i) = AsControlDependency(node.input(i));
+        }
+      }
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
index 3f11febc64..96ceee791f 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -164,6 +164,42 @@ TEST_F(DebugStripperTest, StripCheckNumericsFromGraph) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
+TEST_F(DebugStripperTest, StripPrintFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output print = ops::Print(s.WithOpName("Print"), x, {x});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Print") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ(1, node.attr_size());
+    }
+  }
+
+  EXPECT_EQ(2, output.node_size());
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"Print"}, {{"x", x_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"Print"}, {{"x", x_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From 9dac3ba23fad14c6be2482eaae5ea4f2d34c9893 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sat, 7 Apr 2018 22:42:10 +0900
Subject: [PATCH 0423/1262] move dependency

---
 tensorflow/python/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9dad747ac0..7d40c133c4 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1970,6 +1970,7 @@ py_library(
         ":array_ops",
         ":control_flow_ops",
         ":framework_for_generated_wrappers",
+        ":functional_ops",
         ":linalg_ops",
         ":math_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
@@ -1984,7 +1985,6 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
-        ":functional_ops",
         ":linalg_ops_gen",
         ":math_ops",
         "//third_party/py/numpy",
-- 
GitLab


From e7ea87f97e03360719d132a71acc1eb2f93c249f Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Sat, 7 Apr 2018 10:15:58 -0700
Subject: [PATCH 0424/1262] Automated g4 rollback of changelist 191938267

PiperOrigin-RevId: 192007784
---
 .../internal/optimized/optimized_ops.h        |  28 ++--
 .../internal/reference/reference_ops.h        |  13 +-
 tensorflow/contrib/lite/kernels/pad.cc        |  27 ++--
 tensorflow/contrib/lite/kernels/pad_test.cc   | 129 +++---------------
 4 files changed, 39 insertions(+), 158 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 7a383fba18..9a274612ad 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5067,7 +5067,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims, const int32_t pad_value) {
+                const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
@@ -5087,27 +5087,27 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, pad_value,
+    memset(output_data, 0,
            left_b_padding * output_height * output_width * output_depth *
                sizeof(T));
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
              left_h_padding * output_width * output_depth * sizeof(T));
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
                left_w_padding * output_depth * sizeof(T));
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
-                 pad_value, left_d_padding * sizeof(T));
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
+                 left_d_padding * sizeof(T));
         }
 
         T* out = output_data +
@@ -5121,21 +5121,20 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
           memset(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              pad_value, right_d_padding * sizeof(T));
+              0, right_d_padding * sizeof(T));
         }
       }
       if (right_w_padding != 0) {
         memset(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            pad_value, right_w_padding * output_depth * sizeof(T));
+            0, right_w_padding * output_depth * sizeof(T));
       }
     }
     if (right_h_padding != 0) {
       memset(output_data + Offset(output_dims, 0, 0,
                                   output_height - right_h_padding, out_b),
-             pad_value,
-             right_h_padding * output_width * output_depth * sizeof(T));
+             0, right_h_padding * output_width * output_depth * sizeof(T));
     }
   }
   if (right_b_padding != 0) {
@@ -5147,15 +5146,6 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-inline void Pad(const T* input_data, const Dims<4>& input_dims,
-                const std::vector<int>& left_paddings,
-                const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
-  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
-      output_dims, 0);
-}
-
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 3245bf615e..31e190e248 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3086,7 +3086,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims, const int32_t pad_value) {
+                const Dims<4>& output_dims) {
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -3116,7 +3116,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = static_cast<T>(pad_value);
+            *out_ptr++ = 0;
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -3126,15 +3126,6 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-inline void Pad(const T* input_data, const Dims<4>& input_dims,
-                const std::vector<int>& left_paddings,
-                const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
-  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
-      output_dims, 0);
-}
-
 inline bool LoopCondition(int index, int stop, int stride) {
   return stride > 0 ? index < stop : index > stop;
 }
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 4f9449a225..c29da3862e 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -119,46 +119,39 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar, pad_value)                                \
+#define TF_LITE_PAD(type, scalar)                                           \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
             GetTensorDims(op_context.input), before_padding, after_padding, \
             GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output), pad_value)
+            GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float, 0);
+        TF_LITE_PAD(reference_ops, float);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float, 0);
+        TF_LITE_PAD(optimized_ops, float);
       }
       break;
     case kTfLiteUInt8:
-      // Quantized Pad requires that 0 is represented in the quantized range.
-      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                  std::numeric_limits<uint8_t>::min());
-      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t,
-                    op_context.output->params.zero_point);
+        TF_LITE_PAD(reference_ops, uint8_t);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t,
-                    op_context.output->params.zero_point);
+        TF_LITE_PAD(optimized_ops, uint8_t);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t, 0);
+        TF_LITE_PAD(reference_ops, int32_t);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t, 0);
+        TF_LITE_PAD(optimized_ops, int32_t);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t, 0);
+        TF_LITE_PAD(reference_ops, int64_t);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t, 0);
+        TF_LITE_PAD(optimized_ops, int64_t);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index c06237e572..28834ad071 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -22,7 +22,6 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
-using ::testing::Matcher;
 
 class PadOpModel : public SingleOpModel {
  public:
@@ -30,10 +29,6 @@ class PadOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
-  void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
-  }
-
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
@@ -41,11 +36,6 @@ class PadOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
-  }
-
  protected:
   int input_;
   int output_;
@@ -60,17 +50,16 @@ class PadOpModel : public SingleOpModel {
 //    m.Invoke();
 class PadOpConstModel : public PadOpModel {
  public:
-  PadOpConstModel(const TensorData& input,
+  PadOpConstModel(std::initializer_list<int> input_shape,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings,
-                  const TensorData& output) {
-    input_ = AddInput(input);
+                  std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    output_ = AddOutput(output);
+    output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input.shape});
+    BuildInterpreter({input_shape});
   }
 };
 
@@ -83,45 +72,40 @@ class PadOpConstModel : public PadOpModel {
 //    m.Invoke();
 class PadOpDynamicModel : public PadOpModel {
  public:
-  PadOpDynamicModel(const TensorData& input,
-                    std::initializer_list<int> paddings_shape,
-                    const TensorData& output) {
-    input_ = AddInput(input);
+  PadOpDynamicModel(std::initializer_list<int> input_shape,
+                    std::initializer_list<int> paddings_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(output);
+    output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input.shape, paddings_shape});
+    BuildInterpreter({input_shape, paddings_shape});
   }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
-                      {TensorType_FLOAT32}),
+      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
       "dims != 4");
 }
 
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
-                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
+  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
                "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
   EXPECT_DEATH(
-      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
-                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
+      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
       "Pad value has to be greater than equal to 0.");
 }
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
+  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -130,8 +114,7 @@ TEST(PadOpTest, SimpleConstTest) {
 }
 
 TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                      {TensorType_FLOAT32});
+  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -141,8 +124,7 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
+  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -152,8 +134,7 @@ TEST(PadOpTest, AdvancedConstTest) {
 }
 
 TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                      {TensorType_FLOAT32});
+  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -163,80 +144,6 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-class QuantizedPadOpTest : public ::testing::Test {
- protected:
-  std::vector<Matcher<float>> DequantizedArrayNear(
-      const std::vector<float>& values, const float min, const float max) {
-    const float quantization_tolerance = (max - min) / 255.0;
-    return ArrayFloatNear(values, quantization_tolerance);
-  }
-};
-
-TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
-  // The test_util and actual quantization code currently ensure that the range
-  // must include zero, but if that ever changes, this test will catch it.
-  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
-                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                                 {TensorType_UINT8, {}, 1.0, 2.0}),
-               ".*Check failed: f_min <= 0.*");
-}
-
-TEST_F(QuantizedPadOpTest, SimpleConstTest) {
-  // Padding is represented as four 2-D lists representing above padding and
-  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0},
-                    {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
-  m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(DequantizedArrayNear(
-                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
-                  -1.0, 1.0)));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-}
-
-TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
-                      {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
-  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
-  m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(DequantizedArrayNear(
-                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
-                  -1.0, 1.0)));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-}
-
-TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
-                    {0, 0, 0, 2, 1, 3, 0, 0},
-                    {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
-  m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(DequantizedArrayNear(
-                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
-                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
-                  -1.0, 1.0)));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
-}
-
-TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
-                      {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
-  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
-  m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(DequantizedArrayNear(
-                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
-                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
-                  -1.0, 1.0)));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
-}
-
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 1cd76c209ce6f74298843568a7fc397c2e6f958f Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sat, 7 Apr 2018 11:42:43 -0700
Subject: [PATCH 0425/1262] [XLA:GPU] Eliminate the guard around Winograd
 non-fused convolutions with cudnn7.

Adds DnnSupport::GetVersion() and uses this to unguard Winograd
non-fused convolutions if you're using cudnn7.

PiperOrigin-RevId: 192010450
---
 .../gpu/cudnn_convolution_algorithm_picker.cc | 30 +++++++++++--------
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |  7 +++++
 tensorflow/stream_executor/cuda/cuda_dnn.h    |  1 +
 tensorflow/stream_executor/dnn.h              |  7 +++++
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 1792893ae4..d6b457a91b 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -94,11 +94,17 @@ se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
 // Determines whether we can safely perform a winograd non-fused convolution for
 // the given input and output shapes.  This works around b/68264959, an integer
 // overflow in cuDNNv5 and cuDNNv6.
-//
-// TODO(jlebar): We shouldn't need this check for cuDNNv7.
-bool ShouldIncludeWinogradNonfusedAlgo(
-    const Shape& input_shape, const Shape& output_shape,
-    const ConvolutionDimensionNumbers& dnums) {
+bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const ConvolutionDimensionNumbers& dnums,
+                                       se::StreamExecutor* stream_exec) {
+  // Skip this check for cudnn7 and newer.
+  se::port::StatusOr<std::tuple<int, int, int>> version =
+      stream_exec->AsDnn()->GetVersion();
+  if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+    return true;
+  }
+
   int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
   int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
   int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
@@ -118,20 +124,20 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          bool with_winograd_nonfused,
-                                         se::StreamExecutor* stream_exec_) {
+                                         se::StreamExecutor* stream_exec) {
   std::vector<AlgorithmDesc> algorithms;
   switch (kind) {
     case CudnnConvKind::kBackwardFilter:
-      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kBackwardInput:
-      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kForward:
-      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
-                                                &algorithms));
+      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
+                                               &algorithms));
       break;
   }
 
@@ -209,8 +215,8 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     return nullopt;
   }
 
-  const bool use_winograd_nonfused =
-      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
+      input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 3fd9275289..fa5b90c945 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -478,6 +478,13 @@ port::Status CudnnSupport::Init() {
                                    ToString(status))};
 }
 
+port::StatusOr<std::tuple<int, int, int>> CudnnSupport::GetVersion() {
+  CudnnVersion version;
+  TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
+  return std::make_tuple(version.major_version, version.minor_version,
+                         version.patch_level);
+}
+
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
 class ScopedTensorDescriptor {
  public:
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index e40ba9b012..0e5368aca8 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -46,6 +46,7 @@ class CudnnSupport : public dnn::DnnSupport {
   ~CudnnSupport() override;
 
   port::Status Init() override;
+  port::StatusOr<std::tuple<int, int, int>> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 43cfd313c1..3c47d2c2e8 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <tuple>
 
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -885,6 +886,12 @@ class DnnSupport {
 
   virtual port::Status Init() = 0;
 
+  // Gets the version of the backing library, as a {major, minor, patch} tuple.
+  virtual port::StatusOr<std::tuple<int, int, int>> GetVersion() {
+    return port::UnimplementedError(
+        "DnnSupport::GetVersion not implemented on this platform.");
+  }
+
   // Performs a single-precision forward batch normalization operation onto
   // the stream.
   //
-- 
GitLab


From 7c95ee3ca48f4e50818f12daf749cbe050a8643f Mon Sep 17 00:00:00 2001
From: Brett Koonce <koonce@hello.com>
Date: Sun, 18 Mar 2018 13:41:12 -0700
Subject: [PATCH 0426/1262] contrib: minor spelling tweaks

packages:
  data
  training
  tensor_forest
---
 .../python/kernel_tests/dataset_serialization_test_base.py  | 2 +-
 .../data/python/kernel_tests/interleave_dataset_op_test.py  | 4 ++--
 tensorflow/contrib/data/python/ops/scan_ops.py              | 2 +-
 tensorflow/contrib/tensor_forest/client/random_forest.py    | 2 +-
 .../hybrid/core/ops/hard_routing_function_op.cc             | 2 +-
 .../hybrid/core/ops/stochastic_hard_routing_function_op.cc  | 2 +-
 .../hybrid/core/ops/stochastic_hard_routing_gradient_op.cc  | 2 +-
 tensorflow/contrib/tensor_forest/kernels/tree_utils.cc      | 4 ++--
 tensorflow/contrib/tensor_forest/kernels/tree_utils.h       | 2 +-
 .../tensor_forest/kernels/v4/decision-tree-resource.h       | 2 +-
 .../tensor_forest/kernels/v4/decision_node_evaluator.h      | 2 +-
 tensorflow/contrib/tensor_forest/ops/model_ops.cc           | 2 +-
 tensorflow/contrib/tensor_forest/ops/stats_ops.cc           | 4 ++--
 tensorflow/contrib/tensor_forest/python/tensor_forest.py    | 2 +-
 tensorflow/contrib/training/python/training/resample.py     | 2 +-
 tensorflow/contrib/training/python/training/sampling_ops.py | 6 +++---
 .../python/training/sequence_queueing_state_saver.py        | 4 ++--
 17 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index dbc35097dd..78ecce8f7d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -163,7 +163,7 @@ class DatasetSerializationTestBase(test.TestCase):
                                  num_outputs,
                                  sparse_tensors=False,
                                  verify_exhausted=True):
-    """Verifies that restoring into an already initilized iterator works.
+    """Verifies that restoring into an already initialized iterator works.
 
     Args:
       ds_fn: See `run_core_tests`.
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 256ad8d94d..6a88a7caf6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -338,7 +338,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRaces(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
     Args:
@@ -424,7 +424,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRacesAndBlocking(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
 
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 1c88366273..fe49ee8b19 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -57,7 +57,7 @@ class _ScanDataset(dataset_ops.Dataset):
     self._output_shapes = None
     self._output_types = None
 
-    # Iteratively rerun the scan function until reaching a fixed pont on
+    # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
     need_to_rerun = True
     while need_to_rerun:
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 4abcc20ed3..35e8c92aba 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -399,7 +399,7 @@ def get_combined_model_fn(model_fns):
   training ops: tf.group them.
   loss: average them.
   predictions: concat probabilities such that predictions[*][0-C1] are the
-    probablities for output 1 (where C1 is the number of classes in output 1),
+    probabilities for output 1 (where C1 is the number of classes in output 1),
     predictions[*][C1-(C1+C2)] are the probabilities for output 2 (where C2
     is the number of classes in output 2), etc.  Also stack predictions such
     that predictions[i][j] is the class prediction for example i and output j.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index cf0db788a4..06bfe871fd 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -80,7 +80,7 @@ REGISTER_OP("HardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index c9df09bfda..1a055756c0 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -85,7 +85,7 @@ REGISTER_OP("StochasticHardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index b0d8b832b5..7d092bbc24 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -81,7 +81,7 @@ REGISTER_OP("StochasticHardRoutingGradient")
   tree_biases: `tree_biases[i]` gives the bias of the logistic
    regression model that translates from node features to
    probabilities.
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 44997ec5d6..cefcc96051 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -421,7 +421,7 @@ double getChebyshevEpsilon(const std::vector<float>& mu1,
                            const std::vector<float>& mu2) {
   // Math time!!
   // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface.
-  // Using Langrange multipliers, we get
+  // Using Lagrange multipliers, we get
   //   partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x
   //   partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y
   // or
@@ -485,7 +485,7 @@ double getChebyshevEpsilon(const std::vector<float>& mu1,
   }
 
   double sdiscrim = sqrt(discrim);
-  // TODO(thomaswc): Analyze whetever one of these is always closer.
+  // TODO(thomaswc): Analyze whatever one of these is always closer.
   double v1 = (-b + sdiscrim) / (2 * a);
   double v2 = (-b - sdiscrim) / (2 * a);
   double dist1 = getDistanceFromLambda3(v1, mu1, mu2);
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index edbac67006..03aab1b61e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -123,7 +123,7 @@ bool BestSplitDominatesRegression(const Tensor& total_sums,
                                   const Tensor& split_squares,
                                   int32 accumulator);
 
-// Performs booststrap_samples bootstrap samples of the best split's class
+// Performs bootstrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
 // least dominate_fraction of the time, the former has a better (lower)
 // Gini impurity.  Does not take over ownership of *rand.
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index 328af28725..d3edb43733 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -60,7 +60,7 @@ class DecisionTreeResource : public ResourceBase {
   mutex* get_mutex() { return &mu_; }
 
   // Return the TreeNode for the leaf that the example ends up at according
-  // to decsion_tree_. Also fill in that leaf's depth if it isn't nullptr.
+  // to decision_tree_. Also fill in that leaf's depth if it isn't nullptr.
   int32 TraverseTree(const std::unique_ptr<TensorDataSet>& input_data,
                      int example, int32* depth, TreePath* path) const;
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index bf2b2aaa3c..3db351c328 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -60,7 +60,7 @@ class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
   bool include_equals_;
 };
 
-// Evalutor for splits with multiple weighted features.
+// Evaluator for splits with multiple weighted features.
 class ObliqueInequalityDecisionNodeEvaluator
     : public BinaryDecisionNodeEvaluator {
  public:
diff --git a/tensorflow/contrib/tensor_forest/ops/model_ops.cc b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
index 3099cccdf8..98124d519c 100644
--- a/tensorflow/contrib/tensor_forest/ops/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
@@ -165,7 +165,7 @@ tree_handle: The handle to the tree.
 leaf_ids: `leaf_ids[i]` is the leaf id for input i.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 )doc");
 
diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
index e8b5c5d8a6..be0a11546d 100644
--- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
@@ -83,7 +83,7 @@ Grows the tree for finished nodes and allocates waiting nodes.
 params: A serialized TensorForestParams proto.
 tree_handle: The handle to the tree.
 stats_handle: The handle to the stats.
-finshed_nodes: A 1-d Tensor of finished node ids from ProcessInput.
+finished_nodes: A 1-d Tensor of finished node ids from ProcessInput.
 )doc");
 
 REGISTER_OP("ProcessInputV4")
@@ -119,7 +119,7 @@ sparse_input_values: The values tensor from the SparseTensor input.
 sparse_input_shape: The shape tensor from the SparseTensor input.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 finished_nodes: A 1-d tensor of node ids that have finished and are ready to
   grow.
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 3650b5d52f..b9bcbb170b 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -212,7 +212,7 @@ class ForestHParams(object):
     self.regression = getattr(self, 'regression', False)
 
     # Num_outputs is the actual number of outputs (a single prediction for
-    # classification, a N-dimenensional point for regression).
+    # classification, a N-dimensional point for regression).
     self.num_outputs = self.num_classes if self.regression else 1
 
     # Add an extra column to classes for storing counts, which is needed for
diff --git a/tensorflow/contrib/training/python/training/resample.py b/tensorflow/contrib/training/python/training/resample.py
index b16159bc16..7b8332b1d6 100644
--- a/tensorflow/contrib/training/python/training/resample.py
+++ b/tensorflow/contrib/training/python/training/resample.py
@@ -77,7 +77,7 @@ def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
 
   Args:
     inputs: A list of tensors, each of which has a shape of `[batch_size, ...]`
-    rates: A tensor of shape `[batch_size]` contiaining the resampling rates
+    rates: A tensor of shape `[batch_size]` containing the resampling rates
        for each input.
     scope: Scope for the op.
     seed: Random seed to use.
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index ba888f87dc..7140f2a46d 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -123,7 +123,7 @@ def rejection_sample(tensors,
         batch_size=batch_size,
         num_threads=queue_threads)
 
-    # Queues return a single tensor if the list of enqued tensors is one. Since
+    # Queues return a single tensor if the list of enqueued tensors is one. Since
     # we want the type to always be the same, always return a list.
     if isinstance(minibatch, ops.Tensor):
       minibatch = [minibatch]
@@ -312,7 +312,7 @@ def _verify_input(tensor_list, labels, probs_list):
   """Verify that batched inputs are well-formed."""
   checked_probs_list = []
   for probs in probs_list:
-    # Since number of classes shouldn't change at runtime, probalities shape
+    # Since number of classes shouldn't change at runtime, probabilities shape
     # should be fully defined.
     probs.get_shape().assert_is_fully_defined()
 
@@ -407,7 +407,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs):
   ```
 
 
-  A solution for a_i in terms of the other variabes is the following:
+  A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
   """
   # Make list of t_i / p_i.
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 99d486b183..39d75a0806 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -876,7 +876,7 @@ class SequenceQueueingStateSaver(object):
         ]):
           self._length = array_ops.identity(self._length)
 
-        # Only create barrier; enqueu and dequeue operations happen when you
+        # Only create barrier; enqueue and dequeue operations happen when you
         # access prefetch_op and next_batch.
         self._create_barrier()
         self._scope = scope
@@ -1637,7 +1637,7 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
 
   For `key, value` pairs in `input_context` with `SparseTensor` `value` removes
   them from `input_context` and transforms the `value` into a sequence and
-  then adding `key`, transformed `value` into `input_seuqences`.
+  then adding `key`, transformed `value` into `input_sequences`.
   The transformation is done by adding a new first dimension of `value_length`
   equal to that of the other values in input_sequences` and tiling the `value`
   every `num_unroll` steps.
-- 
GitLab


From f1b23c8e5cdbf119b66a01c04fff78c201659064 Mon Sep 17 00:00:00 2001
From: Rui Zhao <rzhao@google.com>
Date: Sat, 7 Apr 2018 14:43:08 -0700
Subject: [PATCH 0427/1262] Save some useful TPU estimator's ops into
 collections for performance measurement.

PiperOrigin-RevId: 192016099
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 6834600b79..47365b78a2 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -38,6 +38,7 @@ from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -53,6 +54,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -73,6 +75,8 @@ _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
 
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
@@ -85,6 +89,13 @@ _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -2006,6 +2017,13 @@ class TPUEstimator(estimator_lib.Estimator):
         enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
         if mode == model_fn_lib.ModeKeys.TRAIN:
           loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
@@ -2036,11 +2054,14 @@ class TPUEstimator(estimator_lib.Estimator):
           # Validate the TPU training graph to catch basic errors
           _validate_tpu_training_graph()
 
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
               training_hooks=hooks,
-              train_op=control_flow_ops.group(*update_ops),
+              train_op=train_op,
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
-- 
GitLab


From 4947ccc20b291dc317da7971fdad1b91c7f553b7 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Sat, 7 Apr 2018 15:01:08 -0700
Subject: [PATCH 0428/1262] Fix batch_norm_benchmark.py to work with the C API.

PiperOrigin-RevId: 192016546
---
 tensorflow/python/ops/batch_norm_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index 5d68b47aea..d83b819097 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients_impl
@@ -39,7 +40,7 @@ from tensorflow.python.platform import test
 def batch_norm_op(tensor, mean, variance, beta, gamma, scale):
   """Fused kernel for batch normalization."""
   # _batch_norm_with_global_normalization is deprecated in v9
-  ops.get_default_graph().graph_def_versions.producer = 8
+  test_util.set_producer_version(ops.get_default_graph(), 8)
   # pylint: disable=protected-access
   return gen_nn_ops._batch_norm_with_global_normalization(
       tensor, mean, variance, beta, gamma, 0.001, scale)
-- 
GitLab


From 22cb902924b3b6e243af6ee386e660c826d60d12 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Sat, 7 Apr 2018 15:56:24 -0700
Subject: [PATCH 0429/1262] Revamp a few ScratchAllocator classes in
 cudnn_rnn_ops

Prepare for RNN autotune.
* The scratch allocator classes are renamed s.t. they're named by the duration of memory allocated.
  * CudnnReservespaceAllocator ==> CudnnRnnAllocatorInOutput.
  * CudnnWorkspaceAllocator ==> CudnnRnnAllocatorInTemp

* The old CudnnWorkspaceAllocator (new CudnnRnnAllocatorInTemp) is made a template s.t. it works with different tensor dtypes, which is used later in autotune, during which both workspace (uint8) and reserve space (input_dtype) are temp-allocated.

* Change CudnnModelShapes  ==> CudnnRnnModelShapes

PiperOrigin-RevId: 192018334
---
 tensorflow/core/kernels/cudnn_rnn_ops.cc    | 106 +++++++++++++-------
 tensorflow/stream_executor/cuda/cuda_dnn.cc |   1 -
 2 files changed, 67 insertions(+), 40 deletions(-)

diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 07dc786d9b..e4036ddaa9 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -227,22 +227,43 @@ inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
                       s.error_message());
 }
 
-// A helper to allocate temporary scratch memory for Cudnn RNN models. It takes
-// the ownership of the underlying memory. The expectation is that the memory
-// should be alive for the span of the Cudnn RNN itself.
-class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
+template <typename>
+struct ToTFDataType;
+
+template <>
+struct ToTFDataType<Eigen::half> : std::integral_constant<DataType, DT_HALF> {};
+
+template <>
+struct ToTFDataType<float> : std::integral_constant<DataType, DT_FLOAT> {};
+
+template <>
+struct ToTFDataType<double> : std::integral_constant<DataType, DT_DOUBLE> {};
+
+template <>
+struct ToTFDataType<uint8> : std::integral_constant<DataType, DT_UINT8> {};
+
+// A helper to allocate temporary scratch memory for Cudnn RNN models. It
+// takes the ownership of the underlying memory. The expectation is that the
+// memory should be alive for the span of the Cudnn RNN itself.
+template <typename T>
+class CudnnRnnAllocatorInTemp : public ScratchAllocator {
  public:
-  ~CudnnRNNWorkspaceAllocator() override {}
-  explicit CudnnRNNWorkspaceAllocator(OpKernelContext* context)
+  ~CudnnRnnAllocatorInTemp() = default;
+
+  explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
+
   StatusOr<DeviceMemory<uint8>> AllocateBytes(
       perftools::gputools::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
+    const DataType tf_data_type = ToTFDataType<T>::value;
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
     Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
     if (!allocation_status.ok()) {
       return ToExecutorStatus(allocation_status);
     }
@@ -250,10 +271,16 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return StatusOr<DeviceMemory<uint8>>(
-        AsDeviceMemory<uint8>(&temporary_memory));
+    return DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory.template flat<T>().data(),
+        temporary_memory.template flat<T>().size() * sizeof(T));
+  }
+
+  int64 TotalByteSize() const { return total_byte_size_; }
+
+  Tensor get_allocated_tensor(int index) const {
+    return allocated_tensors_[index];
   }
-  int64 TotalByteSize() { return total_byte_size_; }
 
  private:
   int64 total_byte_size_ = 0;
@@ -261,15 +288,15 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
   std::vector<Tensor> allocated_tensors_;
 };
 
-// A helper to allocate reserve-space memory for Cudnn RNN models. The tensors
-// are allocated as a kernel output, and will be fed into the backward pass.
+// A helper to allocate memory for Cudnn RNN models as a kernel output. It is
+// used by forward pass kernel to feed the output to the backward pass.
 // The memory is expected to live long enough after the backward pass is
 // finished.
 template <typename T>
-class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
+class CudnnRnnAllocatorInOutput : public ScratchAllocator {
  public:
-  ~CudnnRNNReserveSpaceAllocator() override {}
-  CudnnRNNReserveSpaceAllocator(OpKernelContext* context, int output_index)
+  ~CudnnRnnAllocatorInOutput() override {}
+  CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
@@ -343,13 +370,14 @@ struct CudnnModelTypes {
   TFRNNInputMode rnn_input_mode;
   RnnDirectionMode rnn_direction_mode;
   bool HasInputC() const {
-    // For Cudnn 5.0, only LSTM has input-c. All other models use only input-h.
+    // For Cudnn 5.0, only LSTM has input-c. All other models use only
+    // input-h.
     return rnn_mode == RnnMode::kRnnLstm;
   }
 };
 
 // A helper class that collects the shapes to describe a RNN model.
-struct CudnnModelShapes {
+struct CudnnRnnModelShapes {
   int num_layers;
   int input_size;
   int num_units;
@@ -360,7 +388,7 @@ struct CudnnModelShapes {
   TensorShape output_shape;
   TensorShape hidden_state_shape;
   // At present only fields related to cached RnnDescriptor are concerned.
-  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+  bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const {
     return num_layers == rhs.num_layers && input_size == rhs.input_size &&
            num_units == rhs.num_units && dir_count == rhs.dir_count;
   }
@@ -371,9 +399,9 @@ struct CudnnModelShapes {
   }
 };
 
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesHasher {
-  uint64 operator()(const CudnnModelShapes& to_hash) const {
+// Utility class for using CudnnRnnModelShapes as a hash table key.
+struct CudnnRnnModelShapesHasher {
+  uint64 operator()(const CudnnRnnModelShapes& to_hash) const {
     uint64 hash = static_cast<uint64>(to_hash.num_layers);
     hash = tensorflow::FingerprintCat64(
         hash, static_cast<uint64>(to_hash.input_size));
@@ -384,21 +412,21 @@ struct CudnnModelShapesHasher {
   }
 };
 
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesComparator {
-  bool operator()(const CudnnModelShapes& first,
-                  const CudnnModelShapes& second) const {
+// Utility class for using CudnnRnnModelShapes as a hash table key.
+struct CudnnRnnModelShapesComparator {
+  bool operator()(const CudnnRnnModelShapes& first,
+                  const CudnnRnnModelShapes& second) const {
     return first.IsCompatibleWith(second);
   }
 };
 
-// Extract and checks the forward input tensors, parameters, and shapes from the
-// OpKernelContext.
+// Extract and checks the forward input tensors, parameters, and shapes from
+// the OpKernelContext.
 Status ExtractForwardInput(OpKernelContext* context,
                            const CudnnModelTypes& model_types,
                            const Tensor** input, const Tensor** input_h,
                            const Tensor** input_c, const Tensor** params,
-                           CudnnModelShapes* model_shapes) {
+                           CudnnRnnModelShapes* model_shapes) {
   TF_RETURN_IF_ERROR(context->input("input", input));
   TF_RETURN_IF_ERROR(context->input("input_h", input_h));
   if (model_types.HasInputC()) {
@@ -810,7 +838,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
+    CudnnRnnModelShapes model_shapes;
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
@@ -876,7 +904,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the reserve_space. The memory lives in the
     // output of this kernel. And it will be fed into the backward pass when
     // needed.
-    CudnnRNNReserveSpaceAllocator<T> reserve_space_allocator(context, 3);
+    CudnnRnnAllocatorInOutput<T> reserve_space_allocator(context, 3);
     if (!is_training_) {
       Tensor* dummy_reserve_space = nullptr;
       OP_REQUIRES_OK(context,
@@ -884,7 +912,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     }
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
     bool launch_status = false;
     {
       mutex_lock l(mu_);
@@ -910,7 +938,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
                   input_c_data, params_data, *output_desc, &output_data,
                   *hidden_state_desc, &output_h_data, *hidden_state_desc,
                   &output_c_data, is_training_, &reserve_space_allocator,
-                  &workspace_allocator, /* output_result_profile */ nullptr)
+                  &workspace_allocator, /*output_result_profile=*/nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -920,8 +948,8 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
  private:
   mutex mu_;
   bool is_training_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
+  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
+                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
       rnn_state_cache_ GUARDED_BY(mu_);
 };
 
@@ -949,7 +977,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
+    CudnnRnnModelShapes model_shapes;
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
@@ -1090,7 +1118,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     auto reserve_space_uint8 = CastDeviceMemory<uint8, T>(reserve_space);
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
     bool launch_status = false;
     {
       mutex_lock l(mu_);
@@ -1119,7 +1147,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
                   output_c_backprop_data, &input_backprop_data,
                   &input_h_backprop_data, &input_c_backprop_data,
                   &params_backprop_data, &reserve_space_uint8,
-                  &workspace_allocator, /* output_result_profile */ nullptr)
+                  &workspace_allocator, /*output_result_profile=*/nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -1128,8 +1156,8 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
  private:
   mutex mu_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
+  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
+                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
       rnn_state_cache_ GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index fa5b90c945..1dc7f991b3 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -297,7 +297,6 @@ CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 namespace {
 
-// Forward declaration.
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
 
 cudnnHandle_t ToHandle(void* opaque_handle) {
-- 
GitLab


From 8fd805fc79ca585fe90ec9fd7c9e0feef89f798e Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Sun, 8 Apr 2018 15:37:26 -0700
Subject: [PATCH 0430/1262] [XLA] Parallelize HloEvaluator::HandleConvolution

This adds a parallel version of Literal::Populate, and uses it in the embarrassingly parallel convolution computation.

PiperOrigin-RevId: 192065277
---
 tensorflow/compiler/xla/literal_util.h        | 40 ++++++--
 tensorflow/compiler/xla/literal_util_test.cc  | 42 ++++++++
 .../compiler/xla/service/hlo_evaluator.cc     | 35 +++----
 tensorflow/compiler/xla/shape_util.h          | 99 +++++++++++++------
 tensorflow/compiler/xla/shape_util_test.cc    | 18 ++++
 5 files changed, 183 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index a96a76fbb4..33abbdb813 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -587,6 +587,12 @@ class Literal {
   template <typename NativeT, typename FnType>
   Status Populate(const FnType& generator);
 
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT, typename FnType>
+  Status PopulateParallel(const FnType& generator);
+
   // Fills this literal with the given value.
   template <typename NativeT>
   void PopulateWithValue(NativeT value);
@@ -785,6 +791,10 @@ class Literal {
   // buffer).
   void DeallocateBuffers();
 
+  // Implementation details shared between Populate() and PopulateParallel()
+  template <typename NativeT, typename FnType>
+  Status PopulateInternal(const FnType& generator, bool parallel);
+
   Shape shape_;
   ShapeTree<Piece> pieces_;
 
@@ -1276,7 +1286,7 @@ void Literal::PopulateSparse(SparseIndexArray indices,
 }
 
 template <typename NativeT, typename FnType>
-Status Literal::Populate(const FnType& generator) {
+Status Literal::PopulateInternal(const FnType& generator, bool parallel) {
   const Shape& this_shape = shape();
   const int64 rank = ShapeUtil::Rank(this_shape);
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
@@ -1286,11 +1296,11 @@ Status Literal::Populate(const FnType& generator) {
   if (rank > 0) {
     StrideConfig stride_config(this_shape, this_shape,
                                AsInt64Slice(this_shape.dimensions()));
-    DimensionVector minor_scan_indexes(rank, 0);
     int64 minor_dimension_size =
         ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
 
     auto init_function = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+      DimensionVector minor_scan_indexes(rank, 0);
       const int64 index =
           IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
       std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
@@ -1298,17 +1308,35 @@ Status Literal::Populate(const FnType& generator) {
         minor_scan_indexes[stride_config.minor_dimension] = i;
         literal_data.at(index + i) = generator(minor_scan_indexes);
       }
-      return true;
     };
-    ShapeUtil::ForEachIndex(this_shape, stride_config.base,
-                            stride_config.dimensions, stride_config.step,
-                            init_function);
+    if (parallel) {
+      ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base,
+                                      stride_config.dimensions,
+                                      stride_config.step, init_function);
+    } else {
+      ShapeUtil::ForEachIndex(
+          this_shape, stride_config.base, stride_config.dimensions,
+          stride_config.step,
+          [&init_function](tensorflow::gtl::ArraySlice<int64> indexes) {
+            init_function(indexes);
+            return true;
+          });
+    }
   } else {
     // For scalars.
     literal_data.at(0) = generator({});
   }
   return Status::OK();
 }
+template <typename NativeT, typename FnType>
+Status Literal::Populate(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/false);
+}
+
+template <typename NativeT, typename FnType>
+Status Literal::PopulateParallel(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/true);
+}
 
 template <typename NativeT>
 void Literal::PopulateWithValue(NativeT value) {
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 7627762074..8b000f44f7 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -1090,6 +1090,48 @@ TEST_F(LiteralUtilTest, Populate) {
   }
 }
 
+TEST_F(LiteralUtilTest, PopulateParallel) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{0}, {0}},
+      {{16}, {0}},
+      {{2, 0}, {1, 0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = Literal::CreateFromShape(shape);
+    auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(),
+                                                           indexes) +
+             17;
+    };
+    TF_EXPECT_OK(literal->PopulateParallel<uint32>(generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](ArraySlice<int64> indexes) {
+      auto value = literal->Get<uint32>(indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
 TEST_F(LiteralUtilTest, ConvertR4) {
   // clang-format off
   auto original = Literal::CreateR4WithLayout<int8>({{
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 53ad8909c5..b24757c33c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -998,18 +998,6 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
-    // Dimension number applicable for input (lhs).
-    const int64 input_batch_dim = dnums.input_batch_dimension();
-    const int64 input_z_dim = dnums.input_feature_dimension();
-    // Dimension number applicable for kernel (rhs).
-    const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
-    const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
-    // Dimension number applicable for output.
-    const int64 output_batch_dim = dnums.output_batch_dimension();
-    const int64 output_z_dim = dnums.output_feature_dimension();
-
-    const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
-
     std::vector<int64> window_dimension_sizes;
     for (auto i : dnums.kernel_spatial_dimensions()) {
       window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
@@ -1021,14 +1009,27 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
     DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
 
-    DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
-
     auto lhs_literal_data = lhs_literal.data<ReturnT>();
     auto rhs_literal_data = rhs_literal.data<ReturnT>();
 
-    auto func = [&](ArraySlice<int64> out_index) {
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data](ArraySlice<int64> out_index) {
+      // Dimension number applicable for input (lhs).
+      const int64 input_batch_dim = dnums.input_batch_dimension();
+      const int64 input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64 output_batch_dim = dnums.output_batch_dimension();
+      const int64 output_z_dim = dnums.output_feature_dimension();
+
+      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
-      std::fill(rhs_spatial_index.begin(), rhs_spatial_index.end(), 0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
 
       // Convolve input feature with kernel.
       do {
@@ -1100,7 +1101,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     };
 
     auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
+    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
 
     parent_->evaluated_[conv] = std::move(result);
     return Status::OK();
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 3e130a02e2..b9becf6452 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,8 +28,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -583,34 +585,7 @@ class ShapeUtil {
                                        tensorflow::gtl::ArraySlice<int64> count,
                                        tensorflow::gtl::ArraySlice<int64> incr,
                                        const FnType& visitor_function) {
-    if (ShapeUtil::HasZeroElements(shape)) {
-      return Status::OK();
-    }
-    CHECK_EQ(Rank(shape), base.size());
-    CHECK_EQ(incr.size(), base.size());
-    CHECK_EQ(count.size(), base.size());
-    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
-    // Allows handling R0 arrays, such that the visitor function will be called
-    // once with the proper empty indexes.
-    int64 n = -1;
-    std::vector<int64> indexes(base.begin(), base.end());
-    while (n < rank) {
-      TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
-      if (!should_continue) {
-        break;
-      }
-      // Increments dimensions in minor to major order.
-      for (n = 0; n < rank; ++n) {
-        int64 dim = LayoutUtil::Minor(shape.layout(), n);
-        indexes[dim] += incr[dim];
-        if (indexes[dim] < base[dim] + count[dim]) {
-          break;
-        }
-        indexes[dim] = base[dim];
-      }
-    }
-
-    return Status::OK();
+    return ForEachIndexInternal(shape, base, count, incr, visitor_function);
   }
 
   // Simple ergonomic wrapper around ShapeUtil::ForEachIndexWithStatus.
@@ -642,11 +617,79 @@ class ShapeUtil {
         .IgnoreError();
   }
 
+  // A parallel version of ForEachIndex(WithStatus). This can only be used if
+  // the visitor_function is thread-safe and the order of iteration does not
+  // matter.
+  //
+  // visitor_function must be a callable of type
+  // void(ArraySlice<int64>) or compatible.
+  template <typename FnType>
+  static void ForEachIndexParallel(const Shape& shape,
+                                   tensorflow::gtl::ArraySlice<int64> base,
+                                   tensorflow::gtl::ArraySlice<int64> count,
+                                   tensorflow::gtl::ArraySlice<int64> incr,
+                                   const FnType& visitor_function) {
+    const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
+    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "test",
+                                        kNumThreads);
+    // If a pool is provided, ForEachIndexInternal can never fail.
+    CHECK(ForEachIndexInternal(
+              shape, base, count, incr,
+              [&visitor_function](tensorflow::gtl::ArraySlice<int64> indexes)
+                  -> StatusOr<bool> {
+                visitor_function(indexes);
+                return true;
+              },
+              &pool)
+              .ok());
+  }
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
   static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
 
+  template <typename FnType>
+  static Status ForEachIndexInternal(
+      const Shape& shape, tensorflow::gtl::ArraySlice<int64> base,
+      tensorflow::gtl::ArraySlice<int64> count,
+      tensorflow::gtl::ArraySlice<int64> incr, const FnType& visitor_function,
+      tensorflow::thread::ThreadPool* pool = nullptr) {
+    if (ShapeUtil::HasZeroElements(shape)) {
+      return Status::OK();
+    }
+    CHECK_EQ(Rank(shape), base.size());
+    CHECK_EQ(incr.size(), base.size());
+    CHECK_EQ(count.size(), base.size());
+    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
+    // Allows handling R0 arrays, such that the visitor function will be called
+    // once with the proper empty indexes.
+    int64 n = -1;
+    std::vector<int64> indexes(base.begin(), base.end());
+    while (n < rank) {
+      if (pool != nullptr) {
+        pool->Schedule(
+            [indexes, visitor_function] { visitor_function(indexes); });
+      } else {
+        TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
+        if (!should_continue) {
+          break;
+        }
+      }
+      // Increments dimensions in minor to major order.
+      for (n = 0; n < rank; ++n) {
+        int64 dim = LayoutUtil::Minor(shape.layout(), n);
+        indexes[dim] += incr[dim];
+        if (indexes[dim] < base[dim] + count[dim]) {
+          break;
+        }
+        indexes[dim] = base[dim];
+      }
+    }
+
+    return Status::OK();
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 424cfe37ea..13582a2a26 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -624,6 +624,24 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
   EXPECT_EQ(invocations, 5);
 }
 
+TEST(ShapeUtilTest, ForEachIndexParallel) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  int64 output[10][10];
+  int init = 5;
+  auto set_func = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
+                                  /*incr=*/{1, 1}, set_func);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(output[i][j], init + i + j);
+    }
+  }
+}
+
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
-- 
GitLab


From 1eea5ad3f9a622411117f7208d308055b0707d0f Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Sun, 8 Apr 2018 15:42:16 -0700
Subject: [PATCH 0431/1262] Automated g4 rollback of changelist 191360905

PiperOrigin-RevId: 192065431
---
 tensorflow/BUILD                              |   8 --
 tensorflow/__init__.py                        |   7 +-
 tensorflow/contrib/cmake/python_modules.txt   |   2 +
 tensorflow/contrib/cmake/tf_python.cmake      |  91 ++++++++++---
 tensorflow/experimental_api.py                |  38 ------
 tensorflow/python/framework/dtypes.py         |   2 +-
 tensorflow/python/framework/versions.py       |  12 +-
 tensorflow/tools/api/generator/BUILD          |   2 +
 .../tools/api/generator/create_python_api.py  | 124 ++++++++++++------
 .../api/generator/create_python_api_test.py   |   6 +-
 tensorflow/tools/api/tests/BUILD              |   1 -
 .../tools/api/tests/api_compatibility_test.py |  58 +-------
 .../ci_build/windows/cpu/cmake/run_py.bat     |   6 +-
 13 files changed, 183 insertions(+), 174 deletions(-)
 delete mode 100644 tensorflow/experimental_api.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 3d5737a9d7..cfafffdd13 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -540,14 +540,6 @@ py_library(
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
-
-py_library(
-    name = "experimental_tensorflow_py",
-    srcs = ["experimental_api.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow/tools/api/tests:__subpackages__"],
     deps = [
         "//tensorflow/python",
         "//tensorflow/tools/api/generator:python_api",
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 78ad6aec19..c8683e3976 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -20,14 +20,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # pylint: disable=wildcard-import
-from tensorflow.python import *  # pylint: disable=redefined-builtin
+from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 8e83b4e176..b786c6d5cb 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -104,6 +104,8 @@ tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
 tensorflow/tools
+tensorflow/tools/api
+tensorflow/tools/api/generator
 tensorflow/tools/graph_transforms
 tensorflow/contrib
 tensorflow/contrib/all_reduce
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 1a5ec34844..ded15b4b66 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -689,6 +689,77 @@ AddUserOps(TARGET _beam_search_ops
     DEPENDS pywrap_tensorflow_internal tf_python_ops
     DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
+else()
+  add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
+endif()
+
+
+########################################################
+# Generate API __init__.py files.
+########################################################
+
+# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
+
+set(api_init_files "")
+foreach(api_init_file ${api_init_files_list})
+    string(STRIP "${api_init_file}" api_init_file)
+    if(api_init_file)
+        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+    endif()
+endforeach(api_init_file)
+set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
+file(WRITE "${api_init_list_file}" "${api_init_files}")
+
+# Run create_python_api.py to generate __init__.py files.
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+      # this step is running since the files aren't there yet.
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
+
+      # Re-add tensorflow/__init__.py back.
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+)
+
+add_custom_target(tf_python_api SOURCES ${api_init_files})
+add_dependencies(tf_python_api tf_python_ops)
+
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -698,6 +769,7 @@ add_dependencies(tf_python_build_pip_package
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
+    tf_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
@@ -710,25 +782,6 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  else()
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  endif()
-else()
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
-endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/experimental_api.py b/tensorflow/experimental_api.py
deleted file mode 100644
index 63a8aa9cb1..0000000000
--- a/tensorflow/experimental_api.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Bring in all of the public TensorFlow interface into this
-# module.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.lazy_loader import LazyLoader
-contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
-del LazyLoader
-
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 0edae92fd4..a31c424263 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -345,7 +345,7 @@ tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 tf_export("uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
-tf_export("uint64").export_constant(__name__, "uint32")
+tf_export("uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
 tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index d08b4bf48a..472ccbcac7 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -31,13 +31,17 @@ __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 VERSION = __version__
 tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
-tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
+tf_export("GIT_VERSION", "__git_version__").export_constant(
+    __name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
-tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
+tf_export("COMPILER_VERSION", "__compiler_version__").export_constant(
+    __name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
-tf_export("CXX11_ABI_FLAG").export_constant(__name__, "CXX11_ABI_FLAG")
+tf_export("CXX11_ABI_FLAG", "__cxx11_abi_flag__").export_constant(
+    __name__, "CXX11_ABI_FLAG")
 MONOLITHIC_BUILD = __monolithic_build__
-tf_export("MONOLITHIC_BUILD").export_constant(__name__, "MONOLITHIC_BUILD")
+tf_export("MONOLITHIC_BUILD", "__monolithic_build__").export_constant(
+    __name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 9f1bdd8aae..a1c569951e 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -32,6 +32,7 @@ genrule(
     # api/module1/module2/__init__.py and api/module3/__init__.py.
     # keep sorted
     outs = [
+        # BEGIN GENERATED FILES
         "api/__init__.py",
         "api/app/__init__.py",
         "api/bitwise/__init__.py",
@@ -117,6 +118,7 @@ genrule(
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
         "api/user_ops/__init__.py",
+        # END GENERATED FILES
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 183c4731b8..6fa48cd70c 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -67,18 +67,23 @@ def format_import(source_module_name, source_name, dest_name):
       return 'import %s as %s' % (source_name, dest_name)
 
 
-class _ModuleImportsBuilder(object):
+class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
   def __init__(self):
-    self.module_imports = collections.defaultdict(list)
-    self._seen_api_names = set()
+    self.module_imports = collections.defaultdict(
+        lambda: collections.defaultdict(set))
+    self._dest_import_to_id = collections.defaultdict(int)
+    # Names that start with underscore in the root module.
+    self._underscore_names_in_root = []
 
   def add_import(
-      self, dest_module_name, source_module_name, source_name, dest_name):
+      self, symbol_id, dest_module_name, source_module_name, source_name,
+      dest_name):
     """Adds this import to module_imports.
 
     Args:
+      symbol_id: (number) Unique identifier of the symbol to import.
       dest_module_name: (string) Module name to add import to.
       source_module_name: (string) Module to import from.
       source_name: (string) Name of the symbol to import.
@@ -89,34 +94,67 @@ class _ModuleImportsBuilder(object):
         dest_name has already been added to dest_module_name.
     """
     import_str = format_import(source_module_name, source_name, dest_name)
-    if import_str in self.module_imports[dest_module_name]:
-      return
 
     # Check if we are trying to expose two different symbols with same name.
     full_api_name = dest_name
     if dest_module_name:
       full_api_name = dest_module_name + '.' + full_api_name
-    if full_api_name in self._seen_api_names:
+    if (full_api_name in self._dest_import_to_id and
+        symbol_id != self._dest_import_to_id[full_api_name] and
+        symbol_id != -1):
       raise SymbolExposedTwiceError(
           'Trying to export multiple symbols with same name: %s.' %
           full_api_name)
-    self._seen_api_names.add(full_api_name)
+    self._dest_import_to_id[full_api_name] = symbol_id
 
-    self.module_imports[dest_module_name].append(import_str)
+    if not dest_module_name and dest_name.startswith('_'):
+      self._underscore_names_in_root.append(dest_name)
 
+    # The same symbol can be available in multiple modules.
+    # We store all possible ways of importing this symbol and later pick just
+    # one.
+    self.module_imports[dest_module_name][full_api_name].add(import_str)
 
-def get_api_imports():
-  """Get a map from destination module to formatted imports.
+  def build(self):
+    """Get a map from destination module to __init__.py code for that module.
+
+    Returns:
+      A dictionary where
+        key: (string) destination module (for e.g. tf or tf.consts).
+        value: (string) text that should be in __init__.py files for
+          corresponding modules.
+    """
+    module_text_map = {}
+    for dest_module, dest_name_to_imports in self.module_imports.items():
+      # Sort all possible imports for a symbol and pick the first one.
+      imports_list = [
+          sorted(imports)[0]
+          for _, imports in dest_name_to_imports.items()]
+      module_text_map[dest_module] = '\n'.join(sorted(imports_list))
+
+    # Expose exported symbols with underscores in root module
+    # since we import from it using * import.
+    underscore_names_str = ', '.join(
+        '\'%s\'' % name for name in self._underscore_names_in_root)
+    module_text_map[''] += '''
+_names_with_underscore = [%s]
+__all__ = [s for s in dir() if not s.startswith('_')]
+__all__.extend([s for s in _names_with_underscore])
+''' % underscore_names_str
+
+    return module_text_map
+
+
+def get_api_init_text():
+  """Get a map from destination module to __init__.py code for that module.
 
   Returns:
     A dictionary where
       key: (string) destination module (for e.g. tf or tf.consts).
-      value: List of strings representing module imports
-          (for e.g. 'from foo import bar') and constant
-          assignments (for e.g. 'FOO = 123').
+      value: (string) text that should be in __init__.py files for
+        corresponding modules.
   """
-  module_imports_builder = _ModuleImportsBuilder()
-  visited_symbols = set()
+  module_code_builder = _ModuleInitCodeBuilder()
 
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
@@ -130,8 +168,6 @@ def get_api_imports():
 
     for module_contents_name in dir(module):
       attr = getattr(module, module_contents_name)
-      if id(attr) in visited_symbols:
-        continue
 
       # If attr is _tf_api_constants attribute, then add the constants.
       if module_contents_name == _API_CONSTANTS_ATTR:
@@ -139,30 +175,25 @@ def get_api_imports():
           for export in exports:
             names = export.split('.')
             dest_module = '.'.join(names[:-1])
-            module_imports_builder.add_import(
-                dest_module, module.__name__, value, names[-1])
+            module_code_builder.add_import(
+                -1, dest_module, module.__name__, value, names[-1])
         continue
 
       _, attr = tf_decorator.unwrap(attr)
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
-        # If the same symbol is available using multiple names, only create
-        # imports for it once.
-        if id(attr) in visited_symbols:
-          continue
-        visited_symbols.add(id(attr))
-
         for export in attr._tf_api_names:  # pylint: disable=protected-access
           names = export.split('.')
           dest_module = '.'.join(names[:-1])
-          module_imports_builder.add_import(
-              dest_module, module.__name__, module_contents_name, names[-1])
+          module_code_builder.add_import(
+              id(attr), dest_module, module.__name__, module_contents_name,
+              names[-1])
 
   # Import all required modules in their parent modules.
   # For e.g. if we import 'foo.bar.Value'. Then, we also
   # import 'bar' in 'foo'.
-  imported_modules = set(module_imports_builder.module_imports.keys())
+  imported_modules = set(module_code_builder.module_imports.keys())
   for module in imported_modules:
     if not module:
       continue
@@ -175,11 +206,11 @@ def get_api_imports():
         parent_module += ('.' + module_split[submodule_index-1] if parent_module
                           else module_split[submodule_index-1])
         import_from += '.' + parent_module
-      module_imports_builder.add_import(
-          parent_module, import_from, module_split[submodule_index],
-          module_split[submodule_index])
+      module_code_builder.add_import(
+          -1, parent_module, import_from,
+          module_split[submodule_index], module_split[submodule_index])
 
-  return module_imports_builder.module_imports
+  return module_code_builder.build()
 
 
 def create_api_files(output_files):
@@ -195,16 +226,19 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
+    # Convert path separators to '/' for easier parsing below.
+    normalized_output_file = output_file.replace(os.sep, '/')
     if _API_DIR not in output_file:
       raise ValueError(
           'Output files must be in api/ directory, found %s.' % output_file)
     # Get the module name that corresponds to output_file.
     # First get module directory under _API_DIR.
     module_dir = os.path.dirname(
-        output_file[output_file.rfind(_API_DIR)+len(_API_DIR):])
+        normalized_output_file[
+            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
     # Convert / to .
     module_name = module_dir.replace('/', '.').strip('.')
-    module_name_to_file_path[module_name] = output_file
+    module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
   for module, file_path in module_name_to_file_path.items():
@@ -212,11 +246,11 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_imports = get_api_imports()
+  module_text_map = get_api_init_text()
 
   # Add imports to output files.
   missing_output_files = []
-  for module, exports in module_imports.items():
+  for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
       module_file_path = '"api/%s/__init__.py"' %  (
@@ -224,7 +258,7 @@ def create_api_files(output_files):
       missing_output_files.append(module_file_path)
       continue
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + '\n'.join(exports))
+      fp.write(_GENERATED_FILE_HEADER + text)
 
   if missing_output_files:
     raise ValueError(
@@ -241,6 +275,16 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
-      help='Python files that we expect this script to output.')
+      help='If a single file is passed in, then we we assume it contains a '
+      'semicolon-separated list of Python files that we expect this script to '
+      'output. If multiple files are passed in, then we assume output files '
+      'are listed directly as arguments.')
   args = parser.parse_args()
-  main(args.outputs)
+  if len(args.outputs) == 1:
+    # If we only get a single argument, then it must be a file containing
+    # list of outputs.
+    with open(args.outputs[0]) as output_list_file:
+      outputs = [line.strip() for line in output_list_file.read().split(';')]
+  else:
+    outputs = args.outputs
+  main(outputs)
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 2760779e6e..218c812045 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -56,7 +56,7 @@ class CreatePythonApiTest(test.TestCase):
     del sys.modules[_MODULE_NAME]
 
   def testFunctionImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = (
         'from test.tensorflow.test_module import test_op as test_op1')
     self.assertTrue(
@@ -69,14 +69,14 @@ class CreatePythonApiTest(test.TestCase):
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testClassImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = 'from test.tensorflow.test_module import TestClass'
     self.assertTrue(
         'TestClass' in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testConstantIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected = 'from test.tensorflow.test_module import _TEST_CONSTANT'
     self.assertTrue(expected in str(imports),
                     msg='%s not in %s' % (expected, str(imports)))
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 0dc154b6d2..724b12cd47 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -23,7 +23,6 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:experimental_tensorflow_py",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 7eeae05847..26d5bca637 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,7 +34,6 @@ import sys
 import unittest
 
 import tensorflow as tf
-from tensorflow import experimental_api as api
 
 from google.protobuf import text_format
 
@@ -47,8 +46,6 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
-if hasattr(tf, 'experimental_api'):
-  del tf.experimental_api
 
 # FLAGS defined at the bottom:
 FLAGS = None
@@ -145,9 +142,6 @@ class ApiCompatibilityTest(test.TestCase):
       verbose_diff_message = ''
       # First check if the key is not found in one or the other.
       if key in only_in_expected:
-        # TODO(annarev): remove once we switch to using tf_export decorators.
-        if key == 'tensorflow.math':
-          continue
         diff_message = 'Object %s expected but not found (removed). %s' % (
             key, additional_missing_object_message)
         verbose_diff_message = diff_message
@@ -208,58 +202,12 @@ class ApiCompatibilityTest(test.TestCase):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    traverse.traverse(tf, public_api_visitor)
-
-    proto_dict = visitor.GetProtos()
-
-    # Read all golden files.
-    expression = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*'))
-    golden_file_list = file_io.get_matching_files(expression)
-
-    def _ReadFileToProto(filename):
-      """Read a filename, create a protobuf from its contents."""
-      ret_val = api_objects_pb2.TFAPIObject()
-      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
-      return ret_val
-
-    golden_proto_dict = {
-        _FileNameToKey(filename): _ReadFileToProto(filename)
-        for filename in golden_file_list
-    }
-
-    # TODO(annarev): remove once we switch to using tf_export decorators.
-    tf_module = golden_proto_dict['tensorflow'].tf_module
-    for i in range(len(tf_module.member)):
-      if tf_module.member[i].name == 'math':
-        del tf_module.member[i]
-        break
-
-    # Diff them. Do not fail if called with update.
-    # If the test is run to update goldens, only report diffs but do not fail.
-    self._AssertProtoDictEquals(
-        golden_proto_dict,
-        proto_dict,
-        verbose=FLAGS.verbose_diffs,
-        update_goldens=FLAGS.update_goldens)
-
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
-  def testNewAPIBackwardsCompatibility(self):
-    # Extract all API stuff.
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     # TODO(annarev): Make slide_dataset available in API.
     public_api_visitor.private_map['tf'] = ['slide_dataset']
-    traverse.traverse(api, public_api_visitor)
+    traverse.traverse(tf, public_api_visitor)
 
     proto_dict = visitor.GetProtos()
 
@@ -286,9 +234,7 @@ class ApiCompatibilityTest(test.TestCase):
         golden_proto_dict,
         proto_dict,
         verbose=FLAGS.verbose_diffs,
-        update_goldens=False,
-        additional_missing_object_message=
-        'Check if tf_export decorator/call is missing for this symbol.')
+        update_goldens=FLAGS.update_goldens)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 3c3b223a00..30554a084c 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -37,9 +40,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
-- 
GitLab


From 91f243fecee7382a969dc830c74c3f17b4dec11d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 9 Apr 2018 02:00:47 +0000
Subject: [PATCH 0432/1262] Fix broken link in doc for install_c.md

This fix adds `https://` to stackoverflow link. Without
`https://` the link is rendered as:
```
https://www.tensorflow.org/install/www.stackoverflow.com/questions/tagged/tensorflow
```
in the current page and is broken.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/install/install_c.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index a3eca4bf37..274413e294 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -113,6 +113,6 @@ If executing `a.out` fails, ask yourself the following questions:
   * Did you export those environment variables?
 
 If you are still seeing build or execution error messages, search (or post to)
-[StackOverflow](www.stackoverflow.com/questions/tagged/tensorflow) for
+[StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow) for
 possible solutions.
 
-- 
GitLab


From bbc1ce5b1397041d12d90502c08997de03f798b3 Mon Sep 17 00:00:00 2001
From: DosLin <doslino@gmail.com>
Date: Mon, 9 Apr 2018 13:14:56 +0800
Subject: [PATCH 0433/1262] Docs: Fix 'Unable to find source java class'

---
 tensorflow/docs_src/mobile/android_build.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 08a5fbe41c..0cd0a98be4 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -51,7 +51,8 @@ If you haven't already, do the following two things:
         // set to 'bazel', 'cmake', 'makefile', 'none'
         def nativeBuildSystem = 'none'
 
-4. Click the Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
+4. Running "Build -> Rebuild Project" from Android Studio menu and click the 
+    Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
 
     If it asks you to use Instant Run, click **Proceed Without Instant Run**.
 
-- 
GitLab


From 5c469e6bafb479ef110b2f02f070507a3711664d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 01:59:50 -0700
Subject: [PATCH 0434/1262] Enabling fp16 for NCCL 1 and 2.

PiperOrigin-RevId: 192096789
---
 .../contrib/nccl/kernels/nccl_manager.cc      |   2 +
 .../contrib/nccl/kernels/nccl_manager_test.cc | 214 ++++++++++--------
 tensorflow/contrib/nccl/ops/nccl_ops.cc       |  14 +-
 .../contrib/nccl/python/ops/nccl_ops_test.py  |   2 +-
 4 files changed, 127 insertions(+), 105 deletions(-)

diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 913935b382..b9b482a698 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -76,6 +76,8 @@ struct NcclManager::Communicator {
 namespace {
 ncclDataType_t ToNcclType(DataType t) {
   switch (t) {
+    case DT_HALF:
+      return ncclHalf;
     case DT_FLOAT:
       return ncclFloat;
     case DT_DOUBLE:
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 985b2bae25..06ca65e33a 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -48,35 +48,9 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   return gpus;
 }
 
+template <typename Scalar>
 class NcclManagerTest : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices->empty());
-    LOG(ERROR) << "Running test with " << devices->size() << " gpus";
-  }
-  static void TearDownTestCase() {
-    for (auto device : *devices) delete device;
-    delete devices;
-  }
-
-  static Allocator* gpu_allocator(BaseGPUDevice* device) {
-    return device->GetStepAllocator(AllocatorAttributes(),
-                                    nullptr /* step_resource_manager */);
-  }
-
-  static std::vector<BaseGPUDevice*>* devices;
-
-  template <typename Scalar>
-  perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-      const Scalar* cuda_memory) {
-    perftools::gputools::DeviceMemoryBase wrapped(
-        const_cast<Scalar*>(cuda_memory));
-    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
-    return typed;
-  }
-
+ public:
   // A single all-reduce to apply.
   struct TestCase {
     string key;
@@ -89,42 +63,52 @@ class NcclManagerTest : public ::testing::Test {
     int num_completed = 0;
   };
 
+  static void SetUpTestCase() {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
+    CHECK(!devices_->empty());
+    LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
+  }
+
+  static void TearDownTestCase() {
+    for (auto device : *devices_) delete device;
+    delete devices_;
+  }
+
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
     TestCase* test_case = new TestCase();
-    test_case->expected = Tensor(DT_FLOAT, shape);
+    test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 1; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(1); });
     } else if (reduction_op == ncclSum) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 0; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(0); });
     } else if (reduction_op == ncclMax) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return -1 * std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return -max_; });
     } else if (reduction_op == ncclMin) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return max_; });
     } else {
       LOG(FATAL) << "Invalid reduction_op " << reduction_op;
     }
 
-    int mult = 1;
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
 
-      Tensor in_cpu(DT_FLOAT, shape);
-      test::FillFn<float>(&in_cpu, [mult, value_offset](int index) {
-        return value_offset + (index + 1) * mult;
+      Tensor in_cpu(data_type_, shape);
+      test::FillFn<Scalar>(&in_cpu, [&](int index) {
+        return static_cast<Scalar>((index + 1) * value_scale + value_offset);
       });
       for (int j = 0; j < shape.num_elements(); ++j) {
-        auto in_val = in_cpu.flat<float>()(j);
-        auto out_expr = test_case->expected.flat<float>();
+        auto in_val = in_cpu.flat<Scalar>()(j);
+        auto out_expr = test_case->expected.template flat<Scalar>();
         if (reduction_op == ncclProd) {
-          out_expr(j) *= in_val;
+          out_expr(j) = out_expr(j) * in_val;
         } else if (reduction_op == ncclSum) {
-          out_expr(j) += in_val;
+          out_expr(j) = out_expr(j) + in_val;
         } else if (reduction_op == ncclMax) {
           if (in_val > out_expr(j)) {
             out_expr(j) = in_val;
@@ -136,26 +120,18 @@ class NcclManagerTest : public ::testing::Test {
         }
       }
 
-      mult *= 10;
-      test_case->ins.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
-      test_case->outs.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
+      value_scale *= 10;
+      test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
+      test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
 
       const Tensor& in_gpu = test_case->ins.back();
-      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<float>().data());
-      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<float>().data(),
+      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
                          in_cpu.TotalBytes());
     }
     return test_case;
   }
 
-  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
-    return [this, test_case](Status s) {
-      mutex_lock l(test_case->mu);
-      ++test_case->num_completed;
-      test_case->final_status.Update(s);
-    };
-  }
-
   void VerifyResults(const string& case_label, TestCase* test_case) {
     // Wait for the done callback to be called.
     {
@@ -168,41 +144,84 @@ class NcclManagerTest : public ::testing::Test {
       test_case->mu.unlock();
     }
     // Copy memory to host and verify.
-    for (int i = 0; i < test_case->outs.size(); ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < test_case->outs.size(); ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
-      const Tensor& out_gpu = test_case->outs[i];
-      Tensor out_cpu(DT_FLOAT, out_gpu.shape());
-      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<float>().data());
-      stream->ThenMemcpy(out_cpu.flat<float>().data(), out_gpu_mem,
+      const Tensor& out_gpu = test_case->outs[rank];
+      Tensor out_cpu(data_type_, out_gpu.shape());
+      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorEqual<float>(test_case->expected, out_cpu);
+      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
     }
   }
+
+  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
+    return [this, test_case](Status s) {
+      mutex_lock l(test_case->mu);
+      ++test_case->num_completed;
+      test_case->final_status.Update(s);
+    };
+  }
+
+  static BaseGPUDevice* GetDevice(size_t rank) {
+    return devices_->at(rank % devices_->size());
+  }
+
+ private:
+  static Allocator* GpuAllocator(BaseGPUDevice* device) {
+    return device->GetStepAllocator(AllocatorAttributes(),
+                                    nullptr /* step_resource_manager */);
+  }
+
+  static perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
+      const Scalar* cuda_memory) {
+    perftools::gputools::DeviceMemoryBase wrapped(
+        const_cast<Scalar*>(cuda_memory));
+    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+    return typed;
+  }
+
+ private:
+  static std::vector<BaseGPUDevice*>* devices_;
+  static const DataType data_type_;
+  static const Scalar max_;
 };
-std::vector<BaseGPUDevice*>* NcclManagerTest::devices = nullptr;
+
+template <typename Scalar>
+std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+template <typename Scalar>
+const DataType NcclManagerTest<Scalar>::data_type_ =
+    DataTypeToEnum<Scalar>::value;
+template <typename Scalar>
+const Scalar NcclManagerTest<Scalar>::max_ =
+    Eigen::NumTraits<Scalar>::highest();
+
+// Instantiate tests for float and half.
+using TypeList = ::testing::Types<float, Eigen::half>;
+TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
-TEST_F(NcclManagerTest, BasicSumReduction) {
+TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   const int num_ranks = 3;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
-    std::unique_ptr<TestCase> test_case(
-        MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0));
-    for (int device_num = 0; device_num < num_ranks; ++device_num) {
-      auto* device = devices->at(device_num % devices->size());
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
           num_ranks, "allreduce", reduction_op, device->executor(),
-          device->gpu_id(), event_mgr, stream, &test_case->ins[device_num],
-          &test_case->outs[device_num], CreateDoneCallback(test_case.get()));
+          device->gpu_id(), event_mgr, stream, &test_case->ins[rank],
+          &test_case->outs[rank], this->CreateDoneCallback(test_case.get()));
     }
 
     LOG(ERROR) << "Verifying results";
-    VerifyResults("test_case", test_case.get());
+    this->VerifyResults("test_case", test_case.get());
   }
 }
 
@@ -213,7 +232,7 @@ TEST_F(NcclManagerTest, BasicSumReduction) {
 // with num_ranks > devices->size(), for some GPUs (e.g. K20m).
 // To test the higher settings, increase num_ranks,
 // num_collectives_per_iteration and time_limit_micros.
-TEST_F(NcclManagerTest, MultipleCallers) {
+TYPED_TEST(NcclManagerTest, MultipleCallers) {
   const int num_ranks = 1;                      // 2;
   const int num_collectives_per_iteration = 1;  // 1000;
   const int num_threads = 3;
@@ -223,49 +242,49 @@ TEST_F(NcclManagerTest, MultipleCallers) {
   srand(Env::Default()->NowMicros());
 
   for (;;) {
-    std::vector<std::pair<int, int>> case_and_device_num;
-    std::vector<std::unique_ptr<TestCase>> test_cases;
+    std::vector<std::pair<int, int>> case_and_rank;
+    std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(
-          MakeTestCase(num_ranks, ncclSum,
-                       TensorShape({100, i % 5 + 1, i % 3 + 1}), i + 0.1 * i));
+      test_cases.emplace_back(this->MakeTestCase(
+          num_ranks, ncclSum, TensorShape({100, i % 5 + 1, i % 3 + 1}),
+          1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
-        case_and_device_num.emplace_back(i, j);
+        case_and_rank.emplace_back(i, j);
       }
     }
 
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       SE_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
-    std::shuffle(case_and_device_num.begin(), case_and_device_num.end(),
+    std::shuffle(case_and_rank.begin(), case_and_rank.end(),
                  std::mt19937(std::random_device()()));
 
-    mutex mu;  // guards case_and_device_num.
+    mutex mu;  // guards case_and_rank.
     std::unique_ptr<thread::ThreadPool> pool(
         new thread::ThreadPool(Env::Default(), "test", num_threads));
-    const int to_schedule = case_and_device_num.size();
+    const int to_schedule = case_and_rank.size();
     for (int i = 0; i < to_schedule; ++i) {
       auto fn = [&]() {
-        int device_num;
+        int rank;
         int test_num;
         {
           mutex_lock l(mu);
-          test_num = case_and_device_num.back().first;
-          device_num = case_and_device_num.back().second;
-          case_and_device_num.pop_back();
+          test_num = case_and_rank.back().first;
+          rank = case_and_rank.back().second;
+          case_and_rank.pop_back();
         }
-        auto* device = devices->at(device_num % devices->size());
+        auto* device = this->GetDevice(rank);
         auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
         auto* stream = device->tensorflow_gpu_device_info()->stream;
-        TestCase* test_case = test_cases[test_num].get();
+        typename TestFixture::TestCase* test_case = test_cases[test_num].get();
         NcclManager::instance()->AddToAllReduce(
             num_ranks, strings::StrCat("allreduce", test_num), ncclSum,
             device->executor(), device->gpu_id(), event_mgr, stream,
-            &test_case->ins[device_num], &test_case->outs[device_num],
-            CreateDoneCallback(test_case));
+            &test_case->ins[rank], &test_case->outs[rank],
+            this->CreateDoneCallback(test_case));
       };
       pool->Schedule(fn);
     }
@@ -274,7 +293,8 @@ TEST_F(NcclManagerTest, MultipleCallers) {
     LOG(ERROR) << "Verifying results for " << num_collectives_per_iteration
                << " collectives";
     for (int i = 0; i < test_cases.size(); ++i) {
-      VerifyResults(strings::StrCat("collective", i), test_cases[i].get());
+      this->VerifyResults(strings::StrCat("collective", i),
+                          test_cases[i].get());
     }
 
     int64 delta = Env::Default()->NowMicros() - start;
diff --git a/tensorflow/contrib/nccl/ops/nccl_ops.cc b/tensorflow/contrib/nccl/ops/nccl_ops.cc
index 8eb804c2e9..a353a34b80 100644
--- a/tensorflow/contrib/nccl/ops/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/ops/nccl_ops.cc
@@ -25,7 +25,7 @@ REGISTER_OP("NcclAllReduce")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -51,7 +51,7 @@ REGISTER_OP("NcclReduce")
     .Input("input: num_devices * T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -69,7 +69,7 @@ reduction: the reduction operation to perform.
 REGISTER_OP("_NcclReduceSend")
     .Input("input: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -92,7 +92,7 @@ REGISTER_OP("_NcclReduceRecv")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -118,7 +118,7 @@ shared_name: Identifier that is shared between ops of the same reduce.
 REGISTER_OP("NcclBroadcast")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("shape: shape")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -135,7 +135,7 @@ shape: The shape of the input tensor.
 
 REGISTER_OP("_NcclBroadcastSend")
     .Input("input: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -157,7 +157,7 @@ shared_name: Identifier that is shared between ops of the same broadcast.
 REGISTER_OP("_NcclBroadcastRecv")
     .Input("shape: int32")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 98fe394c5b..423a8689ae 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -72,7 +72,7 @@ class NcclTestCase(test.TestCase):
           two.
       device_sets: Tuple of virtual devices to run test on.
     """
-    for dtype in [np.float32, np.int32, np.int64, np.float64]:
+    for dtype in [np.float16, np.float32, np.int32, np.int64, np.float64]:
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-- 
GitLab


From c2013e7151b480d75b5d51417e06935faa6b53d5 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 9 Apr 2018 07:11:32 -0700
Subject: [PATCH 0435/1262] Make grpcio pip dependency conditional on little
 endian

grpcio doesn't build correctly on big-endian machines due to
lack of BoringSSL support.

Fixes: #17882
PiperOrigin-RevId: 192122728
---
 tensorflow/tools/pip_package/setup.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 365e8d6b08..cfad0f70c9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -35,7 +35,6 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'grpcio >= 1.8.6',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
@@ -43,6 +42,12 @@ REQUIRED_PACKAGES = [
     'termcolor >= 1.1.0',
 ]
 
+if sys.byteorder == 'little':
+  # grpcio does not build correctly on big-endian machines due to lack of
+  # BoringSSL support.
+  # See https://github.com/tensorflow/tensorflow/issues/17882.
+  REQUIRED_PACKAGES.append('grpcio >= 1.8.6')
+
 project_name = 'tensorflow'
 if '--project_name' in sys.argv:
   project_name_idx = sys.argv.index('--project_name')
-- 
GitLab


From c2c4f669be8f342b5a41778c840188469ef62e72 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 07:21:54 -0700
Subject: [PATCH 0436/1262] Suppress -Wself-assign in self-assignment tests,
 which triggers in newer clang revisions.

PiperOrigin-RevId: 192123736
---
 tensorflow/core/lib/gtl/flatmap_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index bb65e5357a..0901eba926 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -321,7 +321,7 @@ TEST(FlatMap, Copy) {
     NumMap copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
-- 
GitLab


From 610ebcf075333d88db9d19503495d935135d9262 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 07:37:51 -0700
Subject: [PATCH 0437/1262] Pass allow_custom_ops to toco from the python api.

PiperOrigin-RevId: 192125160
---
 tensorflow/contrib/lite/toco/python/toco_from_protos_test.py | 1 +
 tensorflow/contrib/lite/toco/python/toco_python_api.cc       | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
index c35b6f9925..3761e0095e 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -50,6 +50,7 @@ class TocoFromProtosTest(googletest.TestCase):
     toco_flags.output_format = toco_flags_pb2.TFLITE
     toco_flags.inference_input_type = types_pb2.FLOAT
     toco_flags.inference_type = types_pb2.FLOAT
+    toco_flags.allow_custom_ops = True;
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 8a5e483f3f..153c117d17 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -75,7 +75,8 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
       toco::Import(toco_flags, model_flags, input_contents_txt);
   toco::Transform(toco_flags, model.get());
   string output_file_contents_txt;
-  Export(toco_flags, *model, &output_file_contents_txt);
+  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+         &output_file_contents_txt);
 
   // Convert arguments back to byte (py3) or str (py2)
   return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-- 
GitLab


From 9e3a08b4f9e87d0886e69e4ca7928a1647dda062 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 9 Apr 2018 08:19:52 -0700
Subject: [PATCH 0438/1262] Fixes to source_writer.cc.

- Fix memory leak in source_writer constructor.
- FIx test data having .java extension causing issues with internal
  linters. Changing to .txt extension.
---
 tensorflow/java/BUILD                                    | 2 +-
 tensorflow/java/src/gen/cc/source_writer.cc              | 9 ++++++++-
 tensorflow/java/src/gen/cc/source_writer.h               | 2 +-
 tensorflow/java/src/gen/cc/source_writer_test.cc         | 4 +++-
 .../{test.snippet.java => test.snippet.java.txt}         | 0
 5 files changed, 13 insertions(+), 4 deletions(-)
 rename tensorflow/java/src/gen/resources/{test.snippet.java => test.snippet.java.txt} (100%)

diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 1be4c838f3..4b558af2ac 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -312,7 +312,7 @@ tf_cc_test(
         "src/gen/cc/source_writer_test.cc",
     ],
     data = [
-        "src/gen/resources/test.snippet.java",
+        "src/gen/resources/test.snippet.java.txt",
     ],
     deps = [
         ":java_op_gen_lib",
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 214999af9a..c57389f6c5 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -23,10 +23,17 @@ namespace tensorflow {
 namespace java {
 
 SourceWriter::SourceWriter() {
-  // push an empty generic namespace at start, for simplification
+  // Push an empty generic namespace at start, for simplification.
   generic_namespaces_.push(new GenericNamespace());
 }
 
+SourceWriter::~SourceWriter() {
+  // Remove empty generic namespace added at start.
+  GenericNamespace* generic_namespace = generic_namespaces_.top();
+  generic_namespaces_.pop();
+  delete generic_namespace;
+}
+
 SourceWriter& SourceWriter::Indent(int tab) {
   left_margin_.resize(
       std::max(static_cast<int>(left_margin_.size() + tab), 0), ' ');
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index 6abe13b5d2..cb0e9270d3 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -32,7 +32,7 @@ namespace java {
 class SourceWriter {
  public:
   SourceWriter();
-  virtual ~SourceWriter() = default;
+  virtual ~SourceWriter();
 
   // Indents following lines with white spaces.
   //
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index 6926a5a411..cbde64683b 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -259,7 +259,9 @@ TEST(StreamTest, Types) {
 
 TEST(StreamTest, FileSnippet) {
   SourceBufferWriter writer;
-  const string& fname = "tensorflow/java/src/gen/resources/test.snippet.java";
+  const string fname = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(),
+      "java/src/gen/resources/test.snippet.java.txt");
 
   writer.WriteFromFile(fname)
         .BeginBlock()
diff --git a/tensorflow/java/src/gen/resources/test.snippet.java b/tensorflow/java/src/gen/resources/test.snippet.java.txt
similarity index 100%
rename from tensorflow/java/src/gen/resources/test.snippet.java
rename to tensorflow/java/src/gen/resources/test.snippet.java.txt
-- 
GitLab


From bbb644e69d38722fd398f18ef8f20b05810d97c6 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 9 Apr 2018 08:24:38 -0700
Subject: [PATCH 0439/1262] Begin switching to use Kokoro build badges on TF
 GitHub README.md.

Moving away from Jenkins builds. Would like to switch to use build badges
for the equivalent builds we have set up on Kokoro.

PiperOrigin-RevId: 192130083
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3cdb6e478d..177265500f 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 
 | **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.cloud.google.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.cloud.google.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.cloud.google.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
-- 
GitLab


From dfbacef32d1fdafe71caad4c99ce8c4b648e6397 Mon Sep 17 00:00:00 2001
From: Ilya Polenov <daioptych@gmail.com>
Date: Mon, 9 Apr 2018 15:45:48 +0000
Subject: [PATCH 0440/1262] Allow ComplexAbs Op on mobile platforms (#18113)

Seems like it was disabled long time ago before open-sourcing Tensorflow.
I think disabling it is no longer necessary.
Works now on Android. Could anyone check on iOS?

Somewhat related issue: #11804
---
 tensorflow/core/kernels/cwise_op_abs.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 1466f24202..1920c54e80 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -18,9 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
           int64);
-#if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
-#endif
 
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
-- 
GitLab


From 90a3db9ff995634314227f0aacf4984d1eee752a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 09:10:31 -0700
Subject: [PATCH 0441/1262] Adding support for the standalone log operator.
 Basic import/export only, No run time support.

PiperOrigin-RevId: 192135843
---
 tensorflow/contrib/lite/toco/export_tensorflow.cc  | 12 ++++++++++++
 .../graph_transformations/propagate_fixed_sizes.cc |  1 +
 .../resolve_constant_unary.cc                      |  4 ++++
 tensorflow/contrib/lite/toco/import_tensorflow.cc  | 14 ++++++++++++++
 tensorflow/contrib/lite/toco/model.h               | 12 ++++++++++++
 tensorflow/contrib/lite/toco/tooling_util.cc       |  1 +
 6 files changed, 44 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 4a77196aab..4a85f3c5a4 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -704,6 +704,15 @@ void ConvertRelu6Operator(const Relu6Operator& src_op,
   (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertLogOperator(const LogOperator& src_op, GraphDef* tensorflow_graph) {
+  auto* op = tensorflow_graph->add_node();
+  op->set_op("Log");
+  op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *op->add_input() = src_op.inputs[0];
+  (*op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertLogisticOperator(const LogisticOperator& src_op,
                              GraphDef* tensorflow_graph) {
   auto* relu_op = tensorflow_graph->add_node();
@@ -1703,6 +1712,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kRelu6) {
     ConvertRelu6Operator(static_cast<const Relu6Operator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLog) {
+    ConvertLogOperator(static_cast<const LogOperator&>(src_op),
+                       tensorflow_graph);
   } else if (src_op.type == OperatorType::kLogistic) {
     ConvertLogisticOperator(static_cast<const LogisticOperator&>(src_op),
                             tensorflow_graph);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 68d6f21cf8..a648b770f8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1479,6 +1479,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
+    case OperatorType::kLog:
     case OperatorType::kLogistic:
     case OperatorType::kTanh:
     case OperatorType::kLocalResponseNormalization:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index d4db6f1c00..f6c8f79d8d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -51,6 +51,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   // Test for unary ops of types that we know how to resolve.
   switch (unary_op->type) {
     case OperatorType::kCast:
+    case OperatorType::kLog:
     case OperatorType::kNeg:
     case OperatorType::kTensorFlowRsqrt:
     case OperatorType::kTensorFlowSqrt:
@@ -218,6 +219,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
     output_float_data[0] = max;
   } else if (unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kLog ||
              unary_op->type == OperatorType::kTensorFlowRsqrt ||
              unary_op->type == OperatorType::kTensorFlowSqrt ||
              unary_op->type == OperatorType::kTensorFlowSquare) {
@@ -231,6 +233,8 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
       float outval = 0.f;
       if (unary_op->type == OperatorType::kNeg) {
         outval = -val;
+      } else if (unary_op->type == OperatorType::kLog) {
+        outval = std::log(val);
       } else if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
         outval = 1.0f / std::sqrt(val);
       } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 876479079b..6b62eeb638 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -611,6 +611,18 @@ void ConvertRelu6Operator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertLogOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Log");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  auto op = absl::make_unique<LogOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertLogisticOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
@@ -2091,6 +2103,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
       ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Log") {
+      ConvertLogOperator(node, tf_import_flags, model);
     } else if (node.op() == "LogSoftmax") {
       ConvertLogSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 9bd72e7de1..56ef9fe2a8 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -56,6 +56,7 @@ enum class OperatorType {
   kL2Pool,
   kLstmCell,
   kLocalResponseNormalization,
+  kLog,
   kLogistic,
   kMaxPool,
   kFakeQuant,
@@ -591,6 +592,17 @@ struct LogisticOperator : Operator {
   LogisticOperator() : Operator(OperatorType::kLogistic) {}
 };
 
+// Element-wise natural log operator:
+//   x -> ln(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Log
+struct LogOperator : Operator {
+  LogOperator() : Operator(OperatorType::kLog) {}
+};
+
 // Element-wise Tanh operator:
 //   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 //
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index b72f5fa2a7..bd2d5f7df0 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -291,6 +291,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Dequantize)
     HANDLE_OPERATORTYPENAME_CASE(L2Normalization)
     HANDLE_OPERATORTYPENAME_CASE(LocalResponseNormalization)
+    HANDLE_OPERATORTYPENAME_CASE(Log)
     HANDLE_OPERATORTYPENAME_CASE(Logistic)
     HANDLE_OPERATORTYPENAME_CASE(LstmCell)
     HANDLE_OPERATORTYPENAME_CASE(MaxPool)
-- 
GitLab


From e5097b40784d8b697b30ee2cf99c0b9c2e743ca0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 09:11:01 -0700
Subject: [PATCH 0442/1262] Initial Python API for specifying
 outside_compilation blocks that call out from a TPU computation.

For now outside_compilation cannot occur inside any compiled control flow (while loop or conditional). If the computation is replicated, the outside_compilation ops are also replicated. Both of these restrictions will be lifted in followup CLs.

PiperOrigin-RevId: 192135901
---
 .../tf2xla/functionalize_control_flow.cc      |   8 +
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 211 +++++++++++++++++-
 tensorflow/contrib/tpu/python/tpu/tpu_test.py |   2 +-
 tensorflow/python/eager/function.py           |  10 +
 tensorflow/python/framework/ops.py            |  24 +-
 tensorflow/python/ops/control_flow_ops.py     |  10 +
 tensorflow/python/ops/gradients_impl.py       |  48 +++-
 7 files changed, 292 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 8b7beef83e..16b9142cbf 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -901,6 +901,14 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       int src_depth = switch_depth[src_id];
       if (!e->IsControlEdge() || new_switch_depth == src_depth) {
         if (src_depth != new_switch_depth) {
+          // TODO(b/77601805) remove this when outside_compilation supports
+          // control flow.
+          if (str_util::StrContains(src->name(), "outside_compilation") ||
+              str_util::StrContains(n->name(), "outside_compilation")) {
+            return errors::InvalidArgument(
+                "outside_compilation is not yet supported within TensorFlow "
+                "control flow constructs b/77601805");
+          }
           return errors::InvalidArgument(
               "Unable to functionalize control flow in graph: Operand ('",
               src->name(), "') and operator ('", n->name(),
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 3f2db548ac..a1690dadff 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -25,6 +25,8 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -56,6 +58,7 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
 def _tpu_system_device_name(job):
@@ -121,8 +124,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name):
+  def __init__(self, name, num_replicas):
     super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
 
@@ -136,6 +147,143 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         logging.warning("... and %d more" %
                         (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an
+          # outside_compilation cluster C in a forward computation we
+          # would like to put the ops corresponding to the gradient of
+          # X into a new outside_compilation cluster C'. However, if
+          # we take the gradient of X twice, the second one should get
+          # yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is
+          # the cluster that X was in before we took gradients, and a
+          # 'gradient_uid' which is different for every invocation of
+          # gradients, and put the gradient of X in cluster
+          # 'root_cluster.gradient_uid'.
+          #
+          # When the gradient code adds multiple Ops, it asks them to
+          # be colocated either with the original Op X, or with one of
+          # the preceding Ops that was added to the gradient. In other
+          # words, we want to detect the case where we are colocating
+          # with an Op that is in cluster root_cluster.gradient_uid
+          # and put the new Op in that same cluster if the
+          # gradient_uid is the same (the case that we are in the same
+          # invocation of gradients, and just adding new Ops to the
+          # cluster); and in a different cluster if the gradient_uids
+          # are different (the case that we are in a new invocation of
+          # gradients, taking the gradient of a previously-computed
+          # gradient).
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          if len(parts) > 1:
+            uid = parts[-1]
+            if uid == gradient_uid:
+              # Keep using the same cluster
+              cluster = outside_attr
+            else:
+              # We're taking the gradient of a gradient so make a new
+              # cluster attr, adding a new '.uid' on the end to
+              # preserve the invariant that the gradient_uid is the
+              # suffix after the last '.' in the attr.
+              cluster = outside_attr + "." + gradient_uid
+          else:
+            # We're taking the gradient of an Op in the forward pass, so
+            # make a new cluster combining the Op's cluster and the
+            # gradient id.
+            cluster = outside_attr + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        self._device = device.to_string()
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def Exit(self):
+    super(TPUReplicateContext, self).Exit()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
   def AddOp(self, op):
     self._AddOpInternal(op)
 
@@ -157,9 +305,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError("TPU computations cannot be nested")
     op._set_attr(_TPU_REPLICATE_ATTR,
                  attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
-    op.graph.prevent_feeding(op)
-    op.graph.prevent_fetching(op)
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
 
   def AddValue(self, val):
     result = val
@@ -181,6 +336,45 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
+def outside_compilation(computation, args=None):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    args: Inputs to pass to computation.
+  Returns:
+    The Tensors returned by computation.
+  """
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
@@ -280,7 +474,8 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
-  context = TPUReplicateContext(name=graph.unique_name("cluster"))
+  context = TPUReplicateContext(
+      name=graph.unique_name("cluster"), num_replicas=num_replicas)
   try:
     context.Enter()
 
@@ -361,6 +556,12 @@ def replicate(computation,
   finally:
     context.report_unsupported_operations()
     context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
 
   # Fan-out: Builds a TPUReplicatedOutput node for each output.
   outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index 336d8260c3..c3882b8a27 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -37,7 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context")
+    context = tpu.TPUReplicateContext(b"context", 1)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 61859d6be3..5168ad3b18 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -223,6 +223,16 @@ class HelperContext(object):
     else:
       return val
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def __enter__(self):
     # pylint: disable=protected-access
     self._g = ops.get_default_graph()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2574fa57a4..e3ca5a4977 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4179,6 +4179,19 @@ class Graph(object):
     """
     return self._name_stack
 
+  @tf_contextlib.contextmanager
+  def _colocate_with_for_gradient(self, op, gradient_uid,
+                                  ignore_existing=False):
+    with self.colocate_with(op, ignore_existing):
+      if gradient_uid is not None and self._control_flow_context is not None:
+        try:
+          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
+          yield
+        finally:
+          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
+      else:
+        yield
+
   @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
@@ -4958,8 +4971,7 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-@tf_export("colocate_with")
-def colocate_with(op, ignore_existing=False):
+def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
       return device(op.device)
@@ -4973,7 +4985,13 @@ def colocate_with(op, ignore_existing=False):
       else:
         raise ValueError("Encountered an Eager-defined Tensor during graph "
                          "construction, but a function was not being built.")
-    return default_graph.colocate_with(op, ignore_existing)
+    return default_graph._colocate_with_for_gradient(
+        op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
+
+
+@tf_export("colocate_with")
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
 
 @tf_export("control_dependencies")
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e56ab93666..7be8628073 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1595,6 +1595,16 @@ class ControlFlowContext(object):
     last_context = self._context_stack.pop()
     graph._set_control_flow_context(last_context)
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 44473ec69c..13420b7f0e 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -208,7 +208,10 @@ def _AsList(x):
   return x if isinstance(x, (list, tuple)) else [x]
 
 
-def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
   """Fill in default values for grad_ys.
 
   Args:
@@ -216,6 +219,9 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
     ys: List of tensors.
     colocate_gradients_with_ops: If True, try colocating gradients with
       the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
 
   Returns:
     A list of gradients to use, without None.
@@ -231,7 +237,7 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
   for i in xrange(len(grad_ys)):
     grad_y = grad_ys[i]
     y = ys[i]
-    with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
       if grad_y is None:
         if y.dtype.is_complex:
           raise TypeError(
@@ -338,10 +344,10 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
 
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, colocate_gradients_with_ops):
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
   """Context to colocate with `op` if `colocate_gradients_with_ops`."""
   if colocate_gradients_with_ops:
-    with ops.colocate_with(op):
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
       yield
   else:
     yield
@@ -506,6 +512,9 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
   with ops.name_scope(
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [
         x.handle if resource_variable_ops.is_resource_variable(x) else x
@@ -513,7 +522,8 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
 
     # The approach we take here is as follows: Create a list of all ops in the
     # subgraph between the ys and xs.  Visit these ops in reverse order of ids
@@ -570,10 +580,11 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
-      with _maybe_colocate_with(op, colocate_gradients_with_ops):
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=True)
 
@@ -633,7 +644,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
               if gate_gradients and len([x for x in in_grads
                                          if x is not None]) > 1:
                 with ops.device(None):
-                  with ops.colocate_with(None, ignore_existing=True):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
@@ -789,7 +803,7 @@ def _LogOpGradients(op, out_grads, in_grads):
                ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
 
 
-def _MultiDeviceAddN(tensor_list):
+def _MultiDeviceAddN(tensor_list, gradient_uid):
   """Adds tensors from potentially multiple devices."""
   # Basic function structure comes from control_flow_ops.group().
   # Sort tensors according to their devices.
@@ -808,7 +822,10 @@ def _MultiDeviceAddN(tensor_list):
 
   for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
     tensors = tensors_on_device[dev]
-    with ops.colocate_with(tensors[0].op, ignore_existing=True):
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
       summands.append(math_ops.add_n(tensors))
 
   return math_ops.add_n(summands)
@@ -834,12 +851,19 @@ class AggregationMethod(object):
   EXPERIMENTAL_ACCUMULATE_N = 2
 
 
-def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
   """Get the aggregated gradients for op.
 
   Args:
     grads: The map of memoized gradients.
     op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
     loop_state: An object for maintaining the state of the while loops in the
                 graph. It is of type ControlFlowState. None if the graph
                 contains no while loops.
@@ -916,7 +940,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
             out_grads[i] = running_sum
         else:
           used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad)
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
-- 
GitLab


From 57b491744fa685cffc27b0dc73647fa2f05c9b68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 09:58:33 -0700
Subject: [PATCH 0443/1262] Small reorganization of core/BUILD

PiperOrigin-RevId: 192142333
---
 tensorflow/core/BUILD                         | 134 ++++++++++++++----
 .../core/platform/default/build_config/BUILD  |  13 ++
 2 files changed, 117 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1eebeb3995..6f2391c991 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -256,7 +256,7 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-# Minimal lib to detect plafrom
+# Minimal lib to detect platform
 cc_library(
     name = "lib_platform",
     hdrs = [
@@ -264,6 +264,55 @@ cc_library(
     ],
 )
 
+PLATFORM_BASE_HDRS = [
+    "platform/logging.h",
+    "platform/macros.h",
+    "platform/types.h",
+    "platform/cpu_info.h",
+]
+
+PLATFORM_OTHER_HDRS = [
+    "platform/abi.h",
+    "platform/stacktrace.h",
+    "platform/stacktrace_handler.h",
+    "platform/context.h",
+    "platform/cpu_feature_guard.h",
+    "platform/dynamic_annotations.h",
+    "platform/env.h",
+    "platform/env_time.h",
+    "platform/file_system.h",
+    "platform/file_system_helper.h",
+    "platform/fingerprint.h",
+    "platform/init_main.h",
+    "platform/mem.h",
+    "platform/mutex.h",
+    "platform/net.h",
+    "platform/notification.h",
+    "platform/null_file_system.h",
+    "platform/prefetch.h",
+    "platform/profile_utils/clock_cycle_profiler.h",
+    "platform/profile_utils/cpu_utils.h",
+    "platform/protobuf.h",
+    "platform/strong_hash.h",
+    "platform/subprocess.h",
+    "platform/thread_annotations.h",
+]
+
+# Smaller platform libraries that don't depend on "lib" or "lib_internal".
+cc_library(
+    name = "platform_base",
+    srcs = glob([
+        "platform/*/integral_types.h",
+        "platform/*/logging.h",
+        "platform/*/cpu_info.h",
+    ]),
+    hdrs = PLATFORM_BASE_HDRS,
+    deps = [
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:base",
+    ],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -294,7 +343,8 @@ cc_library(
 # tf_cc_test and tf_cc_binary will include the necessary symbols.
 cc_library(
     name = "lib",
-    hdrs = [
+    hdrs = PLATFORM_BASE_HDRS +
+           PLATFORM_OTHER_HDRS + [
         "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
@@ -341,34 +391,6 @@ cc_library(
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
-        "platform/abi.h",
-        "platform/context.h",
-        "platform/cpu_feature_guard.h",
-        "platform/cpu_info.h",
-        "platform/dynamic_annotations.h",
-        "platform/env.h",
-        "platform/env_time.h",
-        "platform/file_system.h",
-        "platform/file_system_helper.h",
-        "platform/fingerprint.h",
-        "platform/init_main.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/mutex.h",
-        "platform/net.h",
-        "platform/notification.h",
-        "platform/null_file_system.h",
-        "platform/prefetch.h",
-        "platform/profile_utils/clock_cycle_profiler.h",
-        "platform/profile_utils/cpu_utils.h",
-        "platform/protobuf.h",
-        "platform/stacktrace.h",
-        "platform/strong_hash.h",
-        "platform/subprocess.h",
-        "platform/thread_annotations.h",
-        "platform/types.h",
-        "platform/windows/cpu_info.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -415,6 +437,17 @@ cc_library(
     ],
 )
 
+# Libraries that will eventually be moved into lib/core
+# Note that stringpiece_test can't be place here yet, because we are
+# required to use tf_cc_test, and that rule will change / into _
+cc_library(
+    name = "core_stringpiece",
+    srcs = ["lib/core/stringpiece.cc"],
+    hdrs = ["lib/core/stringpiece.h"],
+    copts = tf_copts(),
+    deps = [":platform_base"],
+)
+
 # Test support library needed for all tests
 # This is currently public, but may be made internal in the
 # future.  Try to avoid depending on it.
@@ -442,6 +475,27 @@ cc_library(
     ] + tf_additional_test_deps(),
 )
 
+# Testing libraries - lite versions that don't depend on all of "lib" or
+# "lib_internal". Instead, they only need a much smaller set of support
+# libraries such as ":platform_base" and ":core_stringpiece".
+cc_library(
+    name = "test_lite",
+    testonly = 1,
+    srcs = [
+        "platform/test.cc",
+    ],
+    hdrs = [
+        "platform/test.h",
+        "platform/test_benchmark.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":lib_platform",
+        ":platform_base",
+        "//tensorflow/core/platform/default/build_config:gtest",
+    ],
+)
+
 # This build rule (along with :framework_internal, :lib, and :lib_internal)
 # purposefully omits the definitions of many declared symbols, which are
 # included in //tensorflow:libtensorflow_framework.so. Using tf_cc_test and tf_cc_binary
@@ -1650,6 +1704,7 @@ cc_library(
         exclude = [
             "**/*test*",
             "framework/variant.cc",
+            "lib/core/stringpiece.cc",
             "lib/hash/crc32c_accelerate.cc",
             "lib/gif/**/*",
             "lib/jpeg/**/*",
@@ -1663,6 +1718,7 @@ cc_library(
     ) + tf_additional_lib_srcs(
         exclude = [
             "**/*test*",
+            "lib/core/stringpiece.cc",
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
@@ -1683,6 +1739,7 @@ cc_library(
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
         ":abi",
+        ":core_stringpiece",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
@@ -2626,6 +2683,23 @@ cc_library(
     alwayslink = 1,
 )
 
+# This is the lite version of a main() for tests. It does not include any
+# support for reporting benchmark results when running on TPUs.
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    srcs = ["platform/test_main.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":core_stringpiece",
+        ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
+        "//tensorflow/core/platform/default/build_config:test_lite_main",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 447056eb4b..44a89c3a96 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -113,6 +113,12 @@ cc_library(
     copts = tf_copts(),
 )
 
+cc_library(
+    name = "base",
+    srcs = [],
+    copts = tf_copts(),
+)
+
 cc_library(
     name = "platformlib",
     copts = tf_copts(),
@@ -165,6 +171,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    linkstatic = 1,
+    deps = [],
+)
+
 cc_library(
     name = "test_main",
     testonly = 1,
-- 
GitLab


From aed12f35e29924e43f191d42fdcc6f9e025a3a3e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 10:13:28 -0700
Subject: [PATCH 0444/1262] Minimize broadcasts by rewriting a sub-tree of
 binary associative ops (Add, Mul).

PiperOrigin-RevId: 192145052
---
 .../optimizers/arithmetic_optimizer.cc        | 561 ++++++++++++------
 .../optimizers/arithmetic_optimizer.h         |   5 +-
 .../optimizers/arithmetic_optimizer_test.cc   | 161 +++++
 .../optimizers/graph_optimizer_stage.h        |  12 +
 4 files changed, 568 insertions(+), 171 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index da8d677737..fa0f7c1c6e 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -279,6 +279,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
         ctx_ext_(ctx_ext) {}
   virtual ~ArithmeticOptimizerStage() = default;
 
+ protected:
   // Simplification graph rewrite can create additional nodes that are inputs
   // to final simplified node, they can be also added to the arithmetic
   // optimizer queue for further optimization.
@@ -304,10 +305,176 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
   }
 
  private:
-  // extened context required for ArithmeticOptimizer
+  // Extended context required for ArithmeticOptimizer.
   const ArithmeticOptimizerContext ctx_ext_;
 };
 
+// Subtype of ArithmeticOptimizerStage that does optimization by rewriting a
+// group of nodes from the optimized graph.
+//
+// * AddOpsRewrite:
+//   Rewrite a group of Add/AddN with compact Add/AddN tree
+//
+// * MinimizeBroadcasts:
+//   Rewrite a group of binary associative ops, reordering
+//   inputs, to minimize the cost of broadcast
+class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
+ public:
+  explicit ArithmeticNodesGroupOptimizerStage(
+      const string& name, const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext ctx_ext)
+      : ArithmeticOptimizerStage(name, ctx, ctx_ext), optimized_nodes_{} {}
+  ~ArithmeticNodesGroupOptimizerStage() override = default;
+
+  // Input name with a statically inferred shape from GraphProperties
+  struct InputAndShape {
+    InputAndShape(const string& input, const TensorShapeProto& shape)
+        : input(input), shape(shape) {}
+    string input;
+    TensorShapeProto shape;
+  };
+
+  // Subgraph (subtree) of nodes, that we want to optimize in "one shot" (e.g.
+  // all the Add nodes that we plan to rewrite with a single AddN). Subgraph is
+  // obtained by graph traversal, starting from a root node.
+  struct OptimizedNodesGroup {
+    NodeDef* root_node;
+    TensorShapeProto root_shape;
+    // Optimized nodes that will be updated or removed by rewrite
+    std::vector<NodeDef*> optimized_nodes;
+    // Inputs to optimized nodes
+    std::vector<InputAndShape> inputs;
+  };
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    OptimizedNodesGroup group;
+    TF_RETURN_IF_ERROR(CreateOptimizedNodesGroup(node, &group));
+
+    if (!group.optimized_nodes.empty()) {
+      *simplified_node_name = RewriteOptimizedNodesGroup(group);
+    }
+
+    return Status::OK();
+  }
+
+ protected:
+  // Modify the optimized graph after nodes group was successfully identified
+  virtual string RewriteOptimizedNodesGroup(
+      const OptimizedNodesGroup& group) = 0;
+
+  // Check if input can become a part of current optimized nodes group.
+  virtual bool IsAbsorbableByOptimizedNodesGroup(
+      const OptimizedNodesGroup& group, const string& input) const = 0;
+
+  Status AbsorbInputByOptimizedNodesGroup(const string& input,
+                                          OptimizedNodesGroup* group) const {
+    NodeDef* node;
+    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
+
+    if (IsAbsorbableByOptimizedNodesGroup(*group, input)) {
+      for (int i = 0; i < node->input_size(); ++i) {
+        const string& input_i = node->input(i);
+        if (!IsControlInput(input)) {
+          TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+        }
+      }
+      group->optimized_nodes.push_back(node);
+    } else {
+      // If node can't be absorbed, add it to OptimizedNodesGroup input
+      OpInfo::TensorProperties properties;
+      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
+      group->inputs.emplace_back(input, properties.shape());
+    }
+    return Status::OK();
+  }
+
+  Status CreateOptimizedNodesGroup(NodeDef* root_node,
+                                   OptimizedNodesGroup* group) const {
+    OpInfo::TensorProperties root_node_output_properties;
+    TF_RETURN_IF_ERROR(
+        GetTensorProperties(root_node->name(), &root_node_output_properties));
+
+    group->root_node = root_node;
+    group->root_shape = root_node_output_properties.shape();
+
+    group->optimized_nodes.reserve(root_node->input_size());
+    for (int i = 0; i < root_node->input_size(); ++i) {
+      const string& input_i = root_node->input(i);
+      if (!IsControlInput(input_i)) {
+        TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Check if all inputs can be broadcasted to the same shape
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool HasAllInputsBroadcastableToShape(
+      const NodeDef& node, const OpInfo::TensorProperties& properties) const {
+    auto is_broadcastable = [this, &properties](const string& input) {
+      OpInfo::TensorProperties input_props;
+      Status has_input_properties = GetTensorProperties(input, &input_props);
+      return has_input_properties.ok() &&
+             ShapesBroadcastable(properties, input_props);
+    };
+    return std::all_of(node.input().begin(), node.input().end(),
+                       is_broadcastable);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool IsDrivenByControlDependency(const NodeDef& node) const {
+    return std::any_of(node.input().begin(), node.input().end(),
+                       IsControlInput);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool DrivesControlDependency(const NodeDef& node) const {
+    int position;
+    for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
+      for (int i = 0; i < output->input_size(); ++i) {
+        auto input = output->input(i);
+        string name = ParseNodeName(input, &position);
+        if (name == node.name() && /*control input*/ position < 0) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  string ShapeSignature(const TensorShapeProto& shape) const {
+    string signature = strings::StrCat("rank:", shape.dim_size(), ":dim");
+    for (int i = 0; i < shape.dim_size(); ++i)
+      strings::StrAppend(&signature, ":", shape.dim(i).size());
+    return signature;
+  }
+
+  void AddToOptimizedNodes(const NodeDef* node) {
+    optimized_nodes_.insert(node->name());
+  }
+
+  bool IsOnTheSameDevice(const OptimizedNodesGroup& group,
+                         const NodeDef& node) const {
+    return group.root_node->device() == node.device();
+  }
+
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx_.nodes_to_preserve->find(node.name()) !=
+           ctx_.nodes_to_preserve->end();
+  }
+
+  bool IsAlreadyOptimized(const NodeDef& node) const {
+    return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
+  }
+
+ private:
+  // set of nodes already processed by this optimizer stage
+  std::unordered_set<string> optimized_nodes_;
+};
+
 // Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
 // original inputs of absorbed nodes.
 //
@@ -335,110 +502,33 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 //         x    y      w    Add_3      AddN(x, y, q, e)  z
 //                          / \
 //                         q   e
-class AddOpsRewriteStage : public ArithmeticOptimizerStage {
+class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
  public:
   explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
                               const ArithmeticOptimizerContext& ctx_ext)
-      : ArithmeticOptimizerStage("AddOpsRewrite", ctx, ctx_ext),
-        rewritten_nodes_() {}
-
+      : ArithmeticNodesGroupOptimizerStage("AddOpsRewrite", ctx, ctx_ext) {}
   ~AddOpsRewriteStage() override = default;
 
   // Check if a node can become a root of AddOpsGroup
   bool IsSupported(const NodeDef* node) const override {
-    // check basic preconditions
-    if (!IsRewritable(node)) {
-      return false;
-    }
+    if (!CanOptimize(node)) return false;
 
     // shape must be symbolically defined and all inputs compatible with it
     OpInfo::TensorProperties properties;
     Status has_properties = GetTensorProperties(node->name(), &properties);
     return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
-           HasAllInputsOfBroadcastableShape(*node, properties);
-  }
-
-  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
-    AddOpsGroup group;
-    TF_RETURN_IF_ERROR(CreateAddOpsGroup(node, &group));
-
-    if (!group.absorbed_nodes.empty()) {
-      *simplified_node_name = RewriteAddOpsGroup(group);
-    }
-
-    return Status::OK();
-  }
-
- private:
-  // Input name with a statically inferred shape from GraphProperties
-  struct InputAndShape {
-    InputAndShape(const string& input, const TensorShapeProto& shape)
-        : input(input), shape(shape) {}
-    string input;
-    TensorShapeProto shape;
-  };
-
-  // Holds together an add ops subgraph that we want to rewrite together.
-  //
-  // For the graph above the AddOpsGroup will be:
-  //   root_node: AddN_1
-  //   absorbed_nodes: [Add_1, Add_2]
-  //   input_nodes: [x, y, z, w, q, e]
-  struct AddOpsGroup {
-    const NodeDef* root_node;
-    TensorShapeProto root_shape;
-    // Add/AddN operations below the root level that were absorbed by this group
-    std::vector<NodeDef*> absorbed_nodes;
-    // Inputs of absorbed nodes that will be forwarded to optimized AddN ops
-    std::vector<InputAndShape> inputs;
-  };
-
-  // Check if all inputs can be broadcasted to the same shape
-  bool HasAllInputsOfBroadcastableShape(
-      const NodeDef& node, const OpInfo::TensorProperties& properties) const {
-    const AddOpsRewriteStage* self = this;
-    return std::all_of(
-        node.input().begin(), node.input().end(),
-        [self, &properties](const string& input) {
-          OpInfo::TensorProperties input_properties;
-          Status has_input_properties =
-              self->GetTensorProperties(input, &input_properties);
-          return has_input_properties.ok() &&
-                 ShapesBroadcastable(properties, input_properties);
-        });
-  }
-
-  // TODO(ezhulenev): use GraphRewriter?
-  bool IsDrivenByControlDependency(const NodeDef& node) const {
-    return std::any_of(node.input().begin(), node.input().end(),
-                       IsControlInput);
-  }
-
-  // TODO(ezhulenev): use GraphRewriter?
-  bool DrivesControlDependency(const NodeDef& node) const {
-    int position;
-    for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
-      for (int i = 0; i < output->input_size(); ++i) {
-        auto input = output->input(i);
-        string name = ParseNodeName(input, &position);
-        if (name == node.name() && /*control input*/ position < 0) {
-          return true;
-        }
-      }
-    }
-    return false;
+           HasAllInputsBroadcastableToShape(*node, properties);
   }
 
-  // Check if a node can be absorbed by current AddOpsGroup
-  bool IsAbsorbableByAddOpsGroup(const string& name, const AddOpsGroup& group) {
+ protected:
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const string& input) const override {
     NodeDef* node;
-    Status node_status = GetInputNode(name, &node);
-    if (!node_status.ok()) {
-      return false;
-    }
-    // check basic preconditions
-    if (!IsRewritable(node)) {
+    Status node_status = GetInputNode(input, &node);
+    if (!node_status.ok() || !CanOptimize(node)) return false;
+
+    if (!IsOnTheSameDevice(group, *node)) {
       return false;
     }
     // with a single output data consumer (presumably if we reach this node from
@@ -447,102 +537,42 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (NumNonControlDataOutputs(*node, *ctx_.node_map) != 1) {
       return false;
     }
-    // must be on the same device as a root node
-    if (node->device() != group.root_node->device()) {
-      return false;
-    }
     // All input shapes must be broadcastable to the node shape
     OpInfo::TensorProperties properties;
-    Status has_properties = GetTensorProperties(name, &properties);
+    Status has_properties = GetTensorProperties(input, &properties);
     return has_properties.ok() &&
-           HasAllInputsOfBroadcastableShape(*node, properties);
+           HasAllInputsBroadcastableToShape(*node, properties);
   }
 
   // Node requirements both for a root node and an absorbed node
-  bool IsRewritable(const NodeDef* node) const {
-    // only Add or AddN can be a root node
+  bool CanOptimize(const NodeDef* node) const {
     // TODO(ezhulenev): check if AccumulateNV2 can be supported too
     if (!IsAdd(*node) && !IsAddN(*node)) {
       return false;
     }
-    // it must not be in a preserve set
-    if (ctx_.nodes_to_preserve->find(node->name()) !=
-        ctx_.nodes_to_preserve->end()) {
-      return false;
-    }
-    // it must not be a node created or absorbed by previous iteration
-    if (rewritten_nodes_.find(node->name()) != rewritten_nodes_.end()) {
+    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
       return false;
     }
     // it must not be created by this stage at any of previous optimization runs
     if (str_util::StrContains(node->name(), stage_name_)) {
       return false;
     }
-    // should not drive or be driven by control dependency
     // TODO(ezhulenev): relax this condition for root node
     return !(IsDrivenByControlDependency(*node) ||
              DrivesControlDependency(*node));
   }
 
-  // Create an AddOpsGroup with a root in a given node
-  Status CreateAddOpsGroup(const NodeDef* root_node, AddOpsGroup* group) {
-    OpInfo::TensorProperties root_node_output_properties;
-    TF_RETURN_IF_ERROR(
-        GetTensorProperties(root_node->name(), &root_node_output_properties));
-
-    group->root_node = root_node;
-    group->root_shape = root_node_output_properties.shape();
-
-    group->absorbed_nodes.reserve(root_node->input_size());
-    for (int i = 0; i < root_node->input_size(); ++i) {
-      const string& input_i = root_node->input(i);
-      if (!IsControlInput(input_i)) {
-        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status AbsorbInputByAddOpsGroup(const string& input, AddOpsGroup* group) {
-    NodeDef* node;
-    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
-
-    if (IsAbsorbableByAddOpsGroup(input, *group)) {
-      group->absorbed_nodes.push_back(node);
-      for (int i = 0; i < node->input_size(); ++i) {
-        const string& input_i = node->input(i);
-        if (!IsControlInput(input)) {
-          TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
-        }
-      }
-    } else {
-      // If node can't be absorbed, add it to AddOpsGroup input
-      OpInfo::TensorProperties properties;
-      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
-      group->inputs.emplace_back(input, properties.shape());
-    }
-    return Status::OK();
-  }
-
-  // Rewrite an add ops group into a single AddN if all input shapes are
+  // Rewrite a group of add ops into a single AddN if all input shapes are
   // symbolically equal. If not, create AddN for equal shapes first, and then
   // build an Add tree, minimizing the cost of broadcasts.
-  string RewriteAddOpsGroup(const AddOpsGroup& group) {
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
     // all new nodes will be placed under the scope of a root node
     auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
 
-    auto shape_sig = [](const TensorShapeProto& shape) {
-      string name = strings::StrCat("r:", shape.dim_size(), ":d");
-      for (int i = 0; i < shape.dim_size(); ++i)
-        strings::StrAppend(&name, ":", shape.dim(i).size());
-      return name;
-    };
-
     // Find what shapes are present in the inputs of absorbed nodes
     std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
     for (const auto& input : group.inputs) {
-      shape_sig_to_inputs[shape_sig(input.shape)].push_back(input);
+      shape_sig_to_inputs[ShapeSignature(input.shape)].push_back(input);
     }
 
     // Collect all the shapes from representative elements
@@ -556,8 +586,6 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
       string node_name = OptimizedNodeName(root_scope_and_name);
       AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
                                         group.inputs);
-      // keep track of nodes that were created or absorbed as a part of rewrite
-      rewritten_nodes_.insert(node_name);
       return node_name;
     }
 
@@ -586,7 +614,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     // Prepare leaf AddN nodes for inputs of equal shape
     for (int i = 0; i < shapes.size(); ++i) {
       const auto node_name = leaf_node_name(i);
-      const auto& inputs = shape_sig_to_inputs[shape_sig(shapes[i])];
+      const auto& inputs = shape_sig_to_inputs[ShapeSignature(shapes[i])];
       add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
                                                           node_name, inputs));
     }
@@ -637,7 +665,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
       node->add_input(inputAndShape.input);
     }
 
-    rewritten_nodes_.insert(node_name);
+    AddToOptimizedNodes(node);
     return InputAndShape(node_name, shape);
   }
 
@@ -661,13 +689,10 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     node->add_input(left.input);
     node->add_input(right.input);
 
-    rewritten_nodes_.insert(node_name);
+    AddToOptimizedNodes(node);
     return InputAndShape(
         node_name, TensorShapeProto());  // shape is not important at this point
   }
-
-  // keep nodes that were added or absorbed as a part of AddOpsGroup rewrite
-  std::unordered_set<string> rewritten_nodes_;
 };
 
 // Use the commutativity and (left- and right-) distributive property of
@@ -693,7 +718,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     std::set<string> common_factors;
     std::vector<string> ctrl_deps;
@@ -839,6 +864,201 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   std::unordered_set<string> rewritten_nodes_;
 };
 
+// Binary associative ops can be re-ordered to minimize the number of broadcasts
+// and the size of a temporary tensors.
+//
+// Example: [a, c] - scalars, [b, d] - matrices
+//   @ - binary associative op (Add or Mul)
+//   @* - broadcast
+//
+//           @                      @*
+//        /     \                /      \
+//      @*       @*      ->     @        @
+//    /   \    /   \          /   \    /   \
+//   a     b  c     d        a     c  b     d
+class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
+ public:
+  explicit MinimizeBroadcasts(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticNodesGroupOptimizerStage("MinimizeBroadcasts", ctx, ctx_ext) {
+  }
+  ~MinimizeBroadcasts() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    if (!IsBinaryAssociative(*node)) return false;
+
+    // has a symbolically defined shape with broadcastable inputs
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+ protected:
+  bool IsBinaryAssociative(const NodeDef& node) const {
+    return IsMul(node) || IsAdd(node);
+  }
+
+  bool IsSameOp(const OptimizedNodesGroup& group, const NodeDef& node) const {
+    return group.root_node->op() == node.op();
+  }
+
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const string& input) const override {
+    NodeDef* node;
+    Status node_status = GetInputNode(input, &node);
+    if (!node_status.ok()) return false;
+
+    if (!IsSameOp(group, *node)) {
+      return false;
+    }
+    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
+      return false;
+    }
+    if (IsDrivenByControlDependency(*node) || DrivesControlDependency(*node)) {
+      return false;
+    }
+    if (!IsOnTheSameDevice(group, *node)) {
+      return false;
+    }
+    // Optimized nodes updated in place, and that would break the graph, if the
+    // node has multiple output consumers
+    if (NumNonControlOutputs(*node, *ctx_.node_map) != 1) {
+      return false;
+    }
+    // All input shapes must be broadcastable to the node shape
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(input, &properties);
+    return has_properties.ok() &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+  std::size_t CountUniqueShapes(const std::vector<InputAndShape>& inputs) {
+    std::set<string> sigs;
+    for (const auto& ias : inputs) {
+      sigs.insert(ShapeSignature(ias.shape));
+    }
+    return sigs.size();
+  }
+
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    if (CountUniqueShapes(group.inputs) <= 1) {
+      // nothing to optimize when all shapes are the same
+      return group.root_node->name();
+    }
+
+    auto num_nodes = /*root*/ 1 + group.optimized_nodes.size();
+    auto num_inputs = group.inputs.size();
+    CHECK_EQ(num_nodes, num_inputs - 1)
+        << "Can't build a tree with " << num_inputs << " inputs, using "
+        << num_nodes << "binary op nodes.";
+
+    std::deque<InputAndShape> add_ops(group.inputs.begin(), group.inputs.end());
+    std::deque<NodeDef*> optimized_nodes(group.optimized_nodes.begin(),
+                                         group.optimized_nodes.end());
+
+    // sort inputs by it's shape from smallest to largest
+    std::stable_sort(add_ops.begin(), add_ops.end(),
+                     [](const InputAndShape& lhs, const InputAndShape& rhs) {
+                       return CompareSymbolicallyShapedTensorSizes(lhs.shape,
+                                                                   rhs.shape);
+                     });
+
+    // If there is an odd number of inputs, last one is the largest, and we want
+    // to attach it to the root node, to build a well balanced tree.
+    std::deque<InputAndShape> add_ops_leftover;
+    if (add_ops.size() % 2 != 0) {
+      add_ops_leftover.push_back(add_ops.back());
+      add_ops.pop_back();
+    }
+
+    // At this point it's guaranteed that add_ops have even number of inputs.
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+
+      NodeDef* node;
+      if (!optimized_nodes.empty()) {
+        // re-purpose optimized nodes to build a new tree
+        node = optimized_nodes.front();
+        optimized_nodes.pop_front();
+      } else {
+        // or use root node if none optimized nodes left
+        node = group.root_node;
+      }
+      InputAndShape updated_node = UpdateInputs(lhs.input, rhs.input, node);
+
+      // Pushing updated node to the back of a deque will create a wide and
+      // short tree, pushing to the front will create a tall tree. We prefer to
+      // get a wide tree, it minimizes the potential number of temporary tensors
+      // required to keep in memory, though sometimes we can go up to prevent
+      // propagating a brodcast from leaves to the root. Example:
+      //
+      // inputs: [s, s, s, M] (s - scalar, M - matrix)
+      // @* - op with broadcast
+      //
+      //  (only push_back)           @*     (push_front first op)
+      //                            /  \
+      //       @*                  @    M
+      //     /   \                / \
+      //    @     @*      ->     @   s
+      //   / \   / \            / \
+      //  s   s s   M          s   s
+      if (add_ops.size() >= 2 &&
+          CompareSymbolicallyShapedTensorSizes(add_ops.at(0).shape,
+                                               add_ops.at(1).shape)) {
+        add_ops.push_front(updated_node);
+      } else {
+        add_ops.push_back(updated_node);
+      }
+    } while (add_ops.size() > 1);
+    CHECK_EQ(1, add_ops.size());
+
+    // attach the largest tensor to the root op
+    if (!add_ops_leftover.empty()) {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops_leftover.front();
+      InputAndShape updated_node =
+          UpdateInputs(lhs.input, rhs.input, group.root_node);
+      add_ops.push_back(updated_node);
+    }
+
+    return add_ops.front().input;
+  }
+
+  InputAndShape UpdateInputs(const string& input_0, const string& input_1,
+                             NodeDef* node) {
+    string old_input_0 = node->input(0);
+    string old_input_1 = node->input(1);
+
+    // Update inputs only if they changed
+    if (old_input_0 != input_0 || old_input_1 != input_1) {
+      node->set_input(0, input_0);
+      node->set_input(1, input_1);
+      // Invalidate node properties (shape)
+      ctx_.graph_properties->ClearOutputProperties(node->name());
+      ctx_.graph_properties->ClearInputProperties(node->name());
+      // Update the node map
+      ctx_.node_map->RemoveOutput(NodeName(old_input_0), node->name());
+      ctx_.node_map->RemoveOutput(NodeName(old_input_1), node->name());
+      ctx_.node_map->AddOutput(NodeName(input_0), node->name());
+      ctx_.node_map->AddOutput(NodeName(input_1), node->name());
+      // Add updated node to optimization queue
+      AddToOptimizationQueue(node);
+    }
+
+    // Do not add updated node to any other group
+    AddToOptimizedNodes(node);
+
+    TensorShapeProto shape;  // shape is not important at this point
+    return InputAndShape(node->name(), shape);
+  }
+};
+
 // Removes inverse transpose nodes
 class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
  public:
@@ -854,7 +1074,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
   // TODO(rmlarsen): Forward control dependencies on the bypassed
   // transpose nodes.
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     NodeDef* input;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
@@ -943,7 +1163,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     // Bypass Bitcast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
@@ -981,7 +1201,8 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
   bool IsSupported(const NodeDef* node) const override { return IsCast(*node); }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
     // Bypass Cast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
       *simplified_node_name = node->input(0);
@@ -1678,6 +1899,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
   if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
+  if (options_.minimize_broadcasts && can_use_shapes)
+    pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
   if (options_.remove_identity_transpose && can_use_shapes)
     pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
   if (options_.remove_redundant_bitcast)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 39b89dedba..c0fe8839ca 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -59,6 +59,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool enable_try_simplify_and_replace = true;
     bool combine_add_to_addn = false;
     bool hoist_common_factor_out_of_aggregation = true;
+    bool minimize_broadcasts = false;
     bool remove_identity_transpose = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
@@ -69,10 +70,10 @@ class ArithmeticOptimizer : public GraphOptimizer {
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
-      // TODO(ezhulenev): enable combine_add_to_addn by default after 1.8
-      // release cut
+      // TODO(ezhulenev): enable by default after 1.8 release cut
       if (opt_level == RewriterConfig::AGGRESSIVE) {
         options.combine_add_to_addn = true;
+        options.minimize_broadcasts = true;
       }
       return options;
     }
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index e117341ba3..9677175d2e 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -93,6 +93,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.hoist_common_factor_out_of_aggregation = false;
+    options.minimize_broadcasts = false;
     options.remove_identity_transpose = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
@@ -113,6 +114,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
   }
 
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
   void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_identity_transpose = true;
@@ -1841,5 +1847,160 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   EXPECT_EQ(5, found);
 }
 
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul2);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     *                  *
+  //    / \                / \
+  //   *   c      -->     *   b
+  //  / \                / \
+  // a   b              a   c
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("b", mul2_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_FLOAT);
+  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul2, d);
+  auto mul4 = ops::Mul(s.WithOpName("mul4"), mul3, e);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul4);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: Graph is "flattened" and
+  // largest shape pushed to the top.
+  //
+  //          *
+  //        /   \
+  //       *     e                *
+  //      /  \                  /   \
+  //     *    d               *      b
+  //    / \                 /  \
+  //   *   c      -->     *      *
+  //  / \                / \    / \
+  // a   b              a   c  d   e
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("d", mul2_node->input(0));
+  EXPECT_EQ("e", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("mul1", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+
+  const NodeDef* mul4_node = node_map.GetNode("mul4");
+  ASSERT_NE(mul4_node, nullptr);
+  EXPECT_EQ("mul3", mul4_node->input(0));
+  EXPECT_EQ("b", mul4_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // [a, b, c] - scalars, [d] - matrix
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("D"), {32, 32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), c, d);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul1, mul2);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul3);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //                              *
+  //                            /  \
+  //       *                   *    D
+  //     /   \                / \
+  //    *     *      ->      *   c
+  //   / \   / \            / \
+  //  a   b c   D          a   b
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("b", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("c", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("D", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 7ed0474861..072f772946 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -134,6 +134,18 @@ class GraphOptimizerStage {
   // and remove template parameter.
   virtual Status TrySimplify(NodeDef* node, Result* result) = 0;
 
+  // Return InvalidArgumentError if node is not supported by the optimizer
+  // stage.
+  // TODO(ezhulenev): make this check part of non-virtual public API
+  // (TrySimplify), and make virtual implementation protected.
+  Status EnsureNodeIsSupported(const NodeDef* node) const {
+    return IsSupported(node)
+               ? Status::OK()
+               : errors::InvalidArgument(
+                     "Node ", node->name(), " is not supported by optimizer ",
+                     optimizer_name_, " and stage ", stage_name_);
+  }
+
   // Get a name for a new node, created by this stage, based on one or multiple
   // nodes of an original graph.
   const string OptimizedNodeName(const NodeScopeAndName& node) const {
-- 
GitLab


From c89ab82a82585cdaa90bf4911980e9e845909e78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 10:38:58 -0700
Subject: [PATCH 0445/1262] Add validation for output_index of Feed.id. Return
 error instead of crash if output_index is not less than the output number of
 the operation.

PiperOrigin-RevId: 192148911
---
 tensorflow/compiler/tf2xla/tf2xla_test.cc |  5 +++++
 tensorflow/compiler/tf2xla/tf2xla_util.cc | 11 +++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index a9978e697b..b813668a9e 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -90,6 +90,11 @@ TEST(ConvertGraphDefToXla, Sum) {
   TF_EXPECT_OK(result_or.status());
   std::unique_ptr<xla::Literal> result = std::move(result_or.ValueOrDie());
   EXPECT_EQ("(s32[]) (\n42\n)", result->ToString());
+
+  config.mutable_feed(0)->mutable_id()->set_output_index(
+      123); /* invalid output_index */
+  EXPECT_TRUE(errors::IsInvalidArgument(
+      ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index f428a19432..2fc77cc4bc 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -151,8 +151,15 @@ Status AddPlaceholdersForFeeds(
       Status status;
       Node* feed_node = g.AddNode(gd.node(0), &status);
       TF_RETURN_IF_ERROR(status);
-      info.data_type =
-          BaseType(feed_node->output_type(info.feed->id().output_index()));
+
+      if (info.feed->id().output_index() < feed_node->num_outputs()) {
+        info.data_type =
+            BaseType(feed_node->output_type(info.feed->id().output_index()));
+      } else {
+        return errors::InvalidArgument(
+            "Invalid output_index ", info.feed->id().output_index(),
+            " for feed node ", info.feed->id().node_name());
+      }
     }
   }
 
-- 
GitLab


From 3660d06a39faaa9cd02f2ba73deb12647f853c1d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 10:42:45 -0700
Subject: [PATCH 0446/1262] Internal Change.

PiperOrigin-RevId: 192149558
---
 tensorflow/compiler/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index e345c1266a..db93d6e76f 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -124,6 +124,7 @@ tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
+    tags = ["optonly"],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework_for_generated_wrappers",
-- 
GitLab


From 1ad181b6334ec339ab823cd122e19b7a1ad1a6f7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 10:46:59 -0700
Subject: [PATCH 0447/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 192150230
---
 .../core/common_runtime/direct_session_test.cc       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index fbe7b7daaf..8ddc9958b2 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -254,7 +254,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
     EXPECT_TRUE(
-        StringPiece(s.error_message()).contains("would create a cycle"));
+        str_util::StrContains(s.error_message(), "would create a cycle"));
   }
 
   {
@@ -268,7 +268,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message()).contains("unknown node"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown node"));
   }
 
   {
@@ -283,7 +283,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message()).contains("unknown edge"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown edge"));
   }
 
   {
@@ -298,7 +298,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsNotFound(s));
     EXPECT_TRUE(
-        StringPiece(s.error_message()).contains("unable to find feed output"));
+        str_util::StrContains(s.error_message(), "unable to find feed output"));
   }
 
   {
@@ -315,7 +315,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
   }
 
   {
@@ -330,7 +330,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
   }
 }
 
-- 
GitLab


From 7576a99c49679dc17ff806acb1a5150f5d16ee58 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Mon, 9 Apr 2018 10:48:10 -0700
Subject: [PATCH 0448/1262] Add `scope` parameter in experimental Quantization
 API.

This enables quantizing subgraphs of the entire graph. It's useful for networks
like Inception since we don't want to quantize the AuxLogits scope.

PiperOrigin-RevId: 192150416
---
 .../contrib/quantize/python/quantize.py       |  70 +++++++++--
 .../contrib/quantize/python/quantize_graph.py |  26 +++--
 .../quantize/python/quantize_graph_test.py    | 110 ++++++++++++++++--
 .../contrib/quantize/python/quantize_test.py  |  30 ++++-
 4 files changed, 208 insertions(+), 28 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index d53d4d7b10..d2d0426d23 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 # Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
@@ -41,9 +42,16 @@ def Quantize(graph,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
-             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES):
+             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
+             scope=None):
   """Updates graph with quantization operations.
 
+  Currently we quantize the following tensors:
+  * Conv/MatMul: Quantize the weights if it matches.
+  * Activation: Quantize the output if it matches.
+  * Bypass/Post-activation Bypass: Quantize both input and output
+    if it matches.
+
   Args:
     graph: Graph to modify.
     is_training: Whether quantizing training graph or eval graph.
@@ -57,13 +65,21 @@ def Quantize(graph,
       training.
     vars_collection: (Optional) Collection where to store the variables for
       quantization interval ends.
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
   Raises:
     ValueError: When quantization fails.
   """
+  if scope and not scope.endswith('/'):
+    scope += '/'
+
   input_to_ops_map = input_to_ops.InputToOps(graph)
   for layer_match in _FindLayersToQuantize(graph):
     # Quantize the weights.
     context = _GetContextFromOp(layer_match.layer_op)
+
+    # If `scope` is given, only quantize it if the consumer of weights
+    # (the layer op) is in the right scope.
     _InsertQuantOp(
         context,
         'weights_quant',
@@ -74,7 +90,8 @@ def Quantize(graph,
         quant_delay=quant_delay,
         narrow_range=True,
         vars_collection=vars_collection,
-        bits=weight_bits)
+        bits=weight_bits,
+        consumer_scope=scope)
 
     # Quantize the activations.
     consumer_ops = input_to_ops_map.ConsumerOperations(
@@ -82,6 +99,9 @@ def Quantize(graph,
     add_context = context
     if layer_match.bypass_op:
       add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
+
+    # If `scope` is given, only quantize it if the producer of weights
+    # (usually it's the layer op) is in the right scope.
     _InsertQuantOp(
         add_context,
         'act_quant',
@@ -93,11 +113,14 @@ def Quantize(graph,
         quant_delay=quant_delay,
         vars_collection=vars_collection,
         bits=activation_bits,
-        init_min=0.0)
+        init_min=0.0,
+        producer_scope=scope)
 
     # Quantize the inputs and output to the bypass (if it exists). The input to
     # the bypass is the bias add, and the output is the activation.
     if layer_match.bypass_op is not None:
+      # If `scope` is given, only quantize it if the both the producer and the
+      # consumer are in the right scope.
       _InsertQuantOp(
           context,
           'conv_quant',
@@ -107,7 +130,9 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
       _InsertQuantOp(
           add_context,
           'add_quant',
@@ -118,12 +143,16 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
 
     # Quantize bypass ops that occur after the activation.
     if layer_match.post_activation_bypass_op is not None:
       post_activation_bypass_context = re.search(
           r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
+      # If `scope` is given, only quantize it if the producer is in the right
+      # scope.
       _InsertQuantOp(
           post_activation_bypass_context,
           'post_activation_bypass_quant',
@@ -135,7 +164,8 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope)
 
 
 def _FindLayersToQuantize(graph):
@@ -382,7 +412,9 @@ def _InsertQuantOp(context,
                    ema_decay=0.999,
                    quant_delay=None,
                    vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
-                   narrow_range=False):
+                   narrow_range=False,
+                   producer_scope=None,
+                   consumer_scope=None):
   """Inserts a quant op between a producer op and (multiple) consumer ops.
 
   Args:
@@ -407,10 +439,34 @@ def _InsertQuantOp(context,
       quantization interval ends.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^bits - 1] or wide range [0; 2^bits - 1].
+    producer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when the producer is in this scope.
+    consumer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when all the consumers are in this scope.
   Raises:
     ValueError: When producer operation is not directly connected to the
       consumer operation.
   """
+  if producer_scope and not producer.name.startswith(producer_scope):
+    logging.info(
+        '_InsertQuantOp ignores context="%s" name="%s" '
+        'because producer "%s" is not in scope "%s"',
+        context, name, producer.name, producer_scope)
+    return
+
+  if consumer_scope:
+    consumers_in_scope = []
+    for consumer in consumers:
+      if consumer.name.startswith(consumer_scope):
+        consumers_in_scope.append(consumer)
+      else:
+        logging.info(
+            '_InsertQuantOp context="%s" name="%s" ignores '
+            'consumer "%s" because it is not in scope "%s"',
+            context, name, consumer.name, consumer_scope)
+        return
+    consumers = consumers_in_scope
+
   name_prefix = _AddContextToName(context, name)
   # This is needed on TPU where name_scope == 'TPUReplicate/loop', and
   # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 0b74b438ac..11d052d7f4 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -28,7 +28,8 @@ def _create_graph(input_graph=None,
                   weight_bits=8,
                   activation_bits=8,
                   quant_delay=None,
-                  freeze_bn_delay=None):
+                  freeze_bn_delay=None,
+                  scope=None):
   """Rewrites an input_graph in place for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
@@ -48,6 +49,8 @@ def _create_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to the number of steps when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -66,7 +69,8 @@ def _create_graph(input_graph=None,
         is_training,
         quant_delay=quant_delay,
         weight_bits=weight_bits,
-        activation_bits=activation_bits)
+        activation_bits=activation_bits,
+        scope=scope)
 
 
 def create_training_graph(input_graph=None, quant_delay=0):
@@ -133,7 +137,8 @@ def experimental_create_training_graph(input_graph=None,
                                        weight_bits=8,
                                        activation_bits=8,
                                        quant_delay=0,
-                                       freeze_bn_delay=None):
+                                       freeze_bn_delay=None,
+                                       scope=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
   Variables added by the rewrite get added to the global variables collection.
@@ -165,6 +170,8 @@ def experimental_create_training_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -177,12 +184,14 @@ def experimental_create_training_graph(input_graph=None,
       weight_bits=weight_bits,
       activation_bits=activation_bits,
       quant_delay=quant_delay,
-      freeze_bn_delay=freeze_bn_delay)
+      freeze_bn_delay=freeze_bn_delay,
+      scope=scope)
 
 
 def experimental_create_eval_graph(input_graph=None,
                                    weight_bits=8,
-                                   activation_bits=8):
+                                   activation_bits=8,
+                                   scope=None):
   """Rewrites an eval input_graph in place for simulated quantization.
 
   Variables added by the rewrite get added to the global variables collection.
@@ -200,8 +209,8 @@ def experimental_create_eval_graph(input_graph=None,
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
-
-
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -211,4 +220,5 @@ def experimental_create_eval_graph(input_graph=None,
       input_graph=input_graph,
       is_training=False,
       weight_bits=weight_bits,
-      activation_bits=activation_bits)
+      activation_bits=activation_bits,
+      scope=scope)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index b9d03c1bc0..caf8ff28d5 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -66,6 +66,20 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     for fn in rewrite_fns:
       test_fn(fn)
 
+  def _RunTestOverExperimentalRewritesWithScope(self, test_fn, scope):
+    def with_absent_scope(fn):
+      def fn_with_absent_scope(*args):
+        fn(*args, scope=scope)
+      return fn_with_absent_scope
+    rewrite_fns = [
+        with_absent_scope(
+            quantize_graph.experimental_create_training_graph),
+        with_absent_scope(
+            quantize_graph.experimental_create_eval_graph),
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
   def testRewrite(self):
     self._RunTestOverAllRewrites(self._TestRewrite)
 
@@ -99,6 +113,34 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       # Ensure that variables were added.
       self.assertTrue(len(orig_variable_names) < len(q_variables))
 
+  def testWithPreActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPreActivationBypass)
+
+  def _TestWithPreActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(pre_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(
+          any('scope1/add_quant/' in name for name in op_names))
+
+  def testWithPostActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPostActivationBypass)
+
+  def _TestWithPostActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(post_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(any(
+          'scope1/post_activation_bypass_quant/' in name for name in op_names))
+
   def testQuantDelay(self):
     self._RunTestOverTrainingRewrites(self._TestQuantDelay)
 
@@ -224,20 +266,66 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       graph_def_after = str(g.as_graph_def())
       self.assertEqual(graph_def_before, graph_def_after)
 
-  def _ConvLayer(self):
+  def testRewriteWithScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithScope, 'scope1')
+
+  def _TestRewriteWithScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      scope1_output = self._ConvLayer(scope='scope1')
+      self._ConvLayer(input_tensor=scope1_output, scope='scope2')
+
+    rewrite_fn(graph)
+
+    op_names = [op.name for op in graph.get_operations()]
+    # The weights and activation of scope1 is quantized, but not scope2.
+    self.assertTrue(
+        any('scope1/Conv/act_quant' in name for name in op_names))
+    self.assertTrue(
+        any('scope1/Conv/weights_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/act_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/weights_quant' in name for name in op_names))
+
+  def testRewriteWithNonMatchingScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithNonMatchingScope, 'NonExistingScope')
+
+  def _TestRewriteWithNonMatchingScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._ConvLayer()
+
+    op_names_before_rewrite = set([op.name for op in graph.get_operations()])
+    rewrite_fn(graph)
+    op_names_after_rewrite = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_rewrite, op_names_after_rewrite)
+
+  def _ConvLayer(
+      self, input_tensor=None, scope='test', pre_activation_bypass=False,
+      post_activation_bypass=False):
     """Add a basic convolution layer to the default graph."""
     batch_size, height, width, depth = 5, 128, 128, 3
-    inputs = array_ops.zeros((batch_size, height, width, depth))
+    if input_tensor is None:
+      input_tensor = array_ops.zeros((batch_size, height, width, depth))
     weight_init = init_ops.truncated_normal_initializer
-    conv = layers.conv2d(
-        inputs,
-        32, [5, 5],
-        stride=2,
-        padding='SAME',
-        weights_initializer=weight_init(0.09),
-        activation_fn=None,
-        scope='test')
-    _ = nn_ops.relu6(conv)
+    with ops.name_scope(scope):
+      output = layers.conv2d(
+          input_tensor,
+          depth, [5, 5],
+          padding='SAME',
+          weights_initializer=weight_init(0.09),
+          activation_fn=None)
+      if pre_activation_bypass:
+        output += input_tensor
+      output = nn_ops.relu6(output)
+      if post_activation_bypass:
+        output += input_tensor
+    return output
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 8d057d3710..d37c83d683 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -254,12 +254,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     graph = ops.Graph()
     with graph.as_default():
       with graph.name_scope(None):
-        batch_size, height, width, depth = 5, 128, 128, 3
+        batch_size, height, width, depth = 5, 128, 128, 32
         input1 = array_ops.zeros((batch_size, height, width, depth))
         _ = conv2d(
             input1,
             32, [5, 5],
-            stride=2,
             padding='SAME',
             weights_initializer=self._WeightInit(0.09),
             activation_fn=None,
@@ -268,6 +267,33 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
         # Passes if Quantize() does not crash.
 
+  def testWithNonMatchingNameScope(self):
+    self._RunTestOverParameters(self._testWithNonMatchingNameScope)
+
+  def _testWithNonMatchingNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope('name_scope'):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+    op_names_before_quantize = set([op.name for op in graph.get_operations()])
+    quantize.Quantize(
+        graph, is_training, weight_bits=8, activation_bits=8,
+        scope='NonExisting/')
+    op_names_after_quantize = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_quantize, op_names_after_quantize)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
-- 
GitLab


From 6594b9f530ee0a82b61a4b0d2b80c3ced1464fb7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 10:56:29 -0700
Subject: [PATCH 0449/1262] Collective Ops Part 2

Kernel/Op defs for reduction and broadcast.

Note that kernels just set up CollectiveParams and don't
define detailed algorithms.

This change is part of a series of changes introducing infrastructure
for collective ops and initial implementations of reduction and broadcast.

PiperOrigin-RevId: 192151715
---
 tensorflow/core/BUILD                         |  11 +-
 .../api_def_CollectiveBcastRecv.pbtxt         |   5 +
 .../api_def_CollectiveBcastSend.pbtxt         |   5 +
 .../base_api/api_def_CollectiveReduce.pbtxt   |   5 +
 tensorflow/core/framework/collective.h        |   3 -
 tensorflow/core/kernels/BUILD                 |  11 +
 tensorflow/core/kernels/collective_ops.cc     | 266 ++++++++++++++++++
 tensorflow/core/ops/collective_ops.cc         |  55 ++++
 8 files changed, 354 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
 create mode 100644 tensorflow/core/kernels/collective_ops.cc
 create mode 100644 tensorflow/core/ops/collective_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6f2391c991..5a0535fc86 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -687,6 +687,7 @@ tf_gen_op_libs(
         "boosted_trees_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
+        "collective_ops",
         "control_flow_ops",
         "ctc_ops",
         "data_flow_ops",
@@ -803,6 +804,7 @@ cc_library(
         ":boosted_trees_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
+        ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
         ":cudnn_rnn_ops_op_lib",
@@ -948,6 +950,7 @@ cc_library(
         "//tensorflow/core/kernels:boosted_trees_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
+        "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
@@ -2249,17 +2252,17 @@ tf_cuda_library(
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/buf_rendezvous.h",
+    "common_runtime/build_graph_options.h",
     "common_runtime/collective_executor_mgr.h",
     "common_runtime/collective_param_resolver_local.h",
     "common_runtime/collective_rma_local.h",
-    "common_runtime/device_resolver_local.h",
-    "common_runtime/buf_rendezvous.h",
-    "common_runtime/build_graph_options.h",
     "common_runtime/constant_folding.h",
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
     "common_runtime/debugger_state_interface.h",
     "common_runtime/device_factory.h",
+    "common_runtime/device_resolver_local.h",
     "common_runtime/device_set.h",
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
@@ -2270,6 +2273,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
+    "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
     "common_runtime/renamed_device.h",
@@ -2278,7 +2282,6 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/scoped_allocator.h",
     "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
-    "common_runtime/placer.h",
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 0000000000..88049bca36
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  visibility: SKIP
+  summary: "Receives a tensor value broadcast from another device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 0000000000..7ff70f5b17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  visibility: SKIP
+  summary: "Broadcasts a tensor value to one or more other devices."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 0000000000..10d9771d46
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  visibility: SKIP
+  summary: "Mutually reduces multiple tensors of identical type and shape."
+}
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 362d345133..5810c7fa54 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -103,11 +103,8 @@ struct CollectiveParams {
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
   std::vector<int> subdiv_source_rank;
-  const Tensor* in_tensor;             // kernel input
-  Tensor* out_tensor;                  // kernel output
   std::unique_ptr<OpKernel> merge_op;  // reduction only
   std::unique_ptr<OpKernel> final_op;  // reduction only
-  OpKernelContext* op_context;
   string ToString() const;
 };
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b931f79b72..1018e8d25c 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -131,6 +131,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "collective_ops",
+    prefix = "collective_ops",
+    deps = [
+        "//tensorflow/core:collective_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "concat_lib",
     srcs = [
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
new file mode 100644
index 0000000000..5de41bac72
--- /dev/null
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -0,0 +1,266 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace {
+class CollectiveOpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveOpKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+
+  // A string encoding instance, frame and iter to be handed off to
+  // the implementation for use in generating RecvBuf keys.
+  string GetCollectiveKey(OpKernelContext* c) {
+    return strings::StrCat(col_params_.instance.instance_key, ":",
+                           c->frame_iter().frame_id, ":",
+                           c->frame_iter().iter_id);
+  }
+
+  // Returns false if calling invocation of ComputeAsync should return
+  // immediately.
+  bool CanProceedWithCompute(OpKernelContext* c, CollectiveExecutor* col_exec,
+                             const DoneCallback& done) {
+    if (col_params_.group.group_size >
+        col_params_.instance.device_names.size()) {
+      // This is the first invocation: Finish initializing col_params_.
+      // Call in a blockable thread because it's not guaranteed that
+      // this call cannot block.
+      c->env()->SchedClosure([this, c, done, col_exec]() {
+        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
+                                      c->cancellation_manager(),
+                                      [this, c, done](const Status& s) {
+                                        if (s.ok()) {
+                                          ComputeAsync(c, done);
+                                        } else {
+                                          c->SetStatus(s);
+                                          done();
+                                        }
+                                      });
+      });
+      return false;
+    }
+    return true;
+  }
+
+  CollectiveParams col_params_;
+};
+
+class CollectiveReduceOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("subdiv_offsets",
+                      &col_params_.instance.impl_details.subdiv_offsets));
+    string merge_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    OP_REQUIRES(c, merge_op_name == "Add" || merge_op_name == "Mul",
+                errors::InvalidArgument(
+                    "merge_op must be one of {\"Add\", \"Mul\"} but got ",
+                    merge_op_name));
+    string final_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
+    OP_REQUIRES(c, final_op_name == "Id" || final_op_name == "Div",
+                errors::InvalidArgument(
+                    "final_op must be one of {\"Id\", \"Div\"} but got ",
+                    final_op_name));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
+                                       merge_op_name, ",", final_op_name, ")");
+    col_params_.group.device_type = c->device_type();
+
+    // Find the OpKernels by name, type and device type.
+    NodeDef sub_node;
+    // The merge_op takes two inputs
+    sub_node.add_input(real_node.input(0));
+    sub_node.add_input(real_node.input(0));
+    sub_node.set_device(real_node.device());
+    SetAttrValue(col_params_.instance.data_type,
+                 &(*sub_node.mutable_attr())["T"]);
+    col_params_.merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
+    col_params_.final_op = BuildOpKernel(c, final_op_name, &sub_node);
+  }
+
+  std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
+                                          const string& name,
+                                          NodeDef* sub_node) {
+    std::unique_ptr<OpKernel> k;
+    if (name.empty() || name == "Id") return k;
+    sub_node->set_name(name);
+    sub_node->set_op(name);
+    Status status;
+    k = CreateOpKernel(c->device_type(), c->device(),
+                       c->device()->GetAllocator(AllocatorAttributes()),
+                       *sub_node, c->graph_def_version(), &status);
+    if (!status.ok()) {
+      c->CtxFailureWithWarning(errors::Internal("Failed to build OpKernel for ",
+                                                name, " : ",
+                                                status.error_message()));
+    }
+    return k;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // Allocate the output tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->forward_input_or_allocate_output(
+                             {0}, 0, c->input(0).shape(), &output),
+                         done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_CPU),
+                        CollectiveReduceOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_GPU),
+                        CollectiveReduceOpKernel);
+
+class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastSendOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = true;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    OP_REQUIRES_ASYNC(
+        c, shape_.IsSameSize(c->input(0).shape()),
+        errors::Internal("Declared shape of op ", col_params_.name,
+                         " does not match shape of input"),
+        done);
+    // Allocate the output Tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->forward_input_or_allocate_output({0}, 0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastSendOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_CPU),
+                        CollectiveBcastSendOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_GPU),
+                        CollectiveBcastSendOpKernel);
+
+class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastRecvOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = false;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // No input, so must allocate output.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastRecvOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_CPU),
+                        CollectiveBcastRecvOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_GPU),
+                        CollectiveBcastRecvOpKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
new file mode 100644
index 0000000000..d6157a69df
--- /dev/null
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("CollectiveReduce")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
+    .Attr("final_op: {'Id', 'Div'}")
+    .Attr("subdiv_offsets: list(int)")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("CollectiveBcastSend")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+REGISTER_OP("CollectiveBcastRecv")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+}  // namespace tensorflow
-- 
GitLab


From 9d1bf2bd4723fd3d0a012891bc54cc9db54bd9cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 10:59:46 -0700
Subject: [PATCH 0450/1262] Rewrite a fast GEMV path for two goals: 1. Avoid
 cache aliasing issues on CPUs with 4-way set associative L1 cache.    That
 includes Cortex-A53. 2. Be a good basis to port to assembly.

PiperOrigin-RevId: 192152277
---
 .../internal/optimized/optimized_ops.h        | 293 ++++++++++++++----
 1 file changed, 233 insertions(+), 60 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 9a274612ad..5acf1eaede 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -554,88 +554,261 @@ inline void GEMVForLstmCellWithSymmetricRange(
   // exercises it). We just guard our assumptions about size evenness with
   // the following assertions.
   TFLITE_DCHECK(!(output_size % 4));
-  TFLITE_DCHECK(!(input_size % 8));
+  TFLITE_DCHECK(!(input_size % 64));
   const int32* bias_ptr = bias_data;
   int16* output_ptr = output_data;
   const uint8x16_t signbit = vdupq_n_u8(0x80);
   for (int in = 0; in < input_size; in += 32) {
     optimized_ops_preload_l1_keep(input_data + in);
   }
+  const int left_shift = accum_shift > 0 ? accum_shift : 0;
+  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
   for (int out = 0; out < output_size; out += 4) {
-    const uint8* weights_ptr_0 = weights_data + out * input_size;
-    const uint8* weights_ptr_1 = weights_ptr_0 + 1 * input_size;
-    const uint8* weights_ptr_2 = weights_ptr_0 + 2 * input_size;
-    const uint8* weights_ptr_3 = weights_ptr_0 + 3 * input_size;
+    // Load the bias values
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
 
-    int32x4_t acc_0 = vdupq_n_s32(0);
-    int32x4_t acc_1 = vdupq_n_s32(0);
-    int32x4_t acc_2 = vdupq_n_s32(0);
-    int32x4_t acc_3 = vdupq_n_s32(0);
-    int in = 0;
-    const int kReadAhead = 256;
-    // Handle 16 levels of depth at a time.
-    for (; in < input_size; in += 16) {
-      int8x16_t weights_val_0 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_0)));
-      int8x16_t weights_val_1 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_1)));
-      int8x16_t weights_val_2 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_2)));
-      int8x16_t weights_val_3 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_3)));
-      int8x16_t input_val =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + in)));
-      int16x8_t acc16_0 =
-          vmull_s8(vget_low_s8(weights_val_0), vget_low_s8(input_val));
-      int16x8_t acc16_1 =
-          vmull_s8(vget_low_s8(weights_val_1), vget_low_s8(input_val));
-      int16x8_t acc16_2 =
-          vmull_s8(vget_low_s8(weights_val_2), vget_low_s8(input_val));
-      int16x8_t acc16_3 =
-          vmull_s8(vget_low_s8(weights_val_3), vget_low_s8(input_val));
-      acc16_0 = vmlal_s8(acc16_0, vget_high_s8(weights_val_0),
-                         vget_high_s8(input_val));
-      acc16_1 = vmlal_s8(acc16_1, vget_high_s8(weights_val_1),
-                         vget_high_s8(input_val));
-      acc16_2 = vmlal_s8(acc16_2, vget_high_s8(weights_val_2),
-                         vget_high_s8(input_val));
-      acc16_3 = vmlal_s8(acc16_3, vget_high_s8(weights_val_3),
-                         vget_high_s8(input_val));
-      acc_0 = vpadalq_s16(acc_0, acc16_0);
-      acc_1 = vpadalq_s16(acc_1, acc16_1);
-      acc_2 = vpadalq_s16(acc_2, acc16_2);
-      acc_3 = vpadalq_s16(acc_3, acc16_3);
-      weights_ptr_0 += 16;
-      weights_ptr_1 += 16;
-      weights_ptr_2 += 16;
-      weights_ptr_3 += 16;
-      optimized_ops_preload_l1_stream(weights_ptr_0 + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_ptr_1 + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_ptr_2 + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_ptr_3 + kReadAhead);
+    // Clear accumulators. We use 2 accumulator registers per row,
+    // for 4 rows. row_accumRN is the N-th accumulator for row R.
+    int32x4_t row_accum00 = vdupq_n_s32(0);
+    int32x4_t row_accum01 = vdupq_n_s32(0);
+    int32x4_t row_accum10 = vdupq_n_s32(0);
+    int32x4_t row_accum11 = vdupq_n_s32(0);
+    int32x4_t row_accum20 = vdupq_n_s32(0);
+    int32x4_t row_accum21 = vdupq_n_s32(0);
+    int32x4_t row_accum30 = vdupq_n_s32(0);
+    int32x4_t row_accum31 = vdupq_n_s32(0);
+
+    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
+    const int kReadAhead = 512;
+    // Prefetch the first weights values.
+    for (int k = 0; k < kReadAhead; k += 64) {
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      k);
+    }
+    // Loop along the rows, handling 64 bytes per iteration because that's
+    // cache line size on most current ARM-architecture CPUs.
+    for (int in = 0; in < input_size; in += 64) {
+      // Prefetch some future weights values.
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      in + kReadAhead);
+
+      // We will use 2 local 16-bit accumulators per row, for 2 rows.
+      // See below (*) for the rationale of processing only 2 rows at a time.
+      // local_accumRN is the N-th local accumulator for row R.
+      int16x8_t local_accum00;
+      int16x8_t local_accum01;
+      int16x8_t local_accum10;
+      int16x8_t local_accum11;
+
+      // Load 64 bytes of input activations values. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t input0 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
+      int8x16_t input1 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
+      int8x16_t input2 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
+      int8x16_t input3 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
+
+      // Beginning of the core accumulation. Notice how while we have 4
+      // rows to process, this code is taking care of only 2 rows at a time,
+      // thus being divided into two parts looking similar ("Rows 0 and 1" and
+      // "Rows 2 and 3").
+      //
+      // (*) The rationale for handling only 2 rows at a time is to avoid
+      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
+      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
+      // we may find ourselves in a situation where rows alias each other in
+      // the L1 cache, and moreover may also mutually alias with the input
+      // activations. If we try to load 4 rows at a time, together with the
+      // input activations, that may be 5 mutually-aliasing vectors, resulting
+      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
+      // here largely mitigates these issues, and seems at least to be very
+      // effective on Cortex-A53:
+      //                          Before       After
+      // big (Cortex-A73)         2.85 ms      2.85 ms
+      // little (Cortex-A53)      11.0 ms      5.16 ms
+
+      // Rows 0 and 1:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
+      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
+      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
+      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
+      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
+      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
+      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
+      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+
+      // Rows 2 and 3:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
+      weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
+      weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
+      weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
+      weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
+      weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
+      weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
+      weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
     }
+
+    row_accum00 = vaddq_s32(row_accum00, row_accum01);
+    row_accum10 = vaddq_s32(row_accum10, row_accum11);
+    row_accum20 = vaddq_s32(row_accum20, row_accum21);
+    row_accum30 = vaddq_s32(row_accum30, row_accum31);
     // Horizontally reduce accumulators
     int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
         pairwise_reduced_acc_2, pairwise_reduced_acc_3;
     pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
     pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
     pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
     pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
     const int32x2_t reduced_lo =
         vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
     const int32x2_t reduced_hi =
         vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
     int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
     // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
     reduced = vaddq_s32(reduced, bias_vec);
-    int left_shift = accum_shift > 0 ? accum_shift : 0;
-    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
     reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
     // Multiply by the fixed-point multiplier.
     reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
@@ -962,7 +1135,7 @@ inline void FullyConnected(
 #ifdef GEMMLOWP_NEON
   if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
       output_activation_max == 32767) {
-    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 16)) {
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
       GEMVForLstmCellWithSymmetricRange(input_data, input_dims, filter_data,
                                         filter_dims, bias_data_int32, bias_dims,
                                         output_multiplier, -output_shift,
-- 
GitLab


From 2138a691abfa726b0b6ef28d7f3482e94ada38aa Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Mon, 9 Apr 2018 11:08:08 -0700
Subject: [PATCH 0451/1262] Adds complex64/128 Fill kernel registrations for
 GPU.

PiperOrigin-RevId: 192153935
---
 tensorflow/core/kernels/constant_op.cc             | 4 +++-
 tensorflow/core/kernels/fill_functor.cu.cc         | 2 +-
 tensorflow/python/kernel_tests/constant_op_test.py | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 312c1a41d3..fe1a1ba5a3 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -258,13 +258,15 @@ REGISTER_KERNEL(GPU, Eigen::half);
 REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
 REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, complex64);
+REGISTER_KERNEL(GPU, complex128);
 REGISTER_KERNEL(GPU, uint8);
 REGISTER_KERNEL(GPU, int8);
 REGISTER_KERNEL(GPU, uint16);
 REGISTER_KERNEL(GPU, int16);
 REGISTER_KERNEL(GPU, int64);
 REGISTER_KERNEL(GPU, bool);
-// Currently we do not support filling strings and complex64 on GPU
+// Currently we do not support filling strings on GPU
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 3487606778..050c95cf40 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -76,7 +76,7 @@ struct FillFunctor<GPUDevice, T> {
 };
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
-TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_NUMBER_TYPES(DEFINE_FILL_GPU);
 TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 18796f7095..749313b00d 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -653,12 +653,12 @@ class FillTest(test.TestCase):
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex64(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex64)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex64)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex128(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex128)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
-- 
GitLab


From 1b97ea722cea69a8d9a6fe3bd515e22f356d40ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 11:11:10 -0700
Subject: [PATCH 0452/1262] Adds support for tf.HParams to TPUEstimator.

PiperOrigin-RevId: 192154504
---
 tensorflow/contrib/tpu/BUILD                       |  1 +
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 4de09dd988..2f4a76720d 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -47,6 +47,7 @@ py_library(
         ":tpu_lib",
         ":tpu_py",
         "//tensorflow/contrib/summary:summary_ops",
+        "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 47365b78a2..1332108d04 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -38,6 +38,7 @@ from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.contrib.training.python.training import hparam
 from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
@@ -1308,7 +1309,10 @@ class _ModelFnWrapper(object):
       batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
 
     if batch_size_for_model_fn is not None:
-      params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
+      if isinstance(params, hparam.HParams):
+        params.add_hparam(_BATCH_SIZE_KEY, batch_size_for_model_fn)
+      else:
+        params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
     if (self._ctx.is_running_on_cpu(is_export_mode) and
@@ -1947,7 +1951,10 @@ class TPUEstimator(estimator_lib.Estimator):
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
+        if isinstance(kwargs['params'], hparam.HParams):
+          kwargs['params'].add_hparam(_BATCH_SIZE_KEY, batch_size_for_input_fn)
+        else:
+          kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
-- 
GitLab


From 20e1f3a852c83d9369c7d56a943fe6b8f9b88644 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 11:14:25 -0700
Subject: [PATCH 0453/1262] Register tf.concat with uint8 data type.

PiperOrigin-RevId: 192154998
---
 tensorflow/core/kernels/concat_lib_gpu.cc         | 1 +
 tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | 4 ++++
 tensorflow/core/kernels/concat_op.cc              | 1 +
 tensorflow/core/kernels/concat_op_test.cc         | 4 ++++
 4 files changed, 10 insertions(+)

diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index d8643c0b2f..93e392d303 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -118,6 +118,7 @@ TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
+TF_CALL_uint8(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a..a561d918bd 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -202,6 +202,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
 
@@ -209,6 +210,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
 
@@ -216,6 +218,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
 
@@ -223,6 +226,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index f16766315f..a87b63f913 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -212,6 +212,7 @@ REGISTER_CONCAT(qint32);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
+TF_CALL_uint8(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index e3ba8ae9f6..39b44b2fcc 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -78,6 +78,9 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 
+static void BM_ConcatDim1uint8(int iters, int dim2) {
+  ConcatHelper<uint8>(iters, 1, dim2);
+}
 static void BM_ConcatDim1int16(int iters, int dim2) {
   ConcatHelper<int16>(iters, 1, dim2);
 }
@@ -85,6 +88,7 @@ static void BM_ConcatDim1bfloat16(int iters, int dim2) {
   ConcatHelper<bfloat16>(iters, 1, dim2);
 }
 
+BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
 
-- 
GitLab


From 800f1db6f082ec25b0f82f847fd6b41ebc33e929 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 11:16:17 -0700
Subject: [PATCH 0454/1262] Documentation fix.

PiperOrigin-RevId: 192155305
---
 tensorflow/python/data/ops/dataset_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 8729e085a3..c28de3d054 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -121,7 +121,7 @@ class Dataset(object):
       An `Iterator` over the elements of this dataset.
 
     Raises:
-      RuntimeError: If eager execution is enabled.
+      RuntimeError: If eager execution is not enabled.
     """
     if context.executing_eagerly():
       return iterator_ops.EagerIterator(self)
-- 
GitLab


From 568f05c7937c327941a0f301be08cb842c88a851 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 11:19:41 -0700
Subject: [PATCH 0455/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192155883
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 141 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 141 ++++++++++++++++++
 2 files changed, 282 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 81546d52f2..026bfa89cf 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -11645,6 +11645,147 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0cf66d2bd6..b61a3b0e64 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4729,6 +4729,147 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
-- 
GitLab


From e4d1a92e7cf9cdd44b84dc1dba1411301f6f04ab Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 9 Apr 2018 11:45:13 -0700
Subject: [PATCH 0456/1262] Allow creating tensors from numpy arrays, and other
 various constants.

Allow type-inference from a different input tensor, similar to args_to_matching_eager.

- Update TFE_Py_TensorShapeSlice to take tuples.
- Update int values to allow int/long in py2

PiperOrigin-RevId: 192160407
---
 tensorflow/python/eager/pywrap_tensor.cc  | 123 ++++++------
 tensorflow/python/eager/pywrap_tensor.h   |   4 +
 tensorflow/python/eager/pywrap_tfe.h      |  12 +-
 tensorflow/python/eager/pywrap_tfe_src.cc | 220 +++++++++++++++++++---
 tensorflow/python/eager/tensor_test.py    |   7 +-
 tensorflow/python/framework/ops.py        |  16 ++
 6 files changed, 286 insertions(+), 96 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 519814b979..bc509be312 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -161,6 +161,64 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 
 }  // namespace
 
+namespace tensorflow {
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
+  int desired_dtype = -1;
+  if (dtype != Py_None) {
+    if (!PyIntToDataType(dtype, &desired_dtype)) {
+      PyErr_SetString(PyExc_TypeError,
+                      tensorflow::strings::StrCat(
+                          "Expecting a DataType value for dtype. Got ",
+                          Py_TYPE(dtype)->tp_name)
+                          .c_str());
+      return nullptr;
+    }
+  }
+  if (PyArray_Check(value)) {
+    int desired_np_dtype = -1;
+    if (desired_dtype >= 0) {
+      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
+               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
+               .ok()) {
+        PyErr_SetString(PyExc_TypeError,
+                        tensorflow::strings::StrCat(
+                            "Invalid dtype argument value ", desired_dtype)
+                            .c_str());
+        return nullptr;
+      }
+    }
+    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+    int current_np_dtype = PyArray_TYPE(array);
+    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
+    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
+        !PyArray_ISCARRAY(array)) {
+      int new_dtype =
+          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
+      safe_value = tensorflow::make_safe(
+          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
+                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
+      if (PyErr_Occurred()) return nullptr;
+      if (safe_value == nullptr) {
+        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
+        return nullptr;
+      }
+      value = safe_value.get();
+    }
+    return NumpyToTensorHandle(value);
+  } else {
+    tensorflow::Tensor t;
+    // TODO(josh11b): Have PySeqToTensor set python errors instead of
+    // returning Status.
+    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
+    if (!cppstatus.ok()) {
+      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
+      return nullptr;
+    }
+    return TFE_NewTensorHandle(t);
+  }
+}
+}  // namespace tensorflow
+
 extern "C" {
 
 static const int kMaxEagerTensorParentSize = 64;
@@ -230,56 +288,11 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       return -1;
     }
   }
-  tensorflow::Safe_TFE_TensorHandlePtr handle =
-      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(nullptr));
   PyErr_Clear();
-  if (PyArray_Check(value)) {
-    int desired_np_dtype = -1;
-    if (desired_dtype >= 0) {
-      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
-               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
-               .ok()) {
-        PyErr_SetString(PyExc_TypeError,
-                        tensorflow::strings::StrCat(
-                            "Invalid dtype argument value ", desired_dtype)
-                            .c_str());
-        return -1;
-      }
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
-    int current_np_dtype = PyArray_TYPE(array);
-    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
-    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
-        !PyArray_ISCARRAY(array)) {
-      int new_dtype =
-          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
-      safe_value = tensorflow::make_safe(
-          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
-                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
-      if (PyErr_Occurred()) return -1;
-      if (safe_value == nullptr) {
-        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
-        return -1;
-      }
-      value = safe_value.get();
-    }
-    handle = tensorflow::make_safe(NumpyToTensorHandle(value));
-  } else {
-    tensorflow::Tensor t;
-    // TODO(josh11b): Have PySeqToTensor set python errors instead of
-    // returning Status.
-    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
-    if (!cppstatus.ok()) {
-      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
-      return -1;
-    }
-    handle = tensorflow::make_safe(TFE_NewTensorHandle(t));
-  }
-  if (PyErr_Occurred()) return -1;
-  if (handle == nullptr) {
-    PyErr_SetString(PyExc_ValueError, "Error while creating an EagerTensor");
-    return -1;
-  }
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(value, dtype)));
+  if (handle == nullptr) return -1;
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
     handle = tensorflow::make_safe(
@@ -701,12 +714,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
-  if (!PyList_Check(tensor_list)) {
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
+  if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) {
     PyErr_SetString(PyExc_TypeError,
                     tensorflow::strings::StrCat(
-                        "tensor_list argument must be a list. Got \"",
-                        Py_TYPE(tensor_list)->tp_name, "\"")
+                        "tensors argument must be a list or a tuple. Got \"",
+                        Py_TYPE(tensors)->tp_name, "\"")
                         .c_str());
     return nullptr;
   }
@@ -720,14 +733,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
     return nullptr;
   }
 
-  Py_ssize_t num_tensors = PyList_Size(tensor_list);
+  Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors);
   int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
   auto tensor = tensorflow::make_safe(TF_AllocateTensor(
       TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
   int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
   auto status = tensorflow::make_safe(TF_NewStatus());
   for (Py_ssize_t i = 0; i < num_tensors; ++i) {
-    PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i);
+    PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i);
     if (!EagerTensor_CheckExact(tensor_obj)) {
       PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index aa1efdd1b8..5b330432bd 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -22,4 +22,8 @@ limitations under the License.
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
 
+namespace tensorflow {
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
+}
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 32d731d0f6..691b613e48 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -186,16 +186,16 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 // Returns the set of variables watched by the given tape.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
-// Returns an EagerTensor of dimension [len(`tensor_list`)] containing
-// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
+// Returns an EagerTensor of dimension [len(`tensors`)] containing
+// the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
-// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes
+// `tensors`. For example, if `tensors` contains tensors of with shapes
 // [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with
 // `slice_dim` equal to 1 will return [2, 5, 7].
 // On error, returns nullptr and sets python exception.
-// REQUIRES: `tensor_list` is a python list of EagerTensors
+// REQUIRES: `tensors` is a python list/tuple of EagerTensors
 // REQUIRES: `slice_dim` is non-negative and smaller than the rank of all
-//   tensors in `tensor_list`.
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim);
+//   tensors in `tensors`.
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index d99bd0b0ff..ab7f251515 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -38,6 +38,13 @@ using tensorflow::strings::Printf;
 
 namespace {
 
+struct InputInfo {
+  InputInfo(PyObject* item, bool is_list) : item(item), is_list(is_list) {}
+  PyObject* item = nullptr;
+
+  bool is_list = false;
+};
+
 struct FastPathOpExecInfo {
   TFE_Context* ctx;
   const char* device_name;
@@ -53,6 +60,12 @@ struct FastPathOpExecInfo {
   // The op type name of the main op being executed.
   PyObject* op_name;
   PyObject* callbacks;
+
+  // DTypes can come from another input that has the same attr. So build that
+  // map.
+  tensorflow::gtl::FlatMap<string, tensorflow::gtl::InlinedVector<InputInfo, 4>>
+      attr_to_inputs;
+  tensorflow::gtl::FlatMap<string, tensorflow::DataType> cached_dtypes;
 };
 
 #define PARSE_VALUE(fn_name, type, check_fn, parse_fn)                       \
@@ -76,12 +89,29 @@ PARSE_VALUE(ParseIntValue, int, PyLong_Check, PyLong_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
+#if PY_MAJOR_VERSION < 3
+bool ParseInt64Value(const string& key, PyObject* py_value, TF_Status* status,
+                     int64_t* value) {
+  if (PyInt_Check(py_value)) {
+    *value = static_cast<int64_t>(PyInt_AsLong(py_value));
+    return true;
+  } else if (PyLong_Check(py_value)) {
+    *value = static_cast<int64_t>(PyLong_AsLong(py_value));
+    return true;
+  }
+  TF_SetStatus(
+      status, TF_INVALID_ARGUMENT,
+      tensorflow::strings::StrCat("Expecting int or long value for attr ", key,
+                                  ", got ", py_value->ob_type->tp_name)
+          .c_str());
+  return false;
+}
+#endif
+
 Py_ssize_t TensorShapeNumDims(PyObject* value) {
   const auto size = PySequence_Size(value);
   if (size == -1) {
@@ -234,7 +264,7 @@ bool SetOpAttrList(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -296,7 +326,7 @@ void SetOpAttrListDefault(
     TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     int num_values = attr.default_value().list().s_size();
-    std::unique_ptr<const char* []> values(new const char*[num_values]);
+    std::unique_ptr<const char*[]> values(new const char*[num_values]);
     (*attr_list_sizes)[key] = num_values;
     for (int i = 0; i < num_values; i++) {
       values[i] = attr.default_value().list().s(i).data();
@@ -349,7 +379,7 @@ void SetOpAttrListDefault(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -369,7 +399,7 @@ void SetOpAttrListDefault(
   } else if (type == TF_ATTR_FUNC) {
     int num_values = attr.default_value().list().func_size();
     (*attr_list_sizes)[key] = num_values;
-    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
     for (int i = 0; i < num_values; i++) {
       funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
     }
@@ -1399,10 +1429,39 @@ PyObject* GetPythonObjectFromString(const char* s) {
 #endif
 }
 
+PyObject* GetPythonObjectFromInt(int num) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_FromLong(num);
+#else
+  return PyInt_FromLong(num);
+#endif
+}
+
 bool CheckResourceVariable(PyObject* item) {
   return PyObject_TypeCheck(item, resource_variable_type);
 }
 
+bool IsNumberType(PyObject* item) {
+#if PY_MAJOR_VERSION >= 3
+  return PyFloat_Check(item) || PyLong_Check(item);
+#else
+  return PyFloat_Check(item) || PyInt_Check(item) || PyLong_Check(item);
+#endif
+}
+
+bool CheckOneInput(PyObject* item) {
+  if (EagerTensor_CheckExact(item) || CheckResourceVariable(item) ||
+      PyArray_Check(item) || IsNumberType(item)) {
+    return true;
+  }
+
+  // Sequences are not properly handled. Sequences with purely python numeric
+  // types work, but sequences with mixes of EagerTensors and python numeric
+  // types don't work.
+  // TODO(nareshmodi): fix
+  return false;
+}
+
 bool CheckInputsOk(PyObject* seq, int start_index,
                    const tensorflow::OpDef& op_def) {
   for (int i = 0; i < op_def.input_arg_size(); i++) {
@@ -1419,8 +1478,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
       }
       for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
         PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
-        if (!EagerTensor_CheckExact(inner_item) &&
-            !CheckResourceVariable(inner_item)) {
+        if (!CheckOneInput(inner_item)) {
           VLOG(1)
               << "Falling back to slow path for Op \"" << op_def.name()
               << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
@@ -1430,7 +1488,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
           return false;
         }
       }
-    } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
+    } else if (!CheckOneInput(item)) {
       VLOG(1)
           << "Falling back to slow path for Op \"" << op_def.name()
           << "\", Input \"" << op_def.input_arg(i).name()
@@ -1443,6 +1501,51 @@ bool CheckInputsOk(PyObject* seq, int start_index,
   return true;
 }
 
+PyObject* MaybeGetDType(PyObject* item) {
+  if (EagerTensor_CheckExact(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  if (CheckResourceVariable(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "_dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  return nullptr;
+}
+
+PyObject* MaybeGetDTypeForAttr(const string& attr,
+                               FastPathOpExecInfo* op_exec_info) {
+  auto cached_it = op_exec_info->cached_dtypes.find(attr);
+  if (cached_it != op_exec_info->cached_dtypes.end()) {
+    return GetPythonObjectFromInt(cached_it->second);
+  }
+
+  auto it = op_exec_info->attr_to_inputs.find(attr);
+  if (it == op_exec_info->attr_to_inputs.end()) {
+    // No other inputs - this should never happen.
+    Py_RETURN_NONE;
+  }
+
+  for (const auto& input_info : it->second) {
+    if (input_info.is_list) {
+      for (int i = 0; i < PySequence_Fast_GET_SIZE(input_info.item); i++) {
+        auto* dtype =
+            MaybeGetDType(PySequence_Fast_GET_ITEM(input_info.item, i));
+        if (dtype != nullptr) return dtype;
+      }
+    } else {
+      auto* dtype = MaybeGetDType(input_info.item);
+      if (dtype != nullptr) return dtype;
+    }
+  }
+
+  Py_RETURN_NONE;
+}
+
 bool OpDoesntRequireOutput(const string& op_name) {
   static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_outputs =
       new tensorflow::gtl::FlatSet<string>({
@@ -1668,23 +1771,52 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
 //  i) input is an EagerTensor
 //  ii) input is a ResourceVariable - in this case, the is_variable param is set
 //  to true.
-bool ConvertToTensor(const FastPathOpExecInfo& op_exec_info, PyObject* input,
-                     tensorflow::Safe_PyObjectPtr* output_handle,
-                     TF_Status* status) {
-  if (CheckResourceVariable(input)) {
+//
+//  NOTE: dtype_hint_getter must *always* return a PyObject that can be
+//  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
+//  increfs Py_None).
+bool ConvertToTensor(
+    const FastPathOpExecInfo& op_exec_info, PyObject* input,
+    tensorflow::Safe_PyObjectPtr* output_handle,
+    // This gets a hint for this particular input.
+    const std::function<PyObject*()>& dtype_hint_getter,
+    // This sets the dtype after conversion is complete.
+    const std::function<void(const TF_DataType& dtype)>& dtype_setter,
+    TF_Status* status) {
+  if (EagerTensor_CheckExact(input)) {
+    Py_INCREF(input);
+    output_handle->reset(input);
+    return true;
+  } else if (CheckResourceVariable(input)) {
     return ReadVariableOp(op_exec_info, input, output_handle, status);
   }
 
-  Py_INCREF(input);
-  output_handle->reset(input);
+  // The hint comes from a supposedly similarly typed tensor.
+  tensorflow::Safe_PyObjectPtr dtype_hint(dtype_hint_getter());
+  if (PyErr_Occurred()) {
+    return false;
+  }
+
+  auto* handle = tensorflow::ConvertToEagerTensor(input, dtype_hint.get());
+  if (handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Unable to convert value to tensor");
+    return false;
+  }
+
+  output_handle->reset(EagerTensorFromHandle(handle));
+
+  auto dtype_actual = TFE_TensorHandleDataType(handle);
+  dtype_setter(dtype_actual);
 
   return true;
 }
 
 // Adds input and type attr to the op, and to the list of flattened
 // inputs/attrs.
-bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
-                  const tensorflow::OpDef::ArgDef* input_arg,
+bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
+                  const bool add_type_attr,
+                  const tensorflow::OpDef::ArgDef& input_arg,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_attrs,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_inputs,
                   TFE_Op* op, TF_Status* status) {
@@ -1693,18 +1825,30 @@ bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
   // out of scope in this function.
   tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr;
 
-  if (!ConvertToTensor(op_exec_info, input, &py_eager_tensor, status)) {
+  if (!ConvertToTensor(
+          *op_exec_info, input, &py_eager_tensor,
+          [&]() {
+            if (input_arg.type() != tensorflow::DataType::DT_INVALID) {
+              return GetPythonObjectFromInt(input_arg.type());
+            }
+            return MaybeGetDTypeForAttr(input_arg.type_attr(), op_exec_info);
+          },
+          [&](const TF_DataType dtype) {
+            op_exec_info->cached_dtypes[input_arg.type_attr()] =
+                static_cast<tensorflow::DataType>(dtype);
+          },
+          status)) {
     return false;
   }
 
   TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get());
 
-  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+  if (add_type_attr && !input_arg.type_attr().empty()) {
     auto dtype = TFE_TensorHandleDataType(input_handle);
-    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    TFE_OpSetAttrType(op, input_arg.type_attr().data(), dtype);
     if (flattened_attrs != nullptr) {
       flattened_attrs->emplace_back(
-          GetPythonObjectFromString(input_arg->type_attr().data()));
+          GetPythonObjectFromString(input_arg.type_attr().data()));
       flattened_attrs->emplace_back(PyLong_FromLong(dtype));
     }
   }
@@ -1892,6 +2036,23 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
+  // This can be cached somewhere.
+  // Store a list of InputIndex -> List of corresponding inputs.
+  for (int i = 0; i < op_def->input_arg_size(); i++) {
+    if (!op_def->input_arg(i).type_attr().empty()) {
+      auto it =
+          op_exec_info.attr_to_inputs.find(op_def->input_arg(i).type_attr());
+      if (it == op_exec_info.attr_to_inputs.end()) {
+        it = op_exec_info.attr_to_inputs
+                 .insert({op_def->input_arg(i).type_attr(), {}})
+                 .first;
+      }
+      it->second.emplace_back(
+          PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i),
+          !op_def->input_arg(i).number_attr().empty());
+    }
+  }
+
   TF_Status* status = TF_NewStatus();
   TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
   auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
@@ -1986,17 +2147,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
-                          &input_arg, flattened_attrs.get(),
+        if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          true, input_arg, flattened_attrs.get(),
                           flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j),
-                            nullptr /* input_arg */,
-                            nullptr /* flattened_attrs */,
+          if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j),
+                            false, input_arg, nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
           }
@@ -2018,7 +2178,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             status)) {
+                             []() { Py_RETURN_NONE; },
+                             [](const TF_DataType& dtype) {}, status)) {
           return nullptr;
         }
 
@@ -2048,8 +2209,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       attr_list_sizes[attr_name] = len;
     } else {
       // The item is a single item.
-      if (!AddInputToOp(op_exec_info, input, &input_arg, flattened_attrs.get(),
-                        flattened_inputs.get(), op, status)) {
+      if (!AddInputToOp(&op_exec_info, input, true, input_arg,
+                        flattened_attrs.get(), flattened_inputs.get(), op,
+                        status)) {
         return nullptr;
       }
     }
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0bd5a5dbaf..b044b30231 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -278,14 +278,9 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
     with self.assertRaisesRegexp(
         TypeError,
-        r"tensor_list argument must be a list. Got \"EagerTensor\""):
+        r"tensors argument must be a list or a tuple. Got \"EagerTensor\""):
       pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2)
 
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"tensor_list argument must be a list. Got \"tuple\""):
-      pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2)
-
   def testNegativeSliceDim(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e3ca5a4977..a0f0b289df 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1384,6 +1384,22 @@ def register_tensor_conversion_function(base_type,
     if not callable(conversion_func):
       raise TypeError("conversion_func must be callable.")
 
+    # context._context is checked so that we don't inadvertently create it.
+    # This is because enable_eager_execution will fail when called from the main
+    # function if the context._context is already created, and the
+    # register_tensor_conversion_function calls happen when the module is
+    # imported.
+    if context._context is not None and context.executing_eagerly(
+    ) and isinstance(base_type, six.integer_types + (
+        float,
+        np.ndarray,
+    )):
+      # TODO(nareshmodi): consider setting a context variable which disables the
+      # fastpath instead.
+      raise TypeError(
+          "Cannot register conversions for numpy arrays, python number types "
+          "when executing eagerly.")
+
     try:
       funcs_at_priority = _tensor_conversion_func_registry[priority]
     except KeyError:
-- 
GitLab


From 6be585730bee4e33c2a9b51dc9485ec147a8b6cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 11:46:26 -0700
Subject: [PATCH 0457/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 192160587

---
 tensorflow/go/op/wrappers.go | 300 +++++++++++++++++------------------
 1 file changed, 150 insertions(+), 150 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53959d4956..3d261c9d0a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2101,6 +2101,30 @@ func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Checks a tensor for NaN and Inf values.
+//
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+//
+// Arguments:
+//
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message": message}
+	opspec := tf.OpSpec{
+		Type: "CheckNumerics",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the complex conjugate of a complex number.
 //
 // Given a tensor `input` of complex numbers, this operation returns a tensor of
@@ -2354,6 +2378,68 @@ func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Ou
 	return op.Output(0)
 }
 
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PreventGradient",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseToDenseAttr is an optional argument to SparseToDense.
 type SparseToDenseAttr func(optionalAttr)
 
@@ -17071,6 +17157,70 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
 // Adds sparse updates to the variable referenced by `resource`.
 //
 // This operation computes
@@ -27737,153 +27887,3 @@ func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Outp
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Eagerly executes a python function to compute func(input)->output. The
-//
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
-}
-
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
-//
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
-//
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StopGradient",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
-//
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
-//
-// Arguments:
-//	input: any tensor.
-//
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PreventGradient",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-- 
GitLab


From 790e8ae587d636ad34c2e06c4dac6cc4dbdad00e Mon Sep 17 00:00:00 2001
From: Ayush Dubey <ayushd@google.com>
Date: Mon, 9 Apr 2018 11:51:38 -0700
Subject: [PATCH 0458/1262] Automated g4 rollback of changelist 186518037

PiperOrigin-RevId: 192161449
---
 tensorflow/core/BUILD                         |  3 +-
 .../core/common_runtime/bfc_allocator.h       |  2 +-
 .../gpu/gpu_cudamalloc_allocator.h            |  2 +-
 .../common_runtime/gpu/gpu_debug_allocator.h  |  2 +-
 .../core/common_runtime/gpu/pool_allocator.h  |  2 +-
 .../core/common_runtime/mkl_cpu_allocator.h   |  2 +-
 .../visitable_allocator.h                     |  6 +-
 tensorflow/core/framework/allocator.cc        | 63 ++-----------------
 8 files changed, 14 insertions(+), 68 deletions(-)
 rename tensorflow/core/{framework => common_runtime}/visitable_allocator.h (94%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5a0535fc86..13b74b852a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -553,7 +553,6 @@ tf_cuda_library(
         "framework/type_index.h",
         "framework/type_traits.h",
         "framework/types.h",
-        "framework/visitable_allocator.h",
         "public/version.h",
         "util/activation_mode.h",
         "util/bcast.h",
@@ -1974,7 +1973,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
     "framework/variant.h",
-    "framework/visitable_allocator.h",
     "platform/variant_coding.h",
     "util/command_line_flags.h",
     "util/env_var.h",
@@ -2285,6 +2283,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
+    "common_runtime/visitable_allocator.h",
     "graph/gradients.h",
     "graph/quantize_training.h",
 ] + if_mkl(["graph/mkl_graph_util.h"])
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index e34945dd48..b8e773503c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 0a586344cc..208697361d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 0db08dc975..adce3a8436 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 38d669ea07..91ce830df8 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <vector>
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 55c8411ad0..b2ef51d10b 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <cstdlib>
 #include <string>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/framework/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
similarity index 94%
rename from tensorflow/core/framework/visitable_allocator.h
rename to tensorflow/core/common_runtime/visitable_allocator.h
index ed41b05531..8edf922d11 100644
--- a/tensorflow/core/framework/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
 
 #include <functional>
 #include "tensorflow/core/framework/allocator.h"
@@ -76,4 +76,4 @@ class TrackingVisitableAllocator : public TrackingAllocator,
   VisitableAllocator* allocator_;
 };
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 6182f95f28..1a7e5219cd 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -88,20 +88,15 @@ void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
 
-class CPUAllocator : public VisitableAllocator {
+class CPUAllocator : public Allocator {
  public:
-  CPUAllocator()
-      : total_allocation_warning_triggered_(false), allocation_begun_(false) {}
+  CPUAllocator() : total_allocation_warning_triggered_(false) {}
 
   ~CPUAllocator() override {}
 
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    if (!allocation_begun_) {
-      allocation_begun_ = true;
-    }
-
     if (num_bytes > LargeAllocationWarningBytes()) {
       LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
                    << 100 * kLargeAllocationWarningThreshold
@@ -127,38 +122,16 @@ class CPUAllocator : public VisitableAllocator {
         total_allocation_warning_triggered_ = true;
       }
     }
-
-    // visit each Visitor in alloc_visitors_
-    if (p != nullptr) {
-      for (const Visitor& v : alloc_visitors_) {
-        v(p, num_bytes);
-      }
-    }
-
     return p;
   }
 
   void DeallocateRaw(void* ptr) override {
-    std::size_t alloc_size;
-    bool init_alloc_size = false;
     if (cpu_allocator_collect_stats) {
-      alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
-      init_alloc_size = true;
+      const std::size_t alloc_size =
+          port::MallocExtension_GetAllocatedSize(ptr);
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-
-    // visit each Visitor in free_visitors_
-    if (ptr != nullptr) {
-      if (!init_alloc_size) {
-        alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
-        init_alloc_size = true;
-      }
-      for (const Visitor& v : free_visitors_) {
-        v(ptr, alloc_size);
-      }
-    }
-
     port::AlignedFree(ptr);
   }
 
@@ -178,37 +151,11 @@ class CPUAllocator : public VisitableAllocator {
     return port::MallocExtension_GetAllocatedSize(ptr);
   }
 
-  // REQUIRES: can only add visitors before the first Allocate call
-
-  void AddAllocVisitor(Visitor visitor) override {
-    mutex_lock lock(visitor_mutex_);
-    CHECK(!allocation_begun_)
-        << "AddAllocVisitor may not be called after allocation has begun.";
-    alloc_visitors_.push_back(visitor);
-  }
-
-  void AddFreeVisitor(Visitor visitor) override {
-    mutex_lock lock(visitor_mutex_);
-    CHECK(!allocation_begun_)
-        << "AddFreeVisitor may not be called after allocation has begun.";
-    free_visitors_.push_back(visitor);
-  }
-
  private:
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
   bool total_allocation_warning_triggered_ GUARDED_BY(mu_);
 
-  // visitor_mutex_ protects write access to alloc_visitors_ and free_visitors_.
-  // While write access is mutually exclusive, reads may happen concurrently.
-  // This is okay because we may only append to alloc_visitors_ and
-  // free_visitors_ before first allocation, and subsequently we only read these
-  // vectors.
-  mutex visitor_mutex_;
-  std::vector<Visitor> alloc_visitors_;
-  std::vector<Visitor> free_visitors_;
-  std::atomic<bool> allocation_begun_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
 
-- 
GitLab


From c887859438122f68f33f9342f297cd3088f9acf3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 12:02:38 -0700
Subject: [PATCH 0459/1262] Internal fix.

PiperOrigin-RevId: 192163466
---
 tensorflow/contrib/lite/java/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index b14230acd7..1dda55b8ed 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -117,6 +117,7 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/uint8.bin",
+        "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
-- 
GitLab


From e606e9133e96caf00d60e2ac0eb3f308fd0a4758 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 9 Apr 2018 12:09:34 -0700
Subject: [PATCH 0460/1262] Only set session in model_to_estimator if _SESSION
 has not been set. Fix #18193.

PiperOrigin-RevId: 192164669
---
 .../python/keras/_impl/keras/estimator.py     | 13 +++++++++++-
 .../keras/_impl/keras/estimator_test.py       | 20 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 5d370ebbb5..8043242b70 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -26,6 +26,7 @@ from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -465,11 +466,21 @@ def model_to_estimator(keras_model=None,
   estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
 
+  old_session = K._SESSION
   # Pass the config into keras backend's default session.
   sess = session.Session(config=estimator._session_config)
   K.set_session(sess)
+  try:
+    keras_weights = keras_model.get_weights()
+  except errors.FailedPreconditionError as e:
+    if old_session is None:
+      raise e
+    logging.warning(
+        'The Keras backend session has already been '
+        'set. The _session_config passed to model_to_estimator is not used.')
+    K.set_session(old_session)
+    keras_weights = keras_model.get_weights()
 
-  keras_weights = keras_model.get_weights()
   if keras_model._is_graph_network:
     # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
     _save_first_checkpoint(keras_model,
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index e076dc25b1..27b7ec7dd4 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -512,6 +512,26 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
                      ._config.gpu_options.per_process_gpu_memory_fraction,
                      gpu_options.per_process_gpu_memory_fraction)
 
+  def test_pretrained_weights(self):
+    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    keras_model.train_on_batch(
+        np.random.random((10,) + _INPUT_SIZE), np.random.random((10,
+                                                                 _NUM_CLASS)))
+    weights = keras_model.get_weights()
+    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+    keras_model.set_weights(weights)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+    keras.estimator.model_to_estimator(
+        keras_model=keras_model, config=self._config)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 2e6f8b3f05fe2d212c19b9598f93f4e6ee07675f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 12:22:37 -0700
Subject: [PATCH 0461/1262] Provide a hint about the number of iterations to
 while_loop in the case of for loops over tensors of known size. This allows
 using this type of for loops on TPU.

PiperOrigin-RevId: 192166460
---
 .../autograph/operators/control_flow.py       | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index 5b8cb2d63c..81ae64f110 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -83,7 +83,8 @@ def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
       while_cond,
       while_body,
       init_state=(0,) + init_state,
-      extra_deps=(iterated,))
+      extra_deps=(iterated,),
+      opts=dict(maximum_iterations=n))
   # Dropping the iteration index because it's not syntactically visible.
   results = results[1:]
 
@@ -136,7 +137,7 @@ def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
   return results
 
 
-def while_loop(loop_cond, loop_body, init_state, extra_deps):
+def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
   """Functional form of a while statement.
 
   The loop operates on a so-called state, which includes all symbols that are
@@ -153,6 +154,7 @@ def while_loop(loop_cond, loop_body, init_state, extra_deps):
     extra_deps: Tuple containing additional entities on which the loop may
         depend, such as loop invariants referenced by loop_cond. Used
         exclusively for dispatch control.
+    opts: Optional dict of extra loop parameters.
 
   Returns:
     Tuple containing the final state.
@@ -161,18 +163,21 @@ def while_loop(loop_cond, loop_body, init_state, extra_deps):
   # That could be somethins as simple as a collection of dispatch rules, with
   # some prioritization.
   if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
-    return _tf_while_loop(loop_cond, loop_body, init_state)
+    return _tf_while_loop(loop_cond, loop_body, init_state, opts)
   else:
-    return _py_while_loop(loop_cond, loop_body, init_state)
+    return _py_while_loop(loop_cond, loop_body, init_state, opts)
 
 
-def _tf_while_loop(loop_cond, loop_body, init_state):
+def _tf_while_loop(loop_cond, loop_body, init_state, opts):
   """Overload of while_loop that stages a TF while_loop."""
-  return control_flow_ops.while_loop(loop_cond, loop_body, init_state)
+  if opts is None:
+    opts = {}
+  return control_flow_ops.while_loop(loop_cond, loop_body, init_state, **opts)
 
 
-def _py_while_loop(loop_cond, loop_body, init_state):
+def _py_while_loop(loop_cond, loop_body, init_state, opts):
   """Overload of while_loop that executes a Python while loop."""
+  del opts
   state = init_state
   while loop_cond(*state):
     state = loop_body(*state)
-- 
GitLab


From 970baf64cffe9de0b124b5eea53b1ee1d5158506 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 12:37:33 -0700
Subject: [PATCH 0462/1262] Renames exported signature names in MultiHead so
 head_name comes first.

PiperOrigin-RevId: 192168628
---
 .../estimator/python/estimator/multi_head.py     |  2 +-
 .../python/estimator/multi_head_test.py          | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index bbbc19cc4d..ce75899214 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -345,7 +345,7 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         if k == _DEFAULT_SERVING_KEY:
           key = head_name
         else:
-          key = '%s/%s' % (k, head_name)
+          key = '%s/%s' % (head_name, k)
         export_outputs[key] = v
         if (k == head_lib._PREDICT_SERVING_KEY and  # pylint:disable=protected-access
             isinstance(v, export_output_lib.PredictOutput)):
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index d9e5aca295..3d6fccb118 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -127,8 +127,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
-         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -169,11 +169,11 @@ class MultiHeadTest(test.TestCase):
       self.assertAllClose(
           expected_probabilities['head1'],
           sess.run(
-              spec.export_outputs['predict/head1'].outputs['probabilities']))
+              spec.export_outputs['head1/predict'].outputs['probabilities']))
       self.assertAllClose(
           expected_probabilities['head2'],
           sess.run(
-              spec.export_outputs['predict/head2'].outputs['probabilities']))
+              spec.export_outputs['head2/predict'].outputs['probabilities']))
 
   def test_predict_two_heads_logits_tensor(self):
     """Tests predict with logits as Tensor."""
@@ -197,8 +197,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
-         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -254,8 +254,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'regression/head1',
-         'predict/head1', 'head2', 'regression/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/regression',
+         'head1/predict', 'head2', 'head2/regression', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
-- 
GitLab


From 32e0db3e7a085ff2473a53b9401686544b4442aa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 12:41:03 -0700
Subject: [PATCH 0463/1262] Implement DFS using a loop instead of recursion to
 avoid stack overflow.

PiperOrigin-RevId: 192169242
---
 tensorflow/core/grappler/utils.cc | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 5893f286ed..534fe670e0 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -430,18 +430,28 @@ Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
 }
 
 void SimpleGraphView::DepthFirstSearch(
-    const std::unordered_set<string>& op_types_to_traverse, int node_idx,
+    const std::unordered_set<string>& op_types_to_traverse, int root_node,
     std::set<int>* nodes_found) const {
-  if (nodes_found->find(node_idx) != nodes_found->end()) {
-    return;
-  }
-  nodes_found->insert(node_idx);
-  const string& op_type = graph_->node(node_idx).op();
+  nodes_found->clear();
+  const string& op_type = graph_->node(root_node).op();
   if (op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
     return;
   }
-  for (auto output_idx : this->outputs(node_idx)) {
-    DepthFirstSearch(op_types_to_traverse, output_idx, nodes_found);
+  std::vector<int> stack;
+  stack.reserve(32);
+  stack.push_back(root_node);
+  while (!stack.empty()) {
+    const int node_idx = stack.back();
+    stack.pop_back();
+    nodes_found->insert(node_idx);
+    const string& op_type = graph_->node(node_idx).op();
+    if (op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
+      for (auto output_idx : this->outputs(node_idx)) {
+        if (nodes_found->find(output_idx) == nodes_found->end()) {
+          stack.push_back(output_idx);
+        }
+      }
+    }
   }
 }
 
-- 
GitLab


From 7aa1f1f57fa6851529c471da78b2e91e0aaab5c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 13:21:34 -0700
Subject: [PATCH 0464/1262] Adds a within_ops_fn parameter to
 get_forward_walk_ops and get_backward_walk_ops that allows setting a
 condition on ops that are within or not within.

Also adds tests for these methods that were missing.

PiperOrigin-RevId: 192176693
---
 tensorflow/contrib/graph_editor/select.py     |  26 ++-
 .../contrib/graph_editor/tests/select_test.py | 155 +++++++++++++++++-
 2 files changed, 172 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/graph_editor/select.py b/tensorflow/contrib/graph_editor/select.py
index 3ea6ff4d61..d700e6e1a7 100644
--- a/tensorflow/contrib/graph_editor/select.py
+++ b/tensorflow/contrib/graph_editor/select.py
@@ -383,6 +383,7 @@ def get_within_boundary_ops(ops,
 def get_forward_walk_ops(seed_ops,
                          inclusive=True,
                          within_ops=None,
+                         within_ops_fn=None,
                          stop_at_ts=(),
                          control_outputs=None):
   """Do a forward graph walk and return all the visited ops.
@@ -395,6 +396,9 @@ def get_forward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_outputs: a `util.ControlOutputs` instance or None.
       If not `None`, it will be used while walking the graph forward.
@@ -423,7 +427,8 @@ def get_forward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -450,6 +455,7 @@ def get_forward_walk_ops(seed_ops,
 def get_backward_walk_ops(seed_ops,
                           inclusive=True,
                           within_ops=None,
+                          within_ops_fn=None,
                           stop_at_ts=(),
                           control_inputs=False):
   """Do a backward graph walk and return all the visited ops.
@@ -462,6 +468,9 @@ def get_backward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_inputs: if True, control inputs will be used while moving backward.
   Returns:
@@ -488,7 +497,8 @@ def get_backward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -516,6 +526,7 @@ def get_walks_intersection_ops(forward_seed_ops,
                                forward_inclusive=True,
                                backward_inclusive=True,
                                within_ops=None,
+                               within_ops_fn=None,
                                control_inputs=False,
                                control_outputs=None,
                                control_ios=None):
@@ -535,6 +546,9 @@ def get_walks_intersection_ops(forward_seed_ops,
     within_ops: an iterable of tf.Operation within which the search is
       restricted. If within_ops is None, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -555,11 +569,13 @@ def get_walks_intersection_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return [op for op in forward_ops if op in backward_ops]
 
@@ -569,6 +585,7 @@ def get_walks_union_ops(forward_seed_ops,
                         forward_inclusive=True,
                         backward_inclusive=True,
                         within_ops=None,
+                        within_ops_fn=None,
                         control_inputs=False,
                         control_outputs=None,
                         control_ios=None):
@@ -587,6 +604,9 @@ def get_walks_union_ops(forward_seed_ops,
       resulting set.
     within_ops: restrict the search within those operations. If within_ops is
       None, the search is done within the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -607,11 +627,13 @@ def get_walks_union_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return util.concatenate_unique(forward_ops, backward_ops)
 
diff --git a/tensorflow/contrib/graph_editor/tests/select_test.py b/tensorflow/contrib/graph_editor/tests/select_test.py
index 82f999637d..d12c6d3cbd 100644
--- a/tensorflow/contrib/graph_editor/tests/select_test.py
+++ b/tensorflow/contrib/graph_editor/tests/select_test.py
@@ -77,12 +77,10 @@ class SelectTest(test.TestCase):
     """Test for ge.get_ops_ios."""
     control_outputs = ge.util.ControlOutputs(self.graph)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.h.op, control_ios=control_outputs)), 3)
+        len(ge.get_ops_ios(self.h.op, control_ios=control_outputs)), 3)
     self.assertEqual(len(ge.get_ops_ios(self.h.op)), 2)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.c.op, control_ios=control_outputs)), 6)
+        len(ge.get_ops_ios(self.c.op, control_ios=control_outputs)), 6)
     self.assertEqual(len(ge.get_ops_ios(self.c.op)), 5)
 
   def test_compute_boundary_ts_0(self):
@@ -135,16 +133,49 @@ class SelectTest(test.TestCase):
     ops = ge.get_walks_intersection_ops([self.c.op], [self.g.op])
     self.assertEqual(len(ops), 2)
 
+    ops = ge.get_walks_intersection_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.a.op in ops)
+    self.assertTrue(self.c.op in ops)
+    self.assertTrue(self.f.op in ops)
+
+    within_ops = [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops=within_ops)
+    self.assertEqual(len(ops), 0)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 0)
+
   def test_get_walks_union(self):
     """Test for ge.get_walks_union_ops."""
     ops = ge.get_walks_union_ops([self.f.op], [self.g.op])
     self.assertEqual(len(ops), 6)
 
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 8)
+
+    within_ops = [self.a.op, self.c.op, self.d.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops=within_ops)
+    self.assertEqual(len(ops), 4)
+    self.assertTrue(self.b.op not in ops)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.c.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.b.op not in ops)
+    self.assertTrue(self.d.op not in ops)
+
   def test_select_ops(self):
     parameters = (
         (("^foo/",), 7),
         (("^foo/bar/",), 4),
-        (("^foo/bar/", "a"), 5),)
+        (("^foo/bar/", "a"), 5),
+    )
     for param, length in parameters:
       ops = ge.select_ops(*param, graph=self.graph)
       self.assertEqual(len(ops), length)
@@ -152,7 +183,8 @@ class SelectTest(test.TestCase):
   def test_select_ts(self):
     parameters = (
         (".*:0", 8),
-        (r".*/bar/\w+:0", 4),)
+        (r".*/bar/\w+:0", 4),
+    )
     for regex, length in parameters:
       ts = ge.select_ts(regex, graph=self.graph)
       self.assertEqual(len(ts), length)
@@ -160,12 +192,121 @@ class SelectTest(test.TestCase):
   def test_select_ops_and_ts(self):
     parameters = (
         (("^foo/.*",), 7, 0),
-        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),)
+        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),
+    )
     for param, l0, l1 in parameters:
       ops, ts = ge.select_ops_and_ts(*param, graph=self.graph)
       self.assertEqual(len(ops), l0)
       self.assertEqual(len(ts), l1)
 
+  def test_forward_walk_ops(self):
+    seed_ops = [self.a.op, self.d.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.e.op.
+    within_ops_fn = lambda op: op not in (self.e.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # No b.op since it's an independent source node.
+      # No g.op from within_ops.
+      # No e.op from within_ops_fn.
+      # No h.op from stop_at_ts and within_ops.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(
+          set(ops), set([self.a.op, self.c.op, self.d.op, self.f.op]))
+
+      # Also no a.op and d.op when inclusive=False
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.f.op]))
+
+      # Not using within_ops_fn adds e.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.e.op, self.f.op]))
+
+      # Not using stop_at_ts adds back h.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops, inclusive=False, within_ops=within_ops)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.h.op]))
+
+      # Starting just form a (the tensor, not op) omits a, b, d.
+      ops = ge.select.get_forward_walk_ops([self.a], inclusive=True)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.g.op,
+                         self.h.op]))
+
+  def test_backward_walk_ops(self):
+    seed_ops = [self.h.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.c.op.
+    within_ops_fn = lambda op: op not in (self.c.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # Backward walk only includes h since we stop at f and g is not within.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.h.op]))
+
+      # If we do inclusive=False, the result is empty.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set())
+
+      # Removing stop_at_fs adds f.op, d.op.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn)
+      self.assertEqual(set(ops), set([self.d.op, self.f.op, self.h.op]))
+
+      # Not using within_ops_fn adds back ops for a, b, c.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops, inclusive=True, within_ops=within_ops)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.h.op
+          ]))
+
+      # Vanially backward search via self.h.op includes everything excpet e.op.
+      ops = ge.select.get_backward_walk_ops(seed_ops, inclusive=True)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.g.op,
+              self.h.op
+          ]))
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 36882e882c3de9be4381c266af6049b08fe2326c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 13:24:03 -0700
Subject: [PATCH 0465/1262] Add a utility that can detect the class that
 defined a method. This is useful when converting a e.g. a custom Keras model,
 to avoid recompiling the base model methods.

PiperOrigin-RevId: 192177117
---
 .../contrib/autograph/pyct/inspect_utils.py   | 12 ++++++++++
 .../autograph/pyct/inspect_utils_test.py      | 24 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index 30a5961821..386a6d21ec 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -50,6 +50,18 @@ def getnamespace(f):
   return namespace
 
 
+def getdefiningclass(m, owner_class):
+  """Resolves the class (e.g. one of the superclasses) that defined a method."""
+  m = six.get_unbound_function(m)
+  last_defining = owner_class
+  for superclass in tf_inspect.getmro(owner_class):
+    if hasattr(superclass, m.__name__):
+      superclass_m = getattr(superclass, m.__name__)
+      if six.get_unbound_function(superclass_m) == m:
+        last_defining = superclass
+  return last_defining
+
+
 def getmethodclass(m):
   """Resolves a function's owner, e.g. a method's class.
 
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index eda3fc13fd..58f827b79a 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -234,6 +234,30 @@ class InspectUtilsTest(test.TestCase):
     c = TestCallable()
     self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
 
+  def test_getdefiningclass(self):
+    class Superclass(object):
+
+      def foo(self):
+        pass
+
+      def bar(self):
+        pass
+
+    class Subclass(Superclass):
+
+      def foo(self):
+        pass
+
+      def baz(self):
+        pass
+
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.foo, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 07e28a70874eef87772549c505d241eea2e6b9e9 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Mon, 9 Apr 2018 13:42:35 -0700
Subject: [PATCH 0466/1262] Add opcode for new instruction that broadcasts
 degenerate dimensions. Implicit broadcasts can be translated to the new
 instruction instead of a reshape-and-broadcast. Follow-up CLs will add
 support in UserComputation and the various backends.

PiperOrigin-RevId: 192180356
---
 .../xla/service/algebraic_simplifier.cc       |  1 +
 .../compiler/xla/service/dfs_hlo_visitor.h    |  1 +
 .../service/dfs_hlo_visitor_with_default.h    |  3 +++
 .../xla/service/hlo_constant_folding.cc       |  3 ++-
 .../compiler/xla/service/hlo_cost_analysis.cc |  5 ++++
 .../compiler/xla/service/hlo_cost_analysis.h  |  1 +
 .../compiler/xla/service/hlo_graph_dumper.cc  |  1 +
 .../compiler/xla/service/hlo_instruction.cc   | 19 +++++++++++--
 .../compiler/xla/service/hlo_instruction.h    |  4 +++
 tensorflow/compiler/xla/service/hlo_opcode.h  |  1 +
 .../compiler/xla/service/hlo_verifier.cc      | 27 +++++++++++++++----
 .../compiler/xla/service/hlo_verifier.h       |  1 +
 .../xla/service/instruction_fusion.cc         |  7 +++--
 .../compiler/xla/tools/parser/hlo_parser.cc   |  9 +++++++
 .../xla/tools/parser/hlo_parser_test.cc       | 12 +++++++++
 15 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 0e4624fd69..6cb1bd5669 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1424,6 +1424,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
+// TODO(b/74536353): do this simplification for BroadcastDimOne as well.
 StatusOr<bool> AlgebraicSimplifierVisitor::
     TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
         HloInstruction* reshape_or_broadcast) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 56723e7650..3f7089d6ca 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -199,6 +199,7 @@ class DfsHloVisitorBase {
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
+  virtual Status HandleBroadcastDimOne(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 240faebe62..e6680ee9b8 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -158,6 +158,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
+  Status HandleBroadcastDimOne(HloInstructionPtr broadcastDimOne) override {
+    return DefaultAction(broadcastDimOne);
+  }
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 35ecd4428d..7aa38c6b79 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -69,7 +69,8 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
-      if (instruction->opcode() == HloOpcode::kBroadcast) {
+      if (instruction->opcode() == HloOpcode::kBroadcast ||
+          instruction->opcode() == HloOpcode::kBroadcastDimOne) {
         continue;
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 44e4f75f75..ea4dd62fdb 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -336,6 +336,11 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleBroadcastDimOne(
+    const HloInstruction* broadcastDimOne) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandlePad(const HloInstruction*) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index d17678d20f..a9f6845747 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -95,6 +95,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleSelectAndScatter(const HloInstruction* instruction) override;
   Status HandleBitcast(const HloInstruction* bitcast) override;
   Status HandleBroadcast(const HloInstruction* broadcast) override;
+  Status HandleBroadcastDimOne(const HloInstruction* broadcastDimOne) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 25702dc65e..c35783c456 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -956,6 +956,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
+    case HloOpcode::kBroadcastDimOne:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
       // these are essentially free.
       if (instr->IsFused() &&
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index fcf9ebf5f7..8149e47cb5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -683,6 +683,15 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBroadcastDimOne(const Shape& shape,
+                                      HloInstruction* operand) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBroadcastDimOne, shape));
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1275,6 +1284,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateBroadcast(shape, new_operands[0], dimensions_);
       break;
+    case HloOpcode::kBroadcastDimOne:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateBroadcastDimOne(shape, new_operands[0]);
+      break;
     case HloOpcode::kCall:
       clone = CreateCall(shape, new_operands, to_apply());
       break;
@@ -1826,6 +1839,8 @@ bool HloInstruction::IdenticalSlowPath(
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcastDimOne:
+    case HloOpcode::kDynamicUpdateSlice:
       return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
       return eq_shapes(shape(), other.shape()) &&
@@ -1844,8 +1859,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDynamicSlice:
       return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
-    case HloOpcode::kDynamicUpdateSlice:
-      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -2646,6 +2659,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBitcast(this);
     case HloOpcode::kBroadcast:
       return visitor->HandleBroadcast(this);
+    case HloOpcode::kBroadcastDimOne:
+      return visitor->HandleBroadcastDimOne(this);
     case HloOpcode::kPad:
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 80f8408244..a6cb19f331 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -401,6 +401,10 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
+  // Creates a broadcast-size-one-dimensions instruction.
+  static std::unique_ptr<HloInstruction> CreateBroadcastDimOne(
+      const Shape& shape, HloInstruction* operand);
+
   // Creates a sequence of instructions that performs an explicit broadcast of
   // the operand to the target shape.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index af24604c39..dddc72480f 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -54,6 +54,7 @@ namespace xla {
   V(kBitcast, "bitcast")                                     \
   V(kBitcastConvert, "bitcast-convert")                      \
   V(kBroadcast, "broadcast")                                 \
+  V(kBroadcastDimOne, "broadcast-dim-one")                   \
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8c875698eb..63ec5964eb 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -174,17 +174,34 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
   TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
                broadcast->dimensions().size());
-  for (int64 operand_dimension = 0;
-       operand_dimension < ShapeUtil::Rank(operand_shape);
-       ++operand_dimension) {
-    int64 output_dimension = broadcast->dimensions()[operand_dimension];
+  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+    int64 output_dimension = broadcast->dimensions()[i];
     TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(operand_dimension))
+                 operand_shape.dimensions(i))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return tensorflow::Status::OK();
 }
 
+Status ShapeVerifier::HandleBroadcastDimOne(HloInstruction* broadcastDimOne) {
+  const Shape& operand_shape = broadcastDimOne->operand(0)->shape();
+  int64 operand_rank = ShapeUtil::Rank(operand_shape);
+  const Shape& output_shape = broadcastDimOne->shape();
+  // Check for mixed precision.
+  TF_RETURN_IF_ERROR(CheckShape(broadcastDimOne, output_shape));
+  TF_RET_CHECK(operand_rank == ShapeUtil::Rank(output_shape));
+  for (int64 i = 0; i < operand_rank; ++i) {
+    int64 operand_dimension = operand_shape.dimensions(i);
+    int64 output_dimension = output_shape.dimensions(i);
+    TF_RET_CHECK(operand_dimension == 1 ||
+                 operand_dimension == output_dimension)
+        << "Dimension " << i << " of broadcastDimOne "
+        << broadcastDimOne->ToString() << " is " << operand_dimension
+        << ", expected 1 or " << output_dimension;
+  }
+  return tensorflow::Status::OK();
+}
+
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1dd7ec3c51..a4dff977ba 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -54,6 +54,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
+  Status HandleBroadcastDimOne(HloInstruction* broadcastDimOne) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index d69ad80bdb..3f4dbf897d 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -37,6 +37,7 @@ namespace xla {
     case HloOpcode::kBitcast:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
+    case HloOpcode::kBroadcastDimOne:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -142,7 +143,8 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
       });
   return std::count_if(hlo->operands().begin(), hlo->operands().end(),
                        [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast) {
+                         if (operand->opcode() == HloOpcode::kBroadcast ||
+                             operand->opcode() == HloOpcode::kBroadcastDimOne) {
                            return false;
                          }
                          if (operand->opcode() == HloOpcode::kConstant &&
@@ -247,7 +249,8 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     auto reachability = computation->ComputeReachability();
 
     auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast) {
+      if (producer->opcode() == HloOpcode::kBroadcast ||
+          producer->opcode() == HloOpcode::kBroadcastDimOne) {
         return true;
       }
       if (producer->opcode() == HloOpcode::kConstant &&
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index e60a5a4919..b2f122982a 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -724,6 +724,15 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, operands[0], *broadcast_dimensions));
       break;
     }
+    case HloOpcode::kBroadcastDimOne: {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateBroadcastDimOne(shape, operands[0]));
+      break;
+    }
     case HloOpcode::kConcatenate: {
       optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index adc8b1d620..57684b5834 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -57,6 +57,18 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
+)"
+},
+// broadcast size-one dimensions
+{
+"BroadcastDimOne",
+R"(HloModule broadcast_dim_one_module
+
+ENTRY %broadcast-dim-one () -> f32[2,2] {
+  %constant = f32[1,2]{1,0} constant(f32[1,2] { { 1.1, 2.2 } })
+  ROOT %broadcast-dim-one = f32[2,2]{1,0} broadcast-dim-one(f32[1,2]{1,0} %constant)
+}
+
 )"
 },
 // pred constant
-- 
GitLab


From a4a74a3c24f0ba2f702641565299ac46cce3768c Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Mon, 9 Apr 2018 13:57:12 -0700
Subject: [PATCH 0467/1262] Make node and registration getter const.

PiperOrigin-RevId: 192183067
---
 tensorflow/contrib/lite/interpreter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 77db178783..a6d582a813 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -208,7 +208,7 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
-      int node_index) {
+      int node_index) const {
     if (node_index >= nodes_and_registration_.size() || node_index < 0)
       return nullptr;
     return &nodes_and_registration_[node_index];
-- 
GitLab


From b3bf89690799d2a1f52cb6fad6df4ab6be22a2d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 14:02:30 -0700
Subject: [PATCH 0468/1262] Adds __repr__ method to HParams.

PiperOrigin-RevId: 192184000
---
 tensorflow/contrib/training/python/training/hparam.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 95e051e3b5..185f70a86d 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -630,6 +630,9 @@ class HParams(object):
   def __str__(self):
     return str(sorted(self.values().items()))
 
+  def __repr__(self):
+    return '%s(%s)' % (type(self).__name__, self.__str__())
+
   @staticmethod
   def _get_kind_name(param_type, is_list):
     """Returns the field name given parameter type and is_list.
-- 
GitLab


From eafa5561cd7565547cfcf087fc1be5db006c607c Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 9 Apr 2018 14:07:12 -0700
Subject: [PATCH 0469/1262] Automated g4 rollback of changelist 192160407

PiperOrigin-RevId: 192184809
---
 tensorflow/python/eager/pywrap_tensor.cc  | 123 ++++++------
 tensorflow/python/eager/pywrap_tensor.h   |   4 -
 tensorflow/python/eager/pywrap_tfe.h      |  12 +-
 tensorflow/python/eager/pywrap_tfe_src.cc | 220 +++-------------------
 tensorflow/python/eager/tensor_test.py    |   7 +-
 tensorflow/python/framework/ops.py        |  16 --
 6 files changed, 96 insertions(+), 286 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index bc509be312..519814b979 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -161,64 +161,6 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 
 }  // namespace
 
-namespace tensorflow {
-TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
-  int desired_dtype = -1;
-  if (dtype != Py_None) {
-    if (!PyIntToDataType(dtype, &desired_dtype)) {
-      PyErr_SetString(PyExc_TypeError,
-                      tensorflow::strings::StrCat(
-                          "Expecting a DataType value for dtype. Got ",
-                          Py_TYPE(dtype)->tp_name)
-                          .c_str());
-      return nullptr;
-    }
-  }
-  if (PyArray_Check(value)) {
-    int desired_np_dtype = -1;
-    if (desired_dtype >= 0) {
-      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
-               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
-               .ok()) {
-        PyErr_SetString(PyExc_TypeError,
-                        tensorflow::strings::StrCat(
-                            "Invalid dtype argument value ", desired_dtype)
-                            .c_str());
-        return nullptr;
-      }
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
-    int current_np_dtype = PyArray_TYPE(array);
-    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
-    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
-        !PyArray_ISCARRAY(array)) {
-      int new_dtype =
-          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
-      safe_value = tensorflow::make_safe(
-          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
-                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
-      if (PyErr_Occurred()) return nullptr;
-      if (safe_value == nullptr) {
-        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
-        return nullptr;
-      }
-      value = safe_value.get();
-    }
-    return NumpyToTensorHandle(value);
-  } else {
-    tensorflow::Tensor t;
-    // TODO(josh11b): Have PySeqToTensor set python errors instead of
-    // returning Status.
-    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
-    if (!cppstatus.ok()) {
-      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
-      return nullptr;
-    }
-    return TFE_NewTensorHandle(t);
-  }
-}
-}  // namespace tensorflow
-
 extern "C" {
 
 static const int kMaxEagerTensorParentSize = 64;
@@ -288,11 +230,56 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       return -1;
     }
   }
-  PyErr_Clear();
   tensorflow::Safe_TFE_TensorHandlePtr handle =
-      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
-          tensorflow::ConvertToEagerTensor(value, dtype)));
-  if (handle == nullptr) return -1;
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(nullptr));
+  PyErr_Clear();
+  if (PyArray_Check(value)) {
+    int desired_np_dtype = -1;
+    if (desired_dtype >= 0) {
+      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
+               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
+               .ok()) {
+        PyErr_SetString(PyExc_TypeError,
+                        tensorflow::strings::StrCat(
+                            "Invalid dtype argument value ", desired_dtype)
+                            .c_str());
+        return -1;
+      }
+    }
+    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+    int current_np_dtype = PyArray_TYPE(array);
+    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
+    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
+        !PyArray_ISCARRAY(array)) {
+      int new_dtype =
+          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
+      safe_value = tensorflow::make_safe(
+          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
+                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
+      if (PyErr_Occurred()) return -1;
+      if (safe_value == nullptr) {
+        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
+        return -1;
+      }
+      value = safe_value.get();
+    }
+    handle = tensorflow::make_safe(NumpyToTensorHandle(value));
+  } else {
+    tensorflow::Tensor t;
+    // TODO(josh11b): Have PySeqToTensor set python errors instead of
+    // returning Status.
+    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
+    if (!cppstatus.ok()) {
+      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
+      return -1;
+    }
+    handle = tensorflow::make_safe(TFE_NewTensorHandle(t));
+  }
+  if (PyErr_Occurred()) return -1;
+  if (handle == nullptr) {
+    PyErr_SetString(PyExc_ValueError, "Error while creating an EagerTensor");
+    return -1;
+  }
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
     handle = tensorflow::make_safe(
@@ -714,12 +701,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
-  if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) {
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
+  if (!PyList_Check(tensor_list)) {
     PyErr_SetString(PyExc_TypeError,
                     tensorflow::strings::StrCat(
-                        "tensors argument must be a list or a tuple. Got \"",
-                        Py_TYPE(tensors)->tp_name, "\"")
+                        "tensor_list argument must be a list. Got \"",
+                        Py_TYPE(tensor_list)->tp_name, "\"")
                         .c_str());
     return nullptr;
   }
@@ -733,14 +720,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
     return nullptr;
   }
 
-  Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors);
+  Py_ssize_t num_tensors = PyList_Size(tensor_list);
   int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
   auto tensor = tensorflow::make_safe(TF_AllocateTensor(
       TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
   int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
   auto status = tensorflow::make_safe(TF_NewStatus());
   for (Py_ssize_t i = 0; i < num_tensors; ++i) {
-    PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i);
+    PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i);
     if (!EagerTensor_CheckExact(tensor_obj)) {
       PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 5b330432bd..aa1efdd1b8 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -22,8 +22,4 @@ limitations under the License.
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
 
-namespace tensorflow {
-TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
-}
-
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 691b613e48..32d731d0f6 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -186,16 +186,16 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 // Returns the set of variables watched by the given tape.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
-// Returns an EagerTensor of dimension [len(`tensors`)] containing
-// the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
+// Returns an EagerTensor of dimension [len(`tensor_list`)] containing
+// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
-// `tensors`. For example, if `tensors` contains tensors of with shapes
+// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes
 // [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with
 // `slice_dim` equal to 1 will return [2, 5, 7].
 // On error, returns nullptr and sets python exception.
-// REQUIRES: `tensors` is a python list/tuple of EagerTensors
+// REQUIRES: `tensor_list` is a python list of EagerTensors
 // REQUIRES: `slice_dim` is non-negative and smaller than the rank of all
-//   tensors in `tensors`.
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
+//   tensors in `tensor_list`.
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim);
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index ab7f251515..d99bd0b0ff 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -38,13 +38,6 @@ using tensorflow::strings::Printf;
 
 namespace {
 
-struct InputInfo {
-  InputInfo(PyObject* item, bool is_list) : item(item), is_list(is_list) {}
-  PyObject* item = nullptr;
-
-  bool is_list = false;
-};
-
 struct FastPathOpExecInfo {
   TFE_Context* ctx;
   const char* device_name;
@@ -60,12 +53,6 @@ struct FastPathOpExecInfo {
   // The op type name of the main op being executed.
   PyObject* op_name;
   PyObject* callbacks;
-
-  // DTypes can come from another input that has the same attr. So build that
-  // map.
-  tensorflow::gtl::FlatMap<string, tensorflow::gtl::InlinedVector<InputInfo, 4>>
-      attr_to_inputs;
-  tensorflow::gtl::FlatMap<string, tensorflow::DataType> cached_dtypes;
 };
 
 #define PARSE_VALUE(fn_name, type, check_fn, parse_fn)                       \
@@ -89,29 +76,12 @@ PARSE_VALUE(ParseIntValue, int, PyLong_Check, PyLong_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
+PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
+PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
-#if PY_MAJOR_VERSION < 3
-bool ParseInt64Value(const string& key, PyObject* py_value, TF_Status* status,
-                     int64_t* value) {
-  if (PyInt_Check(py_value)) {
-    *value = static_cast<int64_t>(PyInt_AsLong(py_value));
-    return true;
-  } else if (PyLong_Check(py_value)) {
-    *value = static_cast<int64_t>(PyLong_AsLong(py_value));
-    return true;
-  }
-  TF_SetStatus(
-      status, TF_INVALID_ARGUMENT,
-      tensorflow::strings::StrCat("Expecting int or long value for attr ", key,
-                                  ", got ", py_value->ob_type->tp_name)
-          .c_str());
-  return false;
-}
-#endif
-
 Py_ssize_t TensorShapeNumDims(PyObject* value) {
   const auto size = PySequence_Size(value);
   if (size == -1) {
@@ -264,7 +234,7 @@ bool SetOpAttrList(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -326,7 +296,7 @@ void SetOpAttrListDefault(
     TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     int num_values = attr.default_value().list().s_size();
-    std::unique_ptr<const char*[]> values(new const char*[num_values]);
+    std::unique_ptr<const char* []> values(new const char*[num_values]);
     (*attr_list_sizes)[key] = num_values;
     for (int i = 0; i < num_values; i++) {
       values[i] = attr.default_value().list().s(i).data();
@@ -379,7 +349,7 @@ void SetOpAttrListDefault(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -399,7 +369,7 @@ void SetOpAttrListDefault(
   } else if (type == TF_ATTR_FUNC) {
     int num_values = attr.default_value().list().func_size();
     (*attr_list_sizes)[key] = num_values;
-    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
+    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
     for (int i = 0; i < num_values; i++) {
       funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
     }
@@ -1429,39 +1399,10 @@ PyObject* GetPythonObjectFromString(const char* s) {
 #endif
 }
 
-PyObject* GetPythonObjectFromInt(int num) {
-#if PY_MAJOR_VERSION >= 3
-  return PyLong_FromLong(num);
-#else
-  return PyInt_FromLong(num);
-#endif
-}
-
 bool CheckResourceVariable(PyObject* item) {
   return PyObject_TypeCheck(item, resource_variable_type);
 }
 
-bool IsNumberType(PyObject* item) {
-#if PY_MAJOR_VERSION >= 3
-  return PyFloat_Check(item) || PyLong_Check(item);
-#else
-  return PyFloat_Check(item) || PyInt_Check(item) || PyLong_Check(item);
-#endif
-}
-
-bool CheckOneInput(PyObject* item) {
-  if (EagerTensor_CheckExact(item) || CheckResourceVariable(item) ||
-      PyArray_Check(item) || IsNumberType(item)) {
-    return true;
-  }
-
-  // Sequences are not properly handled. Sequences with purely python numeric
-  // types work, but sequences with mixes of EagerTensors and python numeric
-  // types don't work.
-  // TODO(nareshmodi): fix
-  return false;
-}
-
 bool CheckInputsOk(PyObject* seq, int start_index,
                    const tensorflow::OpDef& op_def) {
   for (int i = 0; i < op_def.input_arg_size(); i++) {
@@ -1478,7 +1419,8 @@ bool CheckInputsOk(PyObject* seq, int start_index,
       }
       for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
         PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
-        if (!CheckOneInput(inner_item)) {
+        if (!EagerTensor_CheckExact(inner_item) &&
+            !CheckResourceVariable(inner_item)) {
           VLOG(1)
               << "Falling back to slow path for Op \"" << op_def.name()
               << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
@@ -1488,7 +1430,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
           return false;
         }
       }
-    } else if (!CheckOneInput(item)) {
+    } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
       VLOG(1)
           << "Falling back to slow path for Op \"" << op_def.name()
           << "\", Input \"" << op_def.input_arg(i).name()
@@ -1501,51 +1443,6 @@ bool CheckInputsOk(PyObject* seq, int start_index,
   return true;
 }
 
-PyObject* MaybeGetDType(PyObject* item) {
-  if (EagerTensor_CheckExact(item)) {
-    tensorflow::Safe_PyObjectPtr py_dtype(
-        PyObject_GetAttrString(item, "dtype"));
-    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
-  }
-
-  if (CheckResourceVariable(item)) {
-    tensorflow::Safe_PyObjectPtr py_dtype(
-        PyObject_GetAttrString(item, "_dtype"));
-    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
-  }
-
-  return nullptr;
-}
-
-PyObject* MaybeGetDTypeForAttr(const string& attr,
-                               FastPathOpExecInfo* op_exec_info) {
-  auto cached_it = op_exec_info->cached_dtypes.find(attr);
-  if (cached_it != op_exec_info->cached_dtypes.end()) {
-    return GetPythonObjectFromInt(cached_it->second);
-  }
-
-  auto it = op_exec_info->attr_to_inputs.find(attr);
-  if (it == op_exec_info->attr_to_inputs.end()) {
-    // No other inputs - this should never happen.
-    Py_RETURN_NONE;
-  }
-
-  for (const auto& input_info : it->second) {
-    if (input_info.is_list) {
-      for (int i = 0; i < PySequence_Fast_GET_SIZE(input_info.item); i++) {
-        auto* dtype =
-            MaybeGetDType(PySequence_Fast_GET_ITEM(input_info.item, i));
-        if (dtype != nullptr) return dtype;
-      }
-    } else {
-      auto* dtype = MaybeGetDType(input_info.item);
-      if (dtype != nullptr) return dtype;
-    }
-  }
-
-  Py_RETURN_NONE;
-}
-
 bool OpDoesntRequireOutput(const string& op_name) {
   static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_outputs =
       new tensorflow::gtl::FlatSet<string>({
@@ -1771,52 +1668,23 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
 //  i) input is an EagerTensor
 //  ii) input is a ResourceVariable - in this case, the is_variable param is set
 //  to true.
-//
-//  NOTE: dtype_hint_getter must *always* return a PyObject that can be
-//  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
-//  increfs Py_None).
-bool ConvertToTensor(
-    const FastPathOpExecInfo& op_exec_info, PyObject* input,
-    tensorflow::Safe_PyObjectPtr* output_handle,
-    // This gets a hint for this particular input.
-    const std::function<PyObject*()>& dtype_hint_getter,
-    // This sets the dtype after conversion is complete.
-    const std::function<void(const TF_DataType& dtype)>& dtype_setter,
-    TF_Status* status) {
-  if (EagerTensor_CheckExact(input)) {
-    Py_INCREF(input);
-    output_handle->reset(input);
-    return true;
-  } else if (CheckResourceVariable(input)) {
+bool ConvertToTensor(const FastPathOpExecInfo& op_exec_info, PyObject* input,
+                     tensorflow::Safe_PyObjectPtr* output_handle,
+                     TF_Status* status) {
+  if (CheckResourceVariable(input)) {
     return ReadVariableOp(op_exec_info, input, output_handle, status);
   }
 
-  // The hint comes from a supposedly similarly typed tensor.
-  tensorflow::Safe_PyObjectPtr dtype_hint(dtype_hint_getter());
-  if (PyErr_Occurred()) {
-    return false;
-  }
-
-  auto* handle = tensorflow::ConvertToEagerTensor(input, dtype_hint.get());
-  if (handle == nullptr) {
-    status->status = tensorflow::errors::InvalidArgument(
-        "Unable to convert value to tensor");
-    return false;
-  }
-
-  output_handle->reset(EagerTensorFromHandle(handle));
-
-  auto dtype_actual = TFE_TensorHandleDataType(handle);
-  dtype_setter(dtype_actual);
+  Py_INCREF(input);
+  output_handle->reset(input);
 
   return true;
 }
 
 // Adds input and type attr to the op, and to the list of flattened
 // inputs/attrs.
-bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
-                  const bool add_type_attr,
-                  const tensorflow::OpDef::ArgDef& input_arg,
+bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
+                  const tensorflow::OpDef::ArgDef* input_arg,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_attrs,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_inputs,
                   TFE_Op* op, TF_Status* status) {
@@ -1825,30 +1693,18 @@ bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
   // out of scope in this function.
   tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr;
 
-  if (!ConvertToTensor(
-          *op_exec_info, input, &py_eager_tensor,
-          [&]() {
-            if (input_arg.type() != tensorflow::DataType::DT_INVALID) {
-              return GetPythonObjectFromInt(input_arg.type());
-            }
-            return MaybeGetDTypeForAttr(input_arg.type_attr(), op_exec_info);
-          },
-          [&](const TF_DataType dtype) {
-            op_exec_info->cached_dtypes[input_arg.type_attr()] =
-                static_cast<tensorflow::DataType>(dtype);
-          },
-          status)) {
+  if (!ConvertToTensor(op_exec_info, input, &py_eager_tensor, status)) {
     return false;
   }
 
   TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get());
 
-  if (add_type_attr && !input_arg.type_attr().empty()) {
+  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
     auto dtype = TFE_TensorHandleDataType(input_handle);
-    TFE_OpSetAttrType(op, input_arg.type_attr().data(), dtype);
+    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
     if (flattened_attrs != nullptr) {
       flattened_attrs->emplace_back(
-          GetPythonObjectFromString(input_arg.type_attr().data()));
+          GetPythonObjectFromString(input_arg->type_attr().data()));
       flattened_attrs->emplace_back(PyLong_FromLong(dtype));
     }
   }
@@ -2036,23 +1892,6 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  // This can be cached somewhere.
-  // Store a list of InputIndex -> List of corresponding inputs.
-  for (int i = 0; i < op_def->input_arg_size(); i++) {
-    if (!op_def->input_arg(i).type_attr().empty()) {
-      auto it =
-          op_exec_info.attr_to_inputs.find(op_def->input_arg(i).type_attr());
-      if (it == op_exec_info.attr_to_inputs.end()) {
-        it = op_exec_info.attr_to_inputs
-                 .insert({op_def->input_arg(i).type_attr(), {}})
-                 .first;
-      }
-      it->second.emplace_back(
-          PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i),
-          !op_def->input_arg(i).number_attr().empty());
-    }
-  }
-
   TF_Status* status = TF_NewStatus();
   TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
   auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
@@ -2147,16 +1986,17 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
-                          true, input_arg, flattened_attrs.get(),
+        if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          &input_arg, flattened_attrs.get(),
                           flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j),
-                            false, input_arg, nullptr /* flattened_attrs */,
+          if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j),
+                            nullptr /* input_arg */,
+                            nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
           }
@@ -2178,8 +2018,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             []() { Py_RETURN_NONE; },
-                             [](const TF_DataType& dtype) {}, status)) {
+                             status)) {
           return nullptr;
         }
 
@@ -2209,9 +2048,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       attr_list_sizes[attr_name] = len;
     } else {
       // The item is a single item.
-      if (!AddInputToOp(&op_exec_info, input, true, input_arg,
-                        flattened_attrs.get(), flattened_inputs.get(), op,
-                        status)) {
+      if (!AddInputToOp(op_exec_info, input, &input_arg, flattened_attrs.get(),
+                        flattened_inputs.get(), op, status)) {
         return nullptr;
       }
     }
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index b044b30231..0bd5a5dbaf 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -278,9 +278,14 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
     with self.assertRaisesRegexp(
         TypeError,
-        r"tensors argument must be a list or a tuple. Got \"EagerTensor\""):
+        r"tensor_list argument must be a list. Got \"EagerTensor\""):
       pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2)
 
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"tensor_list argument must be a list. Got \"tuple\""):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2)
+
   def testNegativeSliceDim(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index a0f0b289df..e3ca5a4977 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1384,22 +1384,6 @@ def register_tensor_conversion_function(base_type,
     if not callable(conversion_func):
       raise TypeError("conversion_func must be callable.")
 
-    # context._context is checked so that we don't inadvertently create it.
-    # This is because enable_eager_execution will fail when called from the main
-    # function if the context._context is already created, and the
-    # register_tensor_conversion_function calls happen when the module is
-    # imported.
-    if context._context is not None and context.executing_eagerly(
-    ) and isinstance(base_type, six.integer_types + (
-        float,
-        np.ndarray,
-    )):
-      # TODO(nareshmodi): consider setting a context variable which disables the
-      # fastpath instead.
-      raise TypeError(
-          "Cannot register conversions for numpy arrays, python number types "
-          "when executing eagerly.")
-
     try:
       funcs_at_priority = _tensor_conversion_func_registry[priority]
     except KeyError:
-- 
GitLab


From 06b6a5dc4db1f5fe68dc40e015d5812280856a55 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 14:08:50 -0700
Subject: [PATCH 0470/1262] Fix bug in loop optimizer. Reuse existing constant
 nodes.

PiperOrigin-RevId: 192185091
---
 .../grappler/optimizers/loop_optimizer.cc     | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 28ce2c7a55..fff06dd2ac 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -106,7 +106,7 @@ Status LoopInvariantNodeMotionOptimizer::HandleInvariantEnter(
 Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
                                                      const int num_outputs,
                                                      const int frame_id) {
-  NodeDef* const_node;
+  NodeDef* const_node = nullptr;
   if (num_outputs == 0) {
     // all successor nodes are invariant
     // Remove the control inputs from this frame to the const node,
@@ -118,12 +118,17 @@ Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
     // some successor nodes are variant
     // Have to keep the const node in the frame,
     // so create a new one outside the frame (in parent frame)
-    const_node = optimized_graph_->add_node();
-    const_node->set_name(AddPrefixToNodeName(node->name(), kLoopOptimizer));
-    const_node->set_op("Const");
-    const_node->set_device(node->device());
-    *const_node->mutable_attr() = node->attr();
-    node_map_->AddNode(const_node->name(), const_node);
+    const string const_node_name =
+        AddPrefixToNodeName(node->name(), kLoopOptimizer);
+    const_node = node_map_->GetNode(const_node_name);
+    if (const_node == nullptr) {
+      const_node = optimized_graph_->add_node();
+      const_node->set_name(const_node_name);
+      const_node->set_op("Const");
+      const_node->set_device(node->device());
+      *const_node->mutable_attr() = node->attr();
+      node_map_->AddNode(const_node->name(), const_node);
+    }
     auto consumers = node_map_->GetOutputs(node->name());
     for (auto* consumer : consumers) {
       if (invariant_nodes_.count(consumer)) {
-- 
GitLab


From 0bf3476f025184aeca747bd4d799b20419a14c87 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 14:12:05 -0700
Subject: [PATCH 0471/1262] Doccumentation fix for LossSpec.

PiperOrigin-RevId: 192185646
---
 tensorflow/python/estimator/canned/head.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index bb033d3495..189b81aeea 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -57,8 +57,8 @@ _PREDICT_SERVING_KEY = 'predict'
 
 # A LossSpec contains
 # * a scalar `Tensor` representing reduced weighted training loss
-# * a scalar `Tensor` representing the unreduced unweighted loss
-# * a scalar `Tensor` representing the example weights
+# * a `Tensor` representing the unreduced unweighted loss
+# * a `Tensor` representing the example weights
 # * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
 LossSpec = collections.namedtuple(
     'LossSpec', ['training_loss', 'unreduced_loss', 'weights',
@@ -163,8 +163,8 @@ class _Head(object):
     Returns:
       A LossSpec that contains
       * the scalar `Tensor` representing reduced weighted training loss
-      * the scalar `Tensor` representing the unreduced unweighted loss
-      * the scalar `Tensor` representing the example weights
+      * the `Tensor` representing the unreduced unweighted loss
+      * the `Tensor` representing the example weights
       * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
         etc.)
 
-- 
GitLab


From e60c87c978f7fbb848bc66ca3caa90ccdab8a9b9 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Mon, 9 Apr 2018 14:15:41 -0700
Subject: [PATCH 0472/1262] Fix GPUDebugAllocator and GPUNanResetAllocator: 1.
 Eliminate the checks when the pointer is nullptr. 2. Fix nan array size. Some
 application can produce a zero size which would    cause out-of-range access
 error.

PiperOrigin-RevId: 192186224
---
 .../common_runtime/gpu/gpu_debug_allocator.cc | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 63ed0b8be1..b0ca7e3109 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -85,8 +85,8 @@ GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
 
 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   num_bytes += (2 * MASK_BYTES);
-
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Return the pointer after the header
   void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
@@ -102,11 +102,13 @@ void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return rv;
 }
 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
-  CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
-  CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
+  if (ptr != nullptr) {
+    CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
+    CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
 
-  // Backtrack to the beginning of the header.
-  ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+    // Backtrack to the beginning of the header.
+    ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+  }
   // Deallocate the memory
   base_allocator_->DeallocateRaw(ptr);
 }
@@ -168,10 +170,12 @@ GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
 
 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Initialize the buffer to Nans
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
+  std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                          std::nanf(""));
   gpu::DeviceMemory<float> nan_ptr{
       gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
@@ -182,13 +186,16 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return allocated_ptr;
 }
 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
-  // Reset the buffer to Nans
-  size_t req_size = base_allocator_->RequestedSize(ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
-  gpu::DeviceMemory<float> nan_ptr{
-      gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  if (ptr != nullptr) {
+    // Reset the buffer to Nans
+    size_t req_size = base_allocator_->RequestedSize(ptr);
+    std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                            std::nanf(""));
+    gpu::DeviceMemory<float> nan_ptr{
+        gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
+      LOG(ERROR) << "Could not initialize to NaNs";
+    }
   }
 
   // Deallocate the memory
-- 
GitLab


From 9b18bd70b5739d646b21b7d45de0e5c96b8cc2a1 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 9 Apr 2018 14:26:55 -0700
Subject: [PATCH 0473/1262] Don't initialize global threadpool in GraphRunner.

TF_Graph creates a ShapeRefiner, which in
turn creates a GraphRunner, which prior to this change would eventually create a
LocalDevice that initialized the global eigen threadpool. This prevents
users from specifying a custom number of threads for the pool via a
ConfigProto.

This change introduces a new device class, SingleThreadedCpuDevice, that can
be used for light-weight computations without initializing the threadpool.

Addresses #18300.

PiperOrigin-RevId: 192188031
---
 tensorflow/core/BUILD                         |  1 +
 .../core/common_runtime/eigen_thread_pool.h   |  2 +
 .../core/common_runtime/graph_runner.cc       | 21 ++---
 .../single_threaded_cpu_device.h              | 82 +++++++++++++++++++
 4 files changed, 93 insertions(+), 13 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/single_threaded_cpu_device.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 13b74b852a..c5ca421ced 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2280,6 +2280,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/scoped_allocator.h",
     "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
+    "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
diff --git a/tensorflow/core/common_runtime/eigen_thread_pool.h b/tensorflow/core/common_runtime/eigen_thread_pool.h
index c6f13c6a11..ddd627fb20 100644
--- a/tensorflow/core/common_runtime/eigen_thread_pool.h
+++ b/tensorflow/core/common_runtime/eigen_thread_pool.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_
 #define TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/core/threadpool.h"
 
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 1125d2a34a..790f2eaa1e 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(skyewm): this is necessary to make the single_threaded_cpu_device.h
+// include work. Some other include must be including eigen without defining
+// this. Consider defining in this in a BUILD rule.
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/common_runtime/graph_runner.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -20,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -36,18 +42,6 @@ namespace tensorflow {
 
 namespace {
 
-std::unique_ptr<Device> GetCPUDevice(Env* env) {
-  std::vector<Device*> devices;
-  SessionOptions session_options;
-  session_options.env = env;
-  Status s = DeviceFactory::GetFactory(DEVICE_CPU)
-                 ->CreateDevices(session_options, "", &devices);
-  if (s.ok() && !devices.empty()) {
-    return std::unique_ptr<Device>(devices[0]);
-  }
-  return nullptr;
-}
-
 // A simple rendezvous class.
 // Assumes a single sender and a single receiver, no duplicate sends, and no
 // sends of dead tensors.
@@ -98,7 +92,8 @@ class SimpleRendezvous : public Rendezvous {
 }  // namespace
 
 GraphRunner::GraphRunner(Env* env)
-    : device_deleter_(GetCPUDevice(env)), device_(device_deleter_.get()) {}
+    : device_deleter_(new SingleThreadedCpuDevice(env)),
+      device_(device_deleter_.get()) {}
 GraphRunner::GraphRunner(Device* device) : device_(device) {}
 
 GraphRunner::~GraphRunner() {}
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.h b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
new file mode 100644
index 0000000000..04d5af9087
--- /dev/null
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+class Env;
+
+// A simple single-threaded CPU device. This can be used to run inexpensive
+// computations. In particular, using this avoids initializing the global thread
+// pools in LocalDevice.
+class SingleThreadedCpuDevice : public Device {
+ public:
+  SingleThreadedCpuDevice(Env* env)
+      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
+                                                  Bytes(256 << 20),
+                                                  DeviceLocality())) {
+    eigen_worker_threads_.num_threads = 1;
+    eigen_worker_threads_.workers = new thread::ThreadPool(
+        env, "graph_runner", eigen_worker_threads_.num_threads);
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+
+  ~SingleThreadedCpuDevice() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+    delete eigen_worker_threads_.workers;
+  }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
-- 
GitLab


From 265099d262225a4b54619ee591d261e8146051e4 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 9 Apr 2018 14:52:53 -0700
Subject: [PATCH 0474/1262] Tweak the context stack so init_scope works with
 eager Graphs

Previously breaking out into Graphs created with eager execution enabled would
enter the graph but not re-enable eager execution.

PiperOrigin-RevId: 192192109
---
 tensorflow/python/framework/ops.py      | 26 +++++++++++++++++++++++--
 tensorflow/python/framework/ops_test.py |  7 +++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e3ca5a4977..662cda2a7d 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import copy
+import functools
 import linecache
 import os
 import re
@@ -5244,14 +5245,35 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     try:
-      context.context().context_switches.push(default.building_function,
-                                              default.as_default)
+      if context.executing_eagerly():
+        # A Graph alone on the context stack would keep init_scope-wrapped
+        # operations graph building when entered (assuming init_scope is called
+        # in a graph building context). Instead, we push a context which first
+        # enables eager execution and then re-enters the Graph.
+        context.context().context_switches.push(
+            default.building_function,
+            functools.partial(
+                _enter_context_and_graph,
+                context.eager_mode,
+                default.as_default))
+      else:
+        # This Graph is being used from a graph building context. A lack of
+        # context switch implies that the context is graph building.
+        context.context().context_switches.push(default.building_function,
+                                                default.as_default)
       with super(_DefaultGraphStack, self).get_controller(default) as g:
         yield g
     finally:
       context.context().context_switches.pop()
 
 
+@tf_contextlib.contextmanager
+def _enter_context_and_graph(context_fn, graph_fn):
+  """Combines two context managers."""
+  with context_fn(), graph_fn():
+    yield
+
+
 _default_graph_stack = _DefaultGraphStack()
 
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 58bead91ed..c9c1a3d66b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2305,6 +2305,13 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertEqual(ops.get_name_scope(), "inner")
       self.assertEqual(ops.get_name_scope(), "")
 
+  def testEagerGraphContextsExecuteEagerly(self):
+    with context.eager_mode():
+      with ops.Graph().as_default():
+        with context.graph_mode():
+          with ops.init_scope():
+            self.assertTrue(context.executing_eagerly())
+
   def testPreservesNameScopeInEagerExecution(self):
     with context.eager_mode():
       def foo():
-- 
GitLab


From ef3d7c93ee438ce943347ac7ae913680df23f5d3 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 9 Apr 2018 14:56:18 -0700
Subject: [PATCH 0475/1262] Acknowledges tape in graph mode doesn't like
 unknown shapes

PiperOrigin-RevId: 192192757
---
 tensorflow/python/kernel_tests/list_ops_test.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index d969f0e03a..6173a1def3 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -266,13 +265,10 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
       l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-      c2 = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      c2 = list_ops.tensor_list_stack(
+          l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
-    if context.in_eager_mode():
-      # TODO(b/77609620): Fix this in graph mode.
-      grad = tape.gradient(result, [c])[0]
-    else:
-      grad = gradients.gradients(result, [c])[0]
+    grad = tape.gradient(result, [c])[0]
     self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
   @test_util.run_in_graph_and_eager_modes()
-- 
GitLab


From b59d7b52bf119e07a1682e898e06d20356936a5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 15:00:17 -0700
Subject: [PATCH 0476/1262] Don't run test flaky under TSAN.

PiperOrigin-RevId: 192193350
---
 tensorflow/compiler/tests/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index db93d6e76f..a7a8d2d1ff 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -200,6 +200,10 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
+    tags = [
+        # Allocates very large amounts of memory and does not work under TSAN.
+        "notsan",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-- 
GitLab


From a068b21fa50454009b9f5b69565f128875e57129 Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Mon, 9 Apr 2018 15:02:21 -0700
Subject: [PATCH 0477/1262] [XLA] Use ThreadPool in a safer way.

ThreadPool joins its threads when it is destroyed, and there's no way to explicitly join. This means passing a ThreadPool and then scheduling in the callee is risky.

PiperOrigin-RevId: 192193752
---
 tensorflow/compiler/xla/shape_util.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index b9becf6452..1375f981a8 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -629,10 +629,7 @@ class ShapeUtil {
                                    tensorflow::gtl::ArraySlice<int64> count,
                                    tensorflow::gtl::ArraySlice<int64> incr,
                                    const FnType& visitor_function) {
-    const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
-    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "test",
-                                        kNumThreads);
-    // If a pool is provided, ForEachIndexInternal can never fail.
+    // The parallel version of ForEachIndexInternal can never fail.
     CHECK(ForEachIndexInternal(
               shape, base, count, incr,
               [&visitor_function](tensorflow::gtl::ArraySlice<int64> indexes)
@@ -640,7 +637,7 @@ class ShapeUtil {
                 visitor_function(indexes);
                 return true;
               },
-              &pool)
+              /*parallel=*/true)
               .ok());
   }
 
@@ -650,11 +647,12 @@ class ShapeUtil {
   static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
 
   template <typename FnType>
-  static Status ForEachIndexInternal(
-      const Shape& shape, tensorflow::gtl::ArraySlice<int64> base,
-      tensorflow::gtl::ArraySlice<int64> count,
-      tensorflow::gtl::ArraySlice<int64> incr, const FnType& visitor_function,
-      tensorflow::thread::ThreadPool* pool = nullptr) {
+  static Status ForEachIndexInternal(const Shape& shape,
+                                     tensorflow::gtl::ArraySlice<int64> base,
+                                     tensorflow::gtl::ArraySlice<int64> count,
+                                     tensorflow::gtl::ArraySlice<int64> incr,
+                                     const FnType& visitor_function,
+                                     bool parallel = false) {
     if (ShapeUtil::HasZeroElements(shape)) {
       return Status::OK();
     }
@@ -666,10 +664,16 @@ class ShapeUtil {
     // once with the proper empty indexes.
     int64 n = -1;
     std::vector<int64> indexes(base.begin(), base.end());
+    const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
+    tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+    if (parallel) {
+      pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
+    }
+
     while (n < rank) {
-      if (pool != nullptr) {
+      if (pool != tensorflow::gtl::nullopt) {
         pool->Schedule(
-            [indexes, visitor_function] { visitor_function(indexes); });
+            [indexes, &visitor_function] { visitor_function(indexes); });
       } else {
         TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
         if (!should_continue) {
-- 
GitLab


From fe0fc9c596594f87a1b46a65a8a4f469bc180e29 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Mon, 9 Apr 2018 15:22:18 -0700
Subject: [PATCH 0478/1262] Add bitcast-convert support to the evaluator and as
 a method on Literal.

PiperOrigin-RevId: 192197163
---
 tensorflow/compiler/xla/literal_util.cc       | 120 +++++++++++++-----
 tensorflow/compiler/xla/literal_util.h        |  12 +-
 tensorflow/compiler/xla/literal_util_test.cc  |  20 +++
 .../compiler/xla/service/hlo_evaluator.cc     |  16 +++
 tensorflow/compiler/xla/tests/BUILD           |   3 +
 5 files changed, 139 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 13675b7d00..c2950c1faa 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -1409,6 +1409,28 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
       src_literal, converter);
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  auto converter = [](NativeSrcT src) {
+    return tensorflow::bit_cast<NativeDestT>(src);
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+// This template specialization is here to make the compiler happy. bit_cast has
+// a static check that the types are the same size. This specialization should
+// never be used because the source and destination types are checked for
+// identical sizes higher up.
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  LOG(FATAL) << "Invalid bitcast between types of different sizes.";
+}
+
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
@@ -1428,21 +1450,33 @@ std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
+                                             bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-  return ConvertBetweenNativeTypes<
-      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
-      typename primitive_util::PrimitiveTypeToNative<
-          primitive_dest_type>::type>(src_literal);
+  if (bitcast) {
+    return BitcastBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  } else {
+    return ConvertBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  }
 }
 
 template <PrimitiveType primitive_src_type>
 StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+    const Literal& src_literal, PrimitiveType primitive_dest_type,
+    bool bitcast) {
   switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+#define CONVERT_IF_TYPES_MATCH(type)                                    \
+  case (type):                                                          \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal, \
+                                                           bitcast);
     CONVERT_IF_TYPES_MATCH(PRED)
     CONVERT_IF_TYPES_MATCH(S8)
     CONVERT_IF_TYPES_MATCH(S32)
@@ -1456,28 +1490,31 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
     CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
-      return ConvertToC64<primitive_src_type>(src_literal);
+      if (!bitcast) {
+        return ConvertToC64<primitive_src_type>(src_literal);
+      }
+      break;
     // Other types are not yet supported.
     default:
-      return Unimplemented(
-          "Converting from type %s to type %s is not implemented.",
-          PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
-          PrimitiveType_Name(primitive_dest_type).c_str());
-  }
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<Literal>> Literal::Convert(
-    PrimitiveType primitive_dest_type) const {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape()));
-  if (shape().element_type() == primitive_dest_type) {
-    return CloneToUnique();
+      break;
   }
-  switch (shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(*this, primitive_dest_type);
+  return Unimplemented(
+      "Converting from type %s to type %s is not implemented.",
+      PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
+      PrimitiveType_Name(primitive_dest_type).c_str());
+}
+
+StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
+    const Literal& literal, PrimitiveType primitive_dest_type, bool bitcast) {
+  TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
+  if (literal.shape().element_type() == primitive_dest_type) {
+    return literal.CloneToUnique();
+  }
+  switch (literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type)                                \
+  case (type):                                                            \
+    return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \
+                                            bitcast);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
@@ -1493,12 +1530,35 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
       // Other types are not yet supported.
     default:
       return Unimplemented(
-          "Converting from type %s to type %s is not implemented.",
-          PrimitiveType_Name(shape().element_type()).c_str(),
+          "%s from type %s to type %s is not implemented.",
+          (bitcast ? "Bitcast converting" : "Converting"),
+          PrimitiveType_Name(literal.shape().element_type()).c_str(),
           PrimitiveType_Name(primitive_dest_type).c_str());
   }
 }
 
+}  // namespace
+
+StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+    PrimitiveType primitive_dest_type) const {
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
+}
+
+StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
+    PrimitiveType primitive_dest_type) const {
+  if (primitive_util::BitWidth(shape().element_type()) !=
+      primitive_util::BitWidth(primitive_dest_type)) {
+    return InvalidArgument(
+        "Cannot bitcast convert from %s to %s, bit widths are different: %d != "
+        "%d",
+        PrimitiveType_Name(shape().element_type()).c_str(),
+        PrimitiveType_Name(primitive_dest_type).c_str(),
+        primitive_util::BitWidth(shape().element_type()),
+        primitive_util::BitWidth(primitive_dest_type));
+  }
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true);
+}
+
 StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
     const Shape& dest_shape, bool round_f32_to_bf16) const {
   if (!ShapeUtil::IsTuple(dest_shape)) {
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 33abbdb813..66ff39ecbb 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -333,11 +333,19 @@ class Literal {
   template <typename NativeT>
   std::unique_ptr<Literal> Replicate(int64 times) const;
 
-  // Converts this literal to another primitive type. Returns an error if the
-  // conversion is not possible. This literal must be array-shaped.
+  // Converts this literal to another primitive type using
+  // static_cast<>. Returns an error if the conversion is not possible. This
+  // literal must be array-shaped.
   StatusOr<std::unique_ptr<Literal>> Convert(
       PrimitiveType primitive_dest_type) const;
 
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. The to and from primitive types must have the same bit
+  // width. Returns an error if the conversion is not possible. This literal
+  // must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
+      PrimitiveType primitive_dest_type) const;
+
   // Converts this literal to the given shape. Returns an error is the
   // conversion is not possible.
   //
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 8b000f44f7..be4f2bc5ce 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -1285,6 +1286,25 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
             tensorflow::error::UNIMPLEMENTED);
 }
 
+TEST_F(LiteralUtilTest, BitcastConvert) {
+  auto original =
+      Literal::CreateR1<uint32>({tensorflow::bit_cast<uint32>(2.5f),
+                                 tensorflow::bit_cast<uint32>(-42.25f),
+                                 tensorflow::bit_cast<uint32>(100.f), 0xbeef});
+  auto expected = Literal::CreateR1<float>(
+      {2.5f, -42.25f, 100.0f, tensorflow::bit_cast<float>(0xbeef)});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> converted,
+                          original->BitcastConvert(F32));
+}
+
+TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
+  auto literal = Literal::CreateR0<uint32>(1234);
+  Status status = literal->BitcastConvert(F64).status();
+  EXPECT_NE(Status::OK(), status);
+  EXPECT_TRUE(tensorflow::str_util::StrContains(status.error_message(),
+                                                "bit widths are different"));
+}
+
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   LiteralProto p;
   p.mutable_shape()->set_element_type(PRED);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index b24757c33c..b4f9a9db9c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -399,6 +399,22 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 0276db9925..218345772f 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1476,6 +1476,9 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-- 
GitLab


From 130f44932fbfb3bef20911931de1eb263d55e992 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 9 Apr 2018 16:09:09 -0700
Subject: [PATCH 0479/1262] Hide slide_dataset from the new API.

PiperOrigin-RevId: 192204209
---
 tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt | 4 ++++
 tensorflow/tools/api/tests/api_compatibility_test.py          | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt

diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000..867116c5da
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SlideDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 26d5bca637..1ad6b6d1c0 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -205,8 +205,6 @@ class ApiCompatibilityTest(test.TestCase):
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    # TODO(annarev): Make slide_dataset available in API.
-    public_api_visitor.private_map['tf'] = ['slide_dataset']
     traverse.traverse(tf, public_api_visitor)
 
     proto_dict = visitor.GetProtos()
-- 
GitLab


From e346ac4faec2246c2d3972f158dea6aec858b904 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 16:24:56 -0700
Subject: [PATCH 0480/1262] [XLA] Redesign: implement infeed and outfeed.

- XlaBuilder::Infeed is basically ComputationBuilder::Infeed + UserComputation::AddInfeedInstruction + ComputationLowerer::Visit + HloInstruction::CreateInfeed.
- Similar for Outfeed.

PiperOrigin-RevId: 192206502
---
 .../xla/client/xla_client/xla_builder.cc      | 33 +++++++++++++++++--
 .../xla/tests/client_library_test_base.cc     | 11 +++++--
 .../xla/tests/client_library_test_base.h      |  4 +--
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 3d0cb35b48..ed9f994d39 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -781,12 +781,41 @@ XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
 }
 
 XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (!LayoutUtil::HasLayout(shape)) {
+      return InvalidArgument("Given shape to Infeed must have a layout");
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_infeed_config(config);
+    return AddInstruction(std::move(instr), HloOpcode::kInfeed);
+  });
 }
 
 void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                          const string& outfeed_config) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+
+    // Check and set outfeed shape.
+    if (!LayoutUtil::HasLayout(shape_with_layout)) {
+      return InvalidArgument("Given shape to Outfeed must have a layout");
+    }
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) {
+      return InvalidArgument(
+          "Outfeed shape %s must be compatible with operand shape %s",
+          ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(),
+          ShapeUtil::HumanStringWithLayout(operand_shape).c_str());
+    }
+    *instr.mutable_outfeed_shape() = shape_with_layout;
+
+    instr.set_outfeed_config(outfeed_config);
+
+    return AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand});
+  });
 }
 
 XlaOp XlaBuilder::CustomCall(const string& call_target_name,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 9124ccdb46..c2e3cd2350 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -74,9 +74,9 @@ string ClientLibraryTestBase::TestName() const {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
+template <typename BuilderT>
 StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   // Build the computation, as a convenience.
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   return client_->Execute(computation, arguments, &execution_options_);
@@ -651,4 +651,11 @@ template void ClientLibraryTestBase::ComputeAndCompareTuple(
     XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    ComputationBuilder* builder,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 80e1bbbae8..0572acff88 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -92,9 +92,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Convenience methods for building and running a computation with the member
   // execution options. Modify execution_options_ in your test if you want to
   // customize the options.
+  template <typename BuilderT>
   StatusOr<std::unique_ptr<GlobalData>> Execute(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // TODO(b/74197823): Remove the template type 'BuilderT' in all methods once
   // the migration to XlaBuilder is complete.
-- 
GitLab


From 66a601eece46e91c7c19cb22ebe526cf8b2253d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 16:43:02 -0700
Subject: [PATCH 0481/1262] Internal Change

PiperOrigin-RevId: 192209093
---
 tensorflow/python/keras/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 57f5097639..f6e1d0eec3 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -609,6 +609,7 @@ py_test(
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_windows",
         "noasan",  # times out
         "notsan",
-- 
GitLab


From 26e36ec2c9fb061e7349b2259bc69b2140d18819 Mon Sep 17 00:00:00 2001
From: Patrick Nguyen <drpng@google.com>
Date: Mon, 9 Apr 2018 16:55:06 -0700
Subject: [PATCH 0482/1262] Export recurrent and its RNN implementation in
 tf.contrib.

PiperOrigin-RevId: 192210794
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/python_modules.txt   |   4 +
 tensorflow/contrib/recurrent/BUILD            | 106 +++
 tensorflow/contrib/recurrent/README.md        |  13 +
 .../kernel_tests/functional_rnn_test.py       | 163 ++++
 .../python/kernel_tests/recurrent_test.py     | 192 +++++
 .../recurrent/python/ops/functional_rnn.py    | 396 ++++++++++
 .../contrib/recurrent/python/ops/recurrent.py | 720 ++++++++++++++++++
 .../contrib/recurrent/python/recurrent_api.py |  29 +
 10 files changed, 1625 insertions(+)
 create mode 100644 tensorflow/contrib/recurrent/BUILD
 create mode 100644 tensorflow/contrib/recurrent/README.md
 create mode 100644 tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
 create mode 100644 tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
 create mode 100644 tensorflow/contrib/recurrent/python/ops/functional_rnn.py
 create mode 100644 tensorflow/contrib/recurrent/python/ops/recurrent.py
 create mode 100644 tensorflow/contrib/recurrent/python/recurrent_api.py

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index bf69144ad8..9bef0d8b61 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -81,6 +81,7 @@ py_library(
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
+        "//tensorflow/contrib/recurrent:recurrent_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 1c5b00f92e..aaddb06fa0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -66,6 +66,7 @@ from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
+from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index b786c6d5cb..340be61971 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -367,6 +367,10 @@ tensorflow/contrib/receptive_field
 tensorflow/contrib/receptive_field/python
 tensorflow/contrib/receptive_field/python/util
 tensorflow/contrib/receptive_field/python/util/examples
+tensorflow/contrib/recurrent
+tensorflow/contrib/recurrent/python
+tensorflow/contrib/recurrent/python/ops
+tensorflow/contrib/recurrent/python/kernel_tests
 tensorflow/contrib/reduce_slice_ops
 tensorflow/contrib/reduce_slice_ops/kernels
 tensorflow/contrib/reduce_slice_ops/ops
diff --git a/tensorflow/contrib/recurrent/BUILD b/tensorflow/contrib/recurrent/BUILD
new file mode 100644
index 0000000000..b3cb04ce26
--- /dev/null
+++ b/tensorflow/contrib/recurrent/BUILD
@@ -0,0 +1,106 @@
+# Recurrent library.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "recurrent_py",
+    srcs = ["python/recurrent_api.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":functional_rnn_ops_py",
+        ":recurrent_ops_py",
+    ],
+)
+
+py_library(
+    name = "recurrent_ops_py",
+    srcs = ["python/ops/recurrent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "functional_rnn_ops_py",
+    srcs = ["python/ops/functional_rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":recurrent_ops_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "recurrent_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/recurrent_test.py"],
+    additional_deps = [
+        ":recurrent_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
+
+cuda_py_tests(
+    name = "functional_rnn_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/functional_rnn_test.py"],
+    additional_deps = [
+        ":functional_rnn_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/tpu:tpu",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
diff --git a/tensorflow/contrib/recurrent/README.md b/tensorflow/contrib/recurrent/README.md
new file mode 100644
index 0000000000..86e10eee51
--- /dev/null
+++ b/tensorflow/contrib/recurrent/README.md
@@ -0,0 +1,13 @@
+# Recurrent computation library
+
+The recurrent computation library contains code to perform recurrent
+computations.
+
+Its chief application is to implement recurrent neural networks (RNNs, LSTMs,
+etc), which is implemented in `functional_rnn.py`. Similar techniques may be
+used to implement deep networks.
+
+The computation saves the activations in the forward pass, and computes the
+gradients in the backward pass using a single accumulator.
+
+The `functional_rnn` interface is compatible with the `dynamic_rnn` API.
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
new file mode 100644
index 0000000000..0f19ac7dbe
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
@@ -0,0 +1,163 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Functional RNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import rnn as rnn_lib
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _CreateStackedLstmCell(*cell_sizes):
+  subcells = [rnn_cell_impl.LSTMCell(cell_size) for cell_size in cell_sizes]
+  return rnn_cell_impl.MultiRNNCell(subcells)
+
+
+class FunctionalRnnTest(test_util.TensorFlowTestCase):
+
+  _BATCH_SIZE = 3
+  _TOTAL_TIME = 5
+  _INPUT_SIZE = 11
+  _NUM_UNITS = 7
+
+  # Set this to some output if you want to use it.
+  _LSTM_GRAPH_DEF_FILEPATH = None
+
+  _CELLDEFS = {
+      'gru': (rnn_cell_impl.GRUCell, [_NUM_UNITS]),
+      'lstm': (rnn_cell_impl.LSTMCell, [_NUM_UNITS]),
+      'stacked_lstm': (_CreateStackedLstmCell, [_NUM_UNITS] * 3)
+  }
+
+  def _CreateCell(self, celldef_name):
+    func, args = self._CELLDEFS[celldef_name]
+    return func(*args)
+
+  def _CreateInputs(self):
+    inputs = np.random.random([FunctionalRnnTest._BATCH_SIZE,
+                               FunctionalRnnTest._TOTAL_TIME,
+                               FunctionalRnnTest._INPUT_SIZE])
+    # Always leave one time slot empty, to check max_length behavior.
+    sequence_length = np.random.randint(
+        0, high=FunctionalRnnTest._TOTAL_TIME - 1,
+        size=FunctionalRnnTest._BATCH_SIZE,
+        dtype=np.int)
+    return (inputs, sequence_length)
+
+  def _CreateRnnGraph(self, create_rnn_computation_func, cell, tf_inputs,
+                      tf_sequence_length, initial_state=None,
+                      time_major=None, scope=None):
+    tf_result = create_rnn_computation_func(cell=cell, inputs=tf_inputs,
+                                            sequence_length=tf_sequence_length,
+                                            initial_state=initial_state,
+                                            dtype=dtypes.float32,
+                                            time_major=time_major,
+                                            scope=scope)
+    grad = gradients_impl.gradients(tf_result, variables.trainable_variables())
+    return {'inference': tf_result, 'grad': grad}
+
+  def _MaybeResetVariables(self, variable_cache, sess, var_list):
+    """Possibly resets the variables to a previously seen value."""
+    reset_ops = []
+    fetches = []
+    for var in var_list:
+      if var.name in variable_cache:
+        reset_ops += [var.assign(variable_cache[var.name])]
+      else:
+        fetches += [(var.name, var)]
+    if reset_ops:
+      sess.run(reset_ops)
+    if fetches:
+      val = sess.run(dict(fetches))
+      for n, v in val.items():
+        assert n not in variable_cache
+        variable_cache[n] = v
+
+  def _RunRnn(self, numpy_inputs, numpy_slen, cell_name, variable_cache,
+              is_dynamic):
+    with ops.Graph().as_default() as graph:
+      tf_inputs = array_ops.placeholder(
+          dtypes.float32, shape=numpy_inputs.shape)
+      tf_slen = array_ops.placeholder(dtypes.int32)
+      feeds = {tf_inputs: numpy_inputs, tf_slen: numpy_slen}
+      cell = self._CreateCell(cell_name)
+      fn = rnn_lib.dynamic_rnn if is_dynamic else functional_rnn.functional_rnn
+      fetches = self._CreateRnnGraph(fn, cell, tf_inputs, tf_slen)
+      with self.test_session(graph=graph) as sess:
+        sess.run(variables.global_variables_initializer())
+        # Note that cell.trainable_variables it not always set.
+        self._MaybeResetVariables(variable_cache, sess,
+                                  variables.trainable_variables())
+        val = sess.run(fetches, feed_dict=feeds)
+      graph_def = graph.as_graph_def()
+      return graph_def, val
+
+  def testRunLstm(self):
+    """Runs a simple LSTM. Does not check output."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    graphdef, _ = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    logging.info('graphdef: %s', graphdef)
+    if self._LSTM_GRAPH_DEF_FILEPATH:
+      with open(self._LSTM_GRAPH_DEF_FILEPATH, 'w') as f:
+        f.write(str(graphdef))
+
+  def testLstm(self):
+    """Checks an LSTM against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testGru(self):
+    """Checks a GRU cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testStackedLstm(self):
+    """Checks a stacked LSTM cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    args = [np_inputs, np_slen, 'stacked_lstm', var_cache]
+    _, func_rnn = self._RunRnn(*(args + [False]))
+    _, dyn_rnn = self._RunRnn(*(args + [True]))
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
new file mode 100644
index 0000000000..00fbd4fbb8
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Recurrent ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+_ElmanState = collections.namedtuple('ElmanState', ('h'))
+_ElmanTheta = collections.namedtuple('ElmanTheta', ('w', 'b'))
+_ElmanInputs = collections.namedtuple('ElmanInputs', ('x'))
+
+
+# TODO(drpng): add test for max length computation.
+class RecurrentTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    # pylint:disable=invalid-name
+    _PolyState = collections.namedtuple('PolyState', ('value', 'x_power'))
+    _PolyTheta = collections.namedtuple('PolyTheta', ('x'))
+    _PolyInputs = collections.namedtuple('PolyInputs', ('coeff'))
+    # pylint:enable=invalid-name
+
+    def Poly(theta, state, inputs):
+      next_state = _PolyState(
+          value=state.value + inputs.coeff * state.x_power,
+          x_power=state.x_power * theta.x)
+      return next_state, []
+
+    with self.test_session() as sess:
+      theta = _PolyTheta(x=array_ops.constant(2.0))
+      state = _PolyState(
+          value=array_ops.constant(0.0),
+          x_power=array_ops.constant(1.0))
+      inputs = _PolyInputs(coeff=array_ops.constant([1., 2., 3.]))
+
+      # x = 2
+      # 1 + 2*x + 3*x^2
+      ret = recurrent.Recurrent(theta, state, inputs, Poly)
+
+      acc, state = sess.run(ret)
+      self.assertAllClose(acc.value, [1., 5., 17.])
+      self.assertAllClose(acc.x_power, [2., 4., 8.])
+      self.assertAllClose(state.value, 17.)
+      self.assertAllClose(state.x_power, 8.)
+
+      y = ret[1].value
+      dx, d_coeff = gradients_impl.gradients(ys=[y], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+
+      # 2 + 6*x
+      self.assertAllClose(dx_val, 14.)
+      self.assertAllClose(d_coeff_val, [1., 2., 4.])
+
+      # acc = [1, 1+2x, 1+2x+3x^2]
+      # sum(acc) = 3 + 4x + 3x^2
+      acc = ret[0].value
+      dx, d_coeff = gradients_impl.gradients(
+          ys=[math_ops.reduce_sum(acc)], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+      # 4 + 6*x
+      self.assertAllClose(dx_val, 16.)
+      self.assertAllClose(d_coeff_val, [3., 4., 4.])
+
+  @staticmethod
+  def Rand(shape):
+    return random_ops.random_uniform(
+        shape, minval=-0.2, maxval=0.2, dtype=dtypes.float64)
+
+  @staticmethod
+  def Elman(theta, state0, inputs):
+    h0, w, b, x = state0.h, theta.w, theta.b, inputs.x
+    xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+    h1 = math_ops.sigmoid(xw + b)
+    state1 = _ElmanState(h=h1)
+    return (state1, state1)
+
+  @staticmethod
+  def ElmanGrad(theta, state0, inputs, extras, dstate1):
+
+    @function.Defun()
+    def Grad(h0, w, b, x, h1, dh1):
+      del b
+      # We hand-roll the gradient for the 2nd half of the cell as a demo.
+      dxwb = (dh1 * (1 - h1) * h1)
+      dxw, db = dxwb, math_ops.reduce_sum(dxwb, axis=0)
+
+      # Uses tf.gradient for the 1nd half of the cell as a demo.
+      xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+      dh0, dx, dw = gradients_impl.gradients(
+          ys=[xw], xs=[h0, x, w], grad_ys=[dxw])
+
+      return dh0, dx, dw, db
+
+    dh0, dx, dw, db = Grad(state0.h, theta.w, theta.b, inputs.x,
+                           extras.h, dstate1.h)
+    dstate0 = _ElmanState(h=dh0)
+    dinputs = _ElmanInputs(x=dx)
+    return (_ElmanTheta(w=dw, b=db), dstate0, dinputs)
+
+  @staticmethod
+  def ElmanOut(state1):
+    return _ElmanState(x=state1.h)
+
+  @staticmethod
+  def ElmanOutGrad(dout):
+    return _ElmanState(h=dout.x)
+
+  def testElman(self):
+    for seqlen, use_grad in [(1, False), (1, True), (7, False), (7, True)]:
+      logging.info('== Elman: seqlen=%s, use_grad=%s', seqlen, use_grad)
+      self._ParameterizedTestElman(seqlen, use_grad)
+
+  def _ParameterizedTestElman(self, seqlen, use_grad):
+
+    with self.test_session() as sess:
+      random_seed.set_random_seed(342462)
+
+      batch = 3
+      dims = 4
+      theta = _ElmanTheta(w=RecurrentTest.Rand([2 * dims, dims]),
+                          b=RecurrentTest.Rand([dims]))
+      state0 = _ElmanState(h=RecurrentTest.Rand([batch, dims]))
+      inputs = _ElmanInputs(x=RecurrentTest.Rand([seqlen, batch, dims]))
+
+      # Statically unrolled.
+      s = state0
+      out = []
+      for i in xrange(seqlen):
+        inp = _ElmanInputs(x=inputs.x[i, :])
+        s, _ = RecurrentTest.Elman(theta, s, inp)
+        out += [s.h]
+      acc0, final0 = array_ops.stack(out), s.h
+      loss0 = math_ops.reduce_sum(acc0) + math_ops.reduce_sum(final0)
+      (dw0, db0, dh0, di0) = gradients_impl.gradients(
+          loss0, [theta.w, theta.b, state0.h, inputs.x])
+
+      acc1, final1 = recurrent.Recurrent(
+          theta=theta,
+          state0=state0,
+          inputs=inputs,
+          cell_fn=RecurrentTest.Elman,
+          cell_grad=RecurrentTest.ElmanGrad if use_grad else None)
+      assert isinstance(acc1, _ElmanState)
+      assert isinstance(final1, _ElmanState)
+      acc1, final1 = acc1.h, final1.h
+      loss1 = math_ops.reduce_sum(acc1) + math_ops.reduce_sum(final1)
+      (dw1, db1, dh1, di1) = gradients_impl.gradients(
+          loss1, [theta.w, theta.b, state0.h, inputs.x])
+
+      # Fetches a few values and compare them.
+      (acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0,
+       di1) = sess.run(
+           [acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0, di1])
+      self.assertAllClose(acc0, acc1)
+      self.assertAllClose(final0, final1)
+      self.assertAllClose(dw0, dw1)
+      self.assertAllClose(db0, db1)
+      self.assertAllClose(dh0, dh1)
+      self.assertAllClose(di0, di1)
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
new file mode 100644
index 0000000000..a085474c1b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
@@ -0,0 +1,396 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tf.nn.dynamic_rnn variant, built on the Recurrent class.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def _GetDTypesFromStructure(struct):
+  dtypes_list = []
+  for x in nest.flatten(struct):
+    x = ops.convert_to_tensor(x)
+    dtypes_list.append(x.dtype)
+  return dtypes_list
+
+
+def _SetShapeFromTemplate(struct, struct_template):
+  as_list = nest.flatten(struct)
+  template_as_list = nest.flatten(struct_template)
+  for element, template in zip(as_list, template_as_list):
+    element.set_shape(template.shape)
+
+
+class _FunctionalRnnCell(object):
+  """Wrapper around RNNCell which separates state from computation.
+
+  This class accomplishes the following:
+  * Turn the cell's `__call__` function into a pure function. The global
+    side effects are separated as `theta`. They are the variables created
+    for the weights of the computation.
+  * Unless the output is aliased as part of the state, extend the state to
+    contain the output so that we store the history in `Recurrent`.
+  * Set static shapes as required.
+  """
+
+  def __init__(self, rnn_cell, seq_inputs, initial_state):
+    assert initial_state is not None
+
+    # TODO(drpng): Dtype needs to be configurable.
+    input_dtypes = [dtypes.float32] + _GetDTypesFromStructure(initial_state)
+    # See _index.
+    like_inputs_t = nest.map_structure(
+        lambda x: array_ops.stop_gradient(array_ops.gather(x, 0)), seq_inputs)
+    input_structure = (like_inputs_t, initial_state)
+
+    @function.Defun(*input_dtypes)
+    def FlatCellStep(*flat_inputs):
+      """The flattened version of `rnn_cell`."""
+      inputs_t, state0 = nest.pack_sequence_as(input_structure, flat_inputs)
+      _SetShapeFromTemplate(state0, initial_state)
+      _SetShapeFromTemplate(inputs_t, like_inputs_t)
+      outputs_t, state1 = rnn_cell(inputs_t, state0)
+      state_list = nest.flatten(state1)
+      self._output_shape = outputs_t.shape
+
+      if outputs_t in state_list:
+        output_index_in_state = state_list.index(outputs_t)
+      else:
+        output_index_in_state = None
+
+      if output_index_in_state is None:
+        self._prepend_output = True
+        self._output_state_idx = 0
+        return [outputs_t] + state_list
+      else:
+        self._output_state_idx = output_index_in_state
+        self._prepend_output = False
+        # To save memory, we don't store return the output separately
+        # from the state list, since we know it's the same.
+        return state_list
+
+    def _ToPureFunction(func):
+      # NOTE: This forces the creating of the function.
+      if func.captured_inputs:
+        pure_func = copy.copy(func)
+        # pylint: disable=protected-access
+        pure_func._extra_inputs = []
+        return pure_func
+      return func
+
+    pure_flat_cell_step = _ToPureFunction(FlatCellStep)
+
+    def CellStep(theta, extended_state0, inputs_t):
+      """Performs one time steps on structured inputs.
+
+      The purpose of this function is to turn the parameters into flattened
+      versions, and to resolve the parameter order difference between
+      `Recurrent` and `RNNCell`.
+
+      In the event the cell returns a transformed output that is not aliased
+      within its state, the `extended_state0` also contains the output as its
+      first element.
+
+      Args:
+        theta: Weights required for the computation. A structure of tensors.
+        extended_state0: the state0, and possibly the output at the previous
+          time step. A structure of tensors.
+        inputs_t: the inputs at time t.
+
+      Returns:
+        A pair of the next state (inclusive of the output), and an empty list
+        (unused `extras`).
+        The next state is congruent to state0.
+      """
+      extended_state0_flat = nest.flatten(extended_state0)
+      state0_flat = self.MaybeRemoveOutputFromState(extended_state0_flat)
+      full_inputs = [inputs_t] + state0_flat + theta
+      # Note that the thetas are additional inputs appeneded as extra
+      # parameters.
+      cell_out = pure_flat_cell_step(*full_inputs)
+      return cell_out, []
+
+    self._cell_step = CellStep
+    self._theta = FlatCellStep.captured_inputs
+    self._zero_state = rnn_cell.zero_state
+    self._state_template = initial_state
+    self._output_size = rnn_cell.output_size
+
+  @property
+  def extended_initial_state(self):
+    if self._prepend_output:
+      return [array_ops.zeros(self._output_shape), self._state_template]
+    else:
+      # The base case, where the output is just the hidden state.
+      return self._state_template
+
+  @property
+  def cell_step(self):
+    return self._cell_step
+
+  @property
+  def theta(self):
+    return self._theta
+
+  @property
+  def state_template(self):
+    return self._state_template
+
+  @property
+  def output_shape(self):
+    return self._output_shape
+
+  def GetOutputFromState(self, state):
+    return nest.flatten(state)[self._output_state_idx]
+
+  def MaybeRemoveOutputFromState(self, flat_state):
+    if self._prepend_output:
+      return flat_state[1:]
+    return flat_state
+
+
+def _ApplyLengthsToBatch(sequence_lengths, tf_output):
+  # TODO(drpng): just use Update so that we don't carry over the gradients?
+  """Sets the output to be zero at the end of the sequence."""
+  # output is batch major.
+  batch_size, max_time, vector_size = tf_output.shape
+  output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+  output_time = array_ops.reshape(output_time, [batch_size, max_time])
+  lengths = array_ops.tile(
+      array_ops.reshape(sequence_lengths, [-1, 1]), [1, max_time])
+  is_less = math_ops.cast(
+      math_ops.less(output_time, lengths), dtype=dtypes.float32)
+  keep_mask = array_ops.tile(
+      array_ops.expand_dims(is_less, -1),
+      [1, 1, vector_size])
+  final_output = keep_mask * tf_output
+  return final_output
+
+
+def _PickFinalStateFromHistory(acc_state, sequence_length):
+  """Implements acc_state[sequence_length - 1]."""
+  # This will work on all platforms, unlike the regular slice.
+  last_value = []
+  for state_var in nest.flatten(acc_state):
+    # We compute the following with matrix operations:
+    # last_var = state_var[sequence_length - 1]
+    shape = array_ops.shape(state_var)
+    max_time, batch_size = shape[0], shape[1]
+    output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+    output_time = array_ops.reshape(output_time, [batch_size, max_time])
+    lengths = array_ops.tile(array_ops.reshape(sequence_length,
+                                               [-1, 1]), [1, max_time])
+    last_idx = math_ops.cast(math_ops.equal(output_time, lengths - 1),
+                             dtype=dtypes.float32)
+    last_idx = array_ops.transpose(last_idx)
+    last_idx_for_bcast = array_ops.expand_dims(last_idx, -1)
+    sliced = math_ops.multiply(last_idx_for_bcast, state_var)
+    last_var = math_ops.reduce_sum(sliced, 0)
+    last_value += [last_var]
+  return nest.pack_sequence_as(acc_state, last_value)
+
+
+def _PostProcessOutput(extended_acc_state, extended_final_state, func_cell,
+                       total_time, inputs_lengths):
+  """Post-process output of recurrent.
+
+  This function takes the accumulated extended state and extracts the requested
+  state and output.
+
+  When `inputs_lengths` has been set, it extracts the output from the
+  accumulated state. It also sets outputs past.
+
+  It also sets the static shape information.
+
+  Args:
+    extended_acc_state: A structure containing the accumulated state at each
+      time. It may contain the output at each time as well.
+    extended_final_state: A structure containing the final state. It may
+      contain the output at the final time.
+    func_cell: The functional wrapper around the cell.
+    total_time: A scalar integer tensor.
+    inputs_lengths: An integer tensor with one entry per input.
+
+  Returns:
+    A tuple with the outputs at each time, and the final state.
+  """
+  if inputs_lengths is None:
+    flat_final_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_final_state))
+    tf_state = nest.pack_sequence_as(func_cell.state_template, flat_final_state)
+  else:
+    # The accumulated state is over the entire sequence, so we pick it
+    # out from the acc_state sequence.
+    flat_acc_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_acc_state))
+    acc_state = nest.pack_sequence_as(
+        func_cell.state_template, flat_acc_state)
+    tf_state = _PickFinalStateFromHistory(acc_state, inputs_lengths)
+
+  output_from_state = func_cell.GetOutputFromState(extended_acc_state)
+  tf_output = array_ops.transpose(output_from_state, [1, 0, 2])
+  tf_output.set_shape(
+      [func_cell.output_shape[0], total_time, func_cell.output_shape[1]])
+  if inputs_lengths is not None:
+    # Need set the outputs to zero.
+    tf_output = _ApplyLengthsToBatch(inputs_lengths, tf_output)
+    # tf_output = array_ops.zeros([4, 3, 5])
+  _SetShapeFromTemplate(tf_state, func_cell.state_template)
+  return tf_output, tf_state
+
+
+# pylint: disable=invalid-name
+def functional_rnn(cell, inputs, sequence_length=None,
+                   initial_state=None, dtype=None, time_major=False,
+                   scope=None, use_tpu=False):
+  """Same interface as `tf.nn.dynamic_rnn`."""
+  with variable_scope.variable_scope(scope or 'rnn'):
+    if not time_major:
+      inputs = nest.map_structure(
+          lambda t: array_ops.transpose(t, [1, 0, 2]), inputs)
+    inputs_flat = nest.flatten(inputs)
+    batch_size = array_ops.shape(inputs_flat[0])[1]
+    if initial_state is None:
+      initial_state = cell.zero_state(batch_size, dtype)
+    func_cell = _FunctionalRnnCell(cell, inputs, initial_state)
+  extended_acc_state, extended_final_state = recurrent.Recurrent(
+      theta=func_cell.theta,
+      state0=func_cell.extended_initial_state,
+      inputs=inputs,
+      cell_fn=func_cell.cell_step,
+      use_tpu=use_tpu)
+  return _PostProcessOutput(extended_acc_state, extended_final_state,
+                            func_cell, inputs_flat[0].shape[0], sequence_length)
+
+
+def bidirectional_functional_rnn(
+    cell_fw,
+    cell_bw,
+    inputs,
+    initial_state_fw=None,
+    initial_state_bw=None,
+    dtype=None,
+    sequence_length=None,
+    time_major=False,
+    use_tpu=False,
+    scope=None):
+  """Creates a bidirectional recurrent neural network.
+
+  Performs fully dynamic unrolling of inputs in both directions. Built to be API
+  compatible with `tf.nn.bidirectional_dynamic_rnn`, but implemented with
+  functional control flow for TPU compatibility.
+
+  Args:
+    cell_fw: An instance of `tf.contrib.rnn.RNNCell`.
+    cell_bw: An instance of `tf.contrib.rnn.RNNCell`.
+    inputs: The RNN inputs. If time_major == False (default), this must be a
+      Tensor (or hierarchical structure of Tensors) of shape
+      [batch_size, max_time, ...]. If time_major == True, this must be a Tensor
+      (or hierarchical structure of Tensors) of shape:
+      [max_time, batch_size, ...]. The first two dimensions must match across
+      all the inputs, but otherwise the ranks and other shape components may
+      differ.
+    initial_state_fw: An optional initial state for `cell_fw`. Should match
+      `cell_fw.zero_state` in structure and type.
+    initial_state_bw: An optional initial state for `cell_bw`. Should match
+      `cell_bw.zero_state` in structure and type.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_states are not provided or RNN state has a
+      heterogeneous dtype.
+    sequence_length: An optional int32/int64 vector sized [batch_size]. Used to
+      copy-through state and zero-out outputs when past a batch element's
+      sequence length. So it's more for correctness than performance.
+    time_major: Whether the `inputs` tensor is in "time major" format.
+    use_tpu: Whether to enable TPU-compatible operation. If True, does not truly
+      reverse `inputs` in the backwards RNN. Once b/69305369 is fixed, we can
+      remove this flag.
+    scope: An optional scope name for the dynamic RNN.
+
+  Returns:
+    outputs: A tuple of `(output_fw, output_bw)`. The output of the forward and
+      backward RNN. If time_major == False (default), these will
+      be Tensors shaped: [batch_size, max_time, cell.output_size]. If
+      time_major == True, these will be Tensors shaped:
+      [max_time, batch_size, cell.output_size]. Note, if cell.output_size is a
+      (possibly nested) tuple of integers or TensorShape objects, then the
+      output for that direction will be a tuple having the same structure as
+      cell.output_size, containing Tensors having shapes corresponding to the
+      shape data in cell.output_size.
+    final_states: A tuple of `(final_state_fw, final_state_bw)`. A Tensor or
+      hierarchical structure of Tensors indicating the final cell state in each
+      direction. Must have the same structure and shape as cell.zero_state.
+
+  Raises:
+    ValueError: If `initial_state_fw` is None or `initial_state_bw` is None and
+      `dtype` is not provided.
+  """
+  # Keep this code in sync with tf.nn.dynamic_rnn for compatibility.
+  with variable_scope.variable_scope(scope or 'bidirectional_rnn'):
+    # Forward direction
+    with variable_scope.variable_scope('fw') as fw_scope:
+      output_fw, output_state_fw = functional_rnn(
+          cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+          initial_state=initial_state_fw, dtype=dtype,
+          time_major=time_major, scope=fw_scope, use_tpu=use_tpu)
+    # Backward direction
+    if not time_major:
+      time_dim = 1
+      batch_dim = 0
+    else:
+      time_dim = 0
+      batch_dim = 1
+
+    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_, seq_lengths=seq_lengths,
+            seq_dim=seq_dim, batch_dim=batch_dim)
+      else:
+        # See b/69305369.
+        assert not use_tpu, (
+            'Bidirectional with variable sequence lengths unsupported on TPU')
+        return array_ops.reverse(input_, axis=[seq_dim])
+
+    with variable_scope.variable_scope('bw') as bw_scope:
+      inputs_reverse = _reverse(
+          inputs, seq_lengths=sequence_length,
+          seq_dim=time_dim, batch_dim=batch_dim)
+      tmp, output_state_bw = functional_rnn(
+          cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+          initial_state=initial_state_bw, dtype=dtype,
+          time_major=time_major, scope=bw_scope, use_tpu=use_tpu)
+
+  output_bw = _reverse(
+      tmp, seq_lengths=sequence_length,
+      seq_dim=time_dim, batch_dim=batch_dim)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
+# pylint: enable=invalid-name
diff --git a/tensorflow/contrib/recurrent/python/ops/recurrent.py b/tensorflow/contrib/recurrent/python/ops/recurrent.py
new file mode 100644
index 0000000000..fa16b82ab6
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/recurrent.py
@@ -0,0 +1,720 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent computation.
+
+The main interface of this module is Recurrent().
+A recurrent computation describes an auto-regressive process, where outputs
+of one time step are fed to the output of the next time step.
+
+This module uses:
+  theta: the "weights" each RNN uses.
+  state0: the initial state of each RNN.
+  cell_fn: A python function describing RNN cell. It must has the following
+    signature:
+         cell_fn: (theta, state0, inputs) -> (state1, extras)
+    state1 is the next RNN state, extras are computed by cell_fn
+    and the library forwards extras to cell_fn's gradient function.
+  cell_grad: A python function describing the backprop gradient function
+    for the RNN cell. It must has the following signature:
+         cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+    dstate1 is what the backprop algorithm provides representing
+    gradients of state1 w.r.t. the final loss.
+
+In this module, we handle structures of tensors for theta, state0, inputs,
+and extras. The structure is an arbitrarily nested python structure, such
+as a dictionary of named tuples.
+
+Because the computation is a left-to-right chain, a single in-place accumulator
+can be used rather than a stack. Thus a special gradient was written to reduce
+unnecessary memory usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.inplace_ops import alias_inplace_update
+from tensorflow.python.util import nest
+
+
+def _AssertIsCompatible(a, b):
+  """Checks that `a` and `b` are nested structures of the same type."""
+  # TODO(drpng): implement.
+  del a
+  del b
+
+
+def _Index(struct, index):
+  """Returns a structure with `x[index]` for each tensor `x` in the structure.
+
+  Args:
+    struct: A structure of tensors.
+    index: A scalar integer tensor. Performance is better if `index` is
+      on the host memory.
+
+  Returns:
+    A structure of tensors congruent to `struct`.
+    For each key in `ret`, `rets[key] = struct[key][index]`.
+  """
+  index = ops.convert_to_tensor(index)
+  index.get_shape().assert_has_rank(0)
+  return nest.map_structure(lambda x: x[index], struct)
+
+
+def _Update(struct_acc, struct_x, t):
+  """Updates t-th row in accumulators.
+
+  Args:
+    struct_acc: The accumulators. A structure of tensors.
+    struct_x: The new values. A structure of tensors congruent to `struct_acc`.
+    t: A scalar integer. Performance is better if `t` is on the device
+      memory.
+
+  Returns:
+    A structure of tensors. Say, ret is a returned dictionary. Then, for
+    each key, we have:
+      ret[key] = struct_acc[key];
+      ret[key][t, :] = struct_x[key]
+  """
+  to_skip_update = set()
+  acc_lst = nest.flatten(struct_acc)
+  x_lst = nest.flatten(struct_x)
+  t = math_ops.to_int32([t])  # tf.to_int32 casts on-device tensors.
+  lst = []
+  for acc, x in zip(acc_lst, x_lst):
+    if acc in to_skip_update:
+      # Until b/62105730 is fixed, we need to avoid inplace update for tensors
+      # of rank 1.  could reshape to handle it, but we don't really need the
+      # values applied to these, so just skip their modification.
+      lst += [acc]
+    else:
+      lst += [alias_inplace_update(acc, t, array_ops.expand_dims(x, 0))]
+  return nest.pack_sequence_as(struct_acc, lst)
+
+
+def _SeqLenDim(struct):
+  """Returns the 0-th dim size of tensors in a structure of tensors.
+
+  This is the max sequence length according to the shape of the inputs.
+
+  Args:
+    struct: A structure of tensors. Every tensor's 0-th dim has the same size.
+
+  Returns:
+    A scalar tensor which is the size of 0-th dim of every tensors in struct.
+  """
+  xs = nest.flatten(struct)
+  assert xs
+  dim0 = array_ops.shape(xs[0])[0]
+  return dim0
+
+
+def _Flatten(struct):
+  """Flattens a structure."""
+  return nest.flatten(struct)
+
+
+def _Pack(elements, struct_template):
+  """Packs the list of tensors according to the structure.
+
+  In the event that `elements` should be a scalar, `struct_template` must
+  contain exactly one non-trivial element (for instance, `[[], {'x':elt}]`).
+
+  Args:
+    elements: Elements to be packed. A list of tensor, or a single tensor.
+    struct_template: The container structure in which to pack them.
+  Returns:
+    A python structure of the same type as `struct_template`, containing
+    `elements` as its contained elements.
+  """
+  if not nest.is_sequence(elements):
+    return nest.pack_sequence_as(struct_template, [elements])
+  return nest.pack_sequence_as(struct_template, elements)
+
+
+def _EmptyAcc(slen, struct_template):
+  """Creates a set of accumulators for tensors in structure.
+
+  Args:
+    slen: The sequence length. A scalar tensor.
+    struct_template: A structure of tensors.
+
+  Returns:
+    A structure congruent to `struct_template`. Say ret is a returned
+    dictionary. Then, `ret.key`, a tensor, has the same dtype as
+    `struct_template.key`. The tensor's shape has 1 more dimension
+    than the tensor `struct_template.key`. The extra 0-th dimension is of size
+    `slen`. E.g., if `slen=10` and `struct_template.key`'s shape is `[3, 5]`,
+    then, `ret.key`'s shape is `[10, 3, 5]`.
+  """
+
+  def _EmptyAccForTensor(tensor):
+    return inplace_ops.empty(
+        array_ops.concat([[slen], array_ops.shape(tensor)], axis=0),
+        tensor.dtype,
+        init=True)
+
+  return nest.map_structure(_EmptyAccForTensor, struct_template)
+
+
+def _EmptyLike(struct):
+  """Creates a set of empty initialized tensors.
+
+  Args:
+    struct: A structure of tensors.
+
+  Returns:
+    A struct of tensors. Each tensor has the same shape and dtype as
+    its corresponding tensor in `struct`. And each tensor is initialized.
+  """
+  return nest.map_structure(
+      lambda x: inplace_ops.empty_like(x, init=True), struct)
+
+
+def _Add(struct_x, struct_y):
+  """Adds tensors in `struct_x` with respective tensors in `struct_y`.
+
+  Args:
+    struct_x: A struct of tensors.
+    struct_y: A struct of tensors congruent to `struct_x`.
+
+  Returns:
+    A struct of tensors. Each element of the returned value
+  equals `x + y`, with corresponding values in `struct_x` and `struct_y`.
+  """
+  list_x = nest.flatten(struct_x)
+  list_y = nest.flatten(struct_y)
+  z = []
+  for x, y in zip(list_x, list_y):
+    z += [math_ops.add(x, y)]
+  return nest.pack_sequence_as(struct_x, z)
+
+
+def _Dtypes(struct):
+  """Returns all tensors' data types in a list."""
+  return [x.dtype for x in nest.flatten(struct)]
+
+
+def _ConvertNoneGradientToZeros(xs, dxs):
+  """Sanitize dxs so that None becomes zeros appropriately.
+
+  Args:
+    xs: A list of tensors.
+    dxs: A list of tensors. dxs[i] corresponds to xs[i]'s gradient.
+
+  Returns:
+    A structure same as `dxs` with `None` replaced by a zero tensor.
+  """
+  list_xs = nest.flatten(xs)
+  list_dxs = nest.flatten(dxs)
+
+  # If x does not get any backprop-ed gradient, propagate zeros.
+  rets = []
+  for (x, dx) in zip(list_xs, list_dxs):
+    if dx is None:
+      rets.append(array_ops.zeros_like(x))
+    else:
+      rets.append(dx)
+
+  return nest.pack_sequence_as(dxs, rets)
+
+
+# All structures are flattened for use internally. This is for simplicity
+# and also to use the Defun construct.
+# In the forward pass (inference), the computation is structured as follows.
+# Forward: [gradient = _Recurrent.Grad]
+#   Flatten structures, create accumulators.
+#   for t = 0..max_input_length:
+#     Defun ForwardLoopBody:
+#       Defun Fwd: flatten/pack around cell_fn
+#       state1 = Fwd(inputs[t], state0)
+#       acc_state += [state1]
+#   Pack structures.
+# During the backward pass (backpropping the gradient from the last time
+# step to the first, through the structure), the computation is structured
+# as follows.
+# Grad:
+#   Flatten structures.
+#   Defun Backward:
+#     Create create accumulated derivatives: d_theta, d_inputs, d_acc_state.
+#     Regarding the note at the top of the file, there is only one accumulator
+#     for d_theta accumulated over the whole sequence.
+#     for t = max_input_length -1..0:
+#       Defun BackwardLoopBody:
+#         Retrieve acc_state[t] computed in the forward pass.
+#         Defun Bak: flatten/back around cell_fn_grad.
+#         d_state1 is d_state0 from previous step (ie next time).
+#         d_acc_state[dev_t] += d_state1
+#         d_theta_t, d_state0, d_inputs_t, = Bak()
+#         d_inputs[dev_t] += d_inputs
+#         d_theta += d_theta_t
+#         d_acc_state[t] += d_state1
+#   Pack structures and return.
+class _Recurrent(object):
+  """A helper class to construct a recurrent neural net."""
+
+  def __init__(self, cell_fn, cell_grad, theta, state0, inputs,
+               max_input_length, extras, use_tpu):
+    """RNN helper class.
+
+    Args:
+      cell_fn: A python function, which computes:
+         state1, extras = cell_fn(theta, state0, inputs[t, :])
+      cell_grad: A python function which computes:
+         dtheta, dstate0, dinputs[t, :] = cell_grad(
+           theta, state0, inputs[t, :], extras, dstate1)
+      theta: weights. A structure of tensors.
+      state0: initial state. A structure of tensors.
+      inputs: inputs. A structure of tensors.
+      max_input_length: None, or the maximum effective length of the input over
+        all batches. A scalar tensor.
+      extras: A structure of tensors. The 2nd return value of every
+        invocation of cell_fn is a structure of tensors with matching keys
+        and shapes of this `extras`.
+      use_tpu: A boolean indicating whether the computation is mean to
+        run on a TPU.
+    """
+    self._theta = theta
+    self._state = state0
+    self._inputs = inputs
+    self._max_input_length = self._MaybeComputeMaxInputLength(
+        inputs, max_input_length)
+    self._cell_fn = cell_fn
+    self._cell_grad = cell_grad
+    self._extras = extras
+
+    # pylint: disable=unbalanced-tuple-unpacking
+
+    # NOTE: TF Function (Fwd, Bak, ForwardLoopBody, BackwardLoopBody,
+    # Forward and Backward defined below) simply takes a list of
+    # Tensors and returns a list of Tensors. When we pass in a
+    # structure (a list of structures of Tensors), we use _Flatten to
+    # convert the structure into a list of tensor. Conversely, the
+    # following code often uses _Pack to formulate a structure from a
+    # list of tensors based on a "template".
+
+    # Wraps cell_fn in a TF Function:
+    #    state1 = cell_fn(theta, state0, inputs)
+    fwd_sig = [self._theta, self._state, self._inputs]
+
+    compiled = use_tpu
+    noinline = not compiled
+    dev_t_type = dtypes.int32 if use_tpu else dtypes.int64
+
+    @function.Defun(*_Dtypes(fwd_sig))
+    def Fwd(*args):
+      (theta, state0, inputs) = _Pack(args, fwd_sig)
+      state1, extras = self._cell_fn(theta, state0, inputs)
+      assert not function.get_extra_args(), (
+          'cell_fn is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(state1, self._state)
+      _AssertIsCompatible(extras, self._extras)
+      return _Flatten([state1, extras])
+
+    # Wraps cell_fn in a TF Function as a for-loop's body.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  state0: the previous recurrent state.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state is also stashed into
+    #    acc_state.
+    #  acc_extras: Each timestep's computed extras is stashed into acc_extras
+    fwdloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(fwdloop_sig))
+    def ForwardLoopBody(*args):
+      """The body of forward loop."""
+      t, dev_t = args[0], args[1]
+      (theta, state0, inputs, acc_state, acc_extras) = _Pack(
+          args[2:], fwdloop_sig)
+      inputs_t = _Index(inputs, t)  # external input at time step t.
+      fwd = Fwd(*_Flatten([theta, state0, inputs_t]))
+      state1, extras = _Pack(fwd, [self._state, self._extras])
+      # Saves state1 and extras in their accumulators.
+      acc_state = _Update(acc_state, state1, dev_t)
+      acc_extras = _Update(acc_extras, extras, dev_t)
+
+      return [math_ops.add(dev_t, 1)] + _Flatten(
+          [theta, state1, inputs, acc_state, acc_extras])
+
+    def Grad(op, *args):
+      """The python grad function for the Forward function."""
+
+      # NOTE: tf.gradient backprops None for int32/int64 while zeros
+      # for float32/float64. For consistency, we always backprop
+      # zeros.
+      args = list(args)
+      for i, dy in enumerate(args):
+        if dy is None:
+          args[i] = array_ops.zeros_like(op.outputs[i])
+      # TODO(drpng): getting the extra state here?
+      op_inputs = [x for x in op.inputs]
+      op_struct = [
+          self._theta, self._state, self._inputs, self._max_input_length,
+          self._extras
+      ]
+      (theta, state0, inputs, max_input_length, _) = _Pack(op_inputs, op_struct)
+      # acc_state and acc_extras are computed by the Forward pass and
+      # needed by the Backward pass.
+      acc_state, _, acc_extras = _Pack([x for x in op.outputs],
+                                       [self._state, self._state, self._extras])
+
+      # Forward computes acc_state, the final state and
+      # acc_extras. tf.gradients gives us their gradients w.r.t. the
+      # final loss. Because acc_extras are not exposed by Compute(),
+      # it has no gradients w.r.t. the final loss (i.e., by
+      # construction, it must be zeros).
+      d_acc_state, d_state1, _ = _Pack(args,
+                                       [self._state, self._state, self._extras])
+      return Backward(*_Flatten([
+          theta, state0, inputs, max_input_length, acc_state, acc_extras,
+          d_acc_state, d_state1
+      ]))
+
+    # Forward calls ForwardLoopBody n times. Each time computes one
+    # time step of the recurrent net.
+    forward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._extras
+    ]
+
+    @function.Defun(
+        *_Dtypes(forward_sig), python_grad_func=Grad, noinline=noinline)
+    def Forward(*args):
+      """Forward pass of the recurrent net."""
+      theta, state0, inputs, max_input_length, extras = _Pack(args, forward_sig)
+
+      slen_dim = _SeqLenDim(inputs)
+
+      # Creates accumulators for state0 and extras.
+      acc_state = _EmptyAcc(slen_dim, state0)
+      acc_extras = _EmptyAcc(slen_dim, extras)
+
+      dev_t = array_ops.constant(0, dtype=dev_t_type)
+      run = functional_ops.For(
+          start=0,
+          limit=max_input_length,
+          delta=1,
+          inputs=[dev_t] + _Flatten(
+              [theta, state0, inputs, acc_state, acc_extras]),
+          body=ForwardLoopBody,
+          rewrite_with_while=compiled)
+      _, state1, _, acc_state, acc_extras = _Pack(
+          run[1:],
+          [self._theta, self._state, self._inputs, self._state, self._extras])
+
+      return _Flatten([acc_state, state1, acc_extras])
+
+    # The per-step backward computes:
+    #    d_theta, d_state0, d_inputs = cell_grad(
+    #        theta, state0, inputs, extras, d_state1)
+    # where d_state1 is the backprop-ed gradient for state1, and
+    # extras is the computed by the forward step to facilitate the
+    # backward step.
+    bak_sig = [
+        self._theta, self._state, self._inputs, self._extras, self._state
+    ]
+
+    @function.Defun(*_Dtypes(bak_sig))
+    def Bak(*args):
+      """Backward step."""
+      (theta, state0, inputs, extras, d_state1) = _Pack(args, bak_sig)
+      (dtheta, dstate0, dinputs) = self._cell_grad(theta, state0, inputs,
+                                                   extras, d_state1)
+      assert not function.get_extra_args(), (
+          'cell_grad is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(dtheta, self._theta)
+      _AssertIsCompatible(dstate0, self._state)
+      _AssertIsCompatible(dinputs, self._inputs)
+      return _Flatten(
+          _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                      [dtheta, dstate0, dinputs]))
+
+    # Define defuns used by a functional_ops.If in BackwardLoopBody.
+    state_if_sig = [self._state, self._state]
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnOrigState0(*args):
+      """Returns original state0 from inputs."""
+      (_, orig_state0) = _Pack(args, state_if_sig)
+      return nest.flatten(orig_state0)
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnAccState(*args):
+      """Returns acc_state[t-1] from inputs."""
+      (acc_state, _) = _Pack(args, state_if_sig)
+      return nest.flatten(acc_state)
+
+    # Wraps cell_grad gradient function in a TF Function as a
+    # for-loop's body for the Backward pass.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  state0: the initial state for the entire backward loop.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state was stashed into
+    #    acc_state by the Forward pass.
+    #  acc_extras: Each timestep's computed extras was stashed into
+    #    acc_extras by the Forward pass.
+    #  d_theta: All timestep's gradient for theta is accumulated (added) into
+    #      d_theta.
+    #  d_state1: The backprop-ed gradient for the new stated computed by
+    #      timestep t.
+    #  d_inputs: d_inputs[t, :] is populated by the backward time step t.
+    #  d_acc_state: The backprop-ed gradient for acc_state.
+    bakloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras,
+        self._theta, self._state, self._inputs, self._state
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(bakloop_sig))
+    def BackwardLoopBody(*args):
+      """Backward loop body function."""
+      t, dev_t = args[0], args[1]
+      (theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+       d_inputs, d_acc_state) = _Pack(args[2:], bakloop_sig)
+
+      # The input recurrent state for time step t is previous time step's
+      # output, or the original state0 when on time step 0.
+      state_from_acc = _Index(acc_state, math_ops.maximum(0, t - 1))
+      state0 = functional_ops.If(
+          math_ops.equal(t, array_ops.constant(0, dtypes.int32)),
+          _Flatten([state_from_acc, orig_state0]), ReturnOrigState0,
+          ReturnAccState)
+      state0 = nest.pack_sequence_as(orig_state0, state0)
+
+      # The external inputs for time step t.
+      inputs_t = _Index(inputs, t)
+      # The extras for time step t.
+      extras_t = _Index(acc_extras, t)
+
+      d_state1 = _Add(_Index(d_acc_state, t), d_state1)
+      (d_theta_t, d_state0, d_inputs_t) = _Pack(
+          Bak(*_Flatten([theta, state0, inputs_t, extras_t, d_state1])),
+          [self._theta, self._state, self._inputs])
+      d_theta = _Add(d_theta, d_theta_t)
+      d_inputs = _Update(d_inputs, d_inputs_t, dev_t)
+      return [math_ops.subtract(dev_t, 1)] + _Flatten([
+          theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+          d_inputs, d_acc_state
+      ])
+
+    # Backward calls BackwardLoopBody n times.  Each time computes the backprop
+    # for one time step of the recurrent net.
+    backward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._state, self._extras, self._state, self._state
+    ]
+
+    @function.Defun(*_Dtypes(backward_sig), noinline=noinline)
+    def Backward(*args):
+      """Backward pass for the recurrent net."""
+      # theta, state0, inputs are Forward's inputs.
+      # acc_state is the accumulated 1st output of Forward.
+      # acc_extras is the accumulated 2nd output of Forward.
+      # d_acc_state is the gradient for acc_state.
+      # d_state1 is the gradient for the final state computed by Forward.
+      (theta, state0, inputs, max_input_length, acc_state, acc_extras,
+       d_acc_state, d_state1) = _Pack(args, backward_sig)
+
+      # Accumulators for gradients.
+      d_theta = _EmptyLike(theta)
+      d_inputs = _EmptyLike(inputs)
+
+      # Loop backwards. Note the loop's limit is open-ended, so goes through
+      # t=0.
+      t = max_input_length - 1
+      dev_t = math_ops.to_int32(t) if use_tpu else math_ops.to_int64(t)
+      run = functional_ops.For(
+          start=t,
+          limit=-1,
+          delta=-1,
+          inputs=[dev_t] + _Flatten([
+              theta, state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+              d_inputs, d_acc_state
+          ]),
+          body=BackwardLoopBody,
+          rewrite_with_while=compiled)
+
+      (theta, state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+       d_inputs, d_acc_state) = _Pack(run[1:], bakloop_sig)
+
+      d_max_input_length = array_ops.constant(0, dtype=max_input_length.dtype)
+      return _Flatten(
+          [d_theta, d_state0, d_inputs, d_max_input_length, acc_extras])
+
+    self._forward = Forward
+
+  def _MaybeComputeMaxInputLength(self, inputs, max_input_length):
+    if max_input_length is not None:
+      return max_input_length
+    return math_ops.reduce_max(array_ops.shape(nest.flatten(inputs)[0])[0])
+
+  def Compute(self):
+    return _Pack(
+        self._forward(*_Flatten([
+            self._theta, self._state, self._inputs, self._max_input_length,
+            self._extras
+        ])), [self._state, self._state, self._extras])[:2]
+
+
+def _GetCellGrad(cell_fn, cell_grad):
+  """Returns the gradient function for cell_fn.
+
+  Args:
+    cell_fn: The recurrent neural net's cell function.
+    cell_grad: If not None, cell_fn's gradient function.
+
+  Returns:
+    Returns cell_grad if not None. Otherwise, assume cell_fn is a python
+    function representing the recurrent neural net's cell function, i.e.,
+      cell_fn: (theta, state0, inputs) -> (state1, extra)
+    returns its default gradient python function, i.e.,
+      cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+  """
+
+  if cell_grad:
+    return cell_grad
+
+  def CellGrad(theta, state0, inputs, extras, dstate1):
+    """Default gradient function for cell_fn."""
+    # NOTE: The default grad function recomputes the forward
+    # function and does not take advantage of 'extras' returned by
+    # the forward function.
+    del extras
+    state1, extras = cell_fn(theta, state0, inputs)
+    ys = _Flatten([state1])
+    xs = _Flatten([theta, state0, inputs])
+    grad_ys = _Flatten([dstate1])
+    grads = gradients_impl.gradients(ys=ys, xs=xs, grad_ys=grad_ys)
+    return _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                       _Pack(grads, [theta, state0, inputs]))
+
+  return CellGrad
+
+
+def _IsSingleTimeStep(inputs, max_input_length):
+  """Returns True only if the time dimension of inputs is 1."""
+  if not isinstance(max_input_length, ops.Tensor):
+    return max_input_length == 1
+  for x in nest.flatten(inputs):
+    if x.shape.dims is None or x.shape[0].value != 1:
+      return False
+  return True
+
+
+def Recurrent(theta,
+              state0,
+              inputs,
+              cell_fn,
+              cell_grad=None,
+              extras=None,
+              max_input_length=None,
+              use_tpu=False):
+  """Compute a recurrent neural net.
+
+  Roughly, Recurrent() computes the following:
+    state = state0
+    for t in inputs' sequence length:
+      state = cell_fn(theta, state, inputs[t, :])
+      accumulate_state[t, :] = state
+    return accumulate_state, state
+
+  theta, state, inputs are all structures of tensors.
+
+  inputs[t, :] means taking a slice out from every tensor in the inputs.
+
+  accumulate_state[t, :] = state means that we stash every tensor in
+  'state' into a slice of the corresponding tensor in
+  accumulate_state.
+
+  cell_fn is a python callable computing (building up a TensorFlow
+  graph) the recurrent neural network's one forward step. Two calls of
+  cell_fn must describe two identical computations.
+
+  By construction, Recurrent()'s backward computation does not access
+  any intermediate values computed by cell_fn during forward
+  computation. We may extend Recurrent() to support that by taking a
+  customized backward function of cell_fn.
+
+  Args:
+    theta: weights. A structure of tensors.
+    state0: initial state. A structure of tensors.
+    inputs: inputs. A structure of tensors.
+    cell_fn: A python function, which computes:
+      state1, extras = cell_fn(theta, state0, inputs[t, :])
+    cell_grad: A python function which computes:
+      dtheta, dstate0, dinputs[t, :] = cell_grad(
+        theta, state0, inputs[t, :], extras, dstate1)
+    extras: A structure of tensors. The 2nd return value of every
+      invocation of cell_fn is a structure of tensors with matching keys
+      and shapes of  this `extras`.
+    max_input_length: maximum length of effective input. This is used to
+      truncate the computation if the inputs have been allocated to a
+      larger size. A scalar tensor.
+    use_tpu: whether or not we are on TPU.
+
+  Returns:
+    accumulate_state and the final state.
+  """
+  if cell_grad is None and _IsSingleTimeStep(inputs, max_input_length):
+    # The seqlen length is staticly known as 1. Hence, we just need to
+    # call cell_fn once without putting it into a loop.
+    inputs = nest.map_structure(lambda x: array_ops.squeeze(x, axis=0), inputs)
+    state1, _ = cell_fn(theta, state0, inputs)
+    acc_state = nest.map_structure(lambda x: array_ops.expand_dims(x, axis=0),
+                                   state1)
+    return acc_state, state1
+
+  # If cell_grad is not given, derives the gradient function from
+  # cell_fn.
+  cell_grad = _GetCellGrad(cell_fn, cell_grad)
+
+  if extras is None:
+    # Derives 'extras' so that we can allocate extras' accumulator.
+    _, extras = cell_fn(theta, state0, _Index(inputs, 0))
+    extras = nest.map_structure(array_ops.zeros_like, extras)
+  else:
+    _, actual = cell_fn(theta, state0, _Index(inputs, 0))
+    _AssertIsCompatible(extras, actual)
+
+  return _Recurrent(
+      cell_fn=cell_fn,
+      cell_grad=cell_grad,
+      theta=theta,
+      state0=state0,
+      inputs=inputs,
+      max_input_length=max_input_length,
+      extras=extras,
+      use_tpu=use_tpu).Compute()
diff --git a/tensorflow/contrib/recurrent/python/recurrent_api.py b/tensorflow/contrib/recurrent/python/recurrent_api.py
new file mode 100644
index 0000000000..ffe1dcf7dc
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/recurrent_api.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent computations library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.recurrent.python.ops import functional_bidirectional_rnn
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.contrib.recurrent.python.ops import Recurrent
+# pylint: enable=unused-import
+
+del absolute_import
+del division
+del print_function
-- 
GitLab


From 0a0312473c2c5179d05bb716586e796daa3a0252 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 9 Apr 2018 17:30:27 -0700
Subject: [PATCH 0483/1262] Don't fail when running shape inference on a graph
 that contains functions.

PiperOrigin-RevId: 192215493
---
 tensorflow/core/graph/graph_constructor.cc    |  7 +----
 .../core/grappler/costs/graph_properties.cc   |  4 +--
 .../grappler/costs/graph_properties_test.cc   | 29 +++++++++----------
 3 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index f15e2ce9fa..250992fb7a 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -1019,12 +1019,7 @@ Status GraphConstructor::Convert() {
       }
     }
 
-    // Function shape inference is supported on an opt-in basis per
-    // ShapeRefiner.
-    if (refiner_->function_shape_inference_supported() ||
-        g_->flib_def().Find(node_def->name()) == nullptr) {
-      TF_RETURN_IF_ERROR(ValidateShape(node));
-    }
+    TF_RETURN_IF_ERROR(ValidateShape(node));
 
     // Update pending_count_ for outputs.
     UpdatePendingCountAndReady(outputs_, o, &pending_count_, &ready_);
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 8fe154dbf3..9fa2b7a259 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -920,9 +920,9 @@ Status GraphProperties::UpdateResource(
 }
 
 Status GraphProperties::InferStatically(bool assume_valid_feeds) {
-  Graph graph(OpRegistry::Global());
-  FunctionLibraryDefinition function_library(graph.op_registry(),
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
+  Graph graph(function_library);
   ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   shape_refiner.set_require_shape_inference_fns(false);
   shape_refiner.set_disable_constant_propagation(true);
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index db4dae96de..d3d89b59af 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -742,8 +742,6 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   EXPECT_EQ("float: [128,256]", PropToString(prop));
 }
 
-#if 0
-// Disabled for now since this doesnt' seem to work when functions are instantiated inside while loops. It's also unclear whether it's correct when the same function is instantiated twice.
 TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   // Test graph produced in python using:
   /*
@@ -757,27 +755,26 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
       z = MyAdd(x, y)
       z = MyAdd(x, z)
   */
-  // Check that the shape of the second MyAdd node propagates
-  // correctly.
+  // Check that the shape inference code infers what it can.
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
-  const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
-  const OpInfo::TensorProperties& prop = props[0];
-  EXPECT_EQ(DT_FLOAT, prop.dtype());
-  EXPECT_FALSE(prop.shape().unknown_rank());
-  EXPECT_EQ(2, prop.shape().dim_size());
-  EXPECT_EQ(1, prop.shape().dim(0).size());
-  EXPECT_EQ(2, prop.shape().dim(1).size());
-
-  PartialTensorShape shape(prop.shape());
-  EXPECT_TRUE(shape.IsFullyDefined());
-  EXPECT_FALSE(shape.unknown_rank());
+  const auto out_props = properties.GetOutputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& out_prop = out_props[0];
+  EXPECT_EQ(DT_FLOAT, out_prop.dtype());
+  EXPECT_TRUE(out_prop.shape().unknown_rank());
+
+  const auto in_props = properties.GetInputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& in_prop = in_props[0];
+  EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+  EXPECT_FALSE(in_prop.shape().unknown_rank());
+  EXPECT_EQ(2, in_prop.shape().dim_size());
+  EXPECT_EQ(1, in_prop.shape().dim(0).size());
+  EXPECT_EQ(2, in_prop.shape().dim(1).size());
 }
-#endif
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
   // Build a simple graph with placeholders of unknown dimensions. These
-- 
GitLab


From fccc96cbf48537fca49b61f94147d1c8e299fea4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 17:36:41 -0700
Subject: [PATCH 0484/1262] [slim] Allow passing timeout_fn to
 'evaluation_loop' so that caller can have more control over what to do if
 checkpoints_iterator times out.

PiperOrigin-RevId: 192216302
---
 tensorflow/contrib/slim/python/slim/evaluation.py     |  7 ++++++-
 .../contrib/slim/python/slim/evaluation_test.py       | 11 +++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 3caf4e02da..5cfd5ee82e 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -230,6 +230,7 @@ def evaluation_loop(master,
                     max_number_of_evaluations=None,
                     session_config=None,
                     timeout=None,
+                    timeout_fn=None,
                     hooks=None):
   """Runs TF-Slim's Evaluation Loop.
 
@@ -261,6 +262,9 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
     hooks: A list of additional `SessionRunHook` objects to pass during
       repeated evaluations.
 
@@ -298,4 +302,5 @@ def evaluation_loop(master,
       hooks=all_hooks,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
-      timeout=timeout)
+      timeout=timeout,
+      timeout_fn=timeout_fn)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index c24bd04851..94fc12ca81 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -177,6 +177,17 @@ class EvaluationTest(test.TestCase):
     # The timeout kicked in.
     self.assertLess(end, start + 1.1)
 
+  def testTimeoutFnOnEvaluationLoop(self):
+    # We require a mutable object (e.g. list but not an int) to maintain state
+    # across calls of a nested function.
+    timeout_fn_calls = [0]
+    def _TimeoutFn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] >= 3
+    # Need not do any evaluation, but should just call timeout_fn repeatedly.
+    evaluation.evaluation_loop('', '', '', timeout=0, timeout_fn=_TimeoutFn)
+    self.assertEqual(timeout_fn_calls[0], 3)
+
   def testMonitorCheckpointsLoopTimeout(self):
     ret = list(
         evaluation_lib.checkpoints_iterator(
-- 
GitLab


From 01417f6fe7c28ec530154f63efac333d19ba7632 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 9 Apr 2018 17:40:46 -0700
Subject: [PATCH 0485/1262] Remove extra #define EIGEN_USE_THREADS.

It breaks some builds with
 error: "EIGEN_USE_THREADS" redefined [-Werror]

PiperOrigin-RevId: 192216796
---
 tensorflow/core/common_runtime/eigen_thread_pool.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/eigen_thread_pool.h b/tensorflow/core/common_runtime/eigen_thread_pool.h
index ddd627fb20..c6f13c6a11 100644
--- a/tensorflow/core/common_runtime/eigen_thread_pool.h
+++ b/tensorflow/core/common_runtime/eigen_thread_pool.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_
 #define TENSORFLOW_COMMON_RUNTIME_EIGEN_THREAD_POOL_H_
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/core/threadpool.h"
 
-- 
GitLab


From 0295c3c212546147e57609a791f126aaca44ba44 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 9 Apr 2018 17:45:13 -0700
Subject: [PATCH 0486/1262] boosted_trees: early stop hooks are fixed to stop
 at the right moment  by reading tensor values in a separate session after
 train_op run. PiperOrigin-RevId: 192217338

---
 .../python/estimator/boosted_trees_test.py    | 97 +++++++------------
 .../python/estimator/canned/boosted_trees.py  | 33 +++----
 .../estimator/canned/boosted_trees_test.py    | 63 +++++-------
 3 files changed, 71 insertions(+), 122 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index e99a87f3b3..eee5910687 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -69,10 +70,18 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateEstimator(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -88,9 +97,10 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferEstimator(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -108,31 +118,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
-
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
-
-
-class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
   def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=True)
@@ -145,36 +137,16 @@ class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
-
-    # Check predict that all labels are correct.
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -187,20 +159,17 @@ class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.2136638)
-
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 500ea03ea7..c5d5455b1a 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -209,8 +209,8 @@ class _CacheTrainingStatesUsingVariables(object):
         name='cache_insert')
 
 
-class StopAtAttemptsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
+class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of attempts."""
 
   def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
                max_trees, max_depth):
@@ -224,25 +224,17 @@ class StopAtAttemptsHook(session_run_hook.SessionRunHook):
         [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
 
   def after_run(self, run_context, run_values):
+    # num_* tensors should be retrieved by a separate session than the training
+    # one, in order to read the values after growing.
+    # So, if it's approaching to the limit, get the actual value by additional
+    # session.
     num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees - 1 or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
+      num_finalized_trees, num_attempted_layers = run_context.session.run(
+          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
     if (num_finalized_trees >= self._max_trees or
-        1.0 * num_attempted_layers / self._max_depth > 2 * self._max_trees):
-      run_context.request_stop()
-
-
-class StopAtNumTreesHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
-
-  def __init__(self, num_trees_tensor, max_trees):
-    self._num_trees_tensor = num_trees_tensor
-    self._max_trees = max_trees
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._num_trees_tensor)
-
-  def after_run(self, run_context, run_values):
-    num_trees = run_values.results
-    if num_trees > self._max_trees:
+        num_attempted_layers > 2 * self._max_trees * self._max_depth):
       run_context.request_stop()
 
 
@@ -468,7 +460,8 @@ def _bt_model_fn(
     # Add an early stop hook.
     estimator_spec = estimator_spec._replace(
         training_hooks=estimator_spec.training_hooks +
-        (StopAtNumTreesHook(num_trees, tree_hparams.n_trees),))
+        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                             tree_hparams.n_trees, tree_hparams.max_depth),))
   return estimator_spec
 
 
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 01e5cc7a5d..625745a3f9 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -69,7 +69,7 @@ def _make_train_input_fn(is_classification):
   return _input_fn
 
 
-class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._feature_columns = {
@@ -79,10 +79,18 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateBinaryClassifier(self):
     input_fn = _make_train_input_fn(is_classification=True)
@@ -97,7 +105,8 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
 
@@ -118,29 +127,9 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
     # All labels are correct.
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testTrainAndEvaluateRegressor(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -155,9 +144,10 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferRegressor(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -174,16 +164,13 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 class ModelFnTests(test_util.TensorFlowTestCase):
-- 
GitLab


From 77bd95ab15bdfa6a821540ce1c826c7eb9c9f0a8 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Mon, 9 Apr 2018 18:03:57 -0700
Subject: [PATCH 0487/1262] [tf.data] Add parameter for select_cols in
 tf.decode_csv for option to parse only a subset of CSV columns; added same
 parameter to make_csv_dataset function in tf.contrib.data.

PiperOrigin-RevId: 192219509
---
 .../kernel_tests/reader_dataset_ops_test.py   | 197 +++++++++++++-----
 tensorflow/contrib/data/python/ops/readers.py |  76 ++++++-
 tensorflow/core/kernels/decode_csv_op.cc      |  41 +++-
 tensorflow/core/ops/parsing_ops.cc            |   1 +
 .../python/kernel_tests/decode_csv_op_test.py |  90 ++++++--
 tensorflow/python/ops/parsing_ops.py          |  26 ++-
 tensorflow/tools/api/golden/tensorflow.pbtxt  |   2 +-
 7 files changed, 351 insertions(+), 82 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index f3e9302409..1075302bae 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -295,18 +295,20 @@ class ReadBatchFeaturesTest(test.TestCase):
         ).get_next()
 
   def _record(self, f, r):
-    example = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            "file":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[f])),
-            "record":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[r])),
-            "keywords":
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=self._get_keywords(f, r)))
-        }))
+    example = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                "file":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[f])),
+                "record":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[r])),
+                "keywords":
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=self._get_keywords(f, r)))
+            }))
     return example.SerializeToString()
 
   def _get_keywords(self, f, r):
@@ -374,8 +376,8 @@ class ReadBatchFeaturesTest(test.TestCase):
         record_batch.append(r)
         keywords = self._get_keywords(f, r)
         keywords_batch_values.extend(keywords)
-        keywords_batch_indices.extend([[batch_index, i]
-                                       for i in range(len(keywords))])
+        keywords_batch_indices.extend(
+            [[batch_index, i] for i in range(len(keywords))])
         batch_index += 1
         keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
         if len(file_batch) == batch_size:
@@ -475,9 +477,10 @@ class ReadBatchFeaturesTest(test.TestCase):
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
         "record": parsing_ops.FixedLenFeature([], dtypes.int64),
     }
-    dataset = (core_readers.TFRecordDataset(self.test_filenames)
-               .map(lambda x: parsing_ops.parse_single_example(x, features))
-               .repeat(10).batch(2))
+    dataset = (
+        core_readers.TFRecordDataset(self.test_filenames)
+        .map(lambda x: parsing_ops.parse_single_example(x, features))
+        .repeat(10).batch(2))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     next_element = iterator.get_next()
@@ -607,20 +610,25 @@ class MakeCsvDatasetTest(test.TestCase):
         "record %d" % recordno if recordno % 2 == 1 else "",
     ]
 
-  def _csv_record(self, fileno, recordno):
-    return ",".join(str(v) for v in self._csv_values(fileno, recordno))
+  def _write_file(self, filename, rows):
+    for i in range(len(rows)):
+      if isinstance(rows[i], list):
+        rows[i] = ",".join(str(v) if v is not None else "" for v in rows[i])
+    fn = os.path.join(self.get_temp_dir(), filename)
+    f = open(fn, "w")
+    f.write("\n".join(rows))
+    f.close()
+    return fn
 
   def _create_file(self, fileno, header=True, comment=True):
-    fn = os.path.join(self.get_temp_dir(), "csv_file%d.csv" % fileno)
-    f = open(fn, "w")
+    rows = []
     if header:
-      f.write(",".join(self.COLUMNS) + "\n")
+      rows.append(self.COLUMNS)
     for recno in range(self._num_records):
-      f.write(self._csv_record(fileno, recno) + "\n")
+      rows.append(self._csv_values(fileno, recno))
       if comment:
-        f.write("# Some comment goes here. Should be ignored!\n")
-    f.close()
-    return fn
+        rows.append("# Some comment goes here. Ignore me.")
+    return self._write_file("csv_file%d.csv" % fileno, rows)
 
   def _create_files(self):
     filenames = []
@@ -634,6 +642,7 @@ class MakeCsvDatasetTest(test.TestCase):
       defaults,
       column_names=COLUMNS,
       label_name=LABEL,
+      select_cols=None,
       batch_size=1,
       num_epochs=1,
       shuffle=False,
@@ -656,6 +665,7 @@ class MakeCsvDatasetTest(test.TestCase):
         comment=comment,
         na_value=na_value,
         default_float_type=default_float_type,
+        select_columns=select_cols,
     )
 
   def _next_actual_batch(self, file_indices, batch_size, num_epochs, defaults):
@@ -712,7 +722,7 @@ class MakeCsvDatasetTest(test.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
-  def test_make_csv_dataset(self):
+  def testMakeCSVDataset(self):
     defaults = self.DEFAULTS
 
     with ops.Graph().as_default() as g:
@@ -739,7 +749,7 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
-  def test_make_csv_dataset_with_bad_columns(self):
+  def testMakeCSVDataset_withBadColumns(self):
     """Tests that exception is raised when input is malformed.
     """
     dupe_columns = self.COLUMNS[:-1] + self.COLUMNS[:1]
@@ -755,7 +765,7 @@ class MakeCsvDatasetTest(test.TestCase):
       self._make_csv_dataset(
           self._test_filenames, defaults, label_name="not_a_real_label")
 
-  def test_make_csv_dataset_with_no_label(self):
+  def testMakeCSVDataset_withNoLabel(self):
     """Tests that CSV datasets can be created when no label is specified.
     """
     defaults = self.DEFAULTS
@@ -776,7 +786,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
             label_name=None)
 
-  def test_make_csv_dataset_with_no_comments(self):
+  def testMakeCSVDataset_withNoComments(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
     defaults = self.DEFAULTS
@@ -799,7 +809,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
         )
 
-  def test_make_csv_dataset_with_no_header(self):
+  def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
     defaults = self.DEFAULTS
@@ -822,7 +832,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
         )
 
-  def test_make_csv_dataset_with_types(self):
+  def testMakeCSVDataset_withTypes(self):
     """Tests that defaults can be a dtype instead of a Tensor for required vals.
     """
     defaults = [d for d in self.COLUMN_TYPES[:-1]]
@@ -832,7 +842,7 @@ class MakeCsvDatasetTest(test.TestCase):
         dataset = self._make_csv_dataset(self._test_filenames, defaults)
         self._verify_records(sess, dataset, range(self._num_files))
 
-  def test_make_csv_dataset_with_no_col_names(self):
+  def testMakeCSVDataset_withNoColNames(self):
     """Tests that datasets can be created when column names are not specified.
 
     In that case, we should infer the column names from the header lines.
@@ -851,7 +861,17 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
-  def test_make_csv_dataset_type_inference(self):
+  def testMakeCSVDataset_withTypeInferenceMismatch(self):
+    # Test that error is thrown when num fields doesn't match columns
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames,
+          column_names=self.COLUMNS + ["extra_name"],
+          defaults=None,
+          batch_size=2,
+          num_epochs=10)
+
+  def testMakeCSVDataset_withTypeInference(self):
     """Tests that datasets can be created when no defaults are specified.
 
     In that case, we should infer the types from the first N records.
@@ -875,19 +895,16 @@ class MakeCsvDatasetTest(test.TestCase):
         dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
         dtypes.string, dtypes.string
     ]
-    rows = [[0, 0, 0, "NAN", "", "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    rows = [[None, None, None, "NAN", "",
+             "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
             ['"123"', 2, 2**64, 123.4, "NAN", '"cd,efg"']]
     expected = [[0, 0, 0, 0, "", "a"], [1, 2**31 + 1, 2**64, 123, "", ""],
                 [123, 2, 2**64, 123.4, "", "cd,efg"]]
     for row in expected:
       row[-1] = row[-1].encode("utf-8")  # py3 expects byte strings
       row[-2] = row[-2].encode("utf-8")  # py3 expects byte strings
-    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
-    with open(fn, "w") as f:
-      f.write(",".join(col_names))
-      f.write("\n")
-      for row in rows:
-        f.write(",".join([str(v) if v else "" for v in row]) + "\n")
+    self._write_file("file.csv", [col_names] + rows)
 
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
@@ -895,8 +912,6 @@ class MakeCsvDatasetTest(test.TestCase):
             fn,
             defaults=None,
             column_names=None,
-            batch_size=1,
-            num_epochs=1,
             label_name=None,
             na_value="NAN",
             default_float_type=dtypes.float32,
@@ -919,8 +934,6 @@ class MakeCsvDatasetTest(test.TestCase):
             fn,
             defaults=None,
             column_names=None,
-            batch_size=1,
-            num_epochs=1,
             label_name=None,
             na_value="NAN",
             default_float_type=dtypes.float64,
@@ -928,11 +941,99 @@ class MakeCsvDatasetTest(test.TestCase):
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
         for i in range(len(expected_dtypes)):
-          assert features["col%d" % i].dtype == expected_dtypes[i]
+          self.assertAllEqual(features["col%d" % i].dtype, expected_dtypes[i])
         for i in range(len(rows)):
-          assert sess.run(features) == dict(zip(col_names, expected[i]))
+          self.assertAllEqual(
+              sess.run(features), dict(zip(col_names, expected[i])))
+
+  def testMakeCSVDataset_withSelectColsError(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    with self.assertRaises(ValueError):
+      # Mismatch in number of defaults and number of columns selected,
+      # should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]] * 5,
+          column_names=col_names,
+          label_name=None,
+          select_cols=[1, 3])
+    with self.assertRaises(ValueError):
+      # Invalid column name should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]],
+          column_names=col_names,
+          label_name=None,
+          select_cols=["invalid_col_name"])
+
+  def testMakeCSVDataset_withSelectCols(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    # If select_cols is specified, should only yield a subset of columns
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=[[0], [0]],
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do default inference with select_cols
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do column name inference
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can specify column names instead of indices
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[col_names[1], col_names[3]])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
 
-  def test_make_csv_dataset_with_shuffle(self):
+  def testMakeCSVDataset_withShuffle(self):
     total_records = self._num_files * self._num_records
     defaults = self.DEFAULTS
     for batch_size in [1, 2]:
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index b8eb09978e..4ec8ae1c79 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -124,18 +124,21 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
 
 def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
                            na_value, header, comment, float_dtype,
-                           rows_for_inference):
+                           num_rows_for_inference, select_columns):
   """Infers column types from the first N valid CSV records of files."""
-  inferred_types = [None] * num_cols
+  if select_columns is None:
+    select_columns = range(num_cols)
+  inferred_types = [None] * len(select_columns)
 
-  for rows_read, csv_row in enumerate(
+  for i, csv_row in enumerate(
       _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
                     comment)):
-    if rows_for_inference is not None and rows_read >= rows_for_inference:
+    if num_rows_for_inference is not None and i >= num_rows_for_inference:
       break
-    for i, str_val in enumerate(csv_row):
-      inferred_types[i] = _infer_type(str_val, na_value, inferred_types[i],
-                                      float_dtype)
+
+    for j, col_index in enumerate(select_columns):
+      inferred_types[j] = _infer_type(csv_row[col_index], na_value,
+                                      inferred_types[j], float_dtype)
 
   # Replace None's with a default type
   inferred_types = [t or dtypes.string for t in inferred_types]
@@ -162,12 +165,37 @@ def _infer_column_names(filenames, field_delim, use_quote_delim):
   return column_names
 
 
+def _get_sorted_col_indices(select_columns, column_names):
+  """Transforms select_columns argument into sorted column indices."""
+  names_to_indices = {n: i for i, n in enumerate(column_names)}
+  num_cols = len(column_names)
+  for i, v in enumerate(select_columns):
+    if isinstance(v, int):
+      if v < 0 or v >= num_cols:
+        raise ValueError(
+            "Column index %d specified in select_columns out of valid range." %
+            v)
+      continue
+    if v not in names_to_indices:
+      raise ValueError(
+          "Value '%s' specified in select_columns not a valid column index or "
+          "name." % v)
+    select_columns[i] = names_to_indices[v]
+
+  # Sort and ensure there are no duplicates
+  result = sorted(set(select_columns))
+  if len(result) != len(select_columns):
+    raise ValueError("select_columns contains duplicate columns")
+  return result
+
+
 def make_csv_dataset(
     file_pattern,
     batch_size,
     column_names=None,
     column_defaults=None,
     label_name=None,
+    select_columns=None,
     field_delim=",",
     use_quote_delim=True,
     na_value="",
@@ -201,20 +229,32 @@ def make_csv_dataset(
       provided, infers the column names from the first row of the records.
       These names will be the keys of the features dict of each dataset element.
     column_defaults: A optional list of default values for the CSV fields. One
-      item per column of the input record. Each item in the list is either a
-      valid CSV dtype (float32, float64, int32, int64, or string), or a
+      item per selected column of the input record. Each item in the list is
+      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
       `Tensor` with one of the aforementioned types. The tensor can either be
       a scalar default value (if the column is optional), or an empty tensor (if
       the column is required). If a dtype is provided instead of a tensor, the
       column is also treated as required. If this list is not provided, tries
       to infer types based on reading the first num_rows_for_inference rows of
       files specified, and assumes all columns are optional, defaulting to `0`
-      for numeric values and `""` for string values.
+      for numeric values and `""` for string values. If both this and
+      `select_columns` are specified, these must have the same lengths, and
+      `column_defaults` is assumed to be sorted in order of increasing column
+      index.
     label_name: A optional string corresponding to the label column. If
       provided, the data for this column is returned as a separate `Tensor` from
       the features dictionary, so that the dataset complies with the format
       expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
       function.
+    select_columns: An optional list of integer indices or string column
+      names, that specifies a subset of columns of CSV data to select. If
+      column names are provided, these must correspond to names provided in
+      `column_names` or inferred from the file header lines. When this argument
+      is specified, only a subset of CSV columns will be parsed and returned,
+      corresponding to the columns specified. Using this results in faster
+      parsing and lower memory usage. If both this and `column_defaults` are
+      specified, these must have the same lengths, and `column_defaults` is
+      assumed to be sorted in order of increasing column index.
     field_delim: An optional `string`. Defaults to `","`. Char delimiter to
       separate fields in a record.
     use_quote_delim: An optional bool. Defaults to `True`. If false, treats
@@ -279,6 +319,9 @@ def make_csv_dataset(
   if len(column_names) != len(set(column_names)):
     raise ValueError("Cannot have duplicate column names.")
 
+  if select_columns is not None:
+    select_columns = _get_sorted_col_indices(select_columns, column_names)
+
   if column_defaults is not None:
     column_defaults = [
         constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
@@ -289,7 +332,17 @@ def make_csv_dataset(
     # construction time
     column_defaults = _infer_column_defaults(
         filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, comment, default_float_type, num_rows_for_inference)
+        header, comment, default_float_type, num_rows_for_inference,
+        select_columns)
+
+  if select_columns is not None and len(column_defaults) != len(select_columns):
+    raise ValueError(
+        "If specified, column_defaults and select_columns must have same "
+        "length."
+    )
+  if select_columns is not None and len(column_names) > len(select_columns):
+    # Pick the relevant subset of column names
+    column_names = [column_names[i] for i in select_columns]
 
   if label_name is not None and label_name not in column_names:
     raise ValueError("`label_name` provided must be one of the columns.")
@@ -322,6 +375,7 @@ def make_csv_dataset(
         field_delim=field_delim,
         use_quote_delim=use_quote_delim,
         na_value=na_value,
+        select_cols=select_columns,
     )
     features = dict(zip(column_names, columns))
     if label_name is not None:
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 0c42f63252..3eed847c16 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -34,6 +34,19 @@ class DecodeCSVOp : public OpKernel {
                 errors::InvalidArgument("Out type too large"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_quote_delim", &use_quote_delim_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("select_cols", &select_cols_));
+    OP_REQUIRES(
+        ctx, out_type_.size() == select_cols_.size() || select_cols_.empty(),
+        errors::InvalidArgument("select_cols should match output size"));
+    select_all_cols_ = select_cols_.empty();
+    for (int i = 1; i < select_cols_.size(); i++) {
+      OP_REQUIRES(ctx, select_cols_[i - 1] < select_cols_[i],
+                  errors::InvalidArgument(
+                      "select_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, select_cols_.empty() || select_cols_.front() >= 0,
+        errors::InvalidArgument("select_cols should be non-negative indices"));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
     delim_ = delim[0];
@@ -183,13 +196,18 @@ class DecodeCSVOp : public OpKernel {
 
  private:
   std::vector<DataType> out_type_;
+  std::vector<int64> select_cols_;
   char delim_;
   bool use_quote_delim_;
+  bool select_all_cols_;
   string na_value_;
 
   void ExtractFields(OpKernelContext* ctx, StringPiece input,
                      std::vector<string>* result) {
     int64 current_idx = 0;
+    int64 num_fields_parsed = 0;
+    int64 selector_idx = 0;  // Keep track of index into select_cols
+
     if (!input.empty()) {
       while (static_cast<size_t>(current_idx) < input.size()) {
         if (input[current_idx] == '\n' || input[current_idx] == '\r') {
@@ -198,6 +216,10 @@ class DecodeCSVOp : public OpKernel {
         }
 
         bool quoted = false;
+        bool include =
+            (select_all_cols_ || select_cols_[selector_idx] ==
+                                     static_cast<size_t>(num_fields_parsed));
+
         if (use_quote_delim_ && input[current_idx] == '"') {
           quoted = true;
           current_idx++;
@@ -214,7 +236,7 @@ class DecodeCSVOp : public OpKernel {
                             input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
-            field += input[current_idx];
+            if (include) field += input[current_idx];
             current_idx++;
           }
 
@@ -226,14 +248,14 @@ class DecodeCSVOp : public OpKernel {
               (static_cast<size_t>(current_idx) < input.size() - 1) &&
               (input[current_idx] != '"' || input[current_idx + 1] != delim_)) {
             if (input[current_idx] != '"') {
-              field += input[current_idx];
+              if (include) field += input[current_idx];
               current_idx++;
             } else {
               OP_REQUIRES(
                   ctx, input[current_idx + 1] == '"',
                   errors::InvalidArgument("Quote inside a string has to be "
                                           "escaped by another quote"));
-              field += '"';
+              if (include) field += '"';
               current_idx += 2;
             }
           }
@@ -250,11 +272,20 @@ class DecodeCSVOp : public OpKernel {
           current_idx += 2;
         }
 
-        result->push_back(field);
+        num_fields_parsed++;
+        if (include) {
+          result->push_back(field);
+          selector_idx++;
+          if (selector_idx == select_cols_.size()) return;
+        }
       }
 
+      bool include =
+          (select_all_cols_ || select_cols_[selector_idx] ==
+                                   static_cast<size_t>(num_fields_parsed));
       // Check if the last field is missing
-      if (input[input.size() - 1] == delim_) result->push_back(string());
+      if (include && input[input.size() - 1] == delim_)
+        result->push_back(string());
     }
   }
 };
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index ddd2aa9274..ddb714b4e9 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -245,6 +245,7 @@ REGISTER_OP("DecodeCSV")
     .Attr("field_delim: string = ','")
     .Attr("use_quote_delim: bool = true")
     .Attr("na_value: string = ''")
+    .Attr("select_cols: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       // Validate the record_defaults inputs.
       for (int i = 1; i < c->num_inputs(); ++i) {
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index fec52fa9cc..4f49d72676 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -78,9 +78,11 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(args, expected_out)
 
   def test2DNoQuoteDelimiter(self):
-    args = {"records": [["1", "2"], ['""', '"']],
-            "record_defaults": [[""]],
-            "use_quote_delim": False}
+    args = {
+        "records": [["1", "2"], ['""', '"']],
+        "record_defaults": [[""]],
+        "use_quote_delim": False
+    }
     expected_out = [[[b"1", b"2"], [b'""', b'"']]]
 
     self._test(args, expected_out)
@@ -88,8 +90,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testDouble(self):
     args = {
         "records": ["1.0", "-1.79e+308", '"1.79e+308"'],
-        "record_defaults": [np.array(
-            [], dtype=np.double)],
+        "record_defaults": [np.array([], dtype=np.double)],
     }
 
     expected_out = [[1.0, -1.79e+308, 1.79e+308]]
@@ -99,8 +100,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testInt64(self):
     args = {
         "records": ["1", "2", '"2147483648"'],
-        "record_defaults": [np.array(
-            [], dtype=np.int64)],
+        "record_defaults": [np.array([], dtype=np.int64)],
     }
 
     expected_out = [[1, 2, 2147483648]]
@@ -173,8 +173,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWithoutDefaultsError(self):
     args = {
         "records": [",1", "0.2,3", "3.0,"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -183,8 +182,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldIntError(self):
     args = {
         "records": [",1", "0.2,234a", "3.0,2"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -202,8 +200,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldFloatError(self):
     args = {
         "records": [",1", "0.2,2", "3.0adf,3"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -229,6 +226,73 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(
         args, expected_err_re="Quoted field has to end with quote followed.*")
 
+  def testSelectCols(self):
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[1], [2]],
+        "select_cols": [0, 1]
+    }
+    expected_out = [[1, 4], [2, 5]]
+    self._test(args, expected_out)
+
+  def testSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 2]
+    }
+    expected_out = [[0, 4], [1, 5], [2, 6]]
+    self._test(args, expected_out)
+
+  def testWrongSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 3]
+    }
+    self._test(args, expected_err_re="Expect 3 fields but have 2 in record 0")
+
+  def testWrongSelectColsLen(self):
+    args = {
+        "records": ["1,2,3", "4,5,6"],
+        "record_defaults": [[0], [0], [0]],
+        "select_cols": [0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "Length of select_cols and record_defaults do not match."):
+      self._test(args)
+
+  def testWrongSelectColsSorting(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [1, 0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols is not strictly increasing."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesNegative(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [-1, 0]  # -1 is not a valid index
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols contains negative values."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesTooHigh(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [0, 3]  # 3 is not a valid index
+    }
+    # Only successfully parses one of the columns
+    self._test(args, expected_err_re="Expect 2 fields but have 1 in record 0")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 075b38d743..d8d9af545f 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -1176,8 +1176,13 @@ def _parse_single_sequence_example_raw(serialized,
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export("decode_csv")
-def decode_csv(records, record_defaults, field_delim=",",
-               use_quote_delim=True, name=None, na_value=""):
+def decode_csv(records,
+               record_defaults,
+               field_delim=",",
+               use_quote_delim=True,
+               name=None,
+               na_value="",
+               select_cols=None):
   """Convert CSV records to tensors. Each column maps to one tensor.
 
   RFC 4180 format is expected for the CSV records.
@@ -1200,19 +1205,32 @@ def decode_csv(records, record_defaults, field_delim=",",
       Bullet 5).
     name: A name for the operation (optional).
     na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
 
   Returns:
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
   """
-  # TODO(martinwicke), remove the wrapper when new Python API generator is done.
+  if select_cols is not None and any(select_cols[i] >= select_cols[i + 1]
+                                     for i in range(len(select_cols) - 1)):
+    raise ValueError("select_cols is not strictly increasing.")
+  if select_cols is not None and select_cols[0] < 0:
+    raise ValueError("select_cols contains negative values.")
+  if select_cols is not None and len(select_cols) != len(record_defaults):
+    raise ValueError("Length of select_cols and record_defaults do not match.")
   return gen_parsing_ops.decode_csv(
       records=records,
       record_defaults=record_defaults,
       field_delim=field_delim,
       use_quote_delim=use_quote_delim,
       na_value=na_value,
-      name=name)
+      name=name,
+      select_cols=select_cols,
+  )
 
 
 # TODO(b/70890287): Combine the implementation of this op and
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index afa3b78eb7..be64fd19d8 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -914,7 +914,7 @@ tf_module {
   }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
-- 
GitLab


From 2782350e7ec81867dc68b0fb8bf0a6ca3dde5c12 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 18:08:13 -0700
Subject: [PATCH 0488/1262] [XLA] Redesign: implement fft.

PiperOrigin-RevId: 192220070
---
 .../compiler/xla/client/xla_client/xla_builder.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index ed9f994d39..170dd59c79 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -777,7 +777,20 @@ XlaOp XlaBuilder::ConvGeneralDilated(
 
 XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
                       const tensorflow::gtl::ArraySlice<int64> fft_length) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
+
+    instr.set_fft_type(fft_type);
+    for (int64 i : fft_length) {
+      instr.add_fft_length(i);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
-- 
GitLab


From 0166491e89487ce6cd6e10bfa77ba82ea42d8a59 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 18:46:09 -0700
Subject: [PATCH 0489/1262]   Add the total device time and total host time to
 tf_op_stats.

PiperOrigin-RevId: 192223560
---
 tensorflow/contrib/tpu/profiler/tf_op_stats.proto | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 2a15875627..63955d1806 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -66,6 +66,10 @@ message OpMetricsDbResult {
   // The total of the difference between the start times of two
   // consecutive infeed-enqueues (per host) in picoseconds.
   optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
+  // The total device time in microseconds.
+  optional double total_device_time_in_us = 4;
+  // The total host time in microseconds.
+  optional double total_host_time_in_us = 5;
 }
 
 // Result proto for StepInfo.
-- 
GitLab


From a45caf444228dbb57343092dc3054f85ff081ef6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 18:53:22 -0700
Subject: [PATCH 0490/1262] Include standard headers when libc++ requires them.

PiperOrigin-RevId: 192224104
---
 tensorflow/contrib/lite/allocation.cc             | 1 +
 tensorflow/contrib/lite/arena_planner.cc          | 1 +
 tensorflow/core/framework/attr_value_util_test.cc | 1 +
 tensorflow/core/grappler/costs/robust_stats.cc    | 1 +
 4 files changed, 4 insertions(+)

diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
index 4b322e027d..a4772731ec 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include <utility>
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index 8e47e2375e..4f836d3677 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/arena_planner.h"
+#include <utility>
 
 namespace tflite {
 
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index e4fad917ff..1a3994736c 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 
+#include <numeric>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
index 9866bc8688..5151b87c59 100644
--- a/tensorflow/core/grappler/costs/robust_stats.cc
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include <algorithm>
 #include <cmath>
+#include <utility>
 
 namespace tensorflow {
 namespace grappler {
-- 
GitLab


From d09bd1b06c4f4c8efad15c1e77c8c54710ccf077 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 19:06:05 -0700
Subject: [PATCH 0491/1262] [XLA] Attach a reference client to the
 client_library_test_base, and implement ComputeAndCompare for XlaBuilder. -
 In ComputeAndCompare(XlaBuilder..), compute the reference result by executing
 on the interpreter backend. - Also migrate some tests to use the new
 ComputeAndCompare(XlaBuilder..).

PiperOrigin-RevId: 192225152
---
 tensorflow/compiler/xla/tests/BUILD           |  4 +
 .../xla/tests/client_library_test_base.cc     | 95 +++++++++++++++++++
 .../xla/tests/client_library_test_base.h      | 26 ++++-
 tensorflow/compiler/xla/tests/reduce_test.cc  | 19 ++--
 4 files changed, 133 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 218345772f..8ecb421780 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -191,6 +191,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -1063,6 +1065,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index c2e3cd2350..312d8f284d 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,6 +36,10 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 namespace {
+
+// Name of the interpreter backend.
+constexpr char kInterpreter[] = "interpreter";
+
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
 Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
@@ -43,6 +48,14 @@ Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
   TF_CHECK_OK(result.status()) << " could not create local client for testing";
   return result.ValueOrDie();
 }
+
+// Helper functions to get the reference platform.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
 }  // namespace
 
 ClientLibraryTestBase::ClientLibraryTestBase(
@@ -66,6 +79,11 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   client_ = GetOrCreateLocalClientOrDie(default_options);
+
+  LocalClientOptions ref_options;
+  ref_options.set_platform(GetReferencePlatform());
+  ref_client_ = GetOrCreateLocalClientOrDie(ref_options);
+
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
 }
@@ -127,6 +145,20 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>>
+ClientLibraryTestBase::ExecuteAndTransferReference(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  ExecutionOptions execution_options = execution_options_;
+  if (shape_with_output_layout != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *shape_with_output_layout;
+  }
+  return ref_client_->ExecuteAndTransfer(computation, arguments,
+                                         &execution_options);
+}
+
 std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -521,6 +553,69 @@ ClientLibraryTestBase::ComputeValueAndReference(
   return std::make_pair(std::move(reference), std::move(result));
 }
 
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*reference, *result);
+}
+
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments,
+    ErrorSpec error) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectNear(*reference, *result, error);
+}
+
+StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+ClientLibraryTestBase::ComputeValueAndReference(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  // Transfer the arguments to the executor service. We put the unique_ptr's
+  // into a vector to keep the data alive on the service until the end of this
+  // function.
+  std::vector<std::unique_ptr<GlobalData>> argument_data;
+  std::vector<std::unique_ptr<GlobalData>> ref_argument_data;
+  for (const auto& arg : arguments) {
+    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone()));
+    TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg));
+    argument_data.push_back(std::move(data));
+    ref_argument_data.push_back(std::move(ref_data));
+  }
+
+  // Create raw pointers to the GlobalData for the rest of the call stack.
+  std::vector<GlobalData*> argument_data_ptr;
+  std::transform(
+      argument_data.begin(), argument_data.end(),
+      std::back_inserter(argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+  std::vector<GlobalData*> ref_argument_data_ptr;
+  std::transform(
+      ref_argument_data.begin(), ref_argument_data.end(),
+      std::back_inserter(ref_argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      ExecuteAndTransfer(computation, argument_data_ptr));
+
+  TF_ASSIGN_OR_RETURN(auto reference, ExecuteAndTransferReference(
+                                          computation, ref_argument_data_ptr));
+
+  return std::make_pair(std::move(reference), std::move(result));
+}
+
 Computation ClientLibraryTestBase::CreateScalarRelu() {
   ComputationBuilder builder(client_, "relu");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 0572acff88..b3212dd228 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -114,6 +114,14 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
+  // This executes the computation via the reference client (which connects a
+  // interpreter backend). The result is used as the expected values of the
+  // computation.
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransferReference(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
   // Convenience OrDie variants of above methods.
   std::unique_ptr<GlobalData> ExecuteOrDie(
       ComputationBuilder* builder,
@@ -236,6 +244,14 @@ class ClientLibraryTestBase : public ::testing::Test {
                          tensorflow::gtl::ArraySlice<Literal> arguments,
                          ErrorSpec error);
 
+  // Convenience method for running a built computation and comparing the result
+  // with the reference result.
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments);
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments,
+                         ErrorSpec error);
+
   // Create scalar operations for use in reductions.
   Computation CreateScalarRelu();
   Computation CreateScalarMax();
@@ -413,6 +429,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
 
   Client* client_;
+  Client* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
 
  private:
@@ -444,12 +461,19 @@ class ClientLibraryTestBase : public ::testing::Test {
       const Shape* output_with_layout = nullptr);
 
   // Executes the computation and calculates the expected reference value using
-  // the HloEvaluator. Returns two literal in the order of (expected, actual).
+  // the HloEvaluator. Returns two literals in the order of (expected, actual).
   StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
   ComputeValueAndReference(ComputationBuilder* builder,
                            const ComputationDataHandle& operand,
                            tensorflow::gtl::ArraySlice<Literal> arguments);
 
+  // Executes the computation and calculates the expected reference value using
+  // the reference client. Returns two literals in the order of (expected,
+  // actual).
+  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+  ComputeValueAndReference(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<Literal> arguments);
+
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index d24927d22b..768beec15e 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -39,6 +39,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -502,21 +504,18 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
 // Test that algebraic simplifier does not incorrectly fold a transpose into a
 // reduction operation.
 XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
-  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
-  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
-  ComputationDataHandle transpose =
-      builder.Transpose(input, /*permutation=*/{1, 0, 2});
-  ComputationDataHandle reduce =
-      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+  XlaOp input = builder.Parameter(0, input_shape, "input");
+  XlaOp zero = builder.ConstantR0<float>(0.0);
+  XlaOp transpose = builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
                           MakeFakeLiteral(input_shape));
 
-  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
-                    ErrorSpec(0.01, 1e-4));
+  ComputeAndCompare(&builder, {std::move(*input_data)}, ErrorSpec(0.01, 1e-4));
 }
 
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
-- 
GitLab


From e3e0af4bd9b1d7a4628a5a4d6901a2d8529cfda5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 19:19:38 -0700
Subject: [PATCH 0492/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192226063
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 59 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  8 +++
 2 files changed, 67 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 026bfa89cf..fe4b7a7be0 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -16407,6 +16407,65 @@ op {
     }
   }
 }
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "DecodeCompressed"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index b61a3b0e64..9950388357 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7361,6 +7361,14 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
   name: "DecodeCompressed"
-- 
GitLab


From a356b2128a9bdbc33eceeff4b058f4d5d2e97738 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 19:46:01 -0700
Subject: [PATCH 0493/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 192227995

---
 tensorflow/go/op/wrappers.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3d261c9d0a..09da8c1892 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -13882,6 +13882,14 @@ func DecodeCSVNaValue(value string) DecodeCSVAttr {
 	}
 }
 
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
 // Convert CSV records to tensors. Each column maps to one tensor.
 //
 // RFC 4180 format is expected for the CSV records.
-- 
GitLab


From 21820d31f24f666d2ae642c633a19aed17a4f477 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 9 Apr 2018 20:09:19 -0700
Subject: [PATCH 0494/1262] Addressed comments.

- Free all un-popped GenericNamespace objects in destructor (not
  just the one garunteed to be there).
- Renamed test file to test.java.snippet.
---
 tensorflow/java/BUILD                                 |  2 +-
 tensorflow/java/src/gen/cc/source_writer.cc           | 11 +++++++----
 tensorflow/java/src/gen/cc/source_writer_test.cc      |  2 +-
 .../{test.snippet.java.txt => test.java.snippet}      |  0
 4 files changed, 9 insertions(+), 6 deletions(-)
 rename tensorflow/java/src/gen/resources/{test.snippet.java.txt => test.java.snippet} (100%)

diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 4b558af2ac..ab7d698a45 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -312,7 +312,7 @@ tf_cc_test(
         "src/gen/cc/source_writer_test.cc",
     ],
     data = [
-        "src/gen/resources/test.snippet.java.txt",
+        "src/gen/resources/test.java.snippet",
     ],
     deps = [
         ":java_op_gen_lib",
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index c57389f6c5..a02f75ad6e 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -28,10 +28,13 @@ SourceWriter::SourceWriter() {
 }
 
 SourceWriter::~SourceWriter() {
-  // Remove empty generic namespace added at start.
-  GenericNamespace* generic_namespace = generic_namespaces_.top();
-  generic_namespaces_.pop();
-  delete generic_namespace;
+  // Remove empty generic namespace added at start as well as any other
+  // namespace objects that haven't been removed.
+  while (!generic_namespaces_.empty()) {
+    GenericNamespace* generic_namespace = generic_namespaces_.top();
+    generic_namespaces_.pop();
+    delete generic_namespace;
+  }
 }
 
 SourceWriter& SourceWriter::Indent(int tab) {
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index cbde64683b..4bce2fea70 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -261,7 +261,7 @@ TEST(StreamTest, FileSnippet) {
   SourceBufferWriter writer;
   const string fname = tensorflow::io::JoinPath(
       tensorflow::testing::TensorFlowSrcRoot(),
-      "java/src/gen/resources/test.snippet.java.txt");
+      "java/src/gen/resources/test.java.snippet");
 
   writer.WriteFromFile(fname)
         .BeginBlock()
diff --git a/tensorflow/java/src/gen/resources/test.snippet.java.txt b/tensorflow/java/src/gen/resources/test.java.snippet
similarity index 100%
rename from tensorflow/java/src/gen/resources/test.snippet.java.txt
rename to tensorflow/java/src/gen/resources/test.java.snippet
-- 
GitLab


From 405553005f9742203d1f0ac0c0a1740fe19766bd Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 9 Apr 2018 20:26:16 -0700
Subject: [PATCH 0495/1262] Fix the GCS Kokoro build badge links.

The original badges I added were redirected/authenticated links (I think). This
resulted in broken images on the README. These new links should not be
redirected and should just link to the badge images.

PiperOrigin-RevId: 192230689
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 177265500f..a69cf1ffea 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 
 | **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.cloud.google.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.cloud.google.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.cloud.google.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
-- 
GitLab


From 1c8f3c81698b67b8fffce86c97df27d392b84cb8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 21:36:36 -0700
Subject: [PATCH 0496/1262] Updating three tests in constant_folding_test.cc
 with PlaceHolders Nodes to check EvaluateNodes returns the same output for
 the original and optimized graph

PiperOrigin-RevId: 192235310
---
 .../optimizers/constant_folding_test.cc       | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 08c92687e3..31abe43846 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1925,6 +1925,14 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("reshape");
 
+  auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  Tensor indices_t(DT_INT32, TensorShape({2}));
+  indices_t.flat<int>()(0) = 0;
+  indices_t.flat<int>()(1) = 1;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -1951,6 +1959,11 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, LargeConstant) {
@@ -2047,6 +2060,23 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
     }
   }
   EXPECT_EQ(6, found);
+
+  // Evaluate id_true when input tensor x is true.
+  Tensor x_t(DT_BOOL, TensorShape({}));
+  x_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(item.graph, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
+
+  // Evalute id_false when input tensor is false.
+  x_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  tensors = EvaluateNodes(output, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
@@ -2288,6 +2318,15 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
       EXPECT_EQ("^id_n", node.input(0));
     }
   }
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
 }
 
 TEST_F(ConstantFoldingTest, TrivialPack) {
-- 
GitLab


From 3f3a6e6685449130389e0b8d76f6ba0fc457bcfe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 22:01:59 -0700
Subject: [PATCH 0497/1262] Implementation of ArgMax

PiperOrigin-RevId: 192236845
---
 tensorflow/contrib/lite/builtin_op_data.h     |   4 +
 tensorflow/contrib/lite/builtin_ops.h         |   1 +
 tensorflow/contrib/lite/kernels/BUILD         |  17 ++
 tensorflow/contrib/lite/kernels/arg_max.cc    | 178 ++++++++++++++++++
 .../contrib/lite/kernels/arg_max_test.cc      | 107 +++++++++++
 tensorflow/contrib/lite/kernels/register.cc   |   2 +
 tensorflow/contrib/lite/model.cc              |   9 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |   6 +
 .../contrib/lite/schema/schema_generated.h    | 141 +++++++++++++-
 tensorflow/contrib/lite/testing/BUILD         |   1 +
 .../contrib/lite/testing/generate_examples.py |  38 +++-
 .../testing/generated_examples_zip_test.cc    |   6 +
 .../contrib/lite/toco/tflite/operator.cc      |  19 ++
 .../contrib/lite/toco/tflite/operator_test.cc |   7 +
 15 files changed, 529 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/arg_max.cc
 create mode 100644 tensorflow/contrib/lite/kernels/arg_max_test.cc

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 2b6c24768c..f5fb2f15e3 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -221,6 +221,10 @@ typedef struct {
   int shrink_axis_mask;
 } TfLiteStridedSliceParams;
 
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 17b791e4e2..e11c7fb2e4 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -80,6 +80,7 @@ typedef enum {
   kTfLiteBuiltinCast = 53,
   kTfLiteBuiltinPrelu = 54,
   kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index df0f3cbeb0..b79900623e 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -135,6 +135,7 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "arg_max.cc",
         "audio_spectrogram.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
@@ -270,6 +271,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "arg_max_test",
+    size = "small",
+    srcs = ["arg_max_test.cc"],
+    tags = [
+        "tflite_not_portable_ios_arm64",
+        "tflite_not_portable_ios_x86_64",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "div_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_max.cc
new file mode 100644
index 0000000000..a2c5e4cead
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max.cc
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace arg_max {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxis = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  // Make sure the axis is only 1 dimension.
+  TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
+
+  // Make sure the axis is only either int32 or int64.
+  TF_LITE_ENSURE(context,
+                 axis->type == kTfLiteInt32 || axis->type == kTfLiteInt64);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
+  switch (params->output_type) {
+    case kTfLiteInt32:
+      output->type = kTfLiteInt32;
+      break;
+    case kTfLiteInt64:
+      output->type = kTfLiteInt64;
+      break;
+    default:
+      context->ReportError(context, "Unknown index output data type");
+      return kTfLiteError;
+  }
+
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt32:
+      break;
+
+    default:
+      context->ReportError(context, "Only float32 and int types are supported");
+      return kTfLiteError;
+  }
+
+  // Copy the input dimensions to output except make the last dimension 1.
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  output_size->data[NumDimensions(input) - 1] = 1;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// The current impl actually ignores the axis argument.
+// Only determine the index of the maximum value in the last dimension.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                     \
+  TF_LITE_ENSURE_EQ(context, GetTensorData<axis_type>(axis)[0], 3);            \
+  optimized_ops::ArgMax(GetTensorData<axis_type>(axis),                        \
+                        GetTensorData<data_type>(input), GetTensorDims(input), \
+                        GetTensorData<output_type>(output),                    \
+                        GetTensorDims(output))
+  if (axis->type == kTfLiteInt32) {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  } else {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_ARG_MAX
+
+  return kTfLiteOk;
+}
+
+}  // namespace arg_max
+
+TfLiteRegistration* Register_ARG_MAX() {
+  static TfLiteRegistration r = {nullptr, nullptr, arg_max::Prepare,
+                                 arg_max::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/arg_max_test.cc b/tensorflow/contrib/lite/kernels/arg_max_test.cc
new file mode 100644
index 0000000000..f4e1da3a6e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ArgMaxOpModel : public SingleOpModel {
+ public:
+  ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+                TensorType output_type, TensorType index_output_type) {
+    input_ = AddInput(input_type);
+    axis_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
+                 CreateArgMaxOptions(builder_, index_output_type).Union());
+    BuildInterpreter({input_shape, {1, 1, 1, 1}});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, GetMaxArgFloat) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32,
+                               TensorType_INT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
+  ArgMaxOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgOutput64) {
+  ArgMaxOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64,
+                               TensorType_INT64);
+  model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 0f98154b90..384e1afaa4 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -77,6 +77,7 @@ TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_DEQUANTIZE();
 TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_ARG_MAX();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -135,6 +136,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 3448de68e8..921c139e30 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -653,6 +653,15 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_MAXIMUM: {
       break;
     }
+    case BuiltinOperator_ARG_MAX: {
+      auto* params = MallocPOD<TfLiteArgMaxParams>();
+      if (auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
+        ConvertTensorType(schema_params->output_type(), &params->output_type,
+                          error_reporter);
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index bc13444dc7..04d53d955a 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -351,6 +351,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_CAST:
       case tflite::BuiltinOperator_PRELU:
       case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_ARG_MAX:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index c63bfb28cc..238a406af5 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -132,6 +132,7 @@ enum BuiltinOperator : byte {
   CAST = 53,
   PRELU = 54,
   MAXIMUM = 55,
+  ARG_MAX = 56,
 }
 
 // Options for the builtin operators.
@@ -175,6 +176,7 @@ union BuiltinOptions {
   CastOptions,
   DequantizeOptions,
   MaximumOptions,
+  ArgMaxOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -391,6 +393,10 @@ table DequantizeOptions {
 table MaximumOptions {
 }
 
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 0735be5c8f..8b355b0dc6 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -148,6 +148,9 @@ struct DequantizeOptionsT;
 struct MaximumOptions;
 struct MaximumOptionsT;
 
+struct ArgMaxOptions;
+struct ArgMaxOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -259,11 +262,12 @@ enum BuiltinOperator {
   BuiltinOperator_CAST = 53,
   BuiltinOperator_PRELU = 54,
   BuiltinOperator_MAXIMUM = 55,
+  BuiltinOperator_ARG_MAX = 56,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_MAXIMUM
+  BuiltinOperator_MAX = BuiltinOperator_ARG_MAX
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[55] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -318,7 +322,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
     BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
     BuiltinOperator_CAST,
     BuiltinOperator_PRELU,
-    BuiltinOperator_MAXIMUM
+    BuiltinOperator_MAXIMUM,
+    BuiltinOperator_ARG_MAX
   };
   return values;
 }
@@ -381,6 +386,7 @@ inline const char **EnumNamesBuiltinOperator() {
     "CAST",
     "PRELU",
     "MAXIMUM",
+    "ARG_MAX",
     nullptr
   };
   return names;
@@ -432,11 +438,12 @@ enum BuiltinOptions {
   BuiltinOptions_CastOptions = 37,
   BuiltinOptions_DequantizeOptions = 38,
   BuiltinOptions_MaximumOptions = 39,
+  BuiltinOptions_ArgMaxOptions = 40,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_MaximumOptions
+  BuiltinOptions_MAX = BuiltinOptions_ArgMaxOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[41] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -477,7 +484,8 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
     BuiltinOptions_LogSoftmaxOptions,
     BuiltinOptions_CastOptions,
     BuiltinOptions_DequantizeOptions,
-    BuiltinOptions_MaximumOptions
+    BuiltinOptions_MaximumOptions,
+    BuiltinOptions_ArgMaxOptions
   };
   return values;
 }
@@ -524,6 +532,7 @@ inline const char **EnumNamesBuiltinOptions() {
     "CastOptions",
     "DequantizeOptions",
     "MaximumOptions",
+    "ArgMaxOptions",
     nullptr
   };
   return names;
@@ -694,6 +703,10 @@ template<> struct BuiltinOptionsTraits<MaximumOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_MaximumOptions;
 };
 
+template<> struct BuiltinOptionsTraits<ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1037,6 +1050,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_MaximumOptions ?
       reinterpret_cast<const MaximumOptionsT *>(value) : nullptr;
   }
+  ArgMaxOptionsT *AsArgMaxOptions() {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<ArgMaxOptionsT *>(value) : nullptr;
+  }
+  const ArgMaxOptionsT *AsArgMaxOptions() const {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<const ArgMaxOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -3846,6 +3867,60 @@ inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(
 
 flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ArgMaxOptionsT : public flatbuffers::NativeTable {
+  typedef ArgMaxOptions TableType;
+  TensorType output_type;
+  ArgMaxOptionsT()
+      : output_type(TensorType_FLOAT32) {
+  }
+};
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArgMaxOptionsT NativeTableType;
+  enum {
+    VT_OUTPUT_TYPE = 4
+  };
+  TensorType output_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ArgMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArgMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMaxOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_output_type(TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ArgMaxOptionsBuilder &operator=(const ArgMaxOptionsBuilder &);
+  flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType output_type = TensorType_FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -4080,6 +4155,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const MaximumOptions *builtin_options_as_MaximumOptions() const {
     return builtin_options_type() == BuiltinOptions_MaximumOptions ? static_cast<const MaximumOptions *>(builtin_options()) : nullptr;
   }
+  const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == BuiltinOptions_ArgMaxOptions ? static_cast<const ArgMaxOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -4262,6 +4340,10 @@ template<> inline const MaximumOptions *Operator::builtin_options_as<MaximumOpti
   return builtin_options_as_MaximumOptions();
 }
 
+template<> inline const ArgMaxOptions *Operator::builtin_options_as<ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -5819,6 +5901,32 @@ inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::Fla
       _fbb);
 }
 
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ArgMaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; };
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -6155,6 +6263,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -6329,6 +6441,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6491,6 +6607,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const MaximumOptionsT *>(value);
       return CreateMaximumOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptionsT *>(value);
+      return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -6653,6 +6773,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new MaximumOptionsT(*reinterpret_cast<MaximumOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_ArgMaxOptions: {
+      value = new ArgMaxOptionsT(*reinterpret_cast<ArgMaxOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -6855,6 +6979,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<ArgMaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 62f20638ba..386cfdb524 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -18,6 +18,7 @@ gen_zipped_test_files(
     name = "optest",
     files = [
         "add.zip",
+        "arg_max.zip",
         "avg_pool.zip",
         "batch_to_space_nd.zip",
         "concat.zip",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index f919517e93..42aa92c1bb 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -104,6 +104,10 @@ KNOWN_BUGS = {
     r"strided_slice.*begin=\[0\].*end=\[1\].*": "73170889",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Needs support for dimensions other than the last one in argmax.
+    r"arg_max.*axis=0.*": "77546240",
+    r"arg_max.*axis=1.*": "77546240",
+    r"arg_max.*axis=2.*": "77546240",
 }
 
 
@@ -1954,7 +1958,7 @@ def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
 
 
 def make_topk_tests(zip_path):
-  """Make a set of tests to do gather."""
+  """Make a set of tests to do topk."""
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
@@ -1962,7 +1966,7 @@ def make_topk_tests(zip_path):
   }]
 
   def build_graph(parameters):
-    """Build the gather op testing graph."""
+    """Build the topk op testing graph."""
     input_value = tf.placeholder(
         dtype=parameters["input_dtype"],
         name="input",
@@ -1979,6 +1983,36 @@ def make_topk_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+
+def make_arg_max_tests(zip_path):
+  """Make a set of tests to do arg_max."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
+      "axis": [0, 1, 2, 3],
+      "output_type": [tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the topk op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    axis = tf.constant(parameters["axis"], name="axis")
+    out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 6697b86e79..291c974545 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -94,6 +94,11 @@ std::map<string, string> kBrokenTests = {
 
     // No support for axis!=0 in GatherV2.
     {R"(^\/gather.*axis=1)", "76910444"},
+
+    // No support for arbitrary dimensions in ArgMax.
+    {R"(^\/arg_max.*axis=0)", "77546240"},
+    {R"(^\/arg_max.*axis=1)", "77546240"},
+    {R"(^\/arg_max.*axis=2)", "77546240"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -236,6 +241,7 @@ TEST_P(OpsTest, RunStuff) {
       ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
 
 INSTANTIATE_TESTS(add)
+INSTANTIATE_TESTS(arg_max)
 INSTANTIATE_TESTS(avg_pool)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index f991529569..4df16827b4 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -662,6 +662,23 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
                    TocoOperator* op) const override {}
 };
 
+class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
+                                      ::tflite::BuiltinOptions_ArgMaxOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateArgMaxOptions(
+        *builder, DataType::Serialize(op.output_data_type));
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->output_data_type = DataType::Deserialize(options.output_type());
+  }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -834,6 +851,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
   ops.emplace_back(
       new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
+  ops.emplace_back(
+      new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
 
   // Custom Operators.
   ops.emplace_back(
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 4783843b7f..5546bda696 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -391,6 +391,13 @@ TEST_F(OperatorTest, BuiltinTopKV2) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinArgMax) {
+  ArgMaxOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("ARG_MAX", OperatorType::kArgMax), op);
+  EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
-- 
GitLab


From 022d1fe0ecb46a2a7b77f8b99de8c273ef804c82 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 22:04:04 -0700
Subject: [PATCH 0498/1262] [XLA] Redesign: implement XlaBuilder::IsConstant,
 XlaBuidler::BuildConstantSubGraph, and
 Client::ComputeConstant(XlaComputation...). - Since the builder no longer
 holds a client, we moved the ComputeConstant to the client side so that it
 can communicate with the service side. Now we add
 XlaBuilder::BuildConstantSubGraph, which is only responsible for building a
 subgraph that is compile-time constant. - Before this change, every
 XlaBuilder has a unique id. Now since it also builds constant subgraph, we
 give every XlaComputation being built a global unique id, and uniquify
 instruction names when actually building the XlaComputation.

PiperOrigin-RevId: 192236997
---
 tensorflow/compiler/xla/client/client.cc      |  28 +++
 tensorflow/compiler/xla/client/client.h       |  21 ++
 .../compiler/xla/client/xla_client/BUILD      |   1 +
 .../xla/client/xla_client/xla_builder.cc      | 183 +++++++++++++++---
 .../xla/client/xla_client/xla_builder.h       |  69 +++----
 tensorflow/compiler/xla/service/service.cc    |  44 +++++
 tensorflow/compiler/xla/service/service.h     |   3 +
 tensorflow/compiler/xla/service_interface.h   |   4 +
 tensorflow/compiler/xla/tests/BUILD           |   2 +
 .../xla/tests/compute_constant_test.cc        | 100 +++++-----
 tensorflow/compiler/xla/xla.proto             |   5 +
 11 files changed, 345 insertions(+), 115 deletions(-)

diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 3f45167fcb..f0f94298a0 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -193,6 +193,34 @@ StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
   return Transfer(*data, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>> Client::ComputeConstant(
+    const XlaComputation& computation, const Layout* output_layout) const {
+  ComputeConstantGraphRequest request;
+  *request.mutable_computation() = computation.proto();
+  if (output_layout != nullptr) {
+    *request.mutable_output_layout() = *output_layout;
+  }
+
+  ComputeConstantResponse response;
+
+  VLOG(2) << "making compute-constant-graph request";
+  Status s = stub_->ComputeConstantGraph(&request, &response);
+  VLOG(2) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  VLOG(3) << "ComputeConstant: {" << response.DebugString() << "}";
+
+  if (!response.has_literal()) {
+    return InternalError(
+        "no computed literal in the provided response in ComputeConstantGraph "
+        "request");
+  }
+  return Literal::CreateFromProto(response.literal());
+}
+
 StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   LoadComputationSnapshotRequest request;
   *request.mutable_module() = module;
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 05d707dab1..14c685d94e 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -194,6 +194,27 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Computes the value of the given computation using a non-optimized
+  // interpreter on the host.
+  //
+  // The computation must not depend on any parameters, or on stateful operators
+  // such as `RngNormal` or `Infeed`.
+  //
+  // This functionality can be useful when translating a computation into XLA
+  // where something that looked dynamic is required by XLA to be specified as a
+  // constant. E.g. the source computation (outside of XLA) may include a
+  // dynamic computation of the shape of something and ComputeConstant lets you
+  // determine what the value of that computation is in the case where the value
+  // can be determined at compile time.
+  //
+  // If output_layout is non-null, then the output of the computation will be
+  // stored using that layout.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
+      const XlaComputation& computation,
+      const Layout* output_layout = nullptr) const;
+
   // Unregister the memory for the given GlobalData on the device.
   Status Unregister(const GlobalData& data);
 
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index b1dba16856..31fa1241ee 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -44,6 +44,7 @@ cc_library(
     hdrs = ["xla_builder.h"],
     deps = [
         ":xla_computation",
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 170dd59c79..a01be28881 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <functional>
 #include <numeric>
+#include <queue>
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -82,7 +85,7 @@ StatusOr<Shape> XlaOp::GetShape() const {
 }
 
 XlaBuilder::XlaBuilder(const string& computation_name)
-    : name_(computation_name), unique_id_(GetUniqueId()) {}
+    : name_(computation_name) {}
 
 XlaBuilder::~XlaBuilder() {}
 
@@ -111,10 +114,11 @@ XlaOp XlaBuilder::NoteErrorOrReturn(
   return op.ConsumeValueOrDie();
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_RET_CHECK(root_id != nullptr);
+
   ProgramShape program_shape;
 
   // Not all instructions can be roots. Walk backwards from the last added
@@ -155,9 +159,56 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
   return program_shape;
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape() {
-  int64 root_id;
-  return GetProgramShape(&root_id);
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
+  int64 root;
+  return GetProgramShape(&root);
+}
+
+void XlaBuilder::IsConstantVisitor(const int64 op_handle,
+                                   std::set<int64>* visited,
+                                   bool* is_constant) const {
+  if (visited->count(op_handle) != 0 || !*is_constant) {
+    return;
+  }
+
+  CHECK(op_handle < instructions_.size() && op_handle >= 0);
+
+  const HloInstructionProto& instr = instructions_[op_handle];
+  const HloOpcode opcode = StringToHloOpcode(instr.opcode()).ValueOrDie();
+  switch (opcode) {
+    default:
+      for (const int64 operand_id : instr.operand_ids()) {
+        IsConstantVisitor(operand_id, visited, is_constant);
+      }
+      // TODO(b/32495713): We aren't checking the called computations.
+      break;
+
+    // Non functional ops.
+    case HloOpcode::kRng:
+    case HloOpcode::kCrossReplicaSum:
+      // TODO(b/33009255): Implmement constant folding for cross replica sum.
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kHostCompute:
+    case HloOpcode::kCall:
+      // TODO(b/32495713): We aren't checking the to_apply computation itself,
+      // so we conservatively say that computations containing the Call op
+      // cannot be constant.  We cannot set is_functional=false in other similar
+      // cases since we're already relying on IsConstant to return true.
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kWhile:
+      // TODO(b/32495713): We aren't checking the condition and body
+      // computations themselves.
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kParameter:
+      *is_constant = false;
+      break;
+  }
+  if (!*is_constant) {
+    VLOG(1) << "Non-constant: " << instr.name();
+  }
+  visited->insert(op_handle);
 }
 
 XlaComputation XlaBuilder::BuildAndNoteError() {
@@ -180,21 +231,24 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
   }
 
   HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
 
   {
     int64 root_id;
-    ProgramShape program_shape;
-    TF_ASSIGN_OR_RETURN(program_shape, GetProgramShape(&root_id));
-    entry.mutable_program_shape()->Swap(&program_shape);
+    TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(),
+                        GetProgramShape(&root_id));
     entry.set_root_id(root_id);
   }
 
   for (auto& instruction : instructions_) {
+    // Ensures that the instruction names are unique among the whole graph.
+    const string& new_name =
+        StrCat(instruction.name(), ".", entry.id(), ".", instruction.id());
+    instruction.set_name(new_name);
     entry.add_instructions()->Swap(&instruction);
   }
 
-  entry.set_id(unique_id_);
-  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
   XlaComputation computation(entry.id());
   HloModuleProto* module = computation.mutable_proto();
   module->set_name(entry.name());
@@ -417,11 +471,10 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
                             const string& name) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    if (parameter_numbers_.find(parameter_number) != parameter_numbers_.end()) {
+    if (!parameter_numbers_.insert(parameter_number).second) {
       return InvalidArgument("parameter %lld already registered",
                              parameter_number);
     }
-    parameter_numbers_.insert(parameter_number);
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
     *instr.mutable_shape() = shape;
@@ -1262,15 +1315,98 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
   });
 }
 
-StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand,
-                                      int64 num_parameters) {
-  return Unimplemented("IsConstant is not implemented.");
+StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  // Verify that the handle is valid.
+  TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
+
+  bool is_constant = true;
+  std::set<int64> visited;
+  IsConstantVisitor(operand.handle(), &visited, &is_constant);
+  return is_constant;
 }
 
-StatusOr<std::unique_ptr<Literal>> XlaBuilder::ComputeConstant(
-    const XlaOp& operand, const Layout* output_layout,
-    tensorflow::gtl::ArraySlice<Literal> parameters) {
-  return Unimplemented("ComputeConstant is not implemented");
+StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
+    const XlaOp& root_op) const {
+  TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
+  if (!is_constant) {
+    auto op_status = LookUpInstruction(root_op);
+    string op_string =
+        op_status.ok() ? op_status.ValueOrDie()->name() : "<unknown operation>";
+    return InvalidArgument(
+        "Operand to BuildConstantSubGraph depends on a parameter.\n\n"
+        "  op requested for constant subgraph: %s\n\n"
+        "This is an internal error that typically happens when the XLA user "
+        "(e.g. TensorFlow) is attempting to determine a value that must be a "
+        "compile-time constant (e.g. an array dimension) but it is not capable "
+        "of being evaluated at XLA compile time.\n\n"
+        "Please file a usability bug with the framework being used (e.g. "
+        "TensorFlow).",
+        op_string.c_str());
+  }
+
+  TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
+                      LookUpInstruction(root_op));
+  TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
+  if (!CanBeRoot(opcode)) {
+    return InvalidArgument("the operand with opcode %s cannot be root",
+                           root->opcode().c_str());
+  }
+
+  HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id(), "_compute_constant"));
+  entry.set_root_id(root->id());
+  ProgramShape* program_shape = entry.mutable_program_shape();
+  *program_shape->mutable_result() = root->shape();
+
+  // We use std::set to keep the instruction ids in ascending order (which is
+  // also a valid denpendency order). The related ops will be added to the
+  // subgraph in the same order.
+  std::set<int64> related_ops;
+  tensorflow::gtl::FlatSet<int64> related_calls;  // Related computations.
+  std::queue<int64> worklist;
+  worklist.push(root->id());
+  related_ops.insert(root->id());
+  while (!worklist.empty()) {
+    int64 node = worklist.front();
+    worklist.pop();
+    for (int64 id : instructions_[node].operand_ids()) {
+      if (related_ops.insert(id).second) {
+        worklist.push(id);
+      }
+    }
+    for (int64 called_id : instructions_[node].called_computation_ids()) {
+      related_calls.insert(called_id);
+    }
+  }
+
+  // Add related ops to the computation.
+  for (int64 id : related_ops) {
+    auto* instr = entry.add_instructions();
+    *instr = instructions_[id];
+    // Ensures that the instruction names are unique among the graph.
+    const string& new_name =
+        StrCat(instr->name(), ".", entry.id(), ".", instr->id());
+    instr->set_name(new_name);
+  }
+
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_program_shape() = *program_shape;
+  for (auto& e : embedded_) {
+    if (related_calls.find(e.second.id()) != related_calls.end()) {
+      *module->add_computations() = e.second;
+    }
+  }
+  *module->add_computations() = std::move(entry);
+
+  return std::move(computation);
 }
 
 std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
@@ -1281,10 +1417,6 @@ std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
   return sub_builder;
 }
 
-Status XlaBuilder::SetReturnValue(const XlaOp& operand) {
-  return Unimplemented("SetReturnValue is not implemented.");
-}
-
 /* static */ ConvolutionDimensionNumbers
 XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   ConvolutionDimensionNumbers dimension_numbers;
@@ -1364,10 +1496,7 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
   instr.set_id(handle);
   instr.set_opcode(HloOpcodeString(opcode));
   if (instr.name().empty()) {
-    instr.set_name(StrCat(instr.opcode(), ".", unique_id_, ".", handle));
-  } else {
-    // Append the handle to make sure the name is unique.
-    instr.set_name(StrCat(instr.name(), ".", unique_id_, ".", handle));
+    instr.set_name(StrCat(instr.opcode()));
   }
   for (const auto& operand : operands) {
     if (operand.builder_ == nullptr) {
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 0673b86646..d747691f16 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -687,11 +687,12 @@ class XlaBuilder {
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters with index greater than or equal to
-  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
-  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
-  // compile-time constant without evaluating the computation.
-  StatusOr<bool> IsConstant(const XlaOp& operand, int64 num_parameters = 0);
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
   // Normalizes operand across spatial and batch dimensions for each feature.
   //
@@ -731,47 +732,14 @@ class XlaBuilder {
                       const XlaOp& grad_output, float epsilon,
                       int64 feature_index);
 
-  // Computes the value of a constant indicated by a XlaOp using a non-optimized
-  // interpreter on the host.
-  //
-  // The operand must represent a constant value, which in this case
-  // means that it must not statically depend on any parameter of the
-  // computation that is being built other then the ones specified on the
-  // parameter list. The parameters in the list will be indexed by their
-  // parameter id property so the number of parameters specified should be at
-  // least as many as the largest used parameter index.
-  //
-  // `IsConstant` can be used to test whether a computation is a compile-time
-  // constant without evaluation it. `ComputeConstant` only succeeds for
-  // computations where `IsConstant` returns true.
-  //
-  // This functionality can be useful when translating a computation
-  // into XLA where something that looked dynamic is required by
-  // XLA to be specified as a constant. E.g. the source
-  // computation (outside of XLA) may include a dynamic
-  // computation of the shape of something and ComputeConstant lets
-  // you determine what the value of that computation is in the case
-  // where the value can be determined at compile time.
-  //
-  // If output_layout is non-null, then the output of the computation
-  // will be stored using that layout.
-  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
-      const XlaOp& operand, const Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {});
-
   // Returns a new XlaBuilder whose resultant Computation is used only by this
   // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
   // behavior as the parent.
   std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
 
-  // Modifies the computation being built so that executions of it will return
-  // the value associated with operand, rather than the last expression enqueued
-  // on the XlaBuilder. Any subsequent operations added to the XlaBuilder will
-  // not have any effect unless SetReturnValue is called again.
-  Status SetReturnValue(const XlaOp& operand);
-
   // Builds the computation with the requested operations, or returns a non-ok
-  // status.
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned.
   StatusOr<XlaComputation> Build();
 
   // Builds the computation with the requested operations, or notes an error in
@@ -784,6 +752,12 @@ class XlaBuilder {
   // instead.
   XlaComputation BuildAndNoteError();
 
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
   // XlaOp and inform the user of the error that occurred while
@@ -796,7 +770,7 @@ class XlaBuilder {
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
   // Returns the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape();
+  StatusOr<ProgramShape> GetProgramShape() const;
 
  private:
   StatusOr<XlaOp> AddInstruction(
@@ -851,10 +825,17 @@ class XlaBuilder {
 
   // Returns the (inferred) result for the program shape for the current
   // computation and fills the root_id in the pointer.
-  StatusOr<ProgramShape> GetProgramShape(int64* root_id);
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+                         bool* is_constant) const;
 
-  string name_;      // Name to use for the built computation.
-  int64 unique_id_;  // The unique id for the built computation.
+  string name_;  // Name to use for the built computation.
 
   // The first error encountered while building the computation.
   // This is OK until the first error is encountered.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ec883a6cf3..70af1c44ea 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1544,6 +1544,50 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
+  if (arg->has_output_layout()) {
+    result_literal = result_literal->Relayout(arg->output_layout());
+  }
+  *result->mutable_literal() = result_literal->ToProto();
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Service::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
+  if (!arg->has_computation()) {
+    return InvalidArgument("computations may not be empty");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("program shape may not be empty");
+  }
+  if (arg->computation().program_shape().parameters_size() != 0) {
+    return InvalidArgument(
+        "constant computation may not depend on any parameters.");
+  }
+
+  ProgramShape program_shape = arg->computation().program_shape();
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  if (arg->has_output_layout()) {
+    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
+        arg->output_layout(), program_shape.result()));
+  }
+
+  HloModuleConfig config(program_shape);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  HloEvaluator evaluator;
+  TF_ASSIGN_OR_RETURN(auto result_literal,
+                      evaluator.Evaluate<std::unique_ptr<Literal>>(
+                          *module, /*arg_literals=*/{}));
+
+  // Since the result layout is non-effective to the Evaluator results, explicit
+  // relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
   if (arg->has_output_layout()) {
     result_literal = result_literal->Relayout(arg->output_layout());
   }
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 9fa72c1b8c..e399f1ac19 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -206,6 +206,9 @@ class Service : public ServiceInterface {
   // Computes the value of a constant expression.
   tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
                                      ComputeConstantResponse* result) override;
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 32aae64973..5b44c26b7c 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -112,6 +112,10 @@ class ServiceInterface {
   virtual tensorflow::Status ComputeConstant(
       const ComputeConstantRequest* arg, ComputeConstantResponse* result) = 0;
 
+  virtual tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) = 0;
+
   // Methods used by Computation.
   virtual tensorflow::Status SnapshotComputation(
       const SnapshotComputationRequest* ag,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 8ecb421780..6c43014b33 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1551,6 +1551,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index e5a03b49ad..c15d808f1d 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -71,28 +74,35 @@ class ComputeConstantTest : public ::testing::Test {
   }
 
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder, Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(auto computed, builder->ComputeConstant(
-                                           operand, output_layout, parameters));
+      Client* client, const XlaOp& operand, XlaBuilder* builder,
+      Layout* output_layout = nullptr) {
+    TF_ASSIGN_OR_RETURN(auto subgraph, builder->BuildConstantSubGraph(operand));
+    TF_ASSIGN_OR_RETURN(auto computed,
+                        client->ComputeConstant(subgraph, output_layout));
     return std::move(computed);
   }
 
+  template <class Scalar>
+  StatusOr<Scalar> ComputeConstantScalar(Client* client, const XlaOp& operand,
+                                         XlaBuilder* builder) {
+    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(client, operand,
+                                                             builder, nullptr));
+    return literal->Get<Scalar>({});
+  }
+
   template <class Scalar>
   StatusOr<Scalar> ComputeConstantScalar(
       Client* client, const ComputationDataHandle& operand,
       ComputationBuilder* builder,
       tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(
-        auto literal,
-        ComputeConstantLiteral(client, operand, builder, nullptr, parameters));
+    TF_ASSIGN_OR_RETURN(auto literal,
+                        builder->ComputeConstant(
+                            operand, /*output_layout=*/nullptr, parameters));
     return literal->Get<Scalar>({});
   }
 
-  bool IsConstant(const ComputationDataHandle& operand,
-                  ComputationBuilder* builder, int64 num_parameters = 0) {
-    StatusOr<bool> result = builder->IsConstant(operand, num_parameters);
+  bool IsConstant(const XlaOp& operand, XlaBuilder* builder) {
+    StatusOr<bool> result = builder->IsConstant(operand);
     EXPECT_TRUE(result.ok()) << result.status();
     return result.ok() ? result.ValueOrDie() : false;
   }
@@ -103,7 +113,7 @@ class ComputeConstantTest : public ::testing::Test {
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.ConstantR0<int32>(42);
     EXPECT_TRUE(IsConstant(computation, &b));
 
@@ -116,7 +126,7 @@ TEST_F(ComputeConstantTest, ScalarInt32Literal) {
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
     EXPECT_TRUE(IsConstant(computation, &b));
@@ -130,7 +140,7 @@ TEST_F(ComputeConstantTest, ScalarFloatAdd) {
 TEST_F(ComputeConstantTest, ScalarRng) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
                      ShapeUtil::MakeShape(F32, {}));
@@ -151,19 +161,21 @@ TEST_F(ComputeConstantTest, Param) {
 
     std::vector<Literal> arguments;
     arguments.push_back(std::move(*Literal::CreateR0(42.5f)));
-    EXPECT_TRUE(IsConstant(computation, &b, arguments.size()));
-
-    auto value =
-        ComputeConstantScalar<float>(client, computation, &b, arguments);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant,
+                            b.IsConstant(computation, arguments.size()));
+    EXPECT_TRUE(is_constant);
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value,
+        ComputeConstantScalar<float>(client, computation, &b, arguments));
+    EXPECT_EQ(value, 44.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, DirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
     EXPECT_FALSE(IsConstant(computation, &b));
 
@@ -177,7 +189,7 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
 TEST_F(ComputeConstantTest, IndirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(1.0f),
               b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
@@ -195,7 +207,7 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
 TEST_F(ComputeConstantTest, UnrelatedParam) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
     auto constant_4 =
@@ -212,64 +224,64 @@ TEST_F(ComputeConstantTest, UnrelatedParam) {
 
     EXPECT_TRUE(IsConstant(constant_13, &b));
 
-    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value, ComputeConstantScalar<float>(client, constant_13, &b));
+    EXPECT_EQ(value, 13.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto computation =
         b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal =
         Literal::CreateR1<int32>({4, 6});
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 XLA_TEST_F(ComputeConstantTest, Layout) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
     for (const std::vector<int64>& layout : layouts) {
       auto layout_proto = LayoutUtil::MakeLayout(layout);
-      auto computed = ComputeConstantLiteral(
-          client,
-          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-          &b, &layout_proto);
-      ASSERT_TRUE(computed.ok()) << computed.status();
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto computed, ComputeConstantLiteral(
+                             client,
+                             b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                                   b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+                             &b, &layout_proto));
 
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
                                              LayoutUtil::MakeLayout(layout));
-      LiteralTestUtil::AssertEqualShapesAndLayouts(
-          expected_literal->shape(), computed.ValueOrDie()->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
+                                                   computed->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
     }
   }
 }
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f9943f71d3..b4cbdf3773 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -417,6 +417,11 @@ message ComputeConstantRequest {
   repeated LiteralProto parameters = 4;
 }
 
+message ComputeConstantGraphRequest {
+  HloModuleProto computation = 1;
+  Layout output_layout = 2;
+}
+
 message ComputeConstantResponse {
   // A LiteralProto is returned directly for this request, instead of a
   // ComputationDataHandle.
-- 
GitLab


From 6e027ee0cbb0e389b912306fd88ebefa470d6065 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 9 Apr 2018 23:13:11 -0700
Subject: [PATCH 0499/1262] [XLA] Redesign: implement and test custom call.

PiperOrigin-RevId: 192241311
---
 .../compiler/xla/client/xla_client/xla_builder.cc   | 13 ++++++++++++-
 tensorflow/compiler/xla/tests/custom_call_test.cc   |  6 +++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index a01be28881..74d48635eb 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -887,7 +887,18 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
 XlaOp XlaBuilder::CustomCall(const string& call_target_name,
                              tensorflow::gtl::ArraySlice<XlaOp> operands,
                              const Shape& shape) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (tensorflow::str_util::StartsWith(call_target_name, "$")) {
+      return InvalidArgument(
+          "Invalid custom_call_target \"%s\": Call targets that start with '$' "
+          "are reserved for internal use.",
+          call_target_name.c_str());
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_custom_call_target(call_target_name);
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+  });
 }
 
 XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 2d847a66b0..b43d5c9ff5 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -134,9 +134,9 @@ class CustomCallClientAPITest : public ClientLibraryTestBase {};
 // When using the client API, CustomCall targets can't begin with '$' -- these
 // are reserved for internal use.
 XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
-  ComputationBuilder builder(client_, TestName());
-  auto call = builder.CustomCall("$illegal", /*operands=*/{},
-                                 ShapeUtil::MakeShape(F32, {1}));
+  XlaBuilder builder(TestName());
+  builder.CustomCall("$illegal", /*operands=*/{},
+                     ShapeUtil::MakeShape(F32, {1}));
 
   StatusOr<std::unique_ptr<GlobalData>> result =
       Execute(&builder, /*arguments=*/{});
-- 
GitLab


From 61994c21f5ddee273e0d79b08444b48858e11bfd Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 10 Apr 2018 20:00:22 +0800
Subject: [PATCH 0500/1262] Remove breaking ``` for math equations

---
 tensorflow/contrib/optimizer_v2/adam.py | 4 ----
 tensorflow/python/training/adam.py      | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 9bc160c0b9..a38c98f471 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -40,23 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     Initialization:
 
-    ```
     \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
     \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
     \\(t <- 0\\) (Initialize timestep)
-    ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    ```
     $$t <- t + 1$$
     $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
     $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
     $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
     $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-    ```
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 1f2c40f18e..dc0f1aba09 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,23 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    ```
     \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
     \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
     \\(t <- 0\\) (Initialize timestep)
-    ```
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    ```
     $$t <- t + 1$$
     $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
     $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
     $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
     $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-    ```
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From c6a6253b5d3cd53409e3ae2636c8cb2597353d12 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 07:47:14 -0700
Subject: [PATCH 0501/1262] Suppress -Wself-assign in self-assignment tests,
 which triggers in newer clang revisions.

PiperOrigin-RevId: 192284946
---
 tensorflow/core/lib/gtl/flatset_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 09fbbb1fb6..010b4bb5df 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -252,7 +252,7 @@ TEST(FlatSet, Copy) {
     NumSet copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
-- 
GitLab


From 7a8fb5b32e797565d5ffa8d6f250a33e2210f423 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 08:51:18 -0700
Subject: [PATCH 0502/1262] Update document

PiperOrigin-RevId: 192292160
---
 tensorflow/docs_src/programmers_guide/graphs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index e69b717432..aa72cae766 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -96,7 +96,7 @@ to all API functions in the same context.  For example:
   (See @{$programmers_guide/variables} for more information about variables.)
 
 * Calling @{tf.train.Optimizer.minimize} will add operations and tensors to the
-  default graph that calculate gradients, and return a @{tf.Operation} that,
+  default graph that calculates gradients, and return a @{tf.Operation} that,
   when run, will apply those gradients to a set of variables.
 
 Most programs rely solely on the default graph. However,
-- 
GitLab


From d7e4458c3ca839fd8f5a86b4342905ce511a47eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 08:55:24 -0700
Subject: [PATCH 0503/1262] Added minimum op, better type support in maximum.

PiperOrigin-RevId: 192292693
---
 tensorflow/contrib/lite/builtin_ops.h         |   1 +
 tensorflow/contrib/lite/kernels/BUILD         |  18 ++-
 .../internal/reference/reference_ops.h        |  11 +-
 .../{maximum.cc => maximum_minimum.cc}        |  77 +++++++---
 .../lite/kernels/maximum_minimum_test.cc      | 143 ++++++++++++++++++
 .../contrib/lite/kernels/maximum_test.cc      |  95 ------------
 tensorflow/contrib/lite/kernels/register.cc   |   2 +
 tensorflow/contrib/lite/model.cc              |   3 +-
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |   5 +-
 .../contrib/lite/schema/schema_generated.h    | 113 +++++++-------
 tensorflow/contrib/lite/testing/BUILD         |   1 +
 .../contrib/lite/testing/generate_examples.py |  35 +++++
 .../testing/generated_examples_zip_test.cc    |   1 +
 .../contrib/lite/toco/tflite/operator.cc      |   2 +
 .../contrib/lite/toco/tflite/operator_test.cc |   2 +
 16 files changed, 328 insertions(+), 182 deletions(-)
 rename tensorflow/contrib/lite/kernels/{maximum.cc => maximum_minimum.cc} (59%)
 create mode 100644 tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
 delete mode 100644 tensorflow/contrib/lite/kernels/maximum_test.cc

diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index e11c7fb2e4..1ceefafc56 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -81,6 +81,7 @@ typedef enum {
   kTfLiteBuiltinPrelu = 54,
   kTfLiteBuiltinMaximum = 55,
   kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index b79900623e..f07eca0ba9 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -157,7 +157,7 @@ cc_library(
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
-        "maximum.cc",
+        "maximum_minimum.cc",
         "mean.cc",
         "mfcc.cc",
         "mul.cc",
@@ -555,9 +555,9 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "maximum_test",
+    name = "maximum_minimum_test",
     size = "small",
-    srcs = ["maximum_test.cc"],
+    srcs = ["maximum_minimum_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -941,4 +941,16 @@ tf_cc_test(
     ],
 )
 
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 31e190e248..410688411e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3419,10 +3419,11 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, const Dims<4>& input2_dims,
-                       T* output_data, const Dims<4>& output_dims) {
+template <typename T, typename Op>
+void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                              const T* input2_data, const Dims<4>& input2_dims,
+                              T* output_data, const Dims<4>& output_dims,
+                              Op op) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
@@ -3436,7 +3437,7 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
           auto in2_idx = SubscriptToIndex(desc2, c, x, y, b);
           auto in1_val = input1_data[in1_idx];
           auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = in1_val > in2_val ? in1_val : in2_val;
+          output_data[out_idx] = op(in1_val, in2_val);
         }
       }
     }
diff --git a/tensorflow/contrib/lite/kernels/maximum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
similarity index 59%
rename from tensorflow/contrib/lite/kernels/maximum.cc
rename to tensorflow/contrib/lite/kernels/maximum_minimum.cc
index 13c40603ce..5a28d663c9 100644
--- a/tensorflow/contrib/lite/kernels/maximum.cc
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
@@ -24,9 +24,9 @@ limitations under the License.
 namespace tflite {
 namespace ops {
 namespace builtin {
-namespace maximum {
+namespace maximum_minimum {
 
-// This file has a reference implemenation of TFMaximum.
+// This file has a reference implemenation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
 };
@@ -35,8 +35,8 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-struct MaximumContext {
-  MaximumContext(TfLiteContext* context, TfLiteNode* node) {
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
     input1 = GetInput(context, node, kInputTensor1);
     input2 = GetInput(context, node, kInputTensor2);
     output = GetOutput(context, node, kOutputTensor);
@@ -50,7 +50,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  MaximumContext op_context(context, node);
+  OpContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
   op_context.output->type = op_context.input1->type;
 
@@ -69,23 +69,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, op_context.output, output_size);
 }
 
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  MaximumContext op_context(context, node);
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                      const OpContext& op_context) {
+  reference_ops::TensorFlowMaximumMinimum<data_type>(
+      GetTensorData<data_type>(op_context.input1),
+      GetTensorDims(op_context.input1),
+      GetTensorData<data_type>(op_context.input2),
+      GetTensorDims(op_context.input2),
+      GetTensorData<data_type>(op_context.output),
+      GetTensorDims(op_context.output), op_type::template op<data_type>);
+}
 
-#define TF_LITE_MAXIMUM(kernel_type, data_type)    \
-  kernel_type::TensorFlowMaximum<data_type>(       \
-      GetTensorData<data_type>(op_context.input1), \
-      GetTensorDims(op_context.input1),            \
-      GetTensorData<data_type>(op_context.input2), \
-      GetTensorDims(op_context.input2),            \
-      GetTensorData<data_type>(op_context.output), \
-      GetTensorDims(op_context.output))
+template <KernelType kernel_type, typename OpType>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
 
   if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TF_LITE_MAXIMUM(reference_ops, float);
+        TFLiteOperation<float, OpType>(context, node, op_context);
+        break;
+      case kTfLiteUInt8:
+        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt32:
+       TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt64:
+        TFLiteOperation<int64_t, OpType>(context, node, op_context);
         break;
       default:
         context->ReportError(context,
@@ -99,19 +125,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                          op_context.output->type);
     return kTfLiteError;
   }
-#undef TF_LITE_MAXIMUM
   return kTfLiteOk;
 }
 
-}  // namespace maximum
+}  // namespace maximum_minimum
 
 TfLiteRegistration* Register_MAXIMUM_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, maximum::Prepare,
-                                 maximum::Eval<maximum::kReference>};
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>};
   return &r;
 }
 
+TfLiteRegistration* Register_MINIMUM_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
 TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
new file mode 100644
index 0000000000..0752aa1804
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class MaxMinOpModel : public SingleOpModel {
+ public:
+  MaxMinOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorType& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor(input2_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values,
+               std::initializer_list<data_type> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<data_type>(input1_values);
+  m.SetInput2<data_type>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<data_type>(), ElementsAreArray(output_values));
+}
+
+template <>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<float> input1_values,
+               std::initializer_list<float> input2_values,
+               std::initializer_list<float> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<float>(input1_values);
+  m.SetInput2<float>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(output_values)));
+}
+
+TEST(MaximumOpTest, FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {1.0, 0.0, 1.0, 12.0, -2.0, -1.43});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44});
+}
+
+TEST(MaxMinOpTest, Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {1, 0, 2, 12, 255, 23});
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {0, 0, 1, 11, 2, 1});
+}
+
+TEST(MaximumOpTest, FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {1.0, 2.0, 0.5, 2.0, 0.5, 11.0});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {0.5, 0.0, -1.0, -2.0, -1.44, 2.0});
+}
+
+TEST(MaximumOpTest, Int32WithBroadcastTest) {
+  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int32_t> data2 = {2};
+  TestModel<int32>(BuiltinOperator_MAXIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {2, 2, 2, 2, 3, 11});
+  TestModel<int32>(BuiltinOperator_MINIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {1, 0, -1, -2, 2, 2});
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/maximum_test.cc b/tensorflow/contrib/lite/kernels/maximum_test.cc
deleted file mode 100644
index df2bf29c20..0000000000
--- a/tensorflow/contrib/lite/kernels/maximum_test.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class MaximumOpModel : public SingleOpModel {
- public:
-  MaximumOpModel(const TensorData& input1, const TensorData& input2,
-                 const TensorType& output) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    output_ = AddOutput(output);
-    SetBuiltinOp(BuiltinOperator_MAXIMUM, BuiltinOptions_MaximumOptions,
-                 CreateMaximumOptions(builder_).Union());
-    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
-  }
-
-  template <class T>
-  void SetInput1(std::initializer_list<T> data) {
-    PopulateTensor(input1_, data);
-  }
-
-  template <class T>
-  void SetInput2(std::initializer_list<T> data) {
-    PopulateTensor(input2_, data);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput() {
-    return ExtractVector<T>(output_);
-  }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- protected:
-  int input1_;
-  int input2_;
-  int output_;
-};
-
-TEST(MaximumOpTest, FloatTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
-  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
-  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}},
-                   {TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32);
-  m.SetInput1<float>(data1);
-  m.SetInput2<float>(data2);
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
-  EXPECT_THAT(
-      m.GetOutput<float>(),
-      ElementsAreArray(ArrayFloatNear({1.0, 0.0, 1.0, 12.0, -2.0, -1.43})));
-}
-
-TEST(MaximumOpTest, FloatWithBroadcastTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
-  std::initializer_list<float> data2 = {0.5, 2.0};
-  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}}, {TensorType_FLOAT32, {2}},
-                   TensorType_FLOAT32);
-  m.SetInput1<float>(data1);
-  m.SetInput2<float>(data2);
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
-  EXPECT_THAT(
-      m.GetOutput<float>(),
-      ElementsAreArray(ArrayFloatNear({1.0, 2.0, 0.5, 2.0, 0.5, 11.0})));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 384e1afaa4..67ba8d0f39 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -77,6 +77,7 @@ TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_DEQUANTIZE();
 TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
 TfLiteRegistration* Register_ARG_MAX();
 
 BuiltinOpResolver::BuiltinOpResolver() {
@@ -136,6 +137,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 921c139e30..13e5532909 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -650,7 +650,8 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_MAXIMUM: {
+    case BuiltinOperator_MAXIMUM:
+    case BuiltinOperator_MINIMUM: {
       break;
     }
     case BuiltinOperator_ARG_MAX: {
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 04d53d955a..08fb820767 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -351,6 +351,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_CAST:
       case tflite::BuiltinOperator_PRELU:
       case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_MINIMUM:
       case tflite::BuiltinOperator_ARG_MAX:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 238a406af5..357493755d 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -133,6 +133,7 @@ enum BuiltinOperator : byte {
   PRELU = 54,
   MAXIMUM = 55,
   ARG_MAX = 56,
+  MINIMUM = 57,
 }
 
 // Options for the builtin operators.
@@ -175,7 +176,7 @@ union BuiltinOptions {
   LogSoftmaxOptions,
   CastOptions,
   DequantizeOptions,
-  MaximumOptions,
+  MaximumMinimumOptions,
   ArgMaxOptions,
 }
 
@@ -390,7 +391,7 @@ table CastOptions {
 table DequantizeOptions {
 }
 
-table MaximumOptions {
+table MaximumMinimumOptions {
 }
 
 table ArgMaxOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 8b355b0dc6..c638daf66e 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -145,8 +145,8 @@ struct CastOptionsT;
 struct DequantizeOptions;
 struct DequantizeOptionsT;
 
-struct MaximumOptions;
-struct MaximumOptionsT;
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsT;
 
 struct ArgMaxOptions;
 struct ArgMaxOptionsT;
@@ -263,11 +263,12 @@ enum BuiltinOperator {
   BuiltinOperator_PRELU = 54,
   BuiltinOperator_MAXIMUM = 55,
   BuiltinOperator_ARG_MAX = 56,
+  BuiltinOperator_MINIMUM = 57,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_ARG_MAX
+  BuiltinOperator_MAX = BuiltinOperator_MINIMUM
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[55] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[56] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -323,7 +324,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[55] {
     BuiltinOperator_CAST,
     BuiltinOperator_PRELU,
     BuiltinOperator_MAXIMUM,
-    BuiltinOperator_ARG_MAX
+    BuiltinOperator_ARG_MAX,
+    BuiltinOperator_MINIMUM
   };
   return values;
 }
@@ -387,6 +389,7 @@ inline const char **EnumNamesBuiltinOperator() {
     "PRELU",
     "MAXIMUM",
     "ARG_MAX",
+    "MINIMUM",
     nullptr
   };
   return names;
@@ -437,7 +440,7 @@ enum BuiltinOptions {
   BuiltinOptions_LogSoftmaxOptions = 36,
   BuiltinOptions_CastOptions = 37,
   BuiltinOptions_DequantizeOptions = 38,
-  BuiltinOptions_MaximumOptions = 39,
+  BuiltinOptions_MaximumMinimumOptions = 39,
   BuiltinOptions_ArgMaxOptions = 40,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
   BuiltinOptions_MAX = BuiltinOptions_ArgMaxOptions
@@ -484,7 +487,7 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[41] {
     BuiltinOptions_LogSoftmaxOptions,
     BuiltinOptions_CastOptions,
     BuiltinOptions_DequantizeOptions,
-    BuiltinOptions_MaximumOptions,
+    BuiltinOptions_MaximumMinimumOptions,
     BuiltinOptions_ArgMaxOptions
   };
   return values;
@@ -531,7 +534,7 @@ inline const char **EnumNamesBuiltinOptions() {
     "LogSoftmaxOptions",
     "CastOptions",
     "DequantizeOptions",
-    "MaximumOptions",
+    "MaximumMinimumOptions",
     "ArgMaxOptions",
     nullptr
   };
@@ -699,8 +702,8 @@ template<> struct BuiltinOptionsTraits<DequantizeOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
 };
 
-template<> struct BuiltinOptionsTraits<MaximumOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_MaximumOptions;
+template<> struct BuiltinOptionsTraits<MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
 };
 
 template<> struct BuiltinOptionsTraits<ArgMaxOptions> {
@@ -1042,13 +1045,13 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_DequantizeOptions ?
       reinterpret_cast<const DequantizeOptionsT *>(value) : nullptr;
   }
-  MaximumOptionsT *AsMaximumOptions() {
-    return type == BuiltinOptions_MaximumOptions ?
-      reinterpret_cast<MaximumOptionsT *>(value) : nullptr;
+  MaximumMinimumOptionsT *AsMaximumMinimumOptions() {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<MaximumMinimumOptionsT *>(value) : nullptr;
   }
-  const MaximumOptionsT *AsMaximumOptions() const {
-    return type == BuiltinOptions_MaximumOptions ?
-      reinterpret_cast<const MaximumOptionsT *>(value) : nullptr;
+  const MaximumMinimumOptionsT *AsMaximumMinimumOptions() const {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<const MaximumMinimumOptionsT *>(value) : nullptr;
   }
   ArgMaxOptionsT *AsArgMaxOptions() {
     return type == BuiltinOptions_ArgMaxOptions ?
@@ -3827,45 +3830,45 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
 
 flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MaximumOptionsT : public flatbuffers::NativeTable {
-  typedef MaximumOptions TableType;
-  MaximumOptionsT() {
+struct MaximumMinimumOptionsT : public flatbuffers::NativeTable {
+  typedef MaximumMinimumOptions TableType;
+  MaximumMinimumOptionsT() {
   }
 };
 
-struct MaximumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef MaximumOptionsT NativeTableType;
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MaximumMinimumOptionsT NativeTableType;
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  MaximumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MaximumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MaximumMinimumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MaximumMinimumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-struct MaximumOptionsBuilder {
+struct MaximumMinimumOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  explicit MaximumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MaximumMinimumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  MaximumOptionsBuilder &operator=(const MaximumOptionsBuilder &);
-  flatbuffers::Offset<MaximumOptions> Finish() {
+  MaximumMinimumOptionsBuilder &operator=(const MaximumMinimumOptionsBuilder &);
+  flatbuffers::Offset<MaximumMinimumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MaximumOptions>(end);
+    auto o = flatbuffers::Offset<MaximumMinimumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
     flatbuffers::FlatBufferBuilder &_fbb) {
-  MaximumOptionsBuilder builder_(_fbb);
+  MaximumMinimumOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ArgMaxOptionsT : public flatbuffers::NativeTable {
   typedef ArgMaxOptions TableType;
@@ -4152,8 +4155,8 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const DequantizeOptions *builtin_options_as_DequantizeOptions() const {
     return builtin_options_type() == BuiltinOptions_DequantizeOptions ? static_cast<const DequantizeOptions *>(builtin_options()) : nullptr;
   }
-  const MaximumOptions *builtin_options_as_MaximumOptions() const {
-    return builtin_options_type() == BuiltinOptions_MaximumOptions ? static_cast<const MaximumOptions *>(builtin_options()) : nullptr;
+  const MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == BuiltinOptions_MaximumMinimumOptions ? static_cast<const MaximumMinimumOptions *>(builtin_options()) : nullptr;
   }
   const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
     return builtin_options_type() == BuiltinOptions_ArgMaxOptions ? static_cast<const ArgMaxOptions *>(builtin_options()) : nullptr;
@@ -4336,8 +4339,8 @@ template<> inline const DequantizeOptions *Operator::builtin_options_as<Dequanti
   return builtin_options_as_DequantizeOptions();
 }
 
-template<> inline const MaximumOptions *Operator::builtin_options_as<MaximumOptions>() const {
-  return builtin_options_as_MaximumOptions();
+template<> inline const MaximumMinimumOptions *Operator::builtin_options_as<MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
 }
 
 template<> inline const ArgMaxOptions *Operator::builtin_options_as<ArgMaxOptions>() const {
@@ -5878,26 +5881,26 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffer
       _fbb);
 }
 
-inline MaximumOptionsT *MaximumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new MaximumOptionsT();
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MaximumMinimumOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void MaximumOptions::UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<MaximumOptions> MaximumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMaximumOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateMaximumOptions(
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
       _fbb);
 }
 
@@ -6259,8 +6262,8 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
     case BuiltinOptions_ArgMaxOptions: {
@@ -6437,8 +6440,8 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return ptr->UnPack(resolver);
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
       return ptr->UnPack(resolver);
     }
     case BuiltinOptions_ArgMaxOptions: {
@@ -6603,9 +6606,9 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const DequantizeOptionsT *>(value);
       return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptionsT *>(value);
-      return CreateMaximumOptions(_fbb, ptr, _rehasher).Union();
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptionsT *>(value);
+      return CreateMaximumMinimumOptions(_fbb, ptr, _rehasher).Union();
     }
     case BuiltinOptions_ArgMaxOptions: {
       auto ptr = reinterpret_cast<const ArgMaxOptionsT *>(value);
@@ -6769,8 +6772,8 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new DequantizeOptionsT(*reinterpret_cast<DequantizeOptionsT *>(u.value));
       break;
     }
-    case BuiltinOptions_MaximumOptions: {
-      value = new MaximumOptionsT(*reinterpret_cast<MaximumOptionsT *>(u.value));
+    case BuiltinOptions_MaximumMinimumOptions: {
+      value = new MaximumMinimumOptionsT(*reinterpret_cast<MaximumMinimumOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_ArgMaxOptions: {
@@ -6974,8 +6977,8 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<MaximumOptionsT *>(value);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<MaximumMinimumOptionsT *>(value);
       delete ptr;
       break;
     }
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 386cfdb524..9f0ba43252 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -39,6 +39,7 @@ gen_zipped_test_files(
         "max_pool.zip",
         "maximum.zip",
         "mean.zip",
+        "minimum.zip",
         "mul.zip",
         "pad.zip",
         "prelu.zip",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 42aa92c1bb..672158aa2f 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -910,6 +910,41 @@ def make_maximum_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_minimum_tests(zip_path):
+  """Make a set of tests to do minimum."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the minimum op testing graph."""
+    input_tensor_1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_1",
+        shape=parameters["input_shape_1"])
+    input_tensor_2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_2",
+        shape=parameters["input_shape_2"])
+
+    out = tf.minimum(input_tensor_1, input_tensor_2)
+    return [input_tensor_1, input_tensor_2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_1"]),
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_2"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_binary_op_tests_func(binary_operator):
   """Return a function that does a test on a binary operator."""
   return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 291c974545..7426ab56af 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -262,6 +262,7 @@ INSTANTIATE_TESTS(log_softmax)
 INSTANTIATE_TESTS(maximum)
 INSTANTIATE_TESTS(max_pool)
 INSTANTIATE_TESTS(mean)
+INSTANTIATE_TESTS(minimum)
 INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 4df16827b4..e015108120 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -890,6 +890,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "LOG_SOFTMAX", OperatorType::kLogSoftmax));
   ops.emplace_back(new SimpleOperator<TensorFlowMaximumOperator>(
       "MAXIMUM", OperatorType::kTensorFlowMaximum));
+  ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 5546bda696..24ba71e459 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -111,6 +111,8 @@ TEST_F(OperatorTest, SimpleOperators) {
                                           OperatorType::kLogSoftmax);
   CheckSimpleOperator<TensorFlowMaximumOperator>(
       "MAXIMUM", OperatorType::kTensorFlowMaximum);
+  CheckSimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
-- 
GitLab


From 40f85affa388288a66d5fb9b2295155216e68b94 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 10 Apr 2018 09:26:59 -0700
Subject: [PATCH 0504/1262] [XLA:GPU] Add infrastructure for unrolling kernels
 to improve bandwidth utilization.

We often have simple kernels that do very little actual work, duplicating that
can increase the used bandwidth.

This change introduces flags and infrastructure for unrolling kernels, it
doesn't include any cost heuristics and is disabled by default. Based on code
written by Bixia Zheng.

PiperOrigin-RevId: 192296781
---
 .../xla/legacy_flags/debug_options_flags.cc   |  7 ++-
 .../xla/service/cpu/parallel_loop_emitter.cc  |  5 +-
 .../xla/service/cpu/parallel_loop_emitter.h   |  2 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    | 37 ++++++++++++--
 .../xla/service/gpu/ir_emitter_unnested.h     |  6 ++-
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  6 ++-
 .../compiler/xla/service/gpu/kernel_thunk.h   |  8 +++-
 .../xla/service/gpu/parallel_loop_emitter.cc  | 48 ++++++++++++++-----
 .../xla/service/gpu/parallel_loop_emitter.h   | 10 ++--
 .../xla/service/gpu/partition_assignment.cc   |  6 ++-
 .../xla/service/gpu/partition_assignment.h    |  3 +-
 .../xla/service/llvm_ir/loop_emitter.cc       | 12 +++--
 .../xla/service/llvm_ir/loop_emitter.h        |  7 +--
 tensorflow/compiler/xla/xla.proto             |  3 ++
 14 files changed, 121 insertions(+), 39 deletions(-)

diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index f037663e3f..70ae95bf47 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -43,7 +43,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
 #ifdef INTEL_MKL
   flags->set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
-
+  flags->set_xla_gpu_max_kernel_unroll_factor(1);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
@@ -223,6 +223,11 @@ void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
+      tensorflow::Flag(
+          "xla_gpu_max_kernel_unroll_factor",
+          int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
+          flag_values->xla_gpu_max_kernel_unroll_factor(),
+          "Specify the maximum kernel unroll factor for the GPU backend."),
       tensorflow::Flag(
           "xla_dump_optimized_hlo_proto_to",
           flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 1e439cde11..54af40506d 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -29,7 +29,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
     : LoopEmitter(target_element_generator, target_array, ir_builder),
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   CHECK(!ShapeUtil::IsTuple(shape_));
   CHECK(!ShapeUtil::IsScalar(shape_));
@@ -69,7 +70,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK(exit_bb_ != nullptr);
 
-  return array_index;
+  return {array_index};
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index ce92e36a94..755715634a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -60,7 +60,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d29cc21ab1..26e497762f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -536,7 +536,27 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
+
+  int max_unroll_factor = fusion->GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_max_kernel_unroll_factor();
+
+  // Find the largest possible power of two to unroll by.
+  // TODO(kramerb): Make this smarter.
+  int unroll_factor = 1;
+  if (!fusion->IsMultiOutputFusion()) {
+    CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
+    int64 num_elements = ShapeUtil::ElementsIn(fusion->shape());
+    for (int i = max_unroll_factor; i > 1; i /= 2) {
+      if (num_elements % i == 0) {
+        unroll_factor = i;
+        break;
+      }
+    }
+  }
+
+  thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
 }
 
@@ -2021,7 +2041,7 @@ Status IrEmitterUnnested::HandleGather(HloInstruction* gather) {
 }
 
 std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst) {
+    const HloInstruction* inst, int unroll_factor) {
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
 
@@ -2113,7 +2133,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   }
 
   return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
-                                 inst);
+                                 inst, unroll_factor);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -2485,21 +2505,28 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  int unroll_factor = thunk->unroll_factor();
   VLOG(3) << bindings_.ToString();
 
   const Shape& element_shape = hlo.IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo.shape(), {0})
                                    : hlo.shape();
+  VLOG(3) << "EmitTargetElementLoopInThunk "
+          << ShapeUtil::HumanStringWithLayout(hlo.shape())
+          << " for unroll_factor " << unroll_factor;
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->device_description());
+      element_shape, ir_emitter_context_->device_description(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
     return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                               launch_dimensions, &ir_builder_)
+                               launch_dimensions, &ir_builder_, unroll_factor)
         .EmitLoop(IrName(&hlo));
   }
 
+  CHECK_EQ(unroll_factor, 1)
+      << "multi-output fusion does not support unrolling";
+
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 66c62e2d2d..b842f480c6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -150,8 +150,10 @@ class IrEmitterUnnested : public IrEmitter {
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object.
-  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst);
+  // Thunk object. The kernel implementation will be unrolled if unroll_factor
+  // is greater than one.
+  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst,
+                                                int unroll_factor = 1);
 
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index c20a781a33..c24dc1457f 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -30,10 +30,12 @@ namespace gpu {
 
 KernelThunk::KernelThunk(
     tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-    const string& kernel_name, const HloInstruction* hlo_instruction)
+    const string& kernel_name, const HloInstruction* hlo_instruction,
+    int unroll_factor)
     : Thunk(Kind::kKernel, hlo_instruction),
       args_(args.begin(), args.end()),
-      kernel_name_(kernel_name) {}
+      kernel_name_(kernel_name),
+      unroll_factor_(unroll_factor) {}
 
 tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
   tensorflow::mutex_lock lock(mutex_);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 9ae455e2fc..df8971b083 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -47,12 +47,14 @@ class KernelThunk : public Thunk {
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
   KernelThunk(tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-              const string& kernel_name, const HloInstruction* hlo_instruction);
+              const string& kernel_name, const HloInstruction* hlo_instruction,
+              int unroll_factor);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
   ~KernelThunk() override = default;
 
   const string& kernel_name() const { return kernel_name_; }
+  int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
@@ -69,6 +71,10 @@ class KernelThunk : public Thunk {
   // Entry kernel name for the computation.
   const string kernel_name_;
 
+  // The number of times this kernel should be unrolled. This works as a
+  // multiplier on the number of elements produced by a GPU thread.
+  const int unroll_factor_;
+
   // The thread and block dimension used to launch the kernel.
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 388dcc008b..d8c07dc311 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -32,25 +32,32 @@ namespace gpu {
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     BodyEmitter body_emitter, const Shape& shape,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(body_emitter, shape, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_arrays, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_array, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   // Emit the following code in LLVM IR:
   //   linear_index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -63,6 +70,9 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   //   "It is guaranteed that [...] 0  <=  %ctaid.x <  %nctaid.x"
   //
   // %nctaid.x is currently specified as 2147483647.
+  VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_;
+  std::vector<llvm_ir::IrArray::Index> array_indices;
+
   llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
@@ -81,7 +91,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   thread_id = ir_builder_->CreateZExt(thread_id, ir_builder_->getInt64Ty(),
                                       "thread_id");
 
-  llvm::Value* linear_index = ir_builder_->CreateAdd(
+  llvm::Value* linear_index_base = ir_builder_->CreateAdd(
       ir_builder_->CreateMul(
           block_id,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block()), "",
@@ -99,15 +109,30 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::assume,
       {ir_builder_->CreateICmpULT(
-          linear_index,
+          linear_index_base,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block() *
                                 launch_dimensions_.block_count()),
           "linear_index_in_range")},
       {}, ir_builder_);
 
+  if (unroll_factor_ > 1) {
+    linear_index_base = ir_builder_->CreateMul(
+        linear_index_base, ir_builder_->getInt64(unroll_factor_),
+        "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
+  }
+
+  array_indices.emplace_back(linear_index_base, shape_, ir_builder_);
+  for (int i = 1; i < unroll_factor_; ++i) {
+    llvm::Value* linear_index = ir_builder_->CreateAdd(
+        linear_index_base, ir_builder_->getInt64(i), "linear_index",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    array_indices.emplace_back(linear_index, shape_, ir_builder_);
+  }
+
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
       ir_builder_->CreateICmpULT(
-          linear_index, ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
+          linear_index_base,
+          ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
       llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false);
 
   // Set exit_bb_ to the exit block of the if structure.
@@ -116,7 +141,8 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
 
   // Set IR builder insertion point to the body of the if structure.
   llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-  return llvm_ir::IrArray::Index(linear_index, shape_, ir_builder_);
+
+  return array_indices;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 8ed63a854a..25318b3bed 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -34,13 +34,13 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   // The meanings of other parameters are the same as LoopEmitter.
   ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
   // Constructs a ParallelLoopEmitter from an element generator that generates
   // each element of the given target array.
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
 
   // Constructs a loop emitter for a loop that generates on element of each of N
   // arrays on each iteration.
@@ -50,18 +50,20 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder);
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+      int unroll_factor = 1);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
   // The thread and block dimension to parallelize the loop on.
   const LaunchDimensions launch_dimensions_;
+  const int unroll_factor_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 6cf280df05..5283d51cd1 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -44,12 +44,16 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc) {
+    const Shape& shape, const se::DeviceDescription& device_desc,
+    int unroll_factor) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
+  CHECK_EQ(num_elements % unroll_factor, 0);
+  num_elements = num_elements / unroll_factor;
+
   // Since we don't do any inter-warp communication, we're free to choose any
   // block size we want, subject to hardware constraints.  We choose the
   // smallest block size that allows the GPU to reach full occupancy (assuming
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 0bf463a6ef..42d2d2af2e 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -58,7 +58,8 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc);
+    const perftools::gputools::DeviceDescription& device_desc,
+    int unroll_factor = 1);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index b6b918ec78..3978acc132 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -88,12 +88,12 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   }
 }
 
-IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
     exit_bb_ = nullptr;
-    return IrArray::Index();
+    return {IrArray::Index()};
   }
 
   // Create loop nest with one for-loop for each dimension of the target shape.
@@ -121,12 +121,14 @@ IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK_NOTNULL(exit_bb_);
 
-  return array_index;
+  return {array_index};
 }
 
 tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
-  IrArray::Index array_index = EmitIndexAndSetExitBasicBlock(loop_name);
-  TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  for (const IrArray::Index& array_index :
+       EmitIndexAndSetExitBasicBlock(loop_name)) {
+    TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  }
 
   // Set the insertion point of ir_builder_ to the loop exit, so that
   // code emitted for later instructions will be correctly placed.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 0fc528439a..9ff497aecd 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -63,11 +63,12 @@ class LoopEmitter {
 
   // Emits a loop nest (with a yet-to-be-filled loop body) that iterates through
   // every element in the given shape. Returns the multi-dimensional index that
-  // specifies the element.
-  IrArray::Index EmitIndexAndSetExitBasicBlock() {
+  // specifies the element, will return multiple indices if the loop is
+  // unrolled.
+  std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
     return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"");
   }
-  virtual IrArray::Index EmitIndexAndSetExitBasicBlock(
+  virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name);
 
   // Emits a complete loop nest for every element in the given shape.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index b4cbdf3773..f619b8dc24 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -192,6 +192,9 @@ message DebugOptions {
   // Generate calls to MKL-DNN in the CPU backend.
   bool xla_cpu_use_mkl_dnn = 97;
 
+  // Maximum kernel unroll factor for the GPU backend.
+  int32 xla_gpu_max_kernel_unroll_factor = 98;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
-- 
GitLab


From 934e383628a94f9994e7da2dea706a92e8eeffb3 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 10 Apr 2018 09:40:25 -0700
Subject: [PATCH 0505/1262] Fix capitalization (on master)

PiperOrigin-RevId: 192298563
---
 tensorflow/docs_src/programmers_guide/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 017db0e8cb..648d001bd3 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,7 +5,7 @@ works. The units are as follows:
 
 ## High Level APIs
 
-  * @{$programmers_guide/eager}, which is the easiest way to use tensorflow.
+  * @{$programmers_guide/eager}, which is the easiest way to use TensorFlow.
   * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/datasets}, which explains how to
-- 
GitLab


From a0410fb4c5be8fd455640507146cbcd0b0b7a2f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 09:41:45 -0700
Subject: [PATCH 0506/1262] Remove manifest_merger from tensorflow_demo. This
 is an internal-only attribute that is being removed from bazel.

PiperOrigin-RevId: 192298746
---
 tensorflow/examples/android/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index a088d7cf2f..aa594a63c6 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -76,7 +76,6 @@ android_binary(
     custom_package = "org.tensorflow.demo",
     inline_constants = 1,
     manifest = "AndroidManifest.xml",
-    manifest_merger = "legacy",
     resource_files = glob(["res/**"]),
     tags = [
         "manual",
-- 
GitLab


From 62b6b3009b6806d27536da58b11373cf97cdab7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 09:50:02 -0700
Subject: [PATCH 0507/1262] Fix markdown in a couple of tf.estimator
 docstrings.

PiperOrigin-RevId: 192299871
---
 tensorflow/python/estimator/run_config.py | 2 +-
 tensorflow/python/estimator/training.py   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index f62c9cece6..dab442aeda 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -316,7 +316,7 @@ class RunConfig(object):
     a list of task addresses.
 
     `task` has two attributes: `type` and `index`, where `type` can be any of
-    the task types in `cluster`. ` When `TF_CONFIG` contains said information,
+    the task types in `cluster`. When `TF_CONFIG` contains said information,
     the following properties are set on this class:
 
     * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index e38b765da5..9d271758f6 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -137,7 +137,7 @@ class TrainSpec(
           * A tuple (features, labels): Where features is a `Tensor` or a
             dictionary of string feature name to `Tensor` and labels is a
             `Tensor` or a dictionary of string label name to `Tensor`.
-            
+
       max_steps: Int. Positive number of total steps for which to train model.
         If `None`, train forever. The training `input_fn` is not expected to
         generate `OutOfRangeError` or `StopIteration` exceptions. See the
@@ -334,7 +334,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   can read and write). The only extra work to do is setting the environment
   variable `TF_CONFIG` properly for each worker correspondingly.
 
-  Also see: https://www.tensorflow.org/deploy/distributed
+  Also see
+  [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
 
   Setting environment variable depends on the platform. For example, on Linux,
   it can be done as follows (`$` is the shell prompt):
-- 
GitLab


From 6b0ec4bab215169c8cde893b022288f8bf7c8835 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Tue, 10 Apr 2018 18:56:52 +0200
Subject: [PATCH 0508/1262] raw_input() was removed in Python 3 (#16440)

---
 .../python/keras/_impl/keras/utils/io_utils.py     | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index bbf1d2a3d9..f82e3277de 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import defaultdict
-import sys
 
 import numpy as np
+import six
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -160,13 +160,11 @@ def ask_to_proceed_with_overwrite(filepath):
   Returns:
       True if we can proceed with overwrite, False otherwise.
   """
-  get_input = input
-  if sys.version_info[:2] <= (2, 7):
-    get_input = raw_input
-  overwrite = get_input('[WARNING] %s already exists - overwrite? '
-                        '[y/n]' % (filepath))
-  while overwrite not in ['y', 'n']:
-    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
+                              '[y/n]' % (filepath)).strip().lower()
+  while overwrite not in ('y', 'n'):
+    overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
+                                '(cancel).').strip().lower()
   if overwrite == 'n':
     return False
   print('[TIP] Next time specify overwrite=True!')
-- 
GitLab


From bd718c410478d066ed1c41d5ffe31970075b808a Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Tue, 10 Apr 2018 10:07:31 -0700
Subject: [PATCH 0509/1262] Place data format op on CPU:0.

PiperOrigin-RevId: 192302833
---
 tensorflow/core/grappler/optimizers/layout_optimizer.cc      | 1 +
 tensorflow/core/grappler/optimizers/layout_optimizer_test.cc | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 561226f945..8fb30d116d 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -919,6 +919,7 @@ class NodeProcessor : public GraphProcessor {
         ParseNodeName(input_name, &port);
         if (IsHostMemory(*input, port)) {
           parsed_name.type = "CPU";
+          parsed_name.id = 0;
           device = DeviceNameUtils::ParsedNameToString(parsed_name);
         }
       }
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 260347b0e8..b913f2b004 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -36,7 +36,7 @@ class LayoutOptimizerTest : public ::testing::Test {
     DeviceProperties device_properties;
     device_properties.set_type("GPU");
     device_properties.mutable_environment()->insert({"architecture", "6"});
-    virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}}));
+    virtual_cluster_.reset(new VirtualCluster({{"/GPU:1", device_properties}}));
   }
 
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
-- 
GitLab


From 049dfd5e070cfa84c82eea71c6c746a70cba4a3f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 10 Apr 2018 10:34:32 -0700
Subject: [PATCH 0510/1262] Add python built-in types support for `tf.as_dtype`
 (#17652)

* Add python built-in types support for `tf.as_dtype`

This fix tries to address the issue raised in 17641 where
it was not possible to use `tf.as_dtype(float)` the same
way as numpy `np.dtype(float)`.
This fix adds the built-in types support for `tf.as_dtype`,
so that it is possible to specify:
```
dtypes.as_dtype(float)   # dtypes.float64
dtypes.as_dtype(int)     # dtypes.int32
dtypes.as_dtype(long)    # dtypes.int64
dtypes.as_dtype(complex) # dtypes.complex128
dtypes.as_dtype(bool)    # dtypes.bool
```

This fix fixes 17641.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test cases for built-in types support with `tf.as_dtype`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix failed test cases with added built-in types support of tf.as_dtype

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix python 3 build

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Restrict the changes to float and bool based on review feedback

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/framework/dtypes.py      | 9 +++++++++
 tensorflow/python/framework/dtypes_test.py | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index a31c424263..6d918f8b89 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -648,6 +648,10 @@ QUANTIZED_DTYPES = frozenset([
 ])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
+_PYTHON_TO_TF = {
+    float: float32,
+    bool: bool,
+}
 
 @tf_export("as_dtype")
 def as_dtype(type_value):
@@ -679,6 +683,11 @@ def as_dtype(type_value):
   except KeyError:
     pass
 
+  try:
+    return _PYTHON_TO_TF[type_value]
+  except KeyError:
+    pass
+
   if isinstance(type_value, np.dtype):
     # The numpy dtype for strings is variable length. We can not compare
     # dtype with a single constant (np.string does not exist) to decide
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index e49e2fda5d..478733e389 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,9 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonTypesConversion(self):
+    self.assertIs(dtypes.float32, dtypes.as_dtype(float))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
 
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From 36a07c59954b8ace54879b8732b6a7ae2dce6450 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 10 Apr 2018 10:43:14 -0700
Subject: [PATCH 0511/1262] Simplify test_util.run_in_graph_and_eager_modes

- Get rid of unnecessary options
- Update various resource variable tests so that they correctly exercise the cases where the variables are placed on GPU (these "with tf.device('/cpu:0')" blocks that were added for eager execution are no longer necessary)

PiperOrigin-RevId: 192309109
---
 .../eager/python/checkpointable_utils_test.py |  10 +-
 .../contrib/optimizer_v2/momentum_test.py     |  24 +-
 tensorflow/python/framework/test_util.py      |  90 +++--
 .../_impl/keras/layers/embeddings_test.py     |   2 +-
 .../keras/_impl/keras/layers/pooling_test.py  |  18 +-
 .../resource_variable_ops_test.py             | 340 ++++++++----------
 tensorflow/python/training/momentum_test.py   |  24 +-
 7 files changed, 256 insertions(+), 252 deletions(-)

diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index e6498ddb06..3ec5c3de39 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -719,8 +719,9 @@ class CheckpointingTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
 
     root = checkpointable.Checkpointable()
-    root.var = checkpointable_utils.add_variable(
-        root, name="var", initializer=0.)
+    with ops.device("/cpu:0"):
+      root.var = checkpointable_utils.add_variable(
+          root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
     if context.executing_eagerly():
       optimizer.minimize(root.var.read_value)
@@ -750,8 +751,9 @@ class CheckpointingTests(test.TestCase):
         new_root).restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
-        new_root, name="var", shape=[])
+    with ops.device("/cpu:0"):
+      new_root.var = checkpointable_utils.add_variable(
+          new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(new_root.var))
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
index f37eb48181..26724f66c2 100644
--- a/tensorflow/contrib/optimizer_v2/momentum_test.py
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -237,7 +237,17 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
 
       # pylint: disable=cell-var-from-loop
       def loss():
@@ -256,7 +266,17 @@ class MomentumOptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index bf00fa6439..eea27d76c6 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -615,45 +615,68 @@ def assert_no_garbage_created(f):
 
 
 def run_in_graph_and_eager_modes(__unused__=None,
-                                 graph=None,
                                  config=None,
-                                 use_gpu=False,
-                                 force_gpu=False,
+                                 use_gpu=True,
                                  reset_test=True,
                                  assert_no_eager_garbage=False):
-  """Runs the test in both graph and eager modes.
+  """Execute the decorated test with and without enabling eager execution.
+
+  This function returns a decorator intended to be applied to test methods in
+  a @{tf.test.TestCase} class. Doing so will cause the contents of the test
+  method to be executed twice - once normally, and once with eager execution
+  enabled. This allows unittests to confirm the equivalence between eager
+  and graph execution (see @{tf.enable_eager_execution}).
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(tf.test.TestCase):
+
+    @run_in_graph_and_eager_modes()
+    def test_foo(self):
+      x = tf.constant([1, 2])
+      y = tf.constant([3, 4])
+      z = tf.add(x, y)
+      self.assertAllEqual([4, 6], self.evaluate(z))
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test validates that `tf.add()` has the same behavior when computed with
+  eager execution enabled as it does when constructing a TensorFlow graph and
+  executing the `z` tensor in a session.
+
 
   Args:
     __unused__: Prevents sliently skipping tests.
-    graph: Optional graph to use during the returned session.
     config: An optional config_pb2.ConfigProto to use to configure the
-      session.
-    use_gpu: If True, attempt to run as many ops as possible on GPU.
-    force_gpu: If True, pin all ops to `/device:GPU:0`.
-    reset_test: If True, tearDown and SetUp the test case again.
+      session when executing graphs.
+    use_gpu: If True, attempt to run as many operations as possible on GPU.
+    reset_test: If True, tearDown and SetUp the test case between the two
+      executions of the test (once with and once without eager execution).
     assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
       collector and asserts that no extra garbage has been created when running
-      the test in eager mode. This will fail if there are reference cycles
-      (e.g. a = []; a.append(a)). Off by default because some tests may create
-      garbage for legitimate reasons (e.g. they define a class which inherits
-      from `object`), and because DEBUG_SAVEALL is sticky in some Python
-      interpreters (meaning that tests which rely on objects being collected
-      elsewhere in the unit test file will not work). Additionally, checks that
-      nothing still has a reference to Tensors that the test allocated.
+      the test with eager execution enabled. This will fail if there are
+      reference cycles (e.g. a = []; a.append(a)). Off by default because some
+      tests may create garbage for legitimate reasons (e.g. they define a class
+      which inherits from `object`), and because DEBUG_SAVEALL is sticky in some
+      Python interpreters (meaning that tests which rely on objects being
+      collected elsewhere in the unit test file will not work). Additionally,
+      checks that nothing still has a reference to Tensors that the test
+      allocated.
   Returns:
-    Returns a decorator that will run the decorated test function
-        using both a graph and using eager execution.
+    Returns a decorator that will run the decorated test method twice:
+    once by constructing and executing a graph in a session and once with
+    eager execution enabled.
   """
 
   assert not __unused__, "Add () after run_in_graph_and_eager_modes."
 
   def decorator(f):
-    """Test method decorator."""
-
     def decorated(self, **kwargs):
-      """Decorated the test method."""
       with context.graph_mode():
-        with self.test_session(graph, config, use_gpu, force_gpu):
+        with self.test_session(use_gpu=use_gpu):
           f(self, **kwargs)
 
       if reset_test:
@@ -663,27 +686,20 @@ def run_in_graph_and_eager_modes(__unused__=None,
         self._tempdir = None
         self.setUp()
 
-      def run_eager_mode(self, **kwargs):
-        if force_gpu:
-          gpu_name = gpu_device_name()
-          if not gpu_name:
-            gpu_name = "/device:GPU:0"
-          with context.device(gpu_name):
-            f(self)
-        elif use_gpu:
-          # TODO(xpan): Support softplacement and gpu by default when available.
-          f(self, **kwargs)
-        else:
-          with context.device("/device:CPU:0"):
+      def run_eagerly(self, **kwargs):
+        if not use_gpu:
+          with ops.device("/cpu:0"):
             f(self, **kwargs)
+        else:
+          f(self, **kwargs)
 
       if assert_no_eager_garbage:
-        run_eager_mode = assert_no_new_tensors(
-            assert_no_garbage_created(run_eager_mode))
+        run_eagerly = assert_no_new_tensors(
+            assert_no_garbage_created(run_eagerly))
 
       with context.eager_mode():
         with ops.Graph().as_default():
-          run_eager_mode(self, **kwargs)
+          run_eagerly(self, **kwargs)
 
     return decorated
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
index 26fd1f1c11..9f6793eac8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.platform import test
 
 class EmbeddingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def test_embedding(self):
     testing_utils.layer_test(
         keras.layers.Embedding,
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
index bb003c1ddd..2c08b647ea 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
@@ -27,14 +27,14 @@ from tensorflow.python.platform import test
 
 class GlobalPoolingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_1d(self):
     testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
                              input_shape=(3, 4, 5))
     testing_utils.layer_test(
         keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_2d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling2D,
@@ -53,7 +53,7 @@ class GlobalPoolingTest(test.TestCase):
         kwargs={'data_format': 'channels_last'},
         input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_3d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling3D,
@@ -75,7 +75,7 @@ class GlobalPoolingTest(test.TestCase):
 
 class Pooling2DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_2d(self):
     pool_size = (3, 3)
     for strides in [(1, 1), (2, 2)]:
@@ -88,7 +88,7 @@ class Pooling2DTest(test.TestCase):
           },
           input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_2d(self):
     testing_utils.layer_test(
         keras.layers.AveragePooling2D,
@@ -122,7 +122,7 @@ class Pooling2DTest(test.TestCase):
 
 class Pooling3DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_3d(self):
     pool_size = (3, 3, 3)
     testing_utils.layer_test(
@@ -141,7 +141,7 @@ class Pooling3DTest(test.TestCase):
         },
         input_shape=(3, 4, 11, 12, 10))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_3d(self):
     pool_size = (3, 3, 3)
     testing_utils.layer_test(
@@ -163,7 +163,7 @@ class Pooling3DTest(test.TestCase):
 
 class Pooling1DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
@@ -173,7 +173,7 @@ class Pooling1DTest(test.TestCase):
                     'padding': padding},
             input_shape=(3, 5, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index edc63264a3..6d33086936 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -174,215 +174,161 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32))
     self.assertEqual(read, 2)
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAdd(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(resource_variable_ops.assign_variable_op(
-          handle, constant_op.constant([[1]], dtype=dtypes.int32)))
-      self.evaluate(resource_variable_ops.resource_scatter_add(
-          handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterSub(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_sub(handle, [0],
-                                                     constant_op.constant(
-                                                         [[2]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[-1]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMul(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_mul(handle, [0],
-                                                     constant_op.constant(
-                                                         [[5]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[5]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterDiv(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_div(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[2]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMin(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_min(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMax(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_max(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[6]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAddScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_add(handle, [0],
-                                                     constant_op.constant(
-                                                         2,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterSubScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_sub(handle, [0],
-                                                     constant_op.constant(
-                                                         2,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[-1]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMulScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_mul(handle, [0],
-                                                     constant_op.constant(
-                                                         5,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[5]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterDivScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_div(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[2]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMinScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_min(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMaxScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_max(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[6]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 297a8bbde5..7bd57ad3d8 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -237,7 +237,17 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
 
       # pylint: disable=cell-var-from-loop
       def loss():
@@ -256,7 +266,17 @@ class MomentumOptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
-- 
GitLab


From 1f9eeeb842a052326da766a626b32b2e7a50ffcc Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 10 Apr 2018 10:50:01 -0700
Subject: [PATCH 0512/1262] Adding release notes for 1.8.0rc0

---
 RELEASE.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index e845953174..6ec03f94d8 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,62 @@
+# Release 1.8.0
+
+## Major Features And Improvements
+* Can now pass `tf.contrib.distribute.MirroredStrategy()` to `tf.estimator.RunConfig()` to run an Estimator model on multiple GPUs on one machine.
+* Add `tf.contrib.data.prefetch_to_device()`, which supports prefetching to GPU memory.
+* Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
+* Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
+* `tf.contrib.bayesflow` is moving out to it's own repo.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory.
+  * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment.
+  * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files.
+  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators in eager mode.
+* Eager Execution:
+  * Can now naturally iterate over `tf.data.Dataset` objects without wrapping in a `tf.contrib.eager.Iterator`. For example: `for x in tf.data.Dataset.range(10): print(x)`
+  * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133)
+  * `tf.GradientTape` has moved out of contrib.
+* `tf.keras`:
+  * Added the fashion mnist dataset.
+  * New data preprocessing functions: `image/random_brightness`, `sequence/TimeseriesGenerator`, and `text/hashing_trick`.
+* Accelerated Linear Algebra (XLA):
+  * Select and scatter in reference util and evaluator now use lexicographical order to break ties.
+* TensorFlow Debugger (tfdbg) CLI:
+ * During tensor-filter operations, allow exclusion of nodes by regular expressions.
+ * Fix spurious background colors in some text terminals.
+* tf.contrib:
+  * Add meta-distribution BatchReshape which reshapes batch dimensions.
+  * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
+  * Add `tf.contrib.framework.argsort`.
+  * Allow `DNNBoostedTreeCombinedEstimator` to work with core versions of feature columns and losses.
+  * Add non-linear image warping ops: `tf.contrib.image.sparse_image_warp`, `tf.contrib.image.dense_image_warp`, and `tf.contrib.image.interpolate_spline`.
+  * Fix bug in `tf.contrib.opt.MultitaskOptimizerWrapper` where types of tensors were mismatched.
+* Other:
+  * Low-level graph construction now calls the TensorFlow C API. This change should be invisible to most users, but can be disabled by setting the environment variable `TF_C_API_GRAPH_CONSTRUCTION=0` in this release. Future releases will remove the ability to disable this change. Please [file a bug](https://github.com/tensorflow/tensorflow/issues/new) if you find yourself using this escape hatch.
+  * Add description of shapes and a pointer to tutorial notebook in `tf.distributions.Distribution`.
+  * Update scatter operations:
+    * Add `tf.scatter_min` and `tf.scatter_max`
+    * Extend scatter operations to work with a scalar update parameter.
+  * Move cuDNN RNN ops to core for use in TensorFlow codebase only.
+  * Add `float64` support for `Conv2d`, `Conv2dBackpropInput`, and `Conv2dBackpropFilter`.
+  * Add `float64` support for `AvgPool`/`AvgPoolGrad`.
+  * Make graph name scope thread local so that they work correctly in multi-threaded environments.
+  * Update nsync synchronization library to avoid slow primitives on Linux.
+  * Removed need to put nsync/public on C include path when building custom ops.
+  * Add `tf.image.psnr`, `tf.image.ssim`, `tf.image.ssim_multiscale`, `tf.image.image_gradients`, `tf.image.sobel_edges`.
+  * Add links to https://js.tensorflow.org.
+  * Fix non-uniformity of orthogonal matrices.
+  * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
+
+
 # Release 1.7.0
 
 ## Major Features And Improvements
-- 
GitLab


From c276b8314cd3161c5626d845edcfb6697cefd043 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Tue, 10 Apr 2018 10:52:15 -0700
Subject: [PATCH 0513/1262] [TF:XLA] fix a segfault in MakeFakeArguments, and
 add a test case.

PiperOrigin-RevId: 192310749
---
 tensorflow/compiler/xla/tests/BUILD           | 13 +++++
 tensorflow/compiler/xla/tests/test_utils.cc   |  2 +-
 .../compiler/xla/tests/test_utils_test.cc     | 57 +++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/compiler/xla/tests/test_utils_test.cc

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6c43014b33..699b077d80 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1969,3 +1969,16 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+xla_test(
+    name = "test_utils_test",
+    srcs = ["test_utils_test.cc"],
+    deps = [
+        ":local_client_test_base",
+        ":test_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 68f75d50cb..e30d115fae 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -165,7 +165,7 @@ enum class ConstantType { kUnknown, kZero, kOne };
 // Return the constant type required by this computation, if known.
 ConstantType GetInitValue(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
-  if (computation.num_parameters() != 2 ||
+  if (computation.num_parameters() != 2 || root->operand_count() != 2 ||
       root->operand(0)->opcode() != HloOpcode::kParameter ||
       root->operand(1)->opcode() != HloOpcode::kParameter ||
       root->operand(0) == root->operand(1)) {
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
new file mode 100644
index 0000000000..e8efc6e2a8
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+// A test fixture is used because we need a client for our computation builder.
+class TestUtilsTest : public LocalClientTestBase {};
+
+XLA_TEST_F(TestUtilsTest, UnusedParam) {
+  ComputationBuilder builder(local_client_, TestName());
+  // Make the reduction lambda.
+  Shape single_float = ShapeUtil::MakeShape(F32, {});
+  builder.Parameter(0, single_float, "unused");
+  builder.Parameter(1, single_float, "used");
+  auto computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  // Make the reduction.
+  Shape pair_float = ShapeUtil::MakeShape(F32, {2});
+  builder.Reduce(builder.Parameter(0, pair_float, "operand"),
+                 builder.Parameter(1, single_float, "init"),
+                 computation_status.ValueOrDie(), {0});
+  computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  auto executable_status = local_client_->Compile(
+      computation_status.ValueOrDie(), {&pair_float, &single_float},
+      ExecutableBuildOptions());
+  TF_ASSERT_OK(executable_status.status());
+  HloModule& module = const_cast<HloModule&>(
+      executable_status.ValueOrDie()->executable()->module());
+  TF_ASSERT_OK(MakeFakeArguments(&module).status());
+}
+
+}  // namespace
+}  // namespace xla
-- 
GitLab


From c2582d40474211877764b5ac24d412384d20bd25 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 10 Apr 2018 11:04:32 -0700
Subject: [PATCH 0514/1262] Update a few release notes

---
 RELEASE.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 6ec03f94d8..83c14200ec 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -13,9 +13,8 @@
   * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory.
   * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment.
   * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files.
-  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators in eager mode.
 * Eager Execution:
-  * Can now naturally iterate over `tf.data.Dataset` objects without wrapping in a `tf.contrib.eager.Iterator`. For example: `for x in tf.data.Dataset.range(10): print(x)`
+  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators when eager execution is enabled.
   * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133)
   * `tf.GradientTape` has moved out of contrib.
 * `tf.keras`:
@@ -24,8 +23,8 @@
 * Accelerated Linear Algebra (XLA):
   * Select and scatter in reference util and evaluator now use lexicographical order to break ties.
 * TensorFlow Debugger (tfdbg) CLI:
- * During tensor-filter operations, allow exclusion of nodes by regular expressions.
- * Fix spurious background colors in some text terminals.
+  * During tensor-filter operations, allow exclusion of nodes by regular expressions.
+  * Fix spurious background colors in some text terminals.
 * tf.contrib:
   * Add meta-distribution BatchReshape which reshapes batch dimensions.
   * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
-- 
GitLab


From e5d12651d3ff1accab74c79a9905e7ec3a05bfc2 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 10 Apr 2018 11:06:26 -0700
Subject: [PATCH 0515/1262] Formatting fix

---
 RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index 83c14200ec..2717c75740 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -25,7 +25,7 @@
 * TensorFlow Debugger (tfdbg) CLI:
   * During tensor-filter operations, allow exclusion of nodes by regular expressions.
   * Fix spurious background colors in some text terminals.
-* tf.contrib:
+* `tf.contrib`:
   * Add meta-distribution BatchReshape which reshapes batch dimensions.
   * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
   * Add `tf.contrib.framework.argsort`.
-- 
GitLab


From afa17984849881f39fb56c6e3500d866539924d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 11:09:37 -0700
Subject: [PATCH 0516/1262] Adds support for hoisting out common denominator in
 arithmetic_optimizer

PiperOrigin-RevId: 192314177
---
 .../optimizers/arithmetic_optimizer.cc        | 103 +++++++++++++-----
 .../optimizers/arithmetic_optimizer_test.cc   |  85 ++++++++++++++-
 2 files changed, 161 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index fa0f7c1c6e..463c332858 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -695,15 +695,20 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
   }
 };
 
-// Use the commutativity and (left- and right-) distributive property of
-// multiplication over addition to hoist common factors out of aggregate nodes
-// where all the inputs are Mul nodes. This pattern occurs frequently in
-// regularization terms for the gradients during training.
+// Use the distributive property of multiplication and division over addition,
+// along with commutativity of the former, to hoist common factors/denominators
+// out of aggregate nodes where ALL the inputs are Mul/Div nodes.
+// This pattern occurs frequently in regularization terms for the gradients
+// during training.
 //
 // For example, we can rewrite an expression of the form:
 //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
 // to the following:
 //   Mul(x, AddN(y1, y2, y3, ... yn))
+// For division, we can rewrite
+//   AddN(Div(y1, x), Div(y2, x), Div(y3, x), ... Div(yn, x))
+// to:
+//   Div(AddN(y1, y2, y3, ... yn), x)
 class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
  public:
   explicit HoistCommonFactorOutOfAggregation(
@@ -720,9 +725,11 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
+    bool common_factor_is_denominator = false;
     std::set<string> common_factors;
     std::vector<string> ctrl_deps;
-    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors, &ctrl_deps));
+    TF_RETURN_IF_ERROR(GetCommonFactors(
+        node, &common_factors, &common_factor_is_denominator, &ctrl_deps));
 
     if (common_factors.size() == 1) {
       const string& common_factor = *common_factors.begin();
@@ -730,24 +737,31 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       // Gather up the non-shared factors
       bool shapes_match = true;
       std::vector<string> unique_factors;
-      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor, &shapes_match,
-                                          &unique_factors));
+      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor,
+                                          common_factor_is_denominator,
+                                          &shapes_match, &unique_factors));
 
       if (shapes_match) {
         NodeDef* input_0;
         TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input_0));
 
-        // Use a copy of the first Mul node for the outer multiplication.
-        NodeDef* new_mul_node = AddCopyNode(OuterMulNodeName(node), input_0);
+        // Use a copy of the first node for the outer multiplication/division.
+        NodeDef* new_outer_node = AddCopyNode(
+            OuterNodeName(node, common_factor_is_denominator), input_0);
         // And a copy of aggregation node as one of the inner operands
         NodeDef* new_add_node = AddCopyNode(InnerAddNodeName(node), node);
 
-        new_mul_node->set_device(node->device());
-        new_mul_node->set_input(0, common_factor);
-        new_mul_node->set_input(1, new_add_node->name());
+        new_outer_node->set_device(node->device());
+        if (common_factor_is_denominator) {
+          new_outer_node->set_input(0, new_add_node->name());
+          new_outer_node->set_input(1, common_factor);
+        } else {
+          new_outer_node->set_input(0, common_factor);
+          new_outer_node->set_input(1, new_add_node->name());
+        }
 
-        ctx_.node_map->AddOutput(common_factor, new_mul_node->name());
-        ctx_.node_map->AddOutput(new_add_node->name(), new_mul_node->name());
+        ctx_.node_map->AddOutput(common_factor, new_outer_node->name());
+        ctx_.node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
@@ -766,17 +780,18 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         AddToOptimizationQueue(new_add_node);
         // do not optimize the same node twice
         rewritten_nodes_.insert(node->name());
-        *simplified_node_name = new_mul_node->name();
+        *simplified_node_name = new_outer_node->name();
       }
     }
     return Status::OK();
   }
 
  private:
-  // Get a name for new outer Mul node
-  string OuterMulNodeName(const NodeDef* node) const {
+  // Get a name for new outer node
+  string OuterNodeName(const NodeDef* node, bool is_div) const {
     auto scope_and_name = ParseNodeScopeAndName(node->name());
-    return OptimizedNodeName(scope_and_name, "Mul");
+    return is_div ? OptimizedNodeName(scope_and_name, "Div")
+                  : OptimizedNodeName(scope_and_name, "Mul");
   }
 
   // Get a name new inner Add node
@@ -785,11 +800,17 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     return OptimizedNodeName(scope_and_name, "Add");
   }
 
-  // Determine the set of common factors if the input nodes are all Mul nodes.
+  // Determine the set of common factors if the input nodes are all Mul or
+  // Div nodes.
   Status GetCommonFactors(const NodeDef* node, std::set<string>* common_factors,
+                          bool* common_factor_is_denominator,
                           std::vector<string>* ctrl_deps) const {
     CHECK(common_factors->empty());
+    CHECK_NOTNULL(common_factor_is_denominator);
+    *common_factor_is_denominator = false;
 
+    bool has_mul = false;
+    bool has_div = false;
     for (int i = 0; i < node->input_size(); ++i) {
       if (i > 0 && common_factors->empty()) break;
       if (IsControlInput(node->input(i))) {
@@ -799,12 +820,36 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       NodeDef* input;
       TF_RETURN_IF_ERROR(GetInputNode(node->input(i), &input));
 
-      if (!IsMul(*input)) {
+      if ((!IsMul(*input) && !IsAnyDiv(*input)) || (IsMul(*input) && has_div) ||
+          (IsAnyDiv(*input) && has_mul)) {
+        // Break if input is neither a Mul or Div, or if there are both Mul &
+        // Div Ops.
         common_factors->clear();
         break;
+      } else if (IsAnyDiv(*input)) {
+        has_div = true;
+        // In case of possible common dividers, we avoid hoisting out if any
+        // input is not float/double, since integer division is not distributive
+        // over addition.
+        OpInfo::TensorProperties properties0, properties1;
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(0), &properties0));
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(1), &properties1));
+        if (properties0.dtype() != DT_FLOAT &&
+            properties0.dtype() != DT_DOUBLE &&
+            properties1.dtype() != DT_FLOAT &&
+            properties1.dtype() != DT_DOUBLE) {
+          common_factors->clear();
+          break;
+        }
+      } else if (IsMul(*input)) {
+        has_mul = true;
       }
 
-      std::set<string> factors_i{input->input(0), input->input(1)};
+      // We only focus on common factors from denominators if any Op is a
+      // Div.
+      std::set<string> factors_i =
+          has_mul ? std::set<string>{input->input(0), input->input(1)}
+                  : std::set<string>{input->input(1)};
       if (i == 0) {
         std::swap(*common_factors, factors_i);
       } else {
@@ -819,6 +864,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         ctrl_deps->push_back(input->input(i));
       }
     }
+
+    *common_factor_is_denominator = has_div;
     return Status::OK();
   }
 
@@ -827,6 +874,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   // have the same shape since the other aggregation ops do not support
   // broadcasting.
   Status GetUniqueFactors(const NodeDef* node, const string& common_factor,
+                          const bool common_factor_is_denominator,
                           bool* shapes_match,
                           std::vector<string>* unique_factors) const {
     *shapes_match = true;
@@ -837,11 +885,13 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       if (IsControlInput(input)) {
         break;
       }
-      NodeDef* mul_node;
-      TF_RETURN_IF_ERROR(GetInputNode(input, &mul_node));
+      NodeDef* inner_node;
+      TF_RETURN_IF_ERROR(GetInputNode(input, &inner_node));
       const int unique_factor_index =
-          mul_node->input(0) == common_factor ? 1 : 0;
-      unique_factors->push_back(mul_node->input(unique_factor_index));
+          common_factor_is_denominator
+              ? 0
+              : (inner_node->input(0) == common_factor ? 1 : 0);
+      unique_factors->push_back(inner_node->input(unique_factor_index));
       if (i > 0 && !IsAdd(*node)) {
         OpInfo::TensorProperties lhs;
         OpInfo::TensorProperties rhs;
@@ -857,7 +907,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     // if graph rewrite happens in multiple passes without graph pruning between
     // them, it's possible that rewritten node already exists in a graph
     return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() ||
-           ctx_.node_map->NodeExists(OuterMulNodeName(node));
+           ctx_.node_map->NodeExists(OuterNodeName(node, false)) ||
+           ctx_.node_map->NodeExists(OuterNodeName(node, true));
   }
 
   // keep names of the nodes that were optimized by this stage
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 9677175d2e..e639812858 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -31,6 +31,9 @@ namespace grappler {
 
 namespace {
 
+constexpr char kHoistFactorOptimizerDiv[] =
+    "ArithmeticOptimizer/HoistCommonFactor_Div_";
+
 constexpr char kHoistFactorOptimizerMul[] =
     "ArithmeticOptimizer/HoistCommonFactor_Mul_";
 
@@ -42,6 +45,11 @@ string HoistMulName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerMul, "");
 }
 
+// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation
+string HoistDivName(const string& name) {
+  return AddPrefixToNodeName(name, kHoistFactorOptimizerDiv, "");
+}
+
 // Optimized name of inner Add node by HoistCommonFactorOutOfAggregation
 string HoistAddName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerAdd, "");
@@ -558,7 +566,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ("^Placeholder", add_1_const_node->input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, HoistFactor) {
+TEST_F(ArithmeticOptimizerTest, HoistFactorMul) {
   for (bool matching_shapes : {true, false}) {
     for (bool use_addn : {true, false}) {
       tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -625,6 +633,81 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      for (bool use_ints : {true, false}) {
+        tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+        Output x = use_ints
+                       ? ops::Const(s.WithOpName("x"), {1, 2}, {1, 2})
+                       : ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+        Output y1 = use_ints
+                        ? ops::Const(s.WithOpName("y1"), {3, 4}, {1, 2})
+                        : ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+        Output y2;
+        if (matching_shapes) {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5, 6}, {1, 2})
+                        : ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2});
+        } else {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5}, {1, 1})
+                        : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+        }
+        Output div1 = ops::Div(s.WithOpName("div1"), y1, x);
+        Output div2 = ops::Div(s.WithOpName("div2"), y2, x);
+        Output id =
+            use_addn
+                ? ops::Identity(s.WithOpName("id"),
+                                ops::AddN(s.WithOpName("add"), {div1, div2}))
+                : ops::Identity(s.WithOpName("id"),
+                                ops::Add(s.WithOpName("add"), div1, div2));
+
+        GrapplerItem item;
+        item.fetch = {"id"};
+        TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+        ArithmeticOptimizer optimizer;
+        EnableOnlyHoistCommonFactor(&optimizer);
+
+        GraphDef output;
+        OptimizeTwice(&optimizer, &item, &output);
+
+        // We expect the following rewrite(s) to occur:
+        //
+        //        Add                 Div
+        //      /    \               /   \
+        //    Div    Div       ->  Add    x
+        //    / \    / \           / \
+        //   y1  x  y2  x         y1  y2
+        //
+        // If "root" op is AddN and shapes does not match, this rewrite is not
+        // possible and graph should stay intact.
+        NodeMap node_map(&output);
+
+        if ((use_addn && !matching_shapes) || use_ints) {
+          VerifyGraphsMatch(item.graph, output, __LINE__);
+        } else {
+          EXPECT_EQ(9, output.node_size());
+
+          const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
+          ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+          EXPECT_EQ("y1", new_add_node->input(0));
+          EXPECT_EQ("y2", new_add_node->input(1));
+
+          const NodeDef* new_div_node = node_map.GetNode(HoistDivName("add"));
+          ASSERT_TRUE(new_div_node != nullptr) << "Hoisted Div node not found";
+          EXPECT_EQ(new_add_node->name(), new_div_node->input(0));
+          EXPECT_EQ("x", new_div_node->input(1));
+
+          const NodeDef* id_node = node_map.GetNode("id");
+          ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+          EXPECT_EQ("id", id_node->name());
+          EXPECT_EQ(HoistDivName("add"), id_node->input(0));
+        }
+      }
+    }
+  }
+}
+
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
-- 
GitLab


From b8fe5bf30662155ae351b3dc794456d2c68b151c Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 10 Apr 2018 11:13:35 -0700
Subject: [PATCH 0517/1262] Update version for 1.8.0rc0

---
 tensorflow/core/public/version.h              |  4 ++--
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       |  9 ++++++--
 tensorflow/tools/docker/Dockerfile.devel      |  2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |  2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |  2 +-
 tensorflow/tools/pip_package/setup.py         |  2 +-
 11 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 706968d347..0ca7d8475f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 7
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 274413e294..995b8ae666 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 1a0956634d..2938a8f7ee 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index cdde45a6f4..c87eacfa93 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.7.0</version>
+                 <version>1.8.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 04e4242b0f..8387289fcf 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -194,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -299,7 +299,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -485,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -659,14 +659,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -678,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -697,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -716,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index b3e9616a05..a237d1af54 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 7d7c2aa75a..677e3329b6 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.7.0 on Linux:
+for TensorFlow 1.8.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -450,6 +450,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
@@ -471,6 +473,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -486,6 +489,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 11f476d12c..0563bd4d6c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 037d13116e..c65e0b72bc 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.7
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1fcb6428b2..9f0cf63e7e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6511a50b3b..f676f040ad 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.7.0'
+_VERSION = '1.8.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 0fd3c5c450a844573f9c417994ae87035119d2b4 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 10 Apr 2018 11:19:26 -0700
Subject: [PATCH 0518/1262] Adding the python symlink command for devel
 packages too.

---
 tensorflow/tools/docker/Dockerfile.devel     | 2 ++
 tensorflow/tools/docker/Dockerfile.devel-gpu | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 11f476d12c..c4f6b24e5c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1fcb6428b2..5aea47e582 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
-- 
GitLab


From 17a320fa107905b335a6fb944eaf323e868a2470 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 11:20:38 -0700
Subject: [PATCH 0519/1262] [XLA] Fix the size of the data returned from
 Literal for sparse literals.

PiperOrigin-RevId: 192315888
---
 tensorflow/compiler/xla/literal_util.h       | 13 +++++++++----
 tensorflow/compiler/xla/literal_util_test.cc |  4 +---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 66ff39ecbb..a6a3dffeb7 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -741,7 +741,13 @@ class Literal {
     int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
 
     // Returns the number of elements in this piece's array.
-    int64 element_count() const { return ShapeUtil::ElementsIn(subshape()); }
+    int64 element_count() const {
+      // If this is a sparse array, use the number of elements represented by
+      // the indices in the associated SparseIndexArray.
+      return LayoutUtil::IsSparseArray(subshape())
+                 ? sparse_indices()->index_count()
+                 : ShapeUtil::ElementsIn(subshape());
+    }
 
     // Copy the data from 'src' into this piece's buffer. Shapes of this piece
     // and src must be compatible.
@@ -853,8 +859,7 @@ tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::ArraySlice<NativeT>(
-      reinterpret_cast<const NativeT*>(buffer()),
-      ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<const NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -867,7 +872,7 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::MutableArraySlice<NativeT>(
-      reinterpret_cast<NativeT*>(buffer()), ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index be4f2bc5ce..61046784e0 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -218,9 +218,7 @@ TEST_F(LiteralUtilTest, CreateSparse) {
   EXPECT_EQ(literal->sparse_indices()->data(),
             ArraySlice<int64>(expected_indices.data(),
                               expected_indices.num_elements()));
-  EXPECT_EQ(
-      ArraySlice<int64>(literal->data<int64>().data(), expected_values.size()),
-      ArraySlice<int64>(expected_values));
+  EXPECT_EQ(literal->data<int64>(), ArraySlice<int64>(expected_values));
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
-- 
GitLab


From 1ab0cc3548f330fda61cf01c524e3f85a00d8485 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 11:33:45 -0700
Subject: [PATCH 0520/1262] Fix bug in TFLite Interpreter python interface

PiperOrigin-RevId: 192318426
---
 tensorflow/contrib/lite/python/interpreter.py                 | 4 ++--
 tensorflow/contrib/lite/python/interpreter_test.py            | 3 +++
 .../lite/python/interpreter_wrapper/interpreter_wrapper.cc    | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index b8638007f7..cb9c0d3121 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -121,8 +121,8 @@ class Interpreter(object):
     Raises:
       ValueError: If the interpreter could not resize the input tensor.
     """
-    if not self.ResizeInputTensor.SetTensor(input_index, tensor_size):
-      raise ValueError('Failed to set input')
+    if not self._interpreter.ResizeInputTensor(input_index, tensor_size):
+      raise ValueError('Failed to resize input')
 
   def get_output_details(self):
     """Gets model output details.
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
index cd2386f526..f802edf020 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -81,6 +81,9 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
+    interpreter.resize_tensor_input(input_details[0]['index'],
+                                    np.array(test_input.shape, dtype=np.int32))
+    interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 35ad226b78..4b34969356 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -186,7 +186,7 @@ bool InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
   std::vector<int> dims(PyArray_SHAPE(array)[0]);
   memcpy(dims.data(), PyArray_BYTES(array), dims.size() * sizeof(int));
 
-  return interpreter_->ResizeInputTensor(i, dims);
+  return (interpreter_->ResizeInputTensor(i, dims) == kTfLiteOk);
 }
 
 std::string InterpreterWrapper::TensorName(int i) const {
-- 
GitLab


From 2177a2306ab43b758630180ca93b84602c73dfc6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 11:57:40 -0700
Subject: [PATCH 0521/1262] Enable loop-invariant node motion in the Grappler
 loop optimizer. Thanks to the team at Alibaba, who contributed the original
 version of this code.

PiperOrigin-RevId: 192322484
---
 tensorflow/core/grappler/optimizers/loop_optimizer.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 83c499bbe7..a422505d23 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -52,14 +52,11 @@ class LoopOptimizer : public GraphOptimizer {
 
   // Granular control for loop optimizer stages.
   struct LoopOptimizerOptions {
-    bool enable_loop_invariant_node_motion = false;
+    bool enable_loop_invariant_node_motion = true;
     bool enable_stack_push_removal = true;
 
     static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
       LoopOptimizerOptions options;
-      if (opt_level == RewriterConfig::AGGRESSIVE) {
-        options.enable_loop_invariant_node_motion = true;
-      }
       return options;
     }
   };
-- 
GitLab


From 199b8ade22550ca3e5ccc6c744914b3ef614d232 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 12:05:06 -0700
Subject: [PATCH 0522/1262] Expand list of value-preserving ops. This will
 increase the number of graphs where we can apply the involution and mul->conv
 fusion optimizations.

PiperOrigin-RevId: 192323712
---
 tensorflow/core/grappler/op_types.cc | 33 +++++++++++++++++++++++-----
 tensorflow/core/grappler/op_types.h  |  4 ++++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 1fb1711f54..9c45aed62f 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -456,15 +456,38 @@ bool IsInvolution(const NodeDef& node) {
   return involution_ops.count(node.op()) > 0;
 }
 
-bool IsValuePreserving(const NodeDef& node) {
+bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
+  const std::unordered_set<string> value_and_order_preserving_ops{
+      "CheckNumerics",
+      "DebugGradientIdentity",
+      "DeepCopy"
+      "Enter",
+      "Exit",
+      "ExpandDims",
+      "Identity",
+      "IdentityN",
+      "PreventGradient",
+      "Print",
+      "Reshape",
+      "Snapshot",
+      "Squeeze",
+      "StopGradient",
+  };
+  return value_and_order_preserving_ops.count(node.op()) > 0;
+}
+
+bool IsValuePreserving(const NodeDef& node) {
   const std::unordered_set<string> value_preserving_ops{
-      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
-      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
-      "ExpandDims", "Squeeze"};
-  return value_preserving_ops.count(node.op()) > 0;
+      "InvertPermutation",
+      "Reverse",
+      "Roll",
+      "Transpose",
+  };
+  return IsValueAndOrderPreserving(node) ||
+         value_preserving_ops.count(node.op()) > 0;
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index d516baebf3..79fd05e187 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -168,6 +168,10 @@ bool ModifiesInputsInPlace(const NodeDef& node);
 // own inverse such that f(f(x)) == x.
 bool IsInvolution(const NodeDef& node);
 
+// Returns true if the op preserves the order and value of elements in its
+// first input tensor and possible changes its shape.
+bool IsValueAndOrderPreserving(const NodeDef& node);
+
 // Returns true if the op in node only rearranges the order of elements in its
 // first input tensor and possible changes its shape. More precisely, this
 // function returns true if the op commutes with all element-wise operations.
-- 
GitLab


From dc7883afb6220e5a105d8fea6e0cfdaf92839db3 Mon Sep 17 00:00:00 2001
From: Noah Eisen <ncteisen@google.com>
Date: Tue, 10 Apr 2018 12:28:04 -0700
Subject: [PATCH 0523/1262] Upgrade gRPC version and fix file duplication

This bumps the gRPC version used in OSS Tensorflow to pick up grpc/grpc#14541, which exposes gRPC serialization classes which were previously hidden in an internal namespace. Using these files eliminates files duplicated from gRPC repo

PiperOrigin-RevId: 192327358
---
 tensorflow/contrib/cmake/external/grpc.cmake  |   2 +-
 tensorflow/core/distributed_runtime/rpc/BUILD |  12 -
 .../rpc/grpc_master_service_impl.h            |  10 -
 .../rpc/grpc_serialization_traits.h           | 217 ------------------
 .../rpc/grpc_worker_service_impl.h            |  28 +--
 tensorflow/workspace.bzl                      |   8 +-
 6 files changed, 14 insertions(+), 263 deletions(-)
 delete mode 100644 tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h

diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index abfc69243e..bec8177a3f 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2)
+set(GRPC_TAG 09386db3939cae1ac12e5f09b735adfa8958c68e)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index d3478dfc38..fa0f8c9b52 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -189,7 +189,6 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
@@ -235,22 +234,11 @@ cc_library(
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:master_proto_cc",
         "@grpc//:grpc++_unsecure",
     ],
 )
 
-cc_library(
-    name = "grpc_serialization_traits",
-    srcs = [],
-    hdrs = ["grpc_serialization_traits.h"],
-    deps = [
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
-    ],
-)
-
 cc_library(
     name = "rpc_rendezvous_mgr",
     srcs = ["rpc_rendezvous_mgr.cc"],
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 3c382738c4..8f1b589698 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -25,18 +25,8 @@ limitations under the License.
 #include "grpc++/impl/codegen/stub_options.h"
 #include "grpc++/impl/codegen/sync_stream.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::CreateSessionRequest);
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::ExtendSessionRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepResponse);
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
deleted file mode 100644
index e7f5fb0c6a..0000000000
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/support/slice.h"
-#include "grpc/grpc.h"
-
-namespace grpc {
-
-namespace tensorflow_helper {
-
-const int kGrpcBufferWriterMaxBufferLength = 8192;
-
-class GrpcBufferWriter final
-    : public ::grpc::protobuf::io::ZeroCopyOutputStream {
- public:
-  explicit GrpcBufferWriter(grpc_byte_buffer** bp, int block_size)
-      : block_size_(block_size), byte_count_(0), have_backup_(false) {
-    *bp = grpc_raw_byte_buffer_create(NULL, 0);
-    slice_buffer_ = &(*bp)->data.raw.slice_buffer;
-  }
-
-  ~GrpcBufferWriter() override {
-    if (have_backup_) {
-      grpc_slice_unref(backup_slice_);
-    }
-  }
-
-  bool Next(void** data, int* size) override {
-    if (have_backup_) {
-      slice_ = backup_slice_;
-      have_backup_ = false;
-    } else {
-      slice_ = grpc_slice_malloc(block_size_);
-    }
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    grpc_slice_buffer_add(slice_buffer_, slice_);
-    return true;
-  }
-
-  void BackUp(int count) override {
-    grpc_slice_buffer_pop(slice_buffer_);
-    if (count == block_size_) {
-      backup_slice_ = slice_;
-    } else {
-      backup_slice_ =
-          grpc_slice_split_tail(&slice_, GRPC_SLICE_LENGTH(slice_) - count);
-      grpc_slice_buffer_add(slice_buffer_, slice_);
-    }
-    // It's dangerous to keep an inlined grpc_slice as the backup slice, since
-    // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an address held by
-    // slice_buffer_.
-    have_backup_ = backup_slice_.refcount != NULL;
-    byte_count_ -= count;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override { return byte_count_; }
-
- private:
-  const int block_size_;
-  int64_t byte_count_;
-  grpc_slice_buffer* slice_buffer_;
-  bool have_backup_;
-  grpc_slice backup_slice_;
-  grpc_slice slice_;
-};
-
-class GrpcBufferReader final
-    : public ::grpc::protobuf::io::ZeroCopyInputStream {
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    (void)grpc_byte_buffer_reader_init(&reader_, buffer);
-  }
-  ~GrpcBufferReader() override { grpc_byte_buffer_reader_destroy(&reader_); }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
-      backup_count_ = 0;
-      return true;
-    }
-    if (!grpc_byte_buffer_reader_next(&reader_, &slice_)) {
-      return false;
-    }
-    grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-}  // namespace tensorflow_helper
-
-// Defines specialized serialization/deserialization routines that
-// default to allowing a 2GB max message size.
-//
-// To instantiate this template for a particular type `T`, use
-// `TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(T)`, as defined below.
-template <typename T>
-class UnlimitedSizeProtoSerializationTraits {
- public:
-  static Status Serialize(const T& msg, grpc_byte_buffer** bp,
-                          bool* own_buffer) {
-    *own_buffer = true;
-    int byte_size = msg.ByteSize();
-    if (byte_size < 0) {
-      return Status(StatusCode::INTERNAL, "Message length was negative");
-    } else if (byte_size <=
-               tensorflow_helper::kGrpcBufferWriterMaxBufferLength) {
-      grpc_slice slice = grpc_slice_malloc(byte_size);
-      GPR_CODEGEN_ASSERT(
-          GRPC_SLICE_END_PTR(slice) ==
-          msg.SerializeWithCachedSizesToArray(GRPC_SLICE_START_PTR(slice)));
-      *bp = grpc_raw_byte_buffer_create(&slice, 1);
-      grpc_slice_unref(slice);
-      return Status::OK;
-    } else {
-      tensorflow_helper::GrpcBufferWriter writer(
-          bp, tensorflow_helper::kGrpcBufferWriterMaxBufferLength);
-      return msg.SerializeToZeroCopyStream(&writer)
-                 ? Status::OK
-                 : Status(StatusCode::INTERNAL, "Failed to serialize message");
-    }
-  }
-
-  static Status Deserialize(grpc_byte_buffer* buffer, T* msg,
-                            int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-    Status result = Status::OK;
-    {
-      tensorflow_helper::GrpcBufferReader reader(buffer);
-      ::grpc::protobuf::io::CodedInputStream decoder(&reader);
-      if (max_message_size == 0) {
-        // NOTE(mrry): Override maximum message size to 2GB.
-        decoder.SetTotalBytesLimit(INT_MAX, INT_MAX);
-      } else {
-        decoder.SetTotalBytesLimit(max_message_size, max_message_size);
-      }
-      if (!msg->ParseFromCodedStream(&decoder)) {
-        result = Status(StatusCode::INTERNAL, msg->InitializationErrorString());
-      }
-      if (!decoder.ConsumedEntireMessage()) {
-        result = Status(StatusCode::INTERNAL, "Did not read entire message");
-      }
-    }
-    grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-
-}  // namespace grpc
-
-// For the given protobuf message type `MessageType`, specializes the
-// gRPC serialization and deserialization such that the default
-// maximum message size is 2GB.
-#define TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(MessageType)             \
-  namespace grpc {                                                    \
-  template <>                                                         \
-  class SerializationTraits<MessageType>                              \
-      : public UnlimitedSizeProtoSerializationTraits<MessageType> {}; \
-  }  // namespace grpc
-
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 2a2f7e3ffb..62b299d5c2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,24 +26,16 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RegisterGraphRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphResponse);
-
 namespace tensorflow {
 class GrpcByteSource : public TensorResponse::Source {
  public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
   ~GrpcByteSource() override { DeleteStream(); }
 
-  typedef ::grpc::tensorflow_helper::GrpcBufferReader Reader;
+  typedef ::grpc::GrpcProtoBufferReader Reader;
 
   protobuf::io::ZeroCopyInputStream* contents() override {
     DeleteStream();
@@ -58,7 +50,7 @@ class GrpcByteSource : public TensorResponse::Source {
     }
   }
 
-  grpc_byte_buffer* buffer_;  // Not owned
+  ::grpc::ByteBuffer* buffer_;  // Not owned
   Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
   char space_[sizeof(Reader)];
 };
@@ -74,17 +66,15 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::TensorResponse.
 // Wire-format is identical to RecvTensorResponse.
 template <>
-class SerializationTraits<tensorflow::TensorResponse>
-    : public UnlimitedSizeProtoSerializationTraits<tensorflow::TensorResponse> {
+class SerializationTraits<tensorflow::TensorResponse> {
  public:
-  static Status Serialize(const tensorflow::TensorResponse& msg,
-                          grpc_byte_buffer** bp, bool* own_buffer) {
+  static Status Serialize(const tensorflow::TensorResponse& msg, ByteBuffer* bp,
+                          bool* own_buffer) {
     LOG(FATAL) << "TODO(sanjay,jeff): Implement";
     return Status();
   }
-  static Status Deserialize(grpc_byte_buffer* buffer,
-                            tensorflow::TensorResponse* msg,
-                            int max_message_size = INT_MAX) {
+  static Status Deserialize(ByteBuffer* buffer,
+                            tensorflow::TensorResponse* msg) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
@@ -98,7 +88,7 @@ class SerializationTraits<tensorflow::TensorResponse>
                             "TensorResponse parse error", s.ToString()));
       }
     }
-    grpc_byte_buffer_destroy(buffer);
+    buffer->Clear();
     return result;
   }
 };
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c72aa3e649..52168a89c5 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -438,11 +438,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2.tar.gz",
-          "https://github.com/grpc/grpc/archive/bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
+          "https://github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
       ],
-      sha256 = "0a05bd355e4571b01d813dddffa38e57e689ac41b264dc9b1bd6ec66463ef5d6",
-      strip_prefix = "grpc-bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2",
+      sha256 = "b857969c667c14f37faa507afc07a3f39a47fbf73203be889d55925622e7b317",
+      strip_prefix = "grpc-09386db3939cae1ac12e5f09b735adfa8958c68e",
   )
 
 
-- 
GitLab


From 22a5485a4f0db8d45efc30492499cba79cc1a47e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 12:28:56 -0700
Subject: [PATCH 0524/1262] Employ array flat sizes more directly in
 reference_ops.

PiperOrigin-RevId: 192327464
---
 .../internal/reference/reference_ops.h        | 831 ++++++------------
 .../contrib/lite/kernels/internal/types.h     | 115 ++-
 2 files changed, 401 insertions(+), 545 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 410688411e..4bbec52bf7 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -635,27 +635,14 @@ void NonGlobalBatchNormalization(
     const Dims<4>& offset_dims, float* output_data,
     const Dims<4>& output_dims) {
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
+  const int inner_size = MatchingFlatSizeSkipDim(
+      input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims);
 
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
-        }
-      }
+    for (int i = 0; i < inner_size; ++i) {
+      output_data[b * inner_size + i] = ActivationFunction<Ac>(
+          (input_data[b * inner_size + i] - mean_data[i]) * multiplier_data[i] +
+          offset_data[i]);
     }
   }
 }
@@ -669,87 +656,52 @@ void GlobalBatchNormalization(const float* input_data,
                               const float* offset_data,
                               const Dims<4>& offset_dims, float* output_data,
                               const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth =
       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
                         offset_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = ActivationFunction<Ac>(
+          (input_data[depth * i + c] - mean_data[c]) * multiplier_data[c] +
+          offset_data[c]);
     }
   }
 }
 
 inline void Relu(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float lower = 0;
-          float clamped = val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float lower = 0;
+    const float clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 1;
+    const float lower = -1;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
@@ -757,24 +709,17 @@ template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   static_assert(Ac == FusedActivationFunctionType::kNone, "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float l2_norm = std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    const float l2_norm = std::sqrt(squared_l2_norm);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
     }
   }
 }
@@ -859,26 +804,11 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] +
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1141,26 +1071,11 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] *
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1384,26 +1299,11 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1411,26 +1311,11 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1812,15 +1697,9 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   (void)gemm_context;  // only used in optimized code.
 
   // Gather dimensions information, and perform consistency checks.
-  const int batches =
-      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
-                        output_state_dims, 3, output_activ_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
-                        output_state_dims, 2, output_activ_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
-                        output_state_dims, 1, output_activ_dims, 1);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims,
+                              output_state_dims, output_activ_dims);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -1836,9 +1715,7 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
                         output_state_dims, 0, output_activ_dims, 0);
   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
-  const int fc_batches = ArraySize(activ_temp_dims, 1) *
-                         ArraySize(activ_temp_dims, 2) *
-                         ArraySize(activ_temp_dims, 3);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0);
   const int fc_output_depth =
       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   const int fc_accum_depth = ArraySize(weights_dims, 0);
@@ -1883,7 +1760,6 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int outer_size = batches * width * height;
   for (int b = 0; b < outer_size; ++b) {
     for (int c = 0; c < output_depth; ++c) {
       // Define the fixed-point data types that we will use here. All use
@@ -2418,28 +2294,20 @@ inline void LocalResponseNormalization(const float* input_data,
                                        float bias, float alpha, float beta,
                                        float* output_data,
                                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const int begin_input_c = std::max(0, c - range);
-          const int end_input_c = std::min(depth, c + range);
-          float accum = 0.f;
-          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-            const float input_val =
-                input_data[Offset(input_dims, input_c, x, y, b)];
-            accum += input_val * input_val;
-          }
-          const float multiplier = std::pow(bias + alpha * accum, -beta);
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      const int begin_input_c = std::max(0, c - range);
+      const int end_input_c = std::min(depth, c + range);
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
       }
+      const float multiplier = std::pow(bias + alpha * accum, -beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
     }
   }
 }
@@ -2447,37 +2315,28 @@ inline void LocalResponseNormalization(const float* input_data,
 inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                          beta);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp((input_data[i * depth + c] - max) * beta);
+    }
 
-        // Compute result.
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                       beta) /
-              sum;
-        }
-      }
+    // Compute result.
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] =
+          std::exp((input_data[i * depth + c] - max) * beta) / sum;
     }
   }
 }
@@ -2498,73 +2357,63 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int x = 0; x < width; ++x) {
-      for (int y = 0; y < height; ++y) {
-        uint8 max_in_row = 0;
-        for (int c = 0; c < depth; ++c) {
-          max_in_row =
-              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int32 fixed_sum_of_exps = sum_of_exps.raw();
+    int headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+    // This is the number of bits to the left of the binary point above 1.0.
+    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+    // no later adjustment will be needed.
+    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+    int32 shifted_sum_minus_one = static_cast<int32>(
+        (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+        (static_cast<uint32>(1) << 31));
+
+    FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+        FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
 
-        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-            sum_of_exps =
-                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                  exp_on_negative_values(scaled_diff_f8));
-          }
-        }
-
-        int32 fixed_sum_of_exps = sum_of_exps.raw();
-        int headroom_plus_one =
-            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
-        // This is the number of bits to the left of the binary point above 1.0.
-        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-        // no later adjustment will be needed.
-        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-        int32 shifted_sum_minus_one = static_cast<int32>(
-            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
-            (static_cast<uint32>(1) << 31));
-
-        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-            FixedPoint0::FromRaw(shifted_sum_minus_one));
-
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
-                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
-            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
-                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
-
-          } else {
-            output_data[Offset(output_dims, c, x, y, b)] = 0;
-          }
-        }
+      } else {
+        output_data[i * depth + c] = 0;
       }
     }
   }
@@ -2572,55 +2421,40 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp(input_data[Offset(input_dims, c, x, y, b)] - max);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(input_data[i * depth + c] - max);
+    }
 
-        // Compute result.
-        const float log_sum = std::log(sum);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] - max - log_sum;
-        }
-      }
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
     }
   }
 }
 
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = 1.f / (1.f + std::exp(-val));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = 1.f / (1.f + std::exp(-val));
+    output_data[i] = result;
   }
 }
 
@@ -2628,53 +2462,43 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
-            // Convert from Q0.31 to Q23.8.
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            // Reinterpret as U0.8.
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      // Convert from Q0.31 to Q23.8.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as U0.8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
   }
 }
 
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -2692,20 +2516,12 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
 
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = std::tanh(val);
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
   }
 }
 
@@ -2714,47 +2530,38 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_multiplier, int input_left_shift,
                  uint8* output_data, const Dims<4>& output_dims) {
   const int32 output_zero_point = 128;
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
-            // Convert from Q0.31 to Q24.7.
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-            output_val_s32 += output_zero_point;
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            // Reinterpret as Q0.7, encoded in uint8.
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as Q0.7, encoded in uint8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
   }
 }
 
@@ -2766,8 +2573,7 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK_GE(input_left_shift, 0);
   TFLITE_DCHECK_LE(input_left_shift, 1);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   // F0 uses 0 integer bits, range [-1, 1].
   // This is the return type of math functions such as tanh, logistic,
@@ -2795,20 +2601,12 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int32 val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = static_cast<float>(scale * (val - zero_point));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int32 val = input_data[i];
+    float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
   }
 }
 
@@ -2872,61 +2670,37 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
     TFLITE_DCHECK_LE(zero_point, qmax);
   }
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
-          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const float src_val = input_data[i];
+    const float unclamped_quantized_val =
+        TfLiteRound(zero_point + src_val / scale);
+    const float quantized_val =
+        std::min(qmax_float, std::max(qmin_float, unclamped_quantized_val));
+    const float dst_val = scale * (quantized_val - zero_point);
+    output_data[i] = dst_val;
   }
 }
 
 template <typename SrcT, typename DstT>
 inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
                  DstT* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = static_cast<DstT>(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = static_cast<DstT>(input_data[offset]);
   }
 }
 
 inline void Floor(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = std::floor(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
   }
 }
 
@@ -3375,23 +3149,11 @@ template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto min_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] > min_value ? min_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
   }
 }
 
@@ -3399,23 +3161,11 @@ template <typename T>
 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto max_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] < max_value ? max_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
   }
 }
 
@@ -3456,25 +3206,20 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = ArraySize(input_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
-        int max_index = 0;
-        for (int d = 1; d < depth; ++d) {
-          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
-          if (curr_value > max_value) {
-            max_value = curr_value;
-            max_index = d;
-          }
-        }
-        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+
+  for (int i = 0; i < outer_size; ++i) {
+    auto max_value = input_data[i * depth];
+    int max_index = 0;
+    for (int d = 1; d < depth; ++d) {
+      const auto& curr_value = input_data[i * depth + d];
+      if (curr_value > max_value) {
+        max_value = curr_value;
+        max_index = d;
       }
     }
+    output_data[i] = max_index;
   }
 }
 
@@ -3524,11 +3269,11 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
 
   // Although transpose convolution simplifies to convolution with transposed
   // weights for strides of 1, non-unitary striding complicates matters. To
-  // keep this reference implementation as clear as possible, we use a "scatter"
-  // access pattern, where we loop through all the input elements, computing
-  // their influence on the output, rather than looping through the output
-  // elements in the typical "gather" access pattern of a conv. We therefore
-  // must initialize the output array to zero.
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
   for (int i = 0; i < RequiredBufferSizeForDims(output_dims); i++) {
     output_data[i] = 0.0f;
   }
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 293538fcbb..3290c364c1 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -130,14 +130,125 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
   return MatchingArraySize(array1, index1, args...);
 }
 
-inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+template <int N>
+inline int FlatSize(const Dims<N>& dims) {
   int max_offset = 0;
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < N; i++) {
     max_offset += (dims.sizes[i] - 1) * dims.strides[i];
   }
   return max_offset + 1;
 }
 
+// Deprecated. Prefer FlatSize.
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  return FlatSize(dims);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2,
+                            const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+template <int N>
+inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
+  int flat_size = 1;
+  for (int i = 0; i < N; i++) {
+    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return FlatSizeSkipDim(dims, skip_dim);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2,
+                                   const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
+                                 check_dims_3);
+}
+
 template <int N>
 bool IsPackedWithoutStrides(const Dims<N>& dims) {
   int expected_stride = 1;
-- 
GitLab


From fe3f9dddb39171dd7cd9fbb9e044a40e08072c50 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenlavoie@gmail.com>
Date: Tue, 10 Apr 2018 12:41:29 -0700
Subject: [PATCH 0525/1262] Make custom_graph_optimizer_registry header-only
 (#18387)

Adds it as a dependency to libtensorflow_framework.so so its symbols are
available to shared objects which want to register optimizers. No other rules
include it, so shared objects won't accidentally get their own version of the
registry.
---
 tensorflow/BUILD                          |  5 +++--
 tensorflow/core/grappler/optimizers/BUILD | 23 ++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index cfafffdd13..f2ad16fa04 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -450,11 +450,12 @@ tf_cc_shared_object(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core:gpu_runtime_impl",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/stream_executor:stream_executor_impl",
-        "//tensorflow/core:gpu_runtime_impl",
     ] + tf_additional_binary_deps(),
 )
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index e4bc030885..696cbd6d79 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -11,6 +11,10 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_protos_grappler",
 )
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
+)
 
 cc_library(
     name = "static_schedule",
@@ -532,11 +536,28 @@ tf_cuda_cc_test(
     ],
 )
 
+# This rule is header-only unless the build is static (--config=monolithic). Its
+# implementation is included directly in the framework shared object.
 cc_library(
     name = "custom_graph_optimizer_registry",
-    srcs = ["custom_graph_optimizer_registry.cc"],
     hdrs = ["custom_graph_optimizer_registry.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":custom_graph_optimizer",
+        "//tensorflow/core:lib",
+    ] + if_static(
+        [":custom_graph_optimizer_registry_impl"],
+    ),
+)
+
+# This rule contains static variables for the optimizer registry. Do not depend
+# on it directly; use :custom_graph_optimizer_registry, and link against
+# libtensorflow_framework.so for the registry symbols.
+cc_library(
+    name = "custom_graph_optimizer_registry_impl",
+    srcs = ["custom_graph_optimizer_registry.cc"],
+    hdrs = ["custom_graph_optimizer_registry.h"],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":custom_graph_optimizer",
         "//tensorflow/core:lib",
-- 
GitLab


From 4bf8270ed534c4cd37160e757d7b8a3dc765d1f0 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 10 Apr 2018 12:54:03 -0700
Subject: [PATCH 0526/1262] Checkpointable: wrap restore ops in init_scope

This should make restore() work with defun-wrapped code, when variables are
created inside the function. Just lifts the restore code into the outer
context. Adds a test for it.

PiperOrigin-RevId: 192331065
---
 .../eager/python/checkpointable_utils_test.py | 45 ++++++++++++++
 .../optimizer_v2/checkpointable_utils_test.py | 45 ++++++++++++++
 tensorflow/python/training/checkpointable.py  | 58 ++++++++++---------
 3 files changed, 120 insertions(+), 28 deletions(-)

diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 3ec5c3de39..688befa772 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -580,6 +581,50 @@ class CheckpointingTests(test.TestCase):
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
   def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
     checkpointable_utils.add_variable(
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 08f9699e85..abcffeb618 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.optimizer_v2 import adam
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -372,6 +373,50 @@ class CheckpointingTests(test.TestCase):
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
   def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
     checkpointable_utils.add_variable(
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index bbbe1e8ac5..9bf48df22e 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -94,12 +94,13 @@ class _CheckpointPosition(object):
 
   def restore(self, checkpointable):
     """Restore this value into `checkpointable`."""
-    if self.bind_object(checkpointable):
-      # This object's correspondence with a checkpointed object is new, so
-      # process deferred restorations for it and its dependencies.
-      restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
-      if restore_ops:
-        self._checkpoint.restore_ops.extend(restore_ops)
+    with ops.init_scope():
+      if self.bind_object(checkpointable):
+        # This object's correspondence with a checkpointed object is new, so
+        # process deferred restorations for it and its dependencies.
+        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        if restore_ops:
+          self._checkpoint.restore_ops.extend(restore_ops)
 
   def bind_object(self, checkpointable):
     """Set a checkpoint<->object correspondence and process slot variables.
@@ -409,28 +410,29 @@ class CheckpointableBase(object):
            "Checkpointable._add_variable called to create another with "
            "that name. Variable names must be unique within a Checkpointable "
            "object.") % (name,))
-    if context.executing_eagerly():
-      # If this is a variable with a single Tensor stored in the checkpoint, we
-      # can set that value as an initializer rather than initializing and then
-      # assigning (when executing eagerly). This call returns None if there is
-      # nothing to restore.
-      checkpoint_initializer = self._preload_simple_restoration(
-          name=name, shape=shape)
-    else:
-      checkpoint_initializer = None
-    if (checkpoint_initializer is not None
-        and not (
-            isinstance(initializer, CheckpointInitialValue)
-            and initializer.restore_uid > checkpoint_initializer.restore_uid)):
-      # If multiple Checkpointable objects are "creating" the same variable via
-      # the magic of custom getters, the one with the highest restore UID (the
-      # one called last) has to make the final initializer. If another custom
-      # getter interrupts this process by overwriting the initializer, then
-      # we'll catch that when we call _track_checkpointable. So this is "best
-      # effort" to set the initializer with the highest restore UID.
-      initializer = checkpoint_initializer
-      shape = None
-
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # If this is a variable with a single Tensor stored in the checkpoint,
+        # we can set that value as an initializer rather than initializing and
+        # then assigning (when executing eagerly). This call returns None if
+        # there is nothing to restore.
+        checkpoint_initializer = self._preload_simple_restoration(
+            name=name, shape=shape)
+      else:
+        checkpoint_initializer = None
+      if (checkpoint_initializer is not None
+          and not (
+              isinstance(initializer, CheckpointInitialValue)
+              and (initializer.restore_uid
+                   > checkpoint_initializer.restore_uid))):
+        # If multiple Checkpointable objects are "creating" the same variable
+        # via the magic of custom getters, the one with the highest restore UID
+        # (the one called last) has to make the final initializer. If another
+        # custom getter interrupts this process by overwriting the initializer,
+        # then we'll catch that when we call _track_checkpointable. So this is
+        # "best effort" to set the initializer with the highest restore UID.
+        initializer = checkpoint_initializer
+        shape = None
     new_variable = getter(
         name=name, shape=shape, dtype=dtype, initializer=initializer,
         **kwargs_for_getter)
-- 
GitLab


From 6b593d329005ffb1a10b1c9cd1374d2cdb620b21 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Tue, 10 Apr 2018 13:32:38 -0700
Subject: [PATCH 0527/1262] Update declaration order in staging ops Buffer
 class according to C++ style guide

PiperOrigin-RevId: 192336966
---
 tensorflow/core/kernels/stage_op.cc | 83 +++++++++++++----------------
 1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 03fc4467a1..73a02a34cf 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -32,53 +32,8 @@ namespace {
 
 class Buffer : public ResourceBase {
  public:
-  // public types
   using Tuple = std::vector<Tensor>;
 
- private:
-  // private variables
-  std::size_t capacity_;
-  std::size_t memory_limit_;
-  std::size_t current_bytes_;
-  std::mutex mu_;
-  std::condition_variable non_empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::deque<Tuple> buf_;
-
- private:
-  // private methods
-
-  // If the buffer is configured for bounded capacity, notify
-  // waiting inserters that space is now available
-  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
-    if (IsBounded()) {
-      lock->unlock();
-      // Notify all inserters. The removal of an element
-      // may make memory available for many inserters
-      // to insert new elements
-      full_cond_var_.notify_all();
-    }
-  }
-
-  // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
-  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
-
-  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
-
-  bool WouldExceedMemoryLimit(std::size_t bytes) const {
-    return bytes + current_bytes_ > memory_limit_;
-  }
-
-  std::size_t GetTupleBytes(const Tuple& tuple) {
-    return std::accumulate(tuple.begin(), tuple.end(), 0,
-                           [](const std::size_t& lhs, const Tensor& rhs) {
-                             return lhs + rhs.TotalBytes();
-                           });
-  }
-
- public:
-  // public methods
   explicit Buffer(std::size_t capacity, std::size_t memory_limit)
       : capacity_(capacity), memory_limit_(memory_limit), current_bytes_(0) {}
 
@@ -181,6 +136,44 @@ class Buffer : public ResourceBase {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
+
+ private:
+  // If the buffer is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+    if (IsBounded()) {
+      lock->unlock();
+      // Notify all inserters. The removal of an element
+      // may make memory available for many inserters
+      // to insert new elements
+      full_cond_var_.notify_all();
+    }
+  }
+
+  // Are there a limit number of elements or a memory limit
+  // configued on this buffer?
+  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
+
+  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
+
+  bool WouldExceedMemoryLimit(std::size_t bytes) const {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  std::size_t GetTupleBytes(const Tuple& tuple) {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
+  }
+
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  std::mutex mu_;
+  std::condition_variable non_empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<Tuple> buf_;
 };
 
 Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
-- 
GitLab


From 6f6f913bc2e9866d70e0615fcae22371d32eee86 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 10 Apr 2018 11:19:26 -0700
Subject: [PATCH 0528/1262] Adding the python symlink command for devel
 packages too.

---
 tensorflow/tools/docker/Dockerfile.devel     | 2 ++
 tensorflow/tools/docker/Dockerfile.devel-gpu | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 0563bd4d6c..f2415930d5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 9f0cf63e7e..1d19821968 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
-- 
GitLab


From 693b339ab2f062ec5bbb29f976c5d1fd94fbffa5 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 10 Apr 2018 13:49:37 -0700
Subject: [PATCH 0529/1262] Refactor layers: - tf.layers layers now subclasses
 tf.keras.layers layers. - tf.keras.layers is now agnostic to variable scopes
 and global collections (future-proof). It also uses ResourceVariable
 everywhere by default. - As a result tf.keras.layers is in general
 lower-complexity, with fewer hacks and workarounds. However some of current
 code is temporary (variable creation should be moved to Checkpointable,
 arguably, and there are some dependency issues that will require later
 refactors). - The legacy tf.layers layers behavior is kept, with references
 to variable scopes and global collections injected in the subclassed
 tf.layers.base.Layer class (the content of tf.layers.base.Layer is the
 complexity differential between the old implementation and the new one).

Note: this refactor does slightly change the behavior of tf.layers.base.Layer, by disabling extreme edge-case behavior that either has long been invalid, or is dangerous and should most definitely be disabled. This will not affect any users since such behaviors only existed in the base Layer unit tests. The behaviors disabled are:
- Option to create reusable variables in `call` (already invalid for some time).
- Option to use a variable scope to create layer variables outside of the layer while not having the layer track such variables locally.
PiperOrigin-RevId: 192339798
---
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |   18 +-
 .../eager/python/checkpointable_utils_test.py |    2 +-
 tensorflow/contrib/eager/python/network.py    |    5 +-
 .../contrib/eager/python/network_test.py      |   32 -
 .../optimizer_v2/checkpointable_utils_test.py |    2 +-
 tensorflow/python/BUILD                       |    9 +-
 tensorflow/python/__init__.py                 |   10 +
 .../python/feature_column/feature_column.py   |   35 +-
 tensorflow/python/keras/BUILD                 |  138 +-
 .../python/keras/_impl/keras/activations.py   |    8 -
 .../python/keras/_impl/keras/backend.py       |   24 +-
 .../keras/_impl/keras/engine/base_layer.py    | 1736 +++++++++++++++--
 .../keras/_impl/keras/engine/input_layer.py   |    5 +-
 .../keras/_impl/keras/engine/network.py       |   46 +-
 .../keras/_impl/keras/engine/saving_test.py   |    2 +-
 .../keras/_impl/keras/engine/sequential.py    |    2 +-
 .../_impl/keras/engine/sequential_test.py     |    1 +
 .../keras/_impl/keras/engine/topology_test.py |   61 +-
 .../keras/_impl/keras/engine/training.py      |   19 +-
 .../python/keras/_impl/keras/initializers.py  |    2 +
 .../keras/_impl/keras/integration_test.py     |  139 +-
 .../keras/_impl/keras/layers/convolutional.py |  937 +++++++--
 .../python/keras/_impl/keras/layers/core.py   |  166 +-
 .../keras/_impl/keras/layers/core_test.py     |    1 -
 .../keras/_impl/keras/layers/embeddings.py    |    6 +-
 .../keras/_impl/keras/layers/normalization.py |  653 ++++++-
 .../_impl/keras/layers/normalization_test.py  |   20 +
 .../keras/_impl/keras/layers/pooling.py       |  411 +++-
 .../_impl/keras/layers/recurrent_test.py      |    4 +-
 .../keras/_impl/keras/layers/wrappers.py      |    4 +-
 .../_impl/keras/model_subclassing_test.py     |   35 +-
 .../keras/_impl/keras/utils/conv_utils.py     |  143 +-
 .../keras/_impl/keras/utils/tf_utils.py       |   74 +
 tensorflow/python/layers/base.py              | 1443 ++------------
 tensorflow/python/layers/base_test.py         |   94 +-
 tensorflow/python/layers/convolutional.py     |  702 +------
 tensorflow/python/layers/core.py              |  142 +-
 tensorflow/python/layers/normalization.py     |  516 +----
 tensorflow/python/layers/pooling.py           |  258 +--
 tensorflow/python/layers/utils_test.py        |   29 -
 tensorflow/python/ops/nn.py                   |    2 -
 .../api/golden/tensorflow.keras.-model.pbtxt  |   11 +-
 .../golden/tensorflow.keras.-sequential.pbtxt |   11 +-
 .../tensorflow.keras.layers.-activation.pbtxt |   13 +-
 ...eras.layers.-activity-regularization.pbtxt |   13 +-
 .../golden/tensorflow.keras.layers.-add.pbtxt |   13 +-
 ...nsorflow.keras.layers.-alpha-dropout.pbtxt |   13 +-
 ...low.keras.layers.-average-pooling1-d.pbtxt |   18 +-
 ...low.keras.layers.-average-pooling2-d.pbtxt |   16 +-
 ...low.keras.layers.-average-pooling3-d.pbtxt |   16 +-
 .../tensorflow.keras.layers.-average.pbtxt    |   13 +-
 ...tensorflow.keras.layers.-avg-pool1-d.pbtxt |   18 +-
 ...tensorflow.keras.layers.-avg-pool2-d.pbtxt |   16 +-
 ...tensorflow.keras.layers.-avg-pool3-d.pbtxt |   16 +-
 ...ow.keras.layers.-batch-normalization.pbtxt |   16 +-
 ...nsorflow.keras.layers.-bidirectional.pbtxt |   13 +-
 ...tensorflow.keras.layers.-concatenate.pbtxt |   13 +-
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |   13 +-
 .../tensorflow.keras.layers.-conv1-d.pbtxt    |   18 +-
 ...flow.keras.layers.-conv2-d-transpose.pbtxt |   18 +-
 .../tensorflow.keras.layers.-conv2-d.pbtxt    |   16 +-
 ...flow.keras.layers.-conv3-d-transpose.pbtxt |   18 +-
 .../tensorflow.keras.layers.-conv3-d.pbtxt    |   16 +-
 ...sorflow.keras.layers.-convolution1-d.pbtxt |   18 +-
 ...ras.layers.-convolution2-d-transpose.pbtxt |   18 +-
 ...sorflow.keras.layers.-convolution2-d.pbtxt |   16 +-
 ...ras.layers.-convolution3-d-transpose.pbtxt |   18 +-
 ...sorflow.keras.layers.-convolution3-d.pbtxt |   16 +-
 ...tensorflow.keras.layers.-cropping1-d.pbtxt |   13 +-
 ...tensorflow.keras.layers.-cropping2-d.pbtxt |   13 +-
 ...tensorflow.keras.layers.-cropping3-d.pbtxt |   13 +-
 .../tensorflow.keras.layers.-dense.pbtxt      |   14 +-
 ...flow.keras.layers.-depthwise-conv2-d.pbtxt |   16 +-
 .../golden/tensorflow.keras.layers.-dot.pbtxt |   13 +-
 .../tensorflow.keras.layers.-dropout.pbtxt    |   14 +-
 .../tensorflow.keras.layers.-e-l-u.pbtxt      |   13 +-
 .../tensorflow.keras.layers.-embedding.pbtxt  |   13 +-
 .../tensorflow.keras.layers.-flatten.pbtxt    |   14 +-
 .../tensorflow.keras.layers.-g-r-u-cell.pbtxt |   13 +-
 .../tensorflow.keras.layers.-g-r-u.pbtxt      |   13 +-
 ...rflow.keras.layers.-gaussian-dropout.pbtxt |   13 +-
 ...sorflow.keras.layers.-gaussian-noise.pbtxt |   13 +-
 ...as.layers.-global-average-pooling1-d.pbtxt |   15 +-
 ...as.layers.-global-average-pooling2-d.pbtxt |   15 +-
 ...as.layers.-global-average-pooling3-d.pbtxt |   15 +-
 ...low.keras.layers.-global-avg-pool1-d.pbtxt |   15 +-
 ...low.keras.layers.-global-avg-pool2-d.pbtxt |   15 +-
 ...low.keras.layers.-global-avg-pool3-d.pbtxt |   15 +-
 ...low.keras.layers.-global-max-pool1-d.pbtxt |   15 +-
 ...low.keras.layers.-global-max-pool2-d.pbtxt |   15 +-
 ...low.keras.layers.-global-max-pool3-d.pbtxt |   15 +-
 ....keras.layers.-global-max-pooling1-d.pbtxt |   15 +-
 ....keras.layers.-global-max-pooling2-d.pbtxt |   15 +-
 ....keras.layers.-global-max-pooling3-d.pbtxt |   15 +-
 ...tensorflow.keras.layers.-input-layer.pbtxt |   13 +-
 .../tensorflow.keras.layers.-input-spec.pbtxt |    2 +-
 ...ensorflow.keras.layers.-l-s-t-m-cell.pbtxt |   13 +-
 .../tensorflow.keras.layers.-l-s-t-m.pbtxt    |   13 +-
 .../tensorflow.keras.layers.-lambda.pbtxt     |   13 +-
 .../tensorflow.keras.layers.-layer.pbtxt      |   15 +-
 ...ensorflow.keras.layers.-leaky-re-l-u.pbtxt |   13 +-
 ...w.keras.layers.-locally-connected1-d.pbtxt |   13 +-
 ...w.keras.layers.-locally-connected2-d.pbtxt |   13 +-
 .../tensorflow.keras.layers.-masking.pbtxt    |   13 +-
 ...tensorflow.keras.layers.-max-pool1-d.pbtxt |   18 +-
 ...tensorflow.keras.layers.-max-pool2-d.pbtxt |   16 +-
 ...tensorflow.keras.layers.-max-pool3-d.pbtxt |   16 +-
 ...sorflow.keras.layers.-max-pooling1-d.pbtxt |   18 +-
 ...sorflow.keras.layers.-max-pooling2-d.pbtxt |   16 +-
 ...sorflow.keras.layers.-max-pooling3-d.pbtxt |   16 +-
 .../tensorflow.keras.layers.-maximum.pbtxt    |   13 +-
 .../tensorflow.keras.layers.-multiply.pbtxt   |   13 +-
 .../tensorflow.keras.layers.-p-re-l-u.pbtxt   |   13 +-
 .../tensorflow.keras.layers.-permute.pbtxt    |   13 +-
 .../tensorflow.keras.layers.-r-n-n.pbtxt      |   13 +-
 ...nsorflow.keras.layers.-repeat-vector.pbtxt |   13 +-
 .../tensorflow.keras.layers.-reshape.pbtxt    |   13 +-
 ...flow.keras.layers.-separable-conv1-d.pbtxt |   18 +-
 ...flow.keras.layers.-separable-conv2-d.pbtxt |   18 +-
 ...ras.layers.-separable-convolution1-d.pbtxt |   18 +-
 ...ras.layers.-separable-convolution2-d.pbtxt |   18 +-
 ...flow.keras.layers.-simple-r-n-n-cell.pbtxt |   13 +-
 ...ensorflow.keras.layers.-simple-r-n-n.pbtxt |   13 +-
 .../tensorflow.keras.layers.-softmax.pbtxt    |   13 +-
 ...low.keras.layers.-spatial-dropout1-d.pbtxt |   14 +-
 ...low.keras.layers.-spatial-dropout2-d.pbtxt |   14 +-
 ...low.keras.layers.-spatial-dropout3-d.pbtxt |   14 +-
 ...ow.keras.layers.-stacked-r-n-n-cells.pbtxt |   13 +-
 ...low.keras.layers.-thresholded-re-l-u.pbtxt |   13 +-
 ...rflow.keras.layers.-time-distributed.pbtxt |   13 +-
 ...sorflow.keras.layers.-up-sampling1-d.pbtxt |   13 +-
 ...sorflow.keras.layers.-up-sampling2-d.pbtxt |   13 +-
 ...sorflow.keras.layers.-up-sampling3-d.pbtxt |   13 +-
 .../tensorflow.keras.layers.-wrapper.pbtxt    |   13 +-
 ...orflow.keras.layers.-zero-padding1-d.pbtxt |   13 +-
 ...orflow.keras.layers.-zero-padding2-d.pbtxt |   13 +-
 ...orflow.keras.layers.-zero-padding3-d.pbtxt |   13 +-
 .../tensorflow.keras.models.-model.pbtxt      |   11 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   11 +-
 ...ensorflow.layers.-average-pooling1-d.pbtxt |   46 +-
 ...ensorflow.layers.-average-pooling2-d.pbtxt |   46 +-
 ...ensorflow.layers.-average-pooling3-d.pbtxt |   46 +-
 ...nsorflow.layers.-batch-normalization.pbtxt |   44 +-
 .../golden/tensorflow.layers.-conv1-d.pbtxt   |   46 +-
 ...tensorflow.layers.-conv2-d-transpose.pbtxt |   48 +-
 .../golden/tensorflow.layers.-conv2-d.pbtxt   |   46 +-
 ...tensorflow.layers.-conv3-d-transpose.pbtxt |   48 +-
 .../golden/tensorflow.layers.-conv3-d.pbtxt   |   46 +-
 .../api/golden/tensorflow.layers.-dense.pbtxt |   44 +-
 .../golden/tensorflow.layers.-dropout.pbtxt   |   44 +-
 .../golden/tensorflow.layers.-flatten.pbtxt   |   44 +-
 .../tensorflow.layers.-input-spec.pbtxt       |    2 +-
 .../api/golden/tensorflow.layers.-layer.pbtxt |   45 +-
 .../tensorflow.layers.-max-pooling1-d.pbtxt   |   46 +-
 .../tensorflow.layers.-max-pooling2-d.pbtxt   |   46 +-
 .../tensorflow.layers.-max-pooling3-d.pbtxt   |   46 +-
 ...tensorflow.layers.-separable-conv1-d.pbtxt |   48 +-
 ...tensorflow.layers.-separable-conv2-d.pbtxt |   48 +-
 ...flow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt |   43 +-
 ...orflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt |   43 +-
 ...nsorflow.nn.rnn_cell.-device-wrapper.pbtxt |   43 +-
 ...sorflow.nn.rnn_cell.-dropout-wrapper.pbtxt |   43 +-
 .../tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt  |   43 +-
 ...tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt |   43 +-
 ...orflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt |   43 +-
 .../tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt  |   45 +-
 ...orflow.nn.rnn_cell.-residual-wrapper.pbtxt |   43 +-
 167 files changed, 5493 insertions(+), 5060 deletions(-)
 create mode 100644 tensorflow/python/keras/_impl/keras/utils/tf_utils.py

diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 588a5e705d..1dd490b386 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
@@ -520,10 +520,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
 
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
@@ -644,10 +641,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleGRUCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -726,11 +720,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
 class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
   """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(
-      rnn_cell_impl.BasicRNNCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 688befa772..36670aa210 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl.keras.engine import sequential
 from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.layers import core
+from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index e55a9276ab..2f8721324f 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -25,6 +25,7 @@ import weakref
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpoint_utils
@@ -176,7 +177,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = base._get_default_graph_uid_map()
+        name_uid_map = keras_base_layer.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
@@ -326,6 +327,8 @@ class Network(base.Layer):
       raise TypeError(
           "Network.track_layer() passed type %s, not a tf.layers.Layer" %
           (type(layer),))
+    # Always use `ResourceVariable` with legacy layers.
+    layer._use_resource_variables = True
     if isinstance(layer, Network):
       layer._finalize_name(parent_network=self)
     else:
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 3329fc6c51..f43376d5d7 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -20,12 +20,10 @@ import gc
 
 from tensorflow.contrib.eager.python import network
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import math_ops
@@ -469,36 +467,6 @@ class NetworkTest(test.TestCase):
     self.assertIsInstance(net.trainable_weights[0],
                           resource_variable_ops.ResourceVariable)
 
-  def testGraphOpNames(self):
-    """Network operation names should match variable naming."""
-
-    def _check_op_prefixes(expected_prefix, checked_ops):
-      for operation in ops.get_default_graph().get_operations():
-        if operation.name == "ignore":
-          continue
-        if operation.name in checked_ops:
-          continue
-        checked_ops.add(operation.name)
-        self.assertStartsWith(expected_start=expected_prefix,
-                              actual=operation.name)
-        self.assertNotIn("my_network", operation.name[len(expected_prefix):])
-        self.assertNotIn("dense", operation.name[len(expected_prefix):])
-
-    with context.graph_mode():
-      net = MyNetwork()
-      zero = constant_op.constant([[0.]], name="ignore")
-      net(zero)
-      checked_ops = set()
-      _check_op_prefixes(expected_prefix="my_network/dense/",
-                         checked_ops=checked_ops)
-      net.net2 = net.track_layer(MyNetwork())
-      net.net2(zero)
-      _check_op_prefixes(expected_prefix="my_network/my_network/dense/",
-                         checked_ops=checked_ops)
-      MyNetwork()(zero)
-      _check_op_prefixes(expected_prefix="my_network_1/dense/",
-                         checked_ops=checked_ops)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableRegularizers(self):
     net = RegularizedNetwork()
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index abcffeb618..54bc23cdef 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.layers import core
+from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a22b9f40b1..7b548d2c70 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2273,7 +2273,6 @@ py_library(
         ":clip_ops",
         ":framework_for_generated_wrappers",
         ":init_ops",
-        ":layers_base",
         ":math_ops",
         ":nn_ops",
         ":partitioned_variables",
@@ -2949,11 +2948,13 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        # `layers` dependency only exists due to the use of a small utility.
+        "//tensorflow/python/keras:layers",
     ],
 )
 
@@ -4310,6 +4311,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
     ],
 )
@@ -4346,6 +4348,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index ab1d01a835..da836aca6f 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -149,6 +149,16 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.eager.context import executing_eagerly
 from tensorflow.python.framework.ops import enable_eager_execution
 
+# Necessary for the symbols in this module to be taken into account by
+# the namespace management system (API decorators).
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+
+# Required due to `rnn` and `rnn_cell` not being imported in `nn` directly
+# (due to a circular dependency issue: rnn depends on layers).
+nn.dynamic_rnn = rnn.dynamic_rnn
+nn.rnn_cell = rnn_cell
+
 # Symbols whitelisted for export without documentation.
 # TODO(cwhipkey): review these and move to contrib, expose through
 # documentation, or remove.
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 3a315e5c2e..7a104fa4ac 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -581,24 +581,25 @@ class _LinearModel(training.Model):
         **kwargs)
 
   def call(self, features):
-    for column in self._feature_columns:
-      if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
-    weighted_sums = []
-    ordered_columns = []
-    builder = _LazyBuilder(features)
-    for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
-      ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
-      weighted_sum = layer(builder)
-      weighted_sums.append(weighted_sum)
+    with variable_scope.variable_scope(self.name):
+      for column in self._feature_columns:
+        if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+          raise ValueError(
+              'Items of feature_columns must be either a '
+              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+      weighted_sums = []
+      ordered_columns = []
+      builder = _LazyBuilder(features)
+      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
+        ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+        weighted_sum = layer(builder)
+        weighted_sums.append(weighted_sum)
 
-    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
-    predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
-    predictions = nn_ops.bias_add(
-        predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
     return predictions
 
   def _add_layers(self, layers):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index f6e1d0eec3..da5bc3e6f1 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -20,7 +20,6 @@ py_library(
     srcs = [
         "__init__.py",
         "_impl/keras/__init__.py",
-        "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
         "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
@@ -32,9 +31,6 @@ py_library(
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
         "_impl/keras/applications/xception.py",
-        "_impl/keras/backend.py",
-        "_impl/keras/callbacks.py",
-        "_impl/keras/constraints.py",
         "_impl/keras/datasets/__init__.py",
         "_impl/keras/datasets/boston_housing.py",
         "_impl/keras/datasets/cifar.py",
@@ -44,49 +40,13 @@ py_library(
         "_impl/keras/datasets/imdb.py",
         "_impl/keras/datasets/mnist.py",
         "_impl/keras/datasets/reuters.py",
-        "_impl/keras/engine/__init__.py",
-        "_impl/keras/engine/base_layer.py",
-        "_impl/keras/engine/input_layer.py",
-        "_impl/keras/engine/network.py",
-        "_impl/keras/engine/saving.py",
-        "_impl/keras/engine/sequential.py",
-        "_impl/keras/engine/training.py",
-        "_impl/keras/engine/training_arrays.py",
-        "_impl/keras/engine/training_eager.py",
-        "_impl/keras/engine/training_generator.py",
-        "_impl/keras/engine/training_utils.py",
         "_impl/keras/estimator.py",
-        "_impl/keras/initializers.py",
-        "_impl/keras/layers/__init__.py",
-        "_impl/keras/layers/advanced_activations.py",
-        "_impl/keras/layers/convolutional.py",
-        "_impl/keras/layers/convolutional_recurrent.py",
-        "_impl/keras/layers/core.py",
-        "_impl/keras/layers/embeddings.py",
-        "_impl/keras/layers/local.py",
-        "_impl/keras/layers/merge.py",
-        "_impl/keras/layers/noise.py",
-        "_impl/keras/layers/normalization.py",
-        "_impl/keras/layers/pooling.py",
-        "_impl/keras/layers/recurrent.py",
-        "_impl/keras/layers/serialization.py",
-        "_impl/keras/layers/wrappers.py",
-        "_impl/keras/losses.py",
-        "_impl/keras/metrics.py",
-        "_impl/keras/models.py",
-        "_impl/keras/optimizers.py",
         "_impl/keras/preprocessing/__init__.py",
         "_impl/keras/preprocessing/image.py",
         "_impl/keras/preprocessing/sequence.py",
         "_impl/keras/preprocessing/text.py",
-        "_impl/keras/regularizers.py",
         "_impl/keras/testing_utils.py",
         "_impl/keras/utils/__init__.py",
-        "_impl/keras/utils/conv_utils.py",
-        "_impl/keras/utils/data_utils.py",
-        "_impl/keras/utils/generic_utils.py",
-        "_impl/keras/utils/io_utils.py",
-        "_impl/keras/utils/layer_utils.py",
         "_impl/keras/utils/multi_gpu_utils.py",
         "_impl/keras/utils/np_utils.py",
         "_impl/keras/utils/vis_utils.py",
@@ -136,7 +96,21 @@ py_library(
         ":empty_condition": [],
         "//conditions:default": [],
     }) + [
-        "@six_archive//:six",
+        ":backend",
+        ":engine",
+        ":layers",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "backend",
+    srcs = ["_impl/keras/backend.py"],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -152,8 +126,6 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
@@ -168,13 +140,83 @@ py_library(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_library(
+    name = "engine",
+    srcs = [
+        "_impl/keras/activations.py",
+        "_impl/keras/callbacks.py",
+        "_impl/keras/constraints.py",
+        "_impl/keras/engine/__init__.py",
+        "_impl/keras/engine/base_layer.py",
+        "_impl/keras/engine/input_layer.py",
+        "_impl/keras/engine/network.py",
+        "_impl/keras/engine/saving.py",
+        "_impl/keras/engine/sequential.py",
+        "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_arrays.py",
+        "_impl/keras/engine/training_eager.py",
+        "_impl/keras/engine/training_generator.py",
+        "_impl/keras/engine/training_utils.py",
+        "_impl/keras/initializers.py",
+        "_impl/keras/losses.py",
+        "_impl/keras/metrics.py",
+        "_impl/keras/models.py",
+        "_impl/keras/optimizers.py",
+        "_impl/keras/regularizers.py",
+        "_impl/keras/utils/data_utils.py",
+        "_impl/keras/utils/io_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "layers",
+    srcs = [
+        "_impl/keras/layers/__init__.py",
+        "_impl/keras/layers/advanced_activations.py",
+        "_impl/keras/layers/convolutional.py",
+        "_impl/keras/layers/convolutional_recurrent.py",
+        "_impl/keras/layers/core.py",
+        "_impl/keras/layers/embeddings.py",
+        "_impl/keras/layers/local.py",
+        "_impl/keras/layers/merge.py",
+        "_impl/keras/layers/noise.py",
+        "_impl/keras/layers/normalization.py",
+        "_impl/keras/layers/pooling.py",
+        "_impl/keras/layers/recurrent.py",
+        "_impl/keras/layers/serialization.py",
+        "_impl/keras/layers/wrappers.py",
+        "_impl/keras/utils/conv_utils.py",
+        "_impl/keras/utils/generic_utils.py",
+        "_impl/keras/utils/layer_utils.py",
+        "_impl/keras/utils/tf_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":engine",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -605,7 +647,7 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index b518898ad8..8def7ec493 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -22,10 +22,8 @@ import six
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.layers.base import Layer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -136,12 +134,6 @@ def get(identifier):
     identifier = str(identifier)
     return deserialize(identifier)
   elif callable(identifier):
-    if isinstance(identifier, Layer):
-      logging.warning(
-          'Do not pass a layer instance (such as {identifier}) as the '
-          'activation argument of another layer. Instead, advanced '
-          'activation layers should be used just like any other '
-          'layer in a model.'.format(identifier=identifier.__class__.__name__))
     return identifier
   else:
     raise ValueError('Could not interpret '
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 3aac6a9065..096db8db32 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import collections
 import json
 import os
+import weakref
 
 import numpy as np
 
@@ -35,7 +36,6 @@ from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -55,7 +55,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
-from tensorflow.python.training import moving_averages
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -263,6 +263,12 @@ def set_image_data_format(data_format):
   _IMAGE_DATA_FORMAT = str(data_format)
 
 
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
+
+
 @tf_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
@@ -283,17 +289,16 @@ def get_uid(prefix=''):
   ```
   """
   graph = ops.get_default_graph()
-  if graph not in tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS:
-    tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(
-        int)
-  layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+  if graph not in PER_GRAPH_LAYER_NAME_UIDS:
+    PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(int)
+  layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
   layer_name_uids[prefix] += 1
   return layer_name_uids[prefix]
 
 
 @tf_export('keras.backend.reset_uids')
 def reset_uids():
-  per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
+  per_graph_layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
   for key in keys:
     del per_graph_layer_name_uids[key]
@@ -1276,6 +1281,11 @@ def moving_average_update(x, value, momentum):
   Returns:
       An Operation to update the variable.
   """
+  # `training` is higher-up than the Keras backend in the abstraction hierarchy.
+  # In particular, `training` depends on layers, and thus on Keras.
+  # moving_averages, being low-level ops, should not be part of the training
+  # module.
+  from tensorflow.python.training import moving_averages  # pylint: disable=g-import-not-at-top
   return moving_averages.assign_moving_average(
       x, value, momentum, zero_debias=True)
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index 755607aafb..3b3af7d092 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -13,143 +13,145 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Base layer code (`Layer`).
-"""
+"""Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+import re
 
+import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-# pylint: disable=invalid-name
-InputSpec = tf_base_layers.InputSpec
-Node = tf_base_layers.Node
-TFBaseLayer = tf_base_layers.Layer
-# pylint: enable=invalid-name
+@tf_export('keras.layers.Layer')
+class Layer(checkpointable.CheckpointableBase):
+  """Base layer class.
 
+  This is the class from which all layers inherit.
 
-@tf_export('keras.layers.Layer')
-class Layer(tf_base_layers.Layer):
-  """Abstract base layer class.
-
-  # Properties
-      name: String, must be unique within a model.
-      input_spec: List of InputSpec class instances
-          each entry describes one required input:
-              - ndim
-              - dtype
-          A layer with `n` input tensors must have
-          an `input_spec` of length `n`.
-      trainable: Boolean, whether the layer weights
-          will be updated during training.
-      uses_learning_phase: Whether any operation
-          of the layer uses `K.in_training_phase()`
-          or `K.in_test_phase()`.
-      input_shape: Shape tuple. Provided for convenience,
-          but note that there may be cases in which this
-          attribute is ill-defined (e.g. a shared layer
-          with multiple input shapes), in which case
-          requesting `input_shape` will raise an Exception.
-          Prefer using `layer.get_input_shape_for(input_shape)`,
-          or `layer.get_input_shape_at(node_index)`.
-      output_shape: Shape tuple. See above.
-      inbound_nodes: List of nodes.
-      outbound_nodes: List of nodes.
-      input, output: Input/output tensor(s). Note that if the layer is used
-          more than once (shared layer), this is ill-defined
-          and will raise an exception. In such cases, use
-          `layer.get_input_at(node_index)`.
-      input_mask, output_mask: Same as above, for masks.
-      trainable_weights: List of variables.
-      non_trainable_weights: List of variables.
-      weights: The concatenation of the lists trainable_weights and
-          non_trainable_weights (in this order).
-
-  # Methods
-      call(x, mask=None): Where the layer's logic lives.
-      __call__(x, mask=None): Wrapper around the layer logic (`call`).
-          If x is a Keras tensor:
-              - Connect current layer with last layer from tensor:
-                  `self._add_inbound_node(last_layer)`
-              - Add layer to tensor history
-          If layer is not built:
-              - Build from inputs shape
-      get_weights()
-      set_weights(weights)
-      get_config()
-      count_params()
-      compute_output_shape(input_shape)
-      compute_mask(x, mask)
-      get_input_at(node_index)
-      get_output_at(node_index)
-      get_input_shape_at(node_index)
-      get_output_shape_at(node_index)
-      get_input_mask_at(node_index)
-      get_output_mask_at(node_index)
-
-  # Class Methods
-      from_config(config)
-
-  # Internal methods:
-      build(input_shape)
-      _add_inbound_node(layer, index=0)
+  A layer is a class implementing common neural networks operations, such
+  as convolution, batch norm, etc. These operations require managing weights,
+  losses, updates, and inter-layer connectivity.
+
+  Users will just instantiate a layer and then treat it as a callable.
+
+  We recommend that descendants of `Layer` implement the following methods:
+  * `__init__()`: Save configuration in member variables
+  * `build()`: Called once from `__call__`, when we know the shapes of inputs
+    and `dtype`. Should have the calls to `add_weight()`, and then
+    call the super's `build()` (which sets `self.built = True`, which is
+    nice in case the user wants to call `build()` manually before the
+    first `__call__`).
+  * `call()`: Called in `__call__` after making sure `build()` has been called
+    once. Should actually perform the logic of applying the layer to the
+    input tensors (which should be passed in as the first argument).
+
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+
+  Read-only properties:
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
+      non-trainable.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
+
+  Mutable properties:
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
+      constraints on inputs that can be accepted by the layer.
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'activity_regularizer',
         'input_shape',
         'batch_input_shape',
         'batch_size',
-        'dtype',
-        'name',
-        'trainable',
         'weights',
+        'activity_regularizer',
     }
     # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
 
-    # Get layer name.
-    name = kwargs.get('name')
-
-    # Get `trainable` status.
-    trainable = kwargs.get('trainable', True)
-
-    # Get `dtype`.
-    dtype = kwargs.get('dtype')
-    if dtype is None:
-      dtype = K.floatx()
-
-    # Call super, which will set all properties common to Keras layers
-    # and core TF layers.
-    super(Layer, self).__init__(
-        name=name, dtype=dtype, trainable=trainable,
-        activity_regularizer=kwargs.get('activity_regularizer'))
+    # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
+    self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
+    self.built = False
+    # Provides information about which inputs are compatible with the layer.
+    self.input_spec = None
+
+    self._init_set_name(name)
+
+    activity_regularizer = kwargs.pop('activity_regularizer', None)
+    if activity_regularizer and context.executing_eagerly():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
+    self._activity_regularizer = activity_regularizer
+    self._trainable_weights = []
+    self._non_trainable_weights = []
+    self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
+    self._losses = []
+    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
+    self._call_fn_args = estimator_util.fn_args(self.call)
+    self._compute_previous_mask = ('mask' in self._call_fn_args or
+                                   hasattr(self, 'compute_mask'))
     self._uses_inputs_arg = True
 
-    # Add properties that are Keras-only for now.
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self._inbound_nodes = []
+    self._outbound_nodes = []
+
     self.supports_masking = False
 
     # Manage input shape information if passed.
@@ -172,39 +174,404 @@ class Layer(tf_base_layers.Layer):
     else:
       self._initial_weights = None
 
-  def add_weight(self,
-                 name,
-                 shape,
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = unique_layer_name(
+          to_snake_case(self.__class__.__name__), zero_based=zero_based)
+    else:
+      self._name = name
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = regularizer
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def updates(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
+
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
+    """
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
+
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
+      else:
+        return ops.convert_to_tensor(x)
+
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
+
+  @property
+  def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
+    if context.executing_eagerly():
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
+
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
+
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: If anything other than None is passed, it signals the losses
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for activity regularization losses, for instance.
+        If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      # TODO(fchollet): it should be possible (and highly desirable) to support
+      # `add_loss` in eager mode. This allows great convenience and flexibility
+      # in defining custom losses on the fly (e.g. in VAEs).
+      # Simply appending the loss value to `self._losses`
+      # is the correct behavior.
+      # The only caveat is that we need to force the user to only call
+      # `add_loss` from inside a model or Layer's `call` method
+      # (otherwise the loss computation cannot be backproped through).
+      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
+
+    losses = generic_utils.to_list(losses)
+    self._losses += losses
+    if inputs is None:
+      for loss in losses:
+        loss._unconditional_loss = True  # pylint: disable=protected-access
+    else:
+      for loss in losses:
+        loss._unconditional_loss = False  # pylint: disable=protected-access
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of loss tensors of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
+
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
+
+  def _name_scope(self):
+    return self.name
+
+  def build(self, _):
+    """Creates the variables of the layer."""
+    self.built = True
+
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  def add_weight(self, name, shape,
                  dtype=None,
                  initializer=None,
                  regularizer=None,
                  trainable=True,
-                 constraint=None):
-    """Adds a weight variable to the layer.
+                 constraint=None,
+                 partitioner=None,
+                 use_resource=None,
+                 getter=None):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
-        name: String, the name for the weight variable.
-        shape: The shape tuple of the weight.
-        dtype: The dtype of the weight.
-        initializer: An Initializer instance (callable).
-        regularizer: An optional Regularizer instance.
-        trainable: A boolean, whether the weight should
-            be trained via backprop or not (assuming
-            that the layer itself is also trainable).
-        constraint: An optional Constraint instance.
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable.
+      constraint: constraint instance (callable).
+      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      use_resource: Whether to use `ResourceVariable`.
+      getter: Variable getter argument to be passed to the `Checkpointable` API.
 
     Returns:
-        The created weight variable.
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
+
+    Raises:
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
     """
     if dtype is None:
-      dtype = K.floatx()
-    weight = self.add_variable(name, shape,
-                               dtype=dtype,
-                               initializer=initializers.get(initializer),
-                               regularizer=regularizers.get(regularizer),
-                               constraint=constraints.get(constraint),
-                               trainable=trainable)
-    return weight
+      dtype = self.dtype or backend.floatx()
+    initializer = initializers.get(initializer)
+    if initializer is None:
+      # Default TensorFlow initializer.
+      initializer = initializers.glorot_uniform()
+    regularizer = regularizers.get(regularizer)
+    constraint = constraints.get(constraint)
+
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        # TODO(allenl): a `make_variable` equivalent should be added as a
+        # `Checkpointable` method.
+        getter=getter or make_variable,
+        # Manage errors in Layer rather than Checkpointable.
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtypes.as_dtype(dtype),
+        constraint=constraint,
+        trainable=trainable and self.trainable,
+        partitioner=partitioner,
+        use_resource=use_resource)
+
+    if regularizer is not None:
+      # TODO(fchollet): in the future, this should be handled at the
+      # level of variable creation, and weight regularization losses
+      # should be variable attributes.
+      self._handle_weight_regularization(name, variable, regularizer)
+
+    if trainable:
+      self._trainable_weights.append(variable)
+    else:
+      self._non_trainable_weights.append(variable)
+    return variable
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
+    if not context.executing_eagerly():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if not context.executing_eagerly():
+            init_graph = ops.get_default_graph()
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+
+    if init_graph is not None:  # pylint: disable=protected-access
+      # The variable was created and initialized in a graph.
+      if regularizer:
+        if isinstance(variable, tf_variables.PartitionedVariable):
+          for v in variable:
+            with ops.colocate_with(v.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(v)
+            if regularization is not None:
+              self.add_loss(regularization)
+        else:
+          with ops.colocate_with(variable.op):
+            with ops.name_scope(name + '/Regularizer'):
+              regularization = regularizer(variable)
+          if regularization is not None:
+            self.add_loss(regularization)
+    elif regularizer:  # initialization took place in an eager context
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        raise RuntimeError(
+            'Partitioned variable regularization is not yet '
+            'supported when executing eagerly. File a feature request'
+            'if this is important to you.')
+      # Save a zero-argument lambda which runs the regularizer on the
+      # variable, to be executed when `Layer.losses` is requested.
+      # This makes losses responsive to variable updates when executing
+      # eagerly.
+      #
+      # TODO(akshayka): Do the same for graphs as well, so that losses
+      # collected in a while_loop can be run outside its control flow
+      # context and so that losses won't be swallowed up by graph functions
+      # (i.e., `.losses()` should always create regularizers).
+      self._losses.append(lambda: regularizer(variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      for output in output_list:
+        with ops.name_scope('ActivityRegularizer'):
+          activity_regularization = self._activity_regularizer(output)
+        self.add_loss(activity_regularization, inputs=inputs)
 
   def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
     """This is where the layer's logic lives.
@@ -218,6 +585,215 @@ class Layer(tf_base_layers.Layer):
     """
     return inputs
 
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
+
+    Arguments:
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
+
+    Raises:
+      ValueError: if the layer's `call` method returns None (an invalid value).
+    """
+    input_list = nest.flatten(inputs)
+
+    build_graph = not context.executing_eagerly()
+    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
+    # which don't use an "inputs" argument.
+    in_deferred_mode = isinstance(input_list[0], DeferredTensor)
+
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if (not hasattr(self, '_compute_previous_mask') or
+        self._compute_previous_mask):
+      previous_mask = collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = estimator_util.fn_args(self.call)
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
+
+    input_shapes = None
+
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        if not build_graph:
+          # Activity regularization is currently unsupported in Eager mode.
+          if self._activity_regularizer:
+            raise ValueError(
+                'activity_regularizer currently unsupported with '
+                'eager execution enabled. Found an activity_regularizer in '
+                '%s(%s).' % (self.__class__.__name__, self))
+        if not build_graph and not in_deferred_mode:
+          for x in input_list:
+            if hasattr(x, '_keras_history'):
+              raise ValueError('_keras_history currently unsupported in '
+                               'Eager mode. Found _keras_history in %s while '
+                               'executing __call__ for %s(%s)' %
+                               (x, self.__class_.__name__, self))
+
+        # Check input assumptions set before layer building, e.g. input rank.
+        self._assert_input_compatibility(inputs)
+        if input_list and self._dtype is None:
+          try:
+            self._dtype = input_list[0].dtype.base_dtype.name
+          except AttributeError:
+            pass
+        if all(hasattr(x, 'get_shape') for x in input_list):
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+        self.build(input_shapes)
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if build_graph or in_deferred_mode:
+        self._assert_input_compatibility(inputs)
+
+      if not in_deferred_mode:
+        outputs = self.call(inputs, *args, **kwargs)
+        if outputs is None:
+          raise ValueError('A layer\'s `call` method should return a Tensor '
+                           'or a list of Tensors, not None (layer: ' +
+                           self.name + ').')
+      else:
+        # Deferred mode behavior: use `compute_output_shape` to
+        # infer the number of outputs of the layer and their shapes.
+        if input_shapes is None:
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
+        output_shapes = self.compute_output_shape(input_shapes)
+        output_shapes = nest.flatten(output_shapes)
+        outputs = [
+            # TODO(fchollet): name the deferred tensors?
+            DeferredTensor(shape=shape, dtype=self._dtype)
+            for shape in output_shapes
+        ]
+        if len(outputs) == 1:
+          outputs = outputs[0]
+
+      if build_graph:
+        self._handle_activity_regularization(inputs, outputs)
+        # TODO(fchollet): consider enabling masking for Eager mode.
+        self._set_mask_metadata(inputs, outputs, previous_mask)
+
+      if in_deferred_mode or build_graph and have_all_keras_metadata(inputs):
+        inputs, outputs = self._set_connectivity_metadata_(
+            inputs, outputs, args, kwargs)
+
+      self.built = True
+      if context.executing_eagerly():
+        return outputs
+
+      if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
+        # Subclassed network: explicitly set metadata normally set by a call to
+        # self._set_inputs(). This is not relevant in eager execution.
+        self._symbolic_set_inputs(inputs, outputs)
+
+      if in_deferred_mode or build_graph:
+        self._set_learning_phase_metadata(inputs, outputs)
+
+    # Optionally load weight values that were specified at layer instantiation.
+    # TODO(fchollet): consider enabling this with eager execution too.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
+    return outputs
+
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This simply wraps `self.__call__`.
+
+    Arguments:
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
+
+  def _set_learning_phase_metadata(self, inputs, outputs):
+    # Update learning phase info. To work with subclassed models,
+    # this should be done even if Keras metadata is absent.
+    output_tensors = generic_utils.to_list(outputs)
+    uses_lp = any(
+        [getattr(x, '_uses_learning_phase', False)
+         for x in generic_utils.to_list(inputs)])
+    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
+    for i in range(len(output_tensors)):
+      try:
+        output_tensors[i]._uses_learning_phase = getattr(
+            output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      except AttributeError:
+        # An output element happens to be a C type (such as tuple or dict).
+        # We don't track learning phase info in such edge cases.
+        pass
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    if hasattr(self, 'compute_mask'):
+      output_mask = self.compute_mask(inputs, previous_mask)
+      if isinstance(outputs, (list, tuple)):
+        if output_mask is None:
+          output_mask = [None for _ in range(len(outputs))]
+        for x, m in zip(outputs, output_mask):
+          try:
+            x._keras_mask = m  # pylint: disable=protected-access
+          except AttributeError:
+            pass  # C type such as dict. Masking not supported in this case.
+      else:
+        try:
+          outputs._keras_mask = output_mask  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    if args and getattr(self, '_uses_inputs_arg', True):
+      raise TypeError(
+          'This Layer takes an `inputs` argument to call(), and only the '
+          '`inputs` argument may be specified as a positional argument. '
+          'Pass everything else as a keyword argument (those arguments will'
+          ' not be tracked as inputs to the Layer).')
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
+
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
+
   def _inputs_from_call_args(self, call_args, call_kwargs):
     """Get Layer inputs from __call__ *args and **kwargs.
 
@@ -282,71 +858,6 @@ class Layer(tf_base_layers.Layer):
         input_arg_values.extend(bound_args[call_arg_spec.varargs])
       return input_arg_values, non_input_arg_values
 
-  def __call__(self, inputs, *args, **kwargs):
-    """Wrapper around self.call(), for handling internal references.
-
-    If a Keras tensor is passed:
-        - We call self._add_inbound_node().
-        - If necessary, we `build` the layer to match
-            the shape of the input(s).
-        - We update the _keras_history of the output tensor(s)
-            with the current layer.
-            This is done as part of _add_inbound_node().
-
-    Arguments:
-        inputs: Can be a tensor or list/tuple of tensors.
-        *args: Additional positional arguments to be passed to `call()`. Only
-          allowed in subclassed Models with custom call() signatures. In other
-          cases, `Layer` inputs must be passed using the `inputs` argument and
-          non-inputs must be keyword arguments.
-        **kwargs: Additional keyword arguments to be passed to `call()`.
-
-    Returns:
-        Output of the layer's `call` method.
-
-    Raises:
-        ValueError: in case the layer is missing shape information
-            for its `build` call.
-        TypeError: If positional arguments are passed and this `Layer` is not a
-            subclassed `Model`.
-    """
-    # Actually call the layer (optionally building it).
-    output = super(Layer, self).__call__(inputs, *args, **kwargs)
-
-    if args and getattr(self, '_uses_inputs_arg', True):
-      raise TypeError(
-          'This Layer takes an `inputs` argument to call(), and only the '
-          '`inputs` argument may be specified as a positional argument. Pass '
-          'everything else as a keyword argument (those arguments will not be '
-          'tracked as inputs to the Layer).')
-
-    if context.executing_eagerly():
-      return output
-
-    inputs, kwargs = self._inputs_from_call_args(
-        call_args=(inputs,) + args, call_kwargs=kwargs)
-
-    if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
-      # Subclassed network: explicitly set metadata normally set by a call to
-      # self._set_inputs().
-      self._symbolic_set_inputs(inputs, output)
-
-    # Update learning phase info.
-    output_tensors = generic_utils.to_list(output)
-    uses_lp = any(
-        [getattr(x, '_uses_learning_phase', False)
-         for x in generic_utils.to_list(inputs)])
-    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
-    for i in range(len(output_tensors)):
-      output_tensors[i]._uses_learning_phase = getattr(
-          output_tensors[i], '_uses_learning_phase', False) or uses_lp
-
-    # Optionally load weight values that were specified at layer instantiation.
-    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
-      self.set_weights(self._initial_weights)
-      del self._initial_weights
-    return output
-
   def compute_output_shape(self, input_shape):
     """Computes the output shape of the layer.
 
@@ -362,13 +873,7 @@ class Layer(tf_base_layers.Layer):
     Returns:
         An input shape tuple.
     """
-    logging.warning(
-        'All custom layers should implement the '
-        '`compute_output_shape` method. This layer (' + self.name + ') '
-        'is relying on the base `Layer.compute_output_shape` implementation, '
-        'which will start raising a `NotImplementedError` '
-        'as of July 1st, 2018.')
-    return input_shape
+    raise NotImplementedError
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
     """Computes an output mask tensor.
@@ -396,6 +901,87 @@ class Layer(tf_base_layers.Layer):
     # carry over the input mask
     return mask
 
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
+
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
+
+    Raises:
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
+    """
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
+
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
 
@@ -476,6 +1062,325 @@ class Layer(tf_base_layers.Layer):
     else:
       return getattr(output, '_keras_mask', None)
 
+  def get_input_shape_at(self, node_index):
+    """Retrieves the input shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_shapes',
+                                             'input shape')
+
+  def get_output_shape_at(self, node_index):
+    """Retrieves the output shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_shapes',
+                                             'output shape')
+
+  def get_input_at(self, node_index):
+    """Retrieves the input tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_tensors',
+                                             'input')
+
+  def get_output_at(self, node_index):
+    """Retrieves the output tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_tensors',
+                                             'output')
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+      AttributeError: If no inbound nodes are found.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name +
+                           ' is not connected, no input to return.')
+    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one output,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+      Output tensor or list of output tensors.
+
+    Raises:
+      AttributeError: if the layer is connected to more than one incoming
+        layers.
+      RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
+    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer, or if all inputs
+    have the same shape.
+
+    Returns:
+        Input shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined input_shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined input shape.')
+    all_input_shapes = set(
+        [str(node.input_shapes) for node in self._inbound_nodes])
+    if len(all_input_shapes) == 1:
+      input_shapes = self._inbound_nodes[0].input_shapes
+      if len(input_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in input_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different input shapes. Hence '
+                           'the notion of "input shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_input_shape_at(node_index)` '
+                           'instead.')
+
+  def count_params(self):
+    """Count the total number of scalars composing the weights.
+
+    Returns:
+        An integer count.
+
+    Raises:
+        ValueError: if the layer isn't yet built
+          (in which case its weights aren't yet defined).
+    """
+    if not self.built:
+      if self.__class__.__name__ == 'Sequential':
+        self.build()  # pylint: disable=no-value-for-parameter
+      else:
+        raise ValueError('You tried to call `count_params` on ' + self.name +
+                         ', but the layer isn\'t built. '
+                         'You can build it manually via: `' + self.name +
+                         '.build(batch_input_shape)`.')
+    weight_shapes = [w.get_shape().as_list() for w in self.weights]
+    return int(sum([np.prod(w) for w in weight_shapes]))
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape(s) of a layer.
+
+    Only applicable if the layer has one output,
+    or if all outputs have the same shape.
+
+    Returns:
+        Output shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined output shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined output shape.')
+    all_output_shapes = set(
+        [str(node.output_shapes) for node in self._inbound_nodes])
+    if len(all_output_shapes) == 1:
+      output_shapes = self._inbound_nodes[0].output_shapes
+      if len(output_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in output_shapes
+        ]
+    else:
+      raise AttributeError('The layer "%s"'
+                           ' has multiple inbound nodes, '
+                           'with different output shapes. Hence '
+                           'the notion of "output shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_output_shape_at(node_index)` '
+                           'instead.' % self.name)
+
+  @property
+  def inbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._inbound_nodes
+
+  @property
+  def outbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._outbound_nodes
+
+  def _assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of the layer (if any). If not, a clear and actional exception gets raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = nest.flatten(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = nest.flatten(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' +
+                       str(len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Inputs received: ' + str(inputs))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      if (spec.ndim is not None or
+          spec.min_ndim is not None or
+          spec.max_ndim is not None):
+        if x.get_shape().ndims is None:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'its rank is undefined, but the layer requires a '
+                           'defined rank.')
+
+      # Check ndim.
+      if spec.ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim != spec.ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                           str(ndim) + '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      if spec.max_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(ndim))
+      if spec.min_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(ndim) +
+                           '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      # Check dtype.
+      if spec.dtype is not None:
+        if x.dtype != spec.dtype:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected dtype=' + str(spec.dtype) +
+                           ', found dtype=' + str(x.dtype))
+      # Check specific shape axes.
+      if spec.axes:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
+                  ' incompatible with the layer: expected axis ' + str(axis) +
+                  ' of input shape to have value ' + str(value) +
+                  ' but received input with shape ' + str(shape))
+      # Check shape.
+      if spec.shape is not None:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for spec_dim, dim in zip(spec.shape, shape):
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(input_index) +
+                                 ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(shape))
+
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
 
@@ -500,14 +1405,14 @@ class Layer(tf_base_layers.Layer):
     if not params:
       return
     weight_value_tuples = []
-    param_values = K.batch_get_value(params)
+    param_values = backend.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
       if pv.shape != w.shape:
         raise ValueError('Layer weight shape ' + str(pv.shape) +
                          ' not compatible with '
                          'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
-    K.batch_set_value(weight_value_tuples)
+    backend.batch_set_value(weight_value_tuples)
 
   def get_weights(self):
     """Returns the current weights of the layer.
@@ -516,7 +1421,7 @@ class Layer(tf_base_layers.Layer):
         Weights values as a list of numpy arrays.
     """
     params = self.weights
-    return K.batch_get_value(params)
+    return backend.batch_get_value(params)
 
   def get_config(self):
     """Returns the config of the layer.
@@ -558,9 +1463,196 @@ class Layer(tf_base_layers.Layer):
     """
     return cls(**config)
 
-  @tf_base_layers.Layer.activity_regularizer.setter
-  def activity_regularizer(self, activity_regularizer):
-    self._activity_regularizer = activity_regularizer
+
+@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+class Node(object):
+  """A `Node` describes the connectivity between two layers.
+
+  Each time a layer is connected to some new input,
+  a node is added to `layer._inbound_nodes`.
+  Each time the output of a layer is used by another layer,
+  a node is added to `layer._outbound_nodes`.
+
+  Arguments:
+      outbound_layer: the layer that takes
+          `input_tensors` and turns them into `output_tensors`
+          (the node gets created when the `call`
+          method of the layer was called).
+      inbound_layers: a list of layers, the same length as `input_tensors`,
+          the layers from where `input_tensors` originate.
+      node_indices: a list of integers, the same length as `inbound_layers`.
+          `node_indices[i]` is the origin node of `input_tensors[i]`
+          (necessary since each inbound layer might have several nodes,
+          e.g. if the layer is being shared with a different data stream).
+      tensor_indices: a list of integers,
+          the same length as `inbound_layers`.
+          `tensor_indices[i]` is the index of `input_tensors[i]` within the
+          output of the inbound layer
+          (necessary since each inbound layer might
+          have multiple tensor outputs, with each one being
+          independently manipulable).
+      input_tensors: list of input tensors.
+      output_tensors: list of output tensors.
+      arguments: dictionary of keyword arguments that were passed to the
+          `call` method of the layer at the call that created the node.
+
+  `node_indices` and `tensor_indices` are basically fine-grained coordinates
+  describing the origin of the `input_tensors`.
+
+  A node from layer A to layer B is added to:
+    - A._outbound_nodes
+    - B._inbound_nodes
+  """
+
+  def __init__(self,
+               outbound_layer,
+               inbound_layers,
+               node_indices,
+               tensor_indices,
+               input_tensors,
+               output_tensors,
+               arguments=None):
+    # Layer instance (NOT a list).
+    if isinstance(outbound_layer, list):
+      raise ValueError(
+          '`outbound_layer` should be a layer instance, not a list.')
+    # this is the layer that takes a list of input tensors
+    # and turns them into a list of output tensors.
+    # the current node will be added to
+    # the inbound_nodes of outbound_layer.
+    self.outbound_layer = outbound_layer
+
+    # The following 3 properties describe where
+    # the input tensors come from: which layers,
+    # and for each layer, which node and which
+    # tensor output of each node.
+
+    # List of layer instances.
+    self.inbound_layers = inbound_layers
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.node_indices = node_indices
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.tensor_indices = tensor_indices
+
+    # Following 2 properties:
+    # tensor inputs and outputs of outbound_layer.
+
+    # List of tensors. 1:1 mapping with inbound_layers.
+    self.input_tensors = input_tensors
+    # List of tensors, created by outbound_layer.call().
+    self.output_tensors = output_tensors
+
+    # Following 2 properties: input and output shapes.
+
+    # List of shape tuples, shapes of input_tensors.
+    self.input_shapes = [static_shape(x) for x in input_tensors]
+    # List of shape tuples, shapes of output_tensors.
+    self.output_shapes = [static_shape(x) for x in output_tensors]
+
+    # Optional keyword arguments to layer's `call`.
+    self.arguments = arguments
+
+    # Add nodes to all layers involved.
+    for layer in inbound_layers:
+      if layer is not None:
+        # For compatibility with external Keras, we use the deprecated
+        # accessor here.
+        layer.outbound_nodes.append(self)
+    # For compatibility with external Keras, we use the deprecated
+    # accessor here.
+    outbound_layer.inbound_nodes.append(self)
+
+  def get_config(self):
+    inbound_names = []
+    for layer in self.inbound_layers:
+      if layer:
+        inbound_names.append(layer.name)
+      else:
+        inbound_names.append(None)
+    return {
+        'outbound_layer': self.outbound_layer.name,
+        'inbound_layers': inbound_names,
+        'node_indices': self.node_indices,
+        'tensor_indices': self.tensor_indices
+    }
+
+
+class DeferredTensor(object):
+  """Tensor-like object used to build graphs of layers in Eager mode.
+
+  When calling a layer on a DeferredTensor, the layer will not perform any
+  computation and will simply perfom shape inference to return new
+  DeferredTensors with appropriate shape information. Thus DeferredTensor
+  behaves like a graph-mode Tensor when manipulated by layers.
+  """
+
+  def __init__(self, shape, dtype, name=None):
+    self.shape = tensor_shape.TensorShape(shape)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
+    self.name = name
+
+  def get_shape(self):
+    return self.shape
+
+  def __str__(self):
+    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
+                                                         self.get_shape(),
+                                                         self.dtype.name)
+
+  def __repr__(self):
+    return "<DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
+                                                        self.get_shape(),
+                                                        self.dtype.name)
 
 
 def shape_type_conversion(fn):
@@ -589,3 +1681,251 @@ def shape_type_conversion(fn):
       return tensor_shape.TensorShape(output_shape)
 
   return wrapper
+
+
+def object_list_uid(object_list):
+  """Creates a single string from object ids."""
+  object_list = nest.flatten(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def static_shape(x):
+  """Get the static shape of a Tensor, or None if it is unavailable."""
+  if x is None:
+    return None
+  try:
+    return tuple(x.get_shape().as_list())
+  except ValueError:
+    return None
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors/ops reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    if isinstance(x, ops.Operation):
+      outputs = x.outputs[:] or []
+      outputs += x._control_outputs
+    elif isinstance(x, ops.Tensor):
+      outputs = x.consumers()
+    elif isinstance(x, tf_variables.Variable):
+      outputs = [x.op]
+    else:
+      raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  # We cannot use Python's `any` because the iterable may return Tensors.
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  return all([hasattr(x, '_keras_history') for x in iterable])
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def is_tensor_or_tensor_list(v):
+  v = nest.flatten(v)
+  if v and isinstance(v[0], ops.Tensor):
+    return True
+  else:
+    return False
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=True,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable.
+    caching_device: Passed to `vs.variable`.
+    validate_shape: Passed to `vs.variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  v = vs.variable(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource)
+  return v
diff --git a/tensorflow/python/keras/_impl/keras/engine/input_layer.py b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
index b51dd8a218..bd9dcbe3c5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
@@ -23,7 +23,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -95,7 +94,7 @@ class InputLayer(base_layer.Layer):
 
       if context.executing_eagerly():
         # In eager mode, create a temporary placeholder to call the layer on.
-        input_tensor = tf_base_layers._DeferredTensor(  # pylint: disable=protected-access
+        input_tensor = base_layer.DeferredTensor(  # pylint: disable=protected-access
             shape=batch_input_shape,
             dtype=dtype,
             name=self.name)
@@ -123,7 +122,7 @@ class InputLayer(base_layer.Layer):
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
-    tf_base_layers.Node(
+    base_layer.Node(
         self,
         inbound_layers=[],
         node_indices=[],
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index 9f1c7de115..cc177c14a8 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -35,8 +35,6 @@ from tensorflow.python.keras._impl.keras.engine import saving
 from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
@@ -82,7 +80,7 @@ class Network(base_layer.Layer):
     # self.losses
     # self.updates
 
-    self._init_set_name(name)
+    self._init_set_name(name, zero_based=True)
     self._activity_regularizer = None
     # This acts just like the `trainable` attribute of any layer instance.
     # It does not affect users of the underlying layers, only users of the
@@ -132,14 +130,14 @@ class Network(base_layer.Layer):
     if context.executing_eagerly():
       # Check that all inputs/outputs are DeferredTensors.
       for tensor in self.inputs:
-        if not isinstance(tensor, tf_base_layers._DeferredTensor):  # pylint: disable=protected-access
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'inputs must come from a call to '
                           '`tf.keras.Input` (called after '
                           'tfe.enable_eager_execution()). '
                           'Received invalid input: ' + str(tensor))
       for tensor in self.outputs:
-        if not isinstance(tensor, tf_base_layers._DeferredTensor):  # pylint: disable=protected-access
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'outputs must come from a call to '
                           'a layer (called after '
@@ -230,7 +228,7 @@ class Network(base_layer.Layer):
     self._layers_by_depth = layers_by_depth
 
     # Create the node linking internal inputs to internal outputs.
-    tf_base_layers.Node(
+    base_layer.Node(
         outbound_layer=self,
         inbound_layers=[],
         node_indices=[],
@@ -243,8 +241,8 @@ class Network(base_layer.Layer):
     for x in self.inputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
       masks.append(mask)
-    mask_cache_key = (tf_layers_util.object_list_uid(self.inputs) + '_' +
-                      tf_layers_util.object_list_uid(masks))
+    mask_cache_key = (base_layer.object_list_uid(self.inputs) + '_' +
+                      base_layer.object_list_uid(masks))
     masks = []
     for x in self.outputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
@@ -289,7 +287,7 @@ class Network(base_layer.Layer):
     self.built = False
 
   def __setattr__(self, name, value):
-    if isinstance(value, (tf_base_layers.Layer, Network)):
+    if isinstance(value, (base_layer.Layer, Network)):
       try:
         is_graph_network = self._is_graph_network
       except AttributeError:
@@ -299,6 +297,10 @@ class Network(base_layer.Layer):
       if not is_graph_network:
         if value not in self._layers:
           self._layers.append(value)
+          if hasattr(value, '_use_resource_variables'):
+            # In subclassed models, legacy layers (tf.layers) must always use
+            # resource variables.
+            value._use_resource_variables = True
     if isinstance(value, checkpointable.CheckpointableBase):
       # Layer (and therefore Network/Model) inherit from CheckpointableBase
       # rather than Checkpointable, which means there is no Checkpointable
@@ -387,8 +389,8 @@ class Network(base_layer.Layer):
       masks = [None for _ in range(len(inputs))]
     else:
       masks = generic_utils.to_list(mask)
-    cache_key = (tf_layers_util.object_list_uid(inputs)
-                 + '_' + tf_layers_util.object_list_uid(masks))
+    cache_key = (base_layer.object_list_uid(inputs)
+                 + '_' + base_layer.object_list_uid(masks))
     if cache_key in self._output_mask_cache:
       return self._output_mask_cache[cache_key]
     else:
@@ -502,8 +504,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = tf_layers_util.get_reachable_from_inputs(relevant_inputs,
-                                                         updates)
+    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
     unconditional_updates = [
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
@@ -540,8 +541,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = tf_layers_util.get_reachable_from_inputs(relevant_inputs,
-                                                         losses)
+    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, losses)
     relevant_conditional_losses = [x for x in losses if x in reachable]
     unconditional_losses = [
         x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
@@ -623,8 +623,8 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Try to retrieve cached outputs if the layer has already been called
       # on these exact inputs.
-      cache_key = (tf_layers_util.object_list_uid(inputs)
-                   + '_' + tf_layers_util.object_list_uid(masks))
+      cache_key = (base_layer.object_list_uid(inputs)
+                   + '_' + base_layer.object_list_uid(masks))
       if cache_key in self._output_tensor_cache:
         # Cache hit.
         return self._output_tensor_cache[cache_key]
@@ -656,7 +656,7 @@ class Network(base_layer.Layer):
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = tf_layers_util.object_list_uid(input_shapes)
+    cache_key = base_layer.object_list_uid(input_shapes)
     if cache_key not in self._output_shape_cache:
       # Cache miss. We have to run the network graph manually (recursive calls
       # to `compute_output_shape`).
@@ -845,7 +845,7 @@ class Network(base_layer.Layer):
     for x in self.outputs:
       assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
       tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(tf_layers_util.static_shape(x))
+      output_shapes.append(base_layer.static_shape(x))
       output_tensors.append(tensor)
       output_masks.append(mask)
 
@@ -859,14 +859,14 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Update cache;
       # keys are based on ids on input tensors and inputs masks.
-      cache_key = (tf_layers_util.object_list_uid(inputs)
-                   + '_' + tf_layers_util.object_list_uid(masks))
+      cache_key = (base_layer.object_list_uid(inputs)
+                   + '_' + base_layer.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
       self._output_mask_cache[cache_key] = output_masks
 
       if output_shapes is not None:
-        input_shapes = [tf_layers_util.static_shape(x) for x in inputs]
-        cache_key = tf_layers_util.object_list_uid(input_shapes)
+        input_shapes = [base_layer.static_shape(x) for x in inputs]
+        cache_key = base_layer.object_list_uid(input_shapes)
         self._output_shape_cache[cache_key] = output_shapes
 
     return output_tensors, output_masks
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
index dde0901204..3b1578cddf 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -422,7 +422,7 @@ class TestWholeModelSaving(test.TestCase):
         f = keras.layers.Dense(2, name='nested_model_dense_%d' % (i,))(f)
       # This layer name will make the `weights_name`
       # HDF5 attribute blow out of proportion.
-      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**15)))(f)
+      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**14)))(f)
       nested_model = keras.Model(inputs=[x], outputs=[f], name='nested_model')
 
       x = keras.Input(shape=(2,), name='outer_model_input')
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/_impl/keras/engine/sequential.py
index 2ef99d5ab3..bd13ca6713 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential.py
@@ -123,7 +123,7 @@ class Sequential(Model):
             multiple output tensors, or is already connected
             somewhere else (forbidden in `Sequential` models).
     """
-    if not isinstance(layer, (base_layer.Layer, base_layer.TFBaseLayer)):
+    if not isinstance(layer, base_layer.Layer):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
index c9a47581df..8aba16aef3 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
@@ -151,6 +151,7 @@ class TestSequential(test.TestCase):
     with self.test_session():
       model = keras.models.Sequential()
       model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+      assert model.updates
 
       model.trainable = False
       assert not model.updates
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 9ab4b6fdcf..49cc1cd3b3 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
-from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -52,11 +52,13 @@ class TopologyConstructionTest(test.TestCase):
                                    (1, 1),
                                    'float32',
                                    trainable=False)
-        self.add_update(state_ops.assign_add(self.a, [[1.]]))
+        self.add_update(state_ops.assign_add(self.a, [[1.]],
+                                             name='unconditional_update'))
         self.built = True
 
       def call(self, inputs):
-        self.add_update(state_ops.assign_add(self.a, inputs),
+        self.add_update(state_ops.assign_add(self.b, inputs,
+                                             name='conditional_update'),
                         inputs=True)
         return inputs + 1
 
@@ -97,10 +99,20 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 4)
     self.assertEqual(len(network.get_updates_for(None)), 2)
 
-    network.add_update(state_ops.assign_add(layer.a, x4), inputs=True)
+    network.add_update(state_ops.assign_add(layer.b, x4), inputs=True)
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
+  def test_get_updates_bn(self):
+    x1 = keras.Input(shape=(1,))
+    layer = keras.layers.BatchNormalization()
+    _ = layer.apply(x1)
+
+    print('BN updates', layer._updates)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 2)
+    self.assertEqual(len(layer.get_updates_for(None)), 0)
+
   def test_get_losses(self):
 
     class MyLayer(keras.layers.Layer):
@@ -875,25 +887,25 @@ class TopologyConstructionTest(test.TestCase):
 class DeferredModeTest(test.TestCase):
 
   def testDeferredTensorAttributes(self):
-    x = tf_base_layers._DeferredTensor(shape=(None, 2),
-                                       dtype='float32',
-                                       name='x')
+    x = base_layer.DeferredTensor(shape=(None, 2),
+                                  dtype='float32',
+                                  name='x')
     self.assertEqual(str(x),
                      'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
     self.assertEqual(repr(x),
-                     '<_DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
+                     '<DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
 
   @test_util.run_in_graph_and_eager_modes()
   def testSimpleNetworkBuilding(self):
     inputs = keras.engine.Input(shape=(32,))
     if context.executing_eagerly():
-      self.assertIsInstance(inputs, tf_base_layers._DeferredTensor)
+      self.assertIsInstance(inputs, base_layer.DeferredTensor)
       self.assertEqual(inputs.dtype.name, 'float32')
       self.assertEqual(inputs.shape.as_list(), [None, 32])
 
     x = keras.layers.Dense(2)(inputs)
     if context.executing_eagerly():
-      self.assertIsInstance(x, tf_base_layers._DeferredTensor)
+      self.assertIsInstance(x, base_layer.DeferredTensor)
       self.assertEqual(x.dtype.name, 'float32')
       self.assertEqual(x.shape.as_list(), [None, 2])
 
@@ -936,5 +948,34 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[0].shape.as_list(), [10, 16])
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
+
+class GraphUtilsTest(test.TestCase):
+
+  def testGetReachableFromInputs(self):
+
+    with self.test_session():
+      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+      x_1 = pl_1 + pl_2
+      x_2 = pl_2 * 2
+      x_3 = pl_3 + 1
+      x_4 = x_1 + x_2
+      x_5 = x_3 * pl_1
+
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_1]),
+          {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_1, pl_2]),
+          {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_3]),
+          {pl_3, x_3, x_5, x_3.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([x_3]),
+          {x_3, x_5, x_5.op})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 71de657da8..7c46743814 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -31,10 +31,10 @@ from tensorflow.python.keras._impl.keras.engine import training_arrays
 from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine import training_generator
 from tensorflow.python.keras._impl.keras.engine import training_utils
+from tensorflow.python.keras._impl.keras.engine.base_layer import DeferredTensor
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.layers.base import _DeferredTensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
@@ -891,15 +891,6 @@ class Model(Network):
     else:
       self._symbolic_set_inputs(inputs, training=training)
 
-  def _set_scope(self, scope=None):
-    """Modify the Layer scope creation logic to create ResourceVariables."""
-    super(Model, self)._set_scope(scope=scope)
-    # Subclassed Models create ResourceVariables by default. This makes it
-    # easier to use Models in an eager/graph agnostic way (since eager execution
-    # always uses ResourceVariables).
-    if not self._is_graph_network:
-      self._scope.set_use_resource(True)
-
   def _eager_set_inputs(self, inputs):
     """Set model's input and output specs based on the input data received.
 
@@ -933,11 +924,11 @@ class Model(Network):
     else:
       dummy_output_values = [dummy_output_values]
     self.outputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_output_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_output_values]
     self.inputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_input_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_input_values]
     self.input_names = [
         'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
     self.output_names = [
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/_impl/keras/initializers.py
index 300bed5e14..ecb71d00e2 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/_impl/keras/initializers.py
@@ -201,6 +201,8 @@ def deserialize(config, custom_objects=None):
 
 @tf_export('keras.initializers.get')
 def get(identifier):
+  if identifier is None:
+    return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 280f7ed1b1..c44808421f 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -29,16 +29,15 @@ from tensorflow.python.platform import test
 
 class KerasIntegrationTest(test.TestCase):
 
-  def test_vector_classification_declarative(self):
+  def test_vector_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -48,23 +47,22 @@ class KerasIntegrationTest(test.TestCase):
           keras.layers.Dense(y_train.shape[-1], activation='softmax')
       ])
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(10,),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(20,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.Dense(16, activation='relu')(inputs)
@@ -73,77 +71,78 @@ class KerasIntegrationTest(test.TestCase):
 
       model = keras.models.Model(inputs, outputs)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_temporal_classification_declarative(self):
+  def test_temporal_classification_sequential(self):
     with self.test_session():
-      np.random.seed(1336)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(4, 8),
+      np.random.seed(1337)
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(4, 10),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.LSTM(5, return_sequences=True,
                                   input_shape=x_train.shape[1:]))
       model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_image_classification_declarative(self):
+  def test_image_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(8, 8, 3),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(12, 12, 3),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.Conv2D(
-          8, 3,
+          4, 3,
+          padding='same',
           activation='relu',
           input_shape=x_train.shape[1:]))
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Conv2D(
           8, 3,
           padding='same',
           activation='relu'))
-      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Conv2D(
+          16, 3,
+          padding='same',
+          activation='relu'))
+      model.add(keras.layers.Flatten())
       model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_video_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(4, 8, 8, 3),
           num_classes=3)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.TimeDistributed(
@@ -159,22 +158,21 @@ class KerasIntegrationTest(test.TestCase):
                     optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.70)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       base_model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -189,27 +187,26 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.84)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(x_train.shape[1:])
       x = keras.layers.Dense(16,
@@ -225,12 +222,12 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_embedding_with_clipnorm(self):
     with self.test_session():
@@ -242,9 +239,9 @@ class KerasIntegrationTest(test.TestCase):
   def test_using_tf_layers_in_keras_sequential_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
 
@@ -254,25 +251,23 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_using_tf_layers_in_keras_functional_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.Input(shape=(10,))
       x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
@@ -281,12 +276,12 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 7cdebc6aa4..d202b6551d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -19,9 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
@@ -38,12 +39,232 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
 from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import convolutional as tf_convolutional_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Conv(Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(
+        kernel_size, rank, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(
+        dilation_rate, rank, 'dilation_rate')
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=self.rank + 2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis].value
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=self.kernel.get_shape(),
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format,
+                                                   self.rank + 2))
+    self.built = True
+
+  def call(self, inputs):
+    outputs = self._convolution_op(inputs, self.kernel)
+
+    if self.use_bias:
+      if self.data_format == 'channels_first':
+        if self.rank == 1:
+          # nn.bias_add does not accept a 1D input tensor.
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+          outputs += bias
+        if self.rank == 2:
+          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
+        if self.rank == 3:
+          # As of Mar 2017, direct addition is significantly slower than
+          # bias_add when computing gradients. To use bias_add, we collapse Z
+          # and Y into a single dimension to obtain a 4D input tensor.
+          outputs_shape = outputs.shape.as_list()
+          if outputs_shape[0] is None:
+            outputs_shape[0] = -1
+          outputs_4d = array_ops.reshape(outputs,
+                                         [outputs_shape[0], outputs_shape[1],
+                                          outputs_shape[2] * outputs_shape[3],
+                                          outputs_shape[4]])
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
+          outputs = array_ops.reshape(outputs_4d, outputs_shape)
+      else:
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
-class Conv1D(tf_convolutional_layers.Conv1D, Layer):
+class Conv1D(Conv):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -74,6 +295,8 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
           where the model should not violate the temporal order.
           See [WaveNet: A Generative Model for Raw Audio, section
             2.1](https://arxiv.org/abs/1609.03499).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
       dilation_rate: an integer or tuple/list of a single integer, specifying
           the dilation rate to use for dilated convolution.
           Currently, specifying any `dilation_rate` value != 1 is
@@ -105,6 +328,7 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                kernel_size,
                strides=1,
                padding='valid',
+               data_format='channels_last',
                dilation_rate=1,
                activation=None,
                use_bias=True,
@@ -117,11 +341,12 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                bias_constraint=None,
                **kwargs):
     super(Conv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
-        data_format='channels_last',
+        data_format=data_format,
         dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
@@ -134,30 +359,9 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
-class Conv2D(tf_convolutional_layers.Conv2D, Layer):
+class Conv2D(Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -247,9 +451,8 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -267,31 +470,9 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
-class Conv3D(tf_convolutional_layers.Conv3D, Layer):
+class Conv3D(Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -388,9 +569,8 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3D, self).__init__(
+        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -408,32 +588,10 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2DTranspose',
            'keras.layers.Convolution2DTranspose')
-class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
+class Conv2DTranspose(Conv2D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -529,8 +687,6 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -548,31 +704,123 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2DTranspose, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    if len(input_shape) != 4:
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_height, out_width)
+      strides = (1, 1, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_height, out_width, self.filters)
+      strides = (1, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv2d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.Conv3DTranspose',
            'keras.layers.Convolution3DTranspose')
-class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
+class Conv3DTranspose(Conv3D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -679,8 +927,6 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -698,6 +944,313 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+  def build(self, input_shape):
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined, found None: ' + str(input_shape))
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+    self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    self.input_spec = InputSpec(ndim=5, axes={c_axis: inputs_shape[c_axis]})
+
+    depth = inputs_shape[d_axis]
+    height = inputs_shape[h_axis]
+    width = inputs_shape[w_axis]
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_depth = conv_utils.deconv_output_length(depth,
+                                                kernel_d,
+                                                self.padding,
+                                                stride_d)
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_depth, out_height,
+                      out_width)
+      strides = (1, 1, stride_d, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_depth, out_height, out_width,
+                      self.filters)
+      strides = (1, stride_d, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv3d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=5),
+        padding=self.padding.upper())
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[d_axis] = conv_utils.deconv_output_length(out_shape[d_axis],
+                                                          kernel_d,
+                                                          self.padding,
+                                                          stride_d)
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
+      if self.data_format == 'channels_first':
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
+      else:
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
+      outputs_4d = nn.bias_add(
+          outputs_4d,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+      outputs = array_ops.reshape(outputs_4d, outputs_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[d_axis] = conv_utils.deconv_output_length(
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
+
+
+class SeparableConv(Conv):
+  """Abstract base layer for separable nD convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv, self).__init__(
+        rank=rank,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.pointwise_initializer = initializers.get(pointwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.pointwise_constraint = constraints.get(pointwise_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis].value
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    depthwise_kernel_shape = self.kernel_size + (input_dim,
+                                                 self.depth_multiplier)
+    pointwise_kernel_shape = (
+        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
+
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
+        shape=pointwise_kernel_shape,
+        initializer=self.pointwise_initializer,
+        regularizer=self.pointwise_regularizer,
+        constraint=self.pointwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    raise NotImplementedError
+
   def get_config(self):
     config = {
         'filters': self.filters,
@@ -705,24 +1258,34 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         'strides': self.strides,
         'padding': self.padding,
         'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
         'activation': activations.serialize(self.activation),
         'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
         'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
         'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
         'bias_constraint': constraints.serialize(self.bias_constraint)
     }
-    base_config = super(Conv3DTranspose, self).get_config()
+    base_config = super(SeparableConv, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.SeparableConv1D',
            'keras.layers.SeparableConvolution1D')
-class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+class SeparableConv1D(SeparableConv):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -802,15 +1365,15 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -825,44 +1388,46 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides * 2 + (1,)
+      spatial_start_dim = 1
+    else:
+      strides = (1, 1) + self.strides * 2
+      spatial_start_dim = 2
+
+    # Explicitly broadcast inputs and kernels to 4D.
+    # TODO(fchollet): refactor when a native separable_conv1d op is available.
+    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
+    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
+    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
+    dilation_rate = (1,) + self.dilation_rate
+
+    outputs = nn.separable_conv2d(
+        inputs,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
 
 
 @tf_export('keras.layers.SeparableConv2D',
            'keras.layers.SeparableConvolution2D')
-class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
+class SeparableConv2D(SeparableConv):
   """Depthwise separable 2D convolution.
 
   Separable convolutions consist in first performing
@@ -959,15 +1524,15 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -982,47 +1547,30 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    # Apply the actual ops.
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides + (1,)
+    else:
+      strides = (1, 1) + self.strides
+    outputs = nn.separable_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        self.pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=self.dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
 
 
 @tf_export('keras.layers.DepthwiseConv2D')
@@ -1162,7 +1710,7 @@ class DepthwiseConv2D(Conv2D):
     self.built = True
 
   def call(self, inputs, training=None):
-    outputs = K.depthwise_conv2d(
+    outputs = backend.depthwise_conv2d(
         inputs,
         self.depthwise_kernel,
         strides=self.strides,
@@ -1171,7 +1719,7 @@ class DepthwiseConv2D(Conv2D):
         data_format=self.data_format)
 
     if self.bias:
-      outputs = K.bias_add(
+      outputs = backend.bias_add(
           outputs,
           self.bias,
           data_format=self.data_format)
@@ -1246,7 +1794,7 @@ class UpSampling1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
 
   def call(self, inputs):
-    output = K.repeat_elements(inputs, self.size, axis=1)
+    output = backend.repeat_elements(inputs, self.size, axis=1)
     return output
 
   def get_config(self):
@@ -1315,7 +1863,8 @@ class UpSampling2D(Layer):
           [input_shape[0], height, width, input_shape[3]])
 
   def call(self, inputs):
-    return K.resize_images(inputs, self.size[0], self.size[1], self.data_format)
+    return backend.resize_images(
+        inputs, self.size[0], self.size[1], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1387,8 +1936,8 @@ class UpSampling3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.resize_volumes(inputs, self.size[0], self.size[1], self.size[2],
-                            self.data_format)
+    return backend.resize_volumes(
+        inputs, self.size[0], self.size[1], self.size[2], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1429,7 +1978,7 @@ class ZeroPadding1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
   def call(self, inputs):
-    return K.temporal_padding(inputs, padding=self.padding)
+    return backend.temporal_padding(inputs, padding=self.padding)
 
   def get_config(self):
     config = {'padding': self.padding}
@@ -1530,7 +2079,7 @@ class ZeroPadding2D(Layer):
           [input_shape[0], rows, cols, input_shape[3]])
 
   def call(self, inputs):
-    return K.spatial_2d_padding(
+    return backend.spatial_2d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
@@ -1648,7 +2197,7 @@ class ZeroPadding3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.spatial_3d_padding(
+    return backend.spatial_3d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index c74fc1e4c0..87b997232e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -24,6 +24,7 @@ import types as python_types
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -32,13 +33,14 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -94,7 +96,7 @@ class Masking(Layer):
 
 
 @tf_export('keras.layers.Dropout')
-class Dropout(tf_core_layers.Dropout, Layer):
+class Dropout(Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting
@@ -113,23 +115,39 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    # Inheritance call order:
-    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(rate=rate,
-                                  noise_shape=noise_shape,
-                                  seed=seed,
-                                  **kwargs)
+    super(Dropout, self).__init__(**kwargs)
+    self.rate = rate
+    self.noise_shape = noise_shape
+    self.seed = seed
     self.supports_masking = True
 
+  def _get_noise_shape(self, inputs):
+    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
+    # which will override `self.noise_shape`, and allows for custom noise
+    # shapes with dynamically sized inputs.
+    if self.noise_shape is None:
+      return self.noise_shape
+    return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
+
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
-    output = super(Dropout, self).call(inputs, training=training)
+
+    def dropped_inputs():
+      return nn.dropout(inputs, 1  - self.rate,
+                        noise_shape=self._get_noise_shape(inputs),
+                        seed=self.seed)
+    output = tf_utils.smart_cond(training,
+                                 dropped_inputs,
+                                 lambda: array_ops.identity(inputs))
     # EagerTensor object has no attribute _uses_learning_phase
     if not context.executing_eagerly() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {
         'rate': self.rate,
@@ -479,7 +497,7 @@ class Permute(Layer):
 
 
 @tf_export('keras.layers.Flatten')
-class Flatten(tf_core_layers.Flatten, Layer):
+class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
   Example:
@@ -495,7 +513,25 @@ class Flatten(tf_core_layers.Flatten, Layer):
       # now: model.output_shape == (None, 65536)
   ```
   """
-  pass
+
+  def __init__(self, **kwargs):
+    super(Flatten, self).__init__(**kwargs)
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def call(self, inputs):
+    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    if not context.executing_eagerly():
+      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = [input_shape[0]]
+    if all(input_shape[1:]):
+      output_shape += [np.prod(input_shape[1:])]
+    else:
+      output_shape += [None]
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.RepeatVector')
@@ -611,10 +647,12 @@ class Lambda(Layer):
                         'must be a list, a tuple, or a function.')
       self._output_shape = output_shape
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     if self._output_shape is None:
+      if context.executing_eagerly():
+        raise NotImplementedError
       x = K.placeholder(shape=input_shape)
       x = self.call(x)
       if isinstance(x, list):
@@ -640,7 +678,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    if has_arg(self.function, 'mask'):
+    if generic_utils.has_arg(self.function, 'mask'):
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
 
@@ -651,14 +689,14 @@ class Lambda(Layer):
 
   def get_config(self):
     if isinstance(self.function, python_types.LambdaType):
-      function = func_dump(self.function)
+      function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
     else:
       function = self.function.__name__
       function_type = 'function'
 
     if isinstance(self._output_shape, python_types.LambdaType):
-      output_shape = func_dump(self._output_shape)
+      output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
@@ -686,26 +724,27 @@ class Lambda(Layer):
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
-      function = deserialize_keras_object(
+      function = generic_utils.deserialize_keras_object(
           config['function'],
           custom_objects=custom_objects,
           printable_module_name='function in Lambda layer')
     elif function_type == 'lambda':
       # Unsafe deserialization from bytecode
-      function = func_load(config['function'], globs=globs)
+      function = generic_utils.func_load(config['function'], globs=globs)
     else:
       raise TypeError('Unknown function type:', function_type)
 
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
-      output_shape = deserialize_keras_object(
+      output_shape = generic_utils.deserialize_keras_object(
           config['output_shape'],
           custom_objects=custom_objects,
           printable_module_name='output_shape function in Lambda layer')
     elif output_shape_type == 'lambda':
       # Unsafe deserialization from bytecode
-      output_shape = func_load(config['output_shape'], globs=globs)
+      output_shape = generic_utils.func_load(config['output_shape'],
+                                             globs=globs)
     else:
       output_shape = config['output_shape']
 
@@ -725,7 +764,7 @@ class Lambda(Layer):
 
 
 @tf_export('keras.layers.Dense')
-class Dense(tf_core_layers.Dense, Layer):
+class Dense(Layer):
   """Just your regular densely-connected NN layer.
 
   `Dense` implements the operation:
@@ -795,21 +834,74 @@ class Dense(tf_core_layers.Dense, Layer):
     if 'input_shape' not in kwargs and 'input_dim' in kwargs:
       kwargs['input_shape'] = (kwargs.pop('input_dim'),)
 
-    # Inheritance call order:
-    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dense, self).__init__(
-        units,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
     self.supports_masking = True
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if input_shape[-1].value is None:
+      raise ValueError('The last dimension of the inputs to `Dense` '
+                       'should be defined. Found `None`.')
+    self.input_spec = InputSpec(min_ndim=2,
+                                axes={-1: input_shape[-1].value})
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    if self.use_bias:
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    shape = inputs.get_shape().as_list()
+    if len(shape) > 2:
+      # Broadcasting is required for the inputs.
+      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
+                                                             [0]])
+      # Reshape the output back to the original ndim of the input.
+      if not context.executing_eagerly():
+        output_shape = shape[:-1] + [self.units]
+        outputs.set_shape(output_shape)
+    else:
+      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    if self.use_bias:
+      outputs = nn.bias_add(outputs, self.bias)
+    if self.activation is not None:
+      return self.activation(outputs)  # pylint: disable=not-callable
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input_shape must be defined, but saw: %s'
+          % input_shape)
+    return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
     config = {
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 551d1b1c3a..d22d8d12dc 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -129,7 +129,6 @@ class CoreLayersTest(test.TestCase):
     testing_utils.layer_test(
         keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes()
   def test_lambda(self):
     testing_utils.layer_test(
         keras.layers.Lambda,
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 540e2d945c..591bab7cd8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -102,7 +102,8 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(**kwargs)
+    dtype = kwargs.pop('dtype', K.floatx())
+    super(Embedding, self).__init__(dtype=dtype, **kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -120,8 +121,7 @@ class Embedding(Layer):
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        dtype=self.dtype)
+        constraint=self.embeddings_constraint)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 3b44b20bf8..b60d864ae5 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -19,17 +19,29 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.layers import normalization as tf_normalization_layers
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
+class BatchNormalization(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -37,28 +49,63 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   close to 0 and the activation standard deviation close to 1.
 
   Arguments:
-      axis: Integer, the axis that should be normalized
-          (typically the features axis).
-          For instance, after a `Conv2D` layer with
-          `data_format="channels_first"`,
-          set `axis=1` in `BatchNormalization`.
-      momentum: Momentum for the moving average.
-      epsilon: Small float added to variance to avoid dividing by zero.
-      center: If True, add offset of `beta` to normalized tensor.
-          If False, `beta` is ignored.
-      scale: If True, multiply by `gamma`.
-          If False, `gamma` is not used.
-          When the next layer is linear (also e.g. `nn.relu`),
-          this can be disabled since the scaling
-          will be done by the next layer.
-      beta_initializer: Initializer for the beta weight.
-      gamma_initializer: Initializer for the gamma weight.
-      moving_mean_initializer: Initializer for the moving mean.
-      moving_variance_initializer: Initializer for the moving variance.
-      beta_regularizer: Optional regularizer for the beta weight.
-      gamma_regularizer: Optional regularizer for the gamma weight.
-      beta_constraint: Optional constraint for the beta weight.
-      gamma_constraint: Optional constraint for the gamma weight.
+    axis: Integer, the axis that should be normalized
+        (typically the features axis).
+        For instance, after a `Conv2D` layer with
+        `data_format="channels_first"`,
+        set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+      which means batch normalization is performed across the whole batch. When
+      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+      Normalization", which creates virtual sub-batches which are each
+      normalized separately (with shared gamma, beta, and moving statistics).
+      Must divide the actual batch size during execution.
+    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
+      the input tensor and returning a pair (scale, bias) to apply to the
+      normalized values (before gamma and beta), only during training. For
+      example, if axis==-1,
+        `adjustment = lambda shape: (
+          tf.random_uniform(shape[-1:], 0.93, 1.07),
+          tf.random_uniform(shape[-1:], -0.1, 0.1))`
+      will scale the normalized value by up to 7% up or down, then shift the
+      result by up to 0.1 (with independent scaling and bias for each feature
+      but shared across all examples), and finally apply gamma and/or beta. If
+      `None`, no adjustment is applied. Cannot be specified if
+      virtual_batch_size is specified.
 
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
@@ -87,33 +134,537 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
                gamma_regularizer=None,
                beta_constraint=None,
                gamma_constraint=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
+               fused=None,
+               trainable=True,
+               virtual_batch_size=None,
+               adjustment=None,
+               name=None,
                **kwargs):
-    self.supports_masking = True
     super(BatchNormalization, self).__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=initializers.get(beta_initializer),
-        gamma_initializer=initializers.get(gamma_initializer),
-        moving_mean_initializer=initializers.get(moving_mean_initializer),
-        moving_variance_initializer=initializers.get(
-            moving_variance_initializer),
-        beta_regularizer=regularizers.get(beta_regularizer),
-        gamma_regularizer=regularizers.get(gamma_regularizer),
-        beta_constraint=constraints.get(beta_constraint),
-        gamma_constraint=constraints.get(gamma_constraint),
-        **kwargs
-    )
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(axis, list):
+      self.axis = axis[:]
+    else:
+      self.axis = axis
+    self.momentum = momentum
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
+    self.moving_variance_initializer = initializers.get(
+        moving_variance_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+    self.renorm = renorm
+    self.virtual_batch_size = virtual_batch_size
+    self.adjustment = adjustment
+    if fused is None:
+      fused = True
+    self.supports_masking = True
+
+    self.fused = fused
+    self._bessels_correction_test_only = True
+    self._use_resource_variables = None
+
+    if renorm:
+      renorm_clipping = renorm_clipping or {}
+      keys = ['rmax', 'rmin', 'dmax']
+      if set(renorm_clipping) - set(keys):
+        raise ValueError('renorm_clipping %s contains keys not in %s' %
+                         (renorm_clipping, keys))
+      self.renorm_clipping = renorm_clipping
+      self.renorm_momentum = renorm_momentum
+
+  def _add_tower_local_variable(self, *args, **kwargs):
+    tower_context = distribute_lib.get_tower_context()
+    with tower_context.tower_local_var_scope('mean'):
+      return self.add_variable(*args, **kwargs)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not input_shape.ndims:
+      raise ValueError('Input has undefined rank:', input_shape)
+    ndims = len(input_shape)
+
+    # Convert axis to list and resolve negatives
+    if isinstance(self.axis, int):
+      self.axis = [self.axis]
+
+    if not isinstance(self.axis, list):
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
+
+    for idx, x in enumerate(self.axis):
+      if x < 0:
+        self.axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.axis) != len(set(self.axis)):
+      raise ValueError('Duplicate axis: %s' % self.axis)
+
+    if self.virtual_batch_size is not None:
+      if self.virtual_batch_size <= 0:
+        raise ValueError('virtual_batch_size must be a positive integer that '
+                         'divides the true batch size of the input Tensor')
+      # If using virtual batches, the first dimension must be the batch
+      # dimension and cannot be the batch norm axis
+      if 0 in self.axis:
+        raise ValueError('When using virtual_batch_size, the batch dimension '
+                         'must be 0 and thus axis cannot include 0')
+      if self.adjustment is not None:
+        raise ValueError('When using virtual_batch_size, adjustment cannot '
+                         'be specified')
+
+    if self.fused:
+      # Currently fused batch norm doesn't support renorm. It also only supports
+      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
+      # output back to its original shape accordingly.
+      self.fused = (not self.renorm and
+                    ndims == 4 and
+                    self.axis in [[1], [3]] and
+                    self.virtual_batch_size is None and
+                    self.adjustment is None)
+      # TODO(chrisying): fused batch norm is currently not supported for
+      # multi-axis batch norm and by extension virtual batches. In some cases,
+      # it might be possible to use fused batch norm but would require reshaping
+      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
+      # particularly tricky. A compromise might be to just support the most
+      # common use case (turning 5D w/ virtual batch to NCHW)
+
+    if self.fused:
+      if self.axis == [1]:
+        self._data_format = 'NCHW'
+      elif self.axis == [3]:
+        self._data_format = 'NHWC'
+      else:
+        raise ValueError('Unsupported axis, fused batch norm only supports '
+                         'axis == [1] or axis == [3]')
+
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
+      param_dtype = dtypes.float32
+    else:
+      param_dtype = self.dtype or dtypes.float32
+
+    axis_to_dim = {x: input_shape[x].value for x in self.axis}
+    for x in axis_to_dim:
+      if axis_to_dim[x] is None:
+        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
+                         input_shape)
+    self.input_spec = InputSpec(ndim=ndims, axes=axis_to_dim)
+
+    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
+      # Single axis batch norm (most common/default use-case)
+      param_shape = (list(axis_to_dim.values())[0],)
+    else:
+      # Parameter shape is the original shape but with 1 in all non-axis dims
+      param_shape = [axis_to_dim[i] if i in axis_to_dim
+                     else 1 for i in range(ndims)]
+      if self.virtual_batch_size is not None:
+        # When using virtual batches, add an extra dim at index 1
+        param_shape.insert(1, 1)
+        for idx, x in enumerate(self.axis):
+          self.axis[idx] = x + 1      # Account for added dimension
+
+    # BUG: when using fused BN with Resource Variables with a dynamic
+    # `training` argument in call, the cond
+    # `smart_cond(
+    #     training,
+    #     _fused_batch_norm_training,
+    #     _fused_batch_norm_inference)` triggers None gradients for the
+    # variables gamma and beta.
+    # In this case we choose to force normal variables when possible.
+    # The bug will not occur of `training` is static, or when
+    # not using fused BN, or when in eager execution.
+    # TODO(fchollet): remove code below when bug is fixed.
+    use_resource = False
+    if context.executing_eagerly():
+      use_resource = True  # Eager execution requires resource variables.
+    elif not self.fused:
+      use_resource = True  # Issue only exists with fused BN.
+    elif self._use_resource_variables is True:
+      use_resource = True  # Case of a subclassed model, always use RVs.
+    if hasattr(self, '_scope'):
+      use_resource = None  # Legacy layers, leave it to `add_weight`.
+
+    if self.scale:
+      self.gamma = self.add_variable(
+          name='gamma',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          use_resource=use_resource,
+          trainable=True)
+    else:
+      self.gamma = None
+      if self.fused:
+        self._gamma_const = array_ops.constant(
+            1.0, dtype=param_dtype, shape=param_shape)
+
+    if self.center:
+      self.beta = self.add_variable(
+          name='beta',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          use_resource=use_resource,
+          trainable=True)
+    else:
+      self.beta = None
+      if self.fused:
+        self._beta_const = array_ops.constant(
+            0.0, dtype=param_dtype, shape=param_shape)
+
+    try:
+      # Disable variable partitioning when creating the moving mean and variance
+      if hasattr(self, '_scope') and self._scope:
+        partitioner = self._scope.partitioner
+        self._scope.set_partitioner(None)
+      else:
+        partitioner = None
+      self.moving_mean = self._add_tower_local_variable(
+          name='moving_mean',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_mean_initializer,
+          trainable=False)
+
+      self.moving_variance = self._add_tower_local_variable(
+          name='moving_variance',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_variance_initializer,
+          trainable=False)
+
+      if self.renorm:
+        # Create variables to maintain the moving mean and standard deviation.
+        # These are used in training and thus are different from the moving
+        # averages above. The renorm variables are colocated with moving_mean
+        # and moving_variance.
+        # NOTE: below, the outer `with device` block causes the current device
+        # stack to be cleared. The nested ones use a `lambda` to set the desired
+        # device and ignore any devices that may be set by the custom getter.
+        def _renorm_variable(name, shape):
+          var = self._add_tower_local_variable(
+              name=name,
+              shape=shape,
+              dtype=param_dtype,
+              initializer=init_ops.zeros_initializer(),
+              trainable=False)
+          return var
+
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_mean):
+          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
+          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+        # renorm_stddev_weight. This allows us to (1) mix the average
+        # stddev with the minibatch stddev early in training, and (2) compute
+        # the unbiased average stddev by dividing renorm_stddev by the weight.
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_variance):
+          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
+          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
+                                                       ())
+    finally:
+      if partitioner:
+        self._scope.set_partitioner(partitioner)
+    self.built = True
+
+  def _assign_moving_average(self, variable, value, momentum):
+    with ops.name_scope(None, 'AssignMovingAvg',
+                        [variable, value, momentum]) as scope:
+      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
+      if decay.dtype != variable.dtype.base_dtype:
+        decay = math_ops.cast(decay, variable.dtype.base_dtype)
+      update_delta = (variable - value) * decay
+      return state_ops.assign_sub(variable, update_delta, name=scope)
+
+  def _fused_batch_norm(self, inputs, training):
+    """Returns the output of fused batch norm."""
+    beta = self.beta if self.center else self._beta_const
+    gamma = self.gamma if self.scale else self._gamma_const
+
+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          epsilon=self.epsilon,
+          data_format=self._data_format)
+
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=self.moving_mean,
+          variance=self.moving_variance,
+          epsilon=self.epsilon,
+          is_training=False,
+          data_format=self._data_format)
+
+    output, mean, variance = tf_utils.smart_cond(
+        training, _fused_batch_norm_training, _fused_batch_norm_inference)
+    if not self._bessels_correction_test_only:
+      # Remove Bessel's correction to be consistent with non-fused batch norm.
+      # Note that the variance computed by fused batch norm is
+      # with Bessel's correction.
+      sample_size = math_ops.cast(
+          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
+      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
+      variance *= factor
+
+    training_value = tf_utils.constant_value(training)
+    if training_value is None:
+      momentum = tf_utils.smart_cond(training,
+                                     lambda: self.momentum,
+                                     lambda: 1.0)
+    else:
+      momentum = ops.convert_to_tensor(self.momentum)
+    if training_value or training_value is None:
+      mean_update = self._assign_moving_average(self.moving_mean, mean,
+                                                momentum)
+      variance_update = self._assign_moving_average(self.moving_variance,
+                                                    variance, momentum)
+      self.add_update(mean_update, inputs=True)
+      self.add_update(variance_update, inputs=True)
+
+    return output
+
+  def _renorm_correction_and_moments(self, mean, variance, training):
+    """Returns the correction and update values for renorm."""
+    stddev = math_ops.sqrt(variance + self.epsilon)
+    # Compute the average mean and standard deviation, as if they were
+    # initialized with this batch's moments.
+    mixed_renorm_mean = (self.renorm_mean +
+                         (1. - self.renorm_mean_weight) * mean)
+    mixed_renorm_stddev = (self.renorm_stddev +
+                           (1. - self.renorm_stddev_weight) * stddev)
+    # Compute the corrections for batch renorm.
+    r = stddev / mixed_renorm_stddev
+    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
+    # Ensure the corrections use pre-update moving averages.
+    with ops.control_dependencies([r, d]):
+      mean = array_ops.identity(mean)
+      stddev = array_ops.identity(stddev)
+    rmin, rmax, dmax = [self.renorm_clipping.get(key)
+                        for key in ['rmin', 'rmax', 'dmax']]
+    if rmin is not None:
+      r = math_ops.maximum(r, rmin)
+    if rmax is not None:
+      r = math_ops.minimum(r, rmax)
+    if dmax is not None:
+      d = math_ops.maximum(d, -dmax)
+      d = math_ops.minimum(d, dmax)
+    # When not training, use r=1, d=0.
+    r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
+    d = tf_utils.smart_cond(training,
+                            lambda: d,
+                            lambda: array_ops.zeros_like(d))
+
+    def _update_renorm_variable(var, weight, value):
+      """Updates a moving average and weight, returns the unbiased value."""
+      value = array_ops.identity(value)
+      def _do_update():
+        """Updates the var and weight, returns their updated ratio."""
+        # Update the variables without zero debiasing. The debiasing will be
+        # accomplished by dividing the exponential moving average by the weight.
+        # For example, after a single update, the moving average would be
+        # (1-decay) * value. and the weight will be 1-decay, with their ratio
+        # giving the value.
+        # Make sure the weight is not updated until before r and d computation.
+        with ops.control_dependencies([value]):
+          weight_value = array_ops.constant(1., dtype=weight.dtype)
+        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
+        new_weight = self._assign_moving_average(weight, weight_value,
+                                                 self.renorm_momentum)
+        # TODO(yuefengz): the updates to var and weighted can not be batched
+        # together if we fetch their updated values here. Consider calculating
+        # new values and delaying the updates.
+        return new_var / new_weight
+
+      def _fake_update():
+        return array_ops.identity(var)
+      return tf_utils.smart_cond(training, _do_update, _fake_update)
+
+    # TODO(yuefengz): colocate the operations
+    new_mean = _update_renorm_variable(self.renorm_mean,
+                                       self.renorm_mean_weight, mean)
+    new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                         self.renorm_stddev_weight, stddev)
+    # Make sqrt(moving_variance + epsilon) = new_stddev.
+    new_variance = math_ops.square(new_stddev) - self.epsilon
+
+    return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
-    output = super(BatchNormalization, self).call(inputs, training=training)
+
+    in_eager_mode = context.executing_eagerly()
+    if self.virtual_batch_size is not None:
+      # Virtual batches (aka ghost batches) can be simulated by reshaping the
+      # Tensor and reusing the existing batch norm implementation
+      original_shape = [-1] + inputs.shape.as_list()[1:]
+      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
+
+      # Will cause errors if virtual_batch_size does not divide the batch size
+      inputs = array_ops.reshape(inputs, expanded_shape)
+
+      def undo_virtual_batching(outputs):
+        outputs = array_ops.reshape(outputs, original_shape)
+        return outputs
+
+    # Gradient bug when using fused BN with dynamic `training` and resource
+    # variables. TODO(fchollet): remove workaround when bug fixed.
+    use_fused_bn = (
+        self.fused and
+        (tf_utils.constant_value(training) is not None or
+         not isinstance(self.gamma, resource_variable_ops.ResourceVariable)))
+    if use_fused_bn:
+      outputs = self._fused_batch_norm(inputs, training=training)
+      if self.virtual_batch_size is not None:
+        # Currently never reaches here since fused_batch_norm does not support
+        # virtual batching
+        outputs = undo_virtual_batching(outputs)
+      if not context.executing_eagerly() and training is K.learning_phase():
+        outputs._uses_learning_phase = True  # pylint: disable=protected-access
+      return outputs
+
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+    reduction_axes = [i for i in range(ndims) if i not in self.axis]
+    if self.virtual_batch_size is not None:
+      del reduction_axes[1]     # Do not reduce along virtual batch dim
+
+    # Broadcasting only necessary for single-axis batch norm where the axis is
+    # not the last dimension
+    broadcast_shape = [1] * ndims
+    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          reduction_axes != list(range(ndims - 1))):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+    def _compose_transforms(scale, offset, then_scale, then_offset):
+      if then_scale is not None:
+        scale *= then_scale
+        offset *= then_scale
+      if then_offset is not None:
+        offset += then_offset
+      return (scale, offset)
+
+    # Determine a boolean value for `training`: could be True, False, or None.
+    training_value = tf_utils.constant_value(training)
+    if training_value is not False:
+      if self.adjustment:
+        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
+        # Adjust only during training.
+        adj_scale = tf_utils.smart_cond(training,
+                                        lambda: adj_scale,
+                                        lambda: array_ops.ones_like(adj_scale))
+        adj_bias = tf_utils.smart_cond(training,
+                                       lambda: adj_bias,
+                                       lambda: array_ops.zeros_like(adj_bias))
+        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
+
+      # Some of the computations here are not necessary when training==False
+      # but not a constant. However, this makes the code simpler.
+      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
+      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
+      moving_mean = self.moving_mean
+      moving_variance = self.moving_variance
+
+      mean = tf_utils.smart_cond(training,
+                                 lambda: mean,
+                                 lambda: moving_mean)
+      variance = tf_utils.smart_cond(training,
+                                     lambda: variance,
+                                     lambda: moving_variance)
+
+      if self.renorm:
+        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
+            mean, variance, training)
+        # When training, the normalized values (say, x) will be transformed as
+        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
+        # = x * (r * gamma) + (d * gamma + beta) with renorm.
+        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
+        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
+        scale, offset = _compose_transforms(r, d, scale, offset)
+      else:
+        new_mean, new_variance = mean, variance
+
+      if self.virtual_batch_size is not None:
+        # This isn't strictly correct since in ghost batch norm, you are
+        # supposed to sequentially update the moving_mean and moving_variance
+        # with each sub-batch. However, since the moving statistics are only
+        # used during evaluation, it is more efficient to just update in one
+        # step and should not make a significant difference in the result.
+        new_mean = math_ops.reduce_mean(new_mean,
+                                        axis=1, keep_dims=True)
+        new_variance = math_ops.reduce_mean(new_variance,
+                                            axis=1, keep_dims=True)
+
+      def _do_update(var, value):
+        if in_eager_mode and not self.trainable:
+          return
+
+        return self._assign_moving_average(var, value, self.momentum)
+
+      mean_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_mean, new_mean),
+          lambda: self.moving_mean)
+      variance_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_variance, new_variance),
+          lambda: self.moving_variance)
+      if not context.executing_eagerly():
+        self.add_update(mean_update, inputs=True)
+        self.add_update(variance_update, inputs=True)
+
+    else:
+      mean, variance = self.moving_mean, self.moving_variance
+
+    outputs = nn.batch_normalization(inputs,
+                                     _broadcast(mean),
+                                     _broadcast(variance),
+                                     offset,
+                                     scale,
+                                     self.epsilon)
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
+
+    if self.virtual_batch_size is not None:
+      outputs = undo_virtual_batching(outputs)
     if not context.executing_eagerly() and training is K.learning_phase():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
+      outputs._uses_learning_phase = True  # pylint: disable=protected-access
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
   def get_config(self):
     config = {
@@ -133,5 +684,19 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
         'beta_constraint': constraints.serialize(self.beta_constraint),
         'gamma_constraint': constraints.serialize(self.gamma_constraint)
     }
+    # Only add TensorFlow-specific parameters if they are set, so as to preserve
+    # model compatibility with external Keras.
+    if self.renorm:
+      config['renorm'] = True
+      config['renorm_clipping'] = self.renorm_clipping
+      config['renorm_momentum'] = self.renorm_momentum
+    if self.virtual_batch_size is not None:
+      config['virtual_batch_size'] = self.virtual_batch_size
+    # Note: adjustment is not serializable.
+    if self.adjustment is not None:
+      logging.warning('The `adjustment` function of this `BatchNormalization` '
+                      'layer cannot be serialized and has been omitted from '
+                      'the layer config. It will not be included when '
+                      're-creating the layer from the saved config.')
     base_config = super(BatchNormalization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
index 2b3628c3f1..fa9277e3d1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
@@ -114,6 +114,26 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
+  def test_batchnorm_convnet_channel_last(self):
+    with self.test_session():
+      # keras.backend.set_learning_phase(True)
+
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(
+          axis=-1, input_shape=(4, 4, 3), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+      np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+      np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
     """
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index 15d5337976..86bc8a680a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -19,16 +19,98 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import pooling as tf_pooling_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Pooling1D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 1D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling1D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
+    if strides is None:
+      strides = pool_size
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=3)
+
+  def call(self, inputs):
+    # There is no TF op for 1D pooling, hence we make the inputs 4D.
+    if self.data_format == 'channels_last':
+      # input is NWC, make it NHWC
+      inputs = array_ops.expand_dims(inputs, 1)
+      # pool on the W dim
+      pool_shape = (1, 1) + self.pool_size + (1,)
+      strides = (1, 1) + self.strides + (1,)
+      data_format = 'NHWC'
+    else:
+      # input is NCW, make it NCHW
+      inputs = array_ops.expand_dims(inputs, 2)
+      # pool on the W dim
+      pool_shape = (1, 1, 1) + self.pool_size
+      strides = (1, 1, 1) + self.strides
+      data_format = 'NCHW'
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=data_format)
+
+    if self.data_format == 'channels_last':
+      return array_ops.squeeze(outputs, 1)
+    else:
+      return array_ops.squeeze(outputs, 2)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(Pooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
-class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
+class MaxPooling1D(Pooling1D):
   """Max pooling operation for temporal data.
 
   Arguments:
@@ -45,23 +127,20 @@ class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
 
-  def get_config(self):
-    config = {
-        'strides': self.strides,
-        'pool_size': self.pool_size,
-        'padding': self.padding
-    }
-    base_config = super(MaxPooling1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling1D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
-class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
+class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
   Arguments:
@@ -78,24 +157,104 @@ class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
+    super(AveragePooling1D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
+
+
+class Pooling2D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling2D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
-                                           **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      pool_shape = (1,) + self.pool_size + (1,)
+      strides = (1,) + self.strides + (1,)
+    else:
+      pool_shape = (1, 1) + self.pool_size
+      strides = (1, 1) + self.strides
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, 4))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
 
   def get_config(self):
     config = {
-        'strides': self.strides,
         'pool_size': self.pool_size,
-        'padding': self.padding
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
     }
-    base_config = super(AveragePooling1D, self).get_config()
+    base_config = super(Pooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
-class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
+class MaxPooling2D(Pooling2D):
   """Max pooling operation for spatial data.
 
   Arguments:
@@ -142,26 +301,14 @@ class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling2D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
-class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
+class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
   Arguments:
@@ -208,12 +355,96 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
+    super(AveragePooling2D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
+
+
+class Pooling3D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 3D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)`
+      while `channels_first` corresponds to
+      inputs with shape `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(Pooling3D, self).__init__(name=name, **kwargs)
     if data_format is None:
-      data_format = K.image_data_format()
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def call(self, inputs):
+    pool_shape = (1,) + self.pool_size + (1,)
+    strides = (1,) + self.strides + (1,)
+
+    if self.data_format == 'channels_first':
+      # TF does not support `channels_first` with 3D pooling operations,
+      # so we must handle this case manually.
+      # TODO(fchollet): remove this when TF pooling is feature-complete.
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper())
+
+    if self.data_format == 'channels_first':
+      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
+                                             self.padding, self.strides[0])
+    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
+                                             self.padding, self.strides[1])
+    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
+                                             self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
 
   def get_config(self):
     config = {
@@ -222,12 +453,12 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
         'strides': self.strides,
         'data_format': self.data_format
     }
-    base_config = super(AveragePooling2D, self).get_config()
+    base_config = super(Pooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
-class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
+class MaxPooling3D(Pooling3D):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -270,26 +501,14 @@ class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling3D, self).__init__(
+        nn.max_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
-class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
+class AveragePooling3D(Pooling3D):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -332,30 +551,18 @@ class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(AveragePooling3D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(AveragePooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(AveragePooling3D, self).__init__(
+        nn.avg_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
-class _GlobalPooling1D(Layer):
+class GlobalPooling1D(Layer):
   """Abstract class for different global pooling 1D layers.
   """
 
   def __init__(self, **kwargs):
-    super(_GlobalPooling1D, self).__init__(**kwargs)
+    super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
 
   def compute_output_shape(self, input_shape):
@@ -368,7 +575,7 @@ class _GlobalPooling1D(Layer):
 
 @tf_export('keras.layers.GlobalAveragePooling1D',
            'keras.layers.GlobalAvgPool1D')
-class GlobalAveragePooling1D(_GlobalPooling1D):
+class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
   Input shape:
@@ -380,11 +587,11 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.mean(inputs, axis=1)
+    return backend.mean(inputs, axis=1)
 
 
 @tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
-class GlobalMaxPooling1D(_GlobalPooling1D):
+class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
   Input shape:
@@ -396,15 +603,15 @@ class GlobalMaxPooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.max(inputs, axis=1)
+    return backend.max(inputs, axis=1)
 
 
-class _GlobalPooling2D(Layer):
+class GlobalPooling2D(Layer):
   """Abstract class for different global pooling 2D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling2D, self).__init__(**kwargs)
+    super(GlobalPooling2D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -420,13 +627,13 @@ class _GlobalPooling2D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling2D, self).get_config()
+    base_config = super(GlobalPooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling2D',
            'keras.layers.GlobalAvgPool2D')
-class GlobalAveragePooling2D(_GlobalPooling2D):
+class GlobalAveragePooling2D(GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
   Arguments:
@@ -456,13 +663,13 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2])
+      return backend.mean(inputs, axis=[1, 2])
     else:
-      return K.mean(inputs, axis=[2, 3])
+      return backend.mean(inputs, axis=[2, 3])
 
 
 @tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
-class GlobalMaxPooling2D(_GlobalPooling2D):
+class GlobalMaxPooling2D(GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
   Arguments:
@@ -492,17 +699,17 @@ class GlobalMaxPooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2])
+      return backend.max(inputs, axis=[1, 2])
     else:
-      return K.max(inputs, axis=[2, 3])
+      return backend.max(inputs, axis=[2, 3])
 
 
-class _GlobalPooling3D(Layer):
+class GlobalPooling3D(Layer):
   """Abstract class for different global pooling 3D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling3D, self).__init__(**kwargs)
+    super(GlobalPooling3D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
@@ -518,13 +725,13 @@ class _GlobalPooling3D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling3D, self).get_config()
+    base_config = super(GlobalPooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling3D',
            'keras.layers.GlobalAvgPool3D')
-class GlobalAveragePooling3D(_GlobalPooling3D):
+class GlobalAveragePooling3D(GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
   Arguments:
@@ -554,13 +761,13 @@ class GlobalAveragePooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2, 3])
+      return backend.mean(inputs, axis=[1, 2, 3])
     else:
-      return K.mean(inputs, axis=[2, 3, 4])
+      return backend.mean(inputs, axis=[2, 3, 4])
 
 
 @tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
-class GlobalMaxPooling3D(_GlobalPooling3D):
+class GlobalMaxPooling3D(GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
   Arguments:
@@ -590,9 +797,9 @@ class GlobalMaxPooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2, 3])
+      return backend.max(inputs, axis=[1, 2, 3])
     else:
-      return K.max(inputs, axis=[2, 3, 4])
+      return backend.max(inputs, axis=[2, 3, 4])
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index 641b563a25..4c68c18825 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -435,8 +435,8 @@ class RNNTest(test.TestCase):
     cells[0].add_update(update_1, inputs=x)
     cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(layer.get_updates_for(None), [update_2])
-    self.assertEqual(layer.get_updates_for(x), [update_1])
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    self.assertEqual(len(layer.get_updates_for(x)), 1)
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index c510e464ae..9aee5f03b6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -23,11 +23,11 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -213,7 +213,7 @@ class TimeDistributed(Wrapper):
         input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
-      input_uid = tf_layers_util.object_list_uid(inputs)
+      input_uid = base_layer.object_list_uid(inputs)
       inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
index 4445900330..bc8698f235 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -607,12 +607,6 @@ class CustomCallSignatureTests(test.TestCase):
     self.assertAllClose(10. * expected_output, self.evaluate(output))
     output = model(first, second=second, training=False)
     self.assertAllClose(expected_output, self.evaluate(output))
-    if not context.executing_eagerly():
-      six.assertCountEqual(self, [first, second], model.inputs)
-    with self.assertRaises(TypeError):
-      # tf.layers.Layer expects an "inputs" argument, so all-keywords doesn't
-      # work at the moment.
-      model(first=first, second=second, fiddle_with_output='yes')
 
   @test_util.run_in_graph_and_eager_modes()
   def test_inputs_in_signature(self):
@@ -622,10 +616,14 @@ class CustomCallSignatureTests(test.TestCase):
       def call(self, inputs, some_other_arg, training=False):
         return inputs
 
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
     model = HasInputsAndOtherPositional()
     with self.assertRaisesRegexp(
         TypeError, 'everything else as a keyword argument'):
-      model(array_ops.ones([]), array_ops.ones([]))
+      x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
+      model(x1, x2)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_kwargs_in_signature(self):
@@ -649,13 +647,14 @@ class CustomCallSignatureTests(test.TestCase):
       def call(self, x, *args, **kwargs):
         return [x] + list(args)
 
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
     model = HasArgs()
-    arg1 = array_ops.ones([])
-    arg2 = array_ops.ones([])
-    arg3 = array_ops.ones([])
-    model(arg1, arg2, arg3, a=3)
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    model(x1, x2, x3, a=3)
     if not context.executing_eagerly():
-      six.assertCountEqual(self, [arg1, arg2, arg3], model.inputs)
+      six.assertCountEqual(self, [x1, x2, x3], model.inputs)
 
   def test_args_and_keywords_in_signature(self):
 
@@ -666,11 +665,9 @@ class CustomCallSignatureTests(test.TestCase):
 
     with context.graph_mode():
       model = HasArgs()
-      arg1 = array_ops.ones([])
-      arg2 = array_ops.ones([])
-      arg3 = array_ops.ones([])
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
       with self.assertRaisesRegexp(TypeError, 'args and arguments with'):
-        model(arg1, arg2, arg3, a=3)
+        model(x1, x2, x3, a=3)
 
   def test_training_no_default(self):
 
@@ -694,11 +691,9 @@ class CustomCallSignatureTests(test.TestCase):
 
     with context.graph_mode():
       model = TrainingNoDefaultWithPositional()
-      arg1 = array_ops.ones([])
-      arg2 = array_ops.ones([])
-      arg3 = array_ops.ones([])
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
       with self.assertRaisesRegexp(TypeError, 'after a non-input'):
-        model(arg1, arg2, arg3)
+        model(x1, x2, x3)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
index 583079d962..8882a3a46b 100644
--- a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
@@ -21,17 +21,146 @@ from __future__ import print_function
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-# pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.layers.utils import conv_input_length
-from tensorflow.python.layers.utils import conv_output_length
-from tensorflow.python.layers.utils import deconv_output_length as deconv_length
-from tensorflow.python.layers.utils import normalize_tuple
+from tensorflow.python.keras._impl.keras import backend
+
+
+def convert_data_format(data_format, ndim):
+  if data_format == 'channels_last':
+    if ndim == 3:
+      return 'NWC'
+    elif ndim == 4:
+      return 'NHWC'
+    elif ndim == 5:
+      return 'NDHWC'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  elif data_format == 'channels_first':
+    if ndim == 3:
+      return 'NCW'
+    elif ndim == 4:
+      return 'NCHW'
+    elif ndim == 5:
+      return 'NCDHW'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single integer or iterable of integers into an integer tuple.
+
+  Arguments:
+    value: The value to validate and convert. Could an int, or any iterable
+      of ints.
+    n: The size of the tuple to be returned.
+    name: The name of the argument being validated, e.g. "strides" or
+      "kernel_size". This is only used to format error messages.
+
+  Returns:
+    A tuple of n integers.
+
+  Raises:
+    ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except (ValueError, TypeError):
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+    return value_tuple
+
+
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(input_length, filter_size, padding, stride):
+  """Determines output length of a transposed convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  input_length *= stride
+  if padding == 'valid':
+    input_length += max(filter_size - stride, 0)
+  elif padding == 'full':
+    input_length -= (stride + filter_size - 2)
+  return input_length
 
 
 def normalize_data_format(value):
   if value is None:
-    value = K.image_data_format()
+    value = backend.image_data_format()
   data_format = value.lower()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('The `data_format` argument must be one of '
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
new file mode 100644
index 0000000000..8da5f77777
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):
+  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+  If `pred` is a bool or has a constant value, we return either `true_fn()`
+  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+  Arguments:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix when using `tf.cond`.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`.
+
+  Raises:
+    TypeError: If `true_fn` or `false_fn` is not callable.
+  """
+  if isinstance(pred, variables.Variable):
+    return control_flow_ops.cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name)
+  return smart_module.smart_cond(
+      pred, true_fn=true_fn, false_fn=false_fn, name=name)
+
+
+def constant_value(pred):
+  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+  Arguments:
+    pred: A scalar, either a Python bool or a TensorFlow boolean variable
+      or tensor, or the Python integer 1 or 0.
+
+  Returns:
+    True or False if `pred` has a constant boolean value, None otherwise.
+
+  Raises:
+    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
+      integer 1 or 0.
+  """
+  # Allow integer booleans.
+  if isinstance(pred, int):
+    if pred == 1:
+      pred = True
+    elif pred == 0:
+      pred = False
+
+  if isinstance(pred, variables.Variable):
+    return None
+  return smart_module.smart_constant_value(pred)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index ec741d3265..64db49c900 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -12,148 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import copy
-import re
-import weakref
 
-import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import utils as layers_util
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.Layer')
-class Layer(checkpointable.CheckpointableBase):
-  """Base layer class.
+InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-  This is the class from which all layers inherit, implementing common
-  infrastructure functionality.
 
-  A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing variables,
-  losses, and updates, as well as applying TensorFlow ops to input tensors.
+@tf_export('layers.Layer')
+class Layer(base_layer.Layer):
+  """Base layer class.
 
-  Users will just instantiate it and then treat it as a callable.
+  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
+  instead.
 
-  We recommend that descendants of Layer implement the following methods:
-  * `__init__()`: Save configuration in member variables
-  * `build()`: Called once from `__call__`, when we know the shapes of inputs
-    and `dtype`. Should have the calls to `add_variable()`, and then
-    call the super's `build()` (which sets `self.built = True`, which is
-    nice in case the user wants to call `build()` manually before the
-    first `__call__`).
-  * `call()`: Called in `__call__` after making sure `build()` has been called
-    once. Should actually perform the logic of applying the layer to the
-    input tensors (which should be passed in as the first argument).
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
 
   Read-only properties:
-    `name`: The name of the layer (string).
-    `dtype`: Default dtype of the layer (default of `None` means use the
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    `trainable_variables`: List of trainable variables.
-    `non_trainable_variables`: List of non-trainable variables.
-    `variables`: List of all variables of this layer, trainable and
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
       non-trainable.
-    `updates`: List of update ops of this layer.
-    `losses`: List of losses added by this layer.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
 
   Mutable properties:
-    `trainable`: Whether the layer should be trained (boolean).
-    `input_spec`: Optional (list of) `InputSpec` object(s) specifying the
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
       constraints on inputs that can be accepted by the layer.
   """
 
   def __init__(self, trainable=True, name=None, dtype=None,
-               activity_regularizer=None, **kwargs):
-    # We use a kwargs dict here because these kwargs only exist
-    # for compatibility reasons.
-    # The list of kwargs is subject to changes in the future.
-    # We do not want to commit to it or to expose the list to users at all.
-    # Note this is exactly as safe as defining kwargs in the function signature,
-    # the only difference being that the list of valid kwargs is defined
-    # below rather rather in the signature, and default values are defined
-    # in calls to kwargs.get().
-    allowed_kwargs = {
-        '_scope',
-        '_reuse',
-        'input_shape',  # For compatibility with Keras `Sequential` model.
-        'batch_size',  # For compatibility with Keras `Sequential` model.
-    }
-    for kwarg in kwargs:
-      if kwarg not in allowed_kwargs:
-        raise TypeError('Keyword argument not understood:', kwarg)
-
-    # Mutable properties
-    # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training
-    self.trainable = trainable
-    # A stateful layer is a layer whose updates are run during inference too,
-    # for instance stateful RNNs.
-    self.stateful = False
-    # Indicates whether `build` needs to be called upon layer call, to create
-    # the layer's weights.
-    self.built = False
-    # Provides information about which inputs are compatible with the layer.
-    self.input_spec = None
-
-    if activity_regularizer and context.executing_eagerly():
-      raise ValueError(
-          ('Activity regularization is not supported when executing eagerly. '
-           'Got activity_regularizer=%s') % (activity_regularizer,))
-    self._activity_regularizer = activity_regularizer
+               **kwargs):
+    # For backwards compatibility, legacy layers do not use `ResourceVariable`
+    # by default.
+    self._use_resource_variables = False
+    scope = kwargs.pop('_scope', None)
+    self._reuse = kwargs.pop('_reuse', None)
+
+    # Avoid an incorrect lint error
     self._trainable_weights = []
-    self._non_trainable_weights = []
-    self._updates = []
-    # When executing eagerly, _losses is a list of zero-argument lambdas which
-    # return tensors. When using graph execution, _losses is a list of ops.
-    self._losses = []
-    self._reuse = kwargs.get('_reuse')
-    self._graph = None  # Will be set at build time.
-    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
-    self._call_fn_args = estimator_util.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in self._call_fn_args or
-                                   hasattr(self, 'compute_mask'))
-    self._call_has_scope_arg = 'scope' in self._call_fn_args
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    self._inbound_nodes = []
-    self._outbound_nodes = []
+    self.built = False
 
-    self._init_set_name(name)
+    super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
+                                **kwargs)
 
-    # Determine variable scope.
-    scope = kwargs.get('_scope')
+    self._graph = None
+    self._call_has_scope_arg = 'scope' in self._call_fn_args
     if scope:
       with vs.variable_scope(scope) as captured_scope:
         self._scope = captured_scope
     else:
       self._scope = None
+    self._current_scope = None
 
-    # Set `_batch_input_shape` attribute
-    # for compatibility with Keras `Sequential` model.
-    if 'input_shape' in kwargs:
-      batch_size = kwargs.get('batch_size')
-      self._batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+  @property
+  def graph(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.graph not supported when executing eagerly.')
+    return self._graph
 
   def _init_set_name(self, name):
     # Determine layer name (non-unique).
@@ -166,18 +109,15 @@ class Layer(checkpointable.CheckpointableBase):
       self._name, base_name = self._make_unique_name()
     self._base_name = base_name
 
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
+                        namespace='', zero_based=False):
+    base_name = base_layer.to_snake_case(self.__class__.__name__)
+    name = base_layer.unique_layer_name(base_name,
+                                        name_uid_map=name_uid_map,
+                                        avoid_names=avoid_names,
+                                        namespace=namespace,
+                                        zero_based=zero_based)
+    return (name, base_name)
 
   @property
   def scope_name(self):
@@ -189,271 +129,16 @@ class Layer(checkpointable.CheckpointableBase):
                        'querying `scope_name`.')
     return self._scope.name
 
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
-
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
-
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
-
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
-    This call is ignored in Eager mode.
-
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.executing_eagerly():
-      return  # Updates already applied when in eager mode.
-
-    updates = _to_list(updates)
-    updates = [x if isinstance(x, ops.Operation)
-               else ops.convert_to_tensor(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
-    else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
-
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
-
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
-
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
-
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
-
-    Note that when executing eagerly, getting this property evaluates
-    regularizers. When using graph execution, variable regularization ops have
-    already been created and are simply returned here.
-
-    Returns:
-      A list of tensors.
-    """
-    if context.executing_eagerly():
-      # _losses may only contain variable regularization losses when executing
-      # eagerly, and they have been saved as lambdas to be executed when
-      # requested.
-      return [regularizer() for regularizer in self._losses]
-    else:
-      return self._losses
-
   def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
-
-    Note that `add_loss` is not supported when executing eagerly. Instead,
-    variable regularizers may be added through `add_variable`. Activity
-    regularization is not supported directly (but such losses may be returned
-    from `Layer.call()`).
-
-    Arguments:
-      losses: Loss tensor, or list/tuple of tensors.
-      inputs: If anything other than None is passed, it signals the losses
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for activity regularization losses, for instance.
-        If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      # TODO(fchollet): it should be possible (and highly desirable) to support
-      # `add_loss` in eager mode. This allows great convenience and flexibility
-      # in defining custom losses on the fly (e.g. in VAEs).
-      # Simply appending the loss value to `self._losses`
-      # is the correct behavior.
-      # The only caveat is that we need to force the user to only call
-      # `add_loss` from inside a model or Layer's `call` method
-      # (otherwise the loss computation cannot be backproped through).
-      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
-
-    losses = _to_list(losses)
-    self._losses += losses
-    if inputs is None:
-      for loss in losses:
-        loss._unconditional_loss = True  # pylint: disable=protected-access
-    else:
-      for loss in losses:
-        loss._unconditional_loss = False  # pylint: disable=protected-access
+    previous_losses_length = len(self._losses)
+    super(Layer, self).add_loss(losses, inputs=inputs)
     # TODO(fchollet): deprecate collection below.
-    _add_elements_to_collection(losses, ops.GraphKeys.REGULARIZATION_LOSSES)
-
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
-
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
-
-  def build(self, _):
-    """Creates the variables of the layer."""
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+    new_losses = self._losses[previous_losses_length:]
+    _add_elements_to_collection(new_losses, ops.GraphKeys.REGULARIZATION_LOSSES)
 
-    Arguments:
-      inputs: input tensor(s).
-      **kwargs: additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
-    """
-    return inputs
-
-  def _name_scope_name(self, current_variable_scope):
+  def _name_scope(self):
     """Determines op naming for the Layer."""
-    return current_variable_scope.original_name_scope
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Returns:
-      A (possibly nested tuple of) `TensorShape`.
-
-    Raises:
-      TypeError: if `input_shape` is not a (possibly nested tuple of)
-        `TensorShape`.
-      ValueError: if `input_shape` is incomplete or is incompatible with the
-        the layer.
-    """
-    raise NotImplementedError
-
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace='', zero_based=False):
-    base_name = _to_snake_case(self.__class__.__name__)
-    name = _unique_layer_name(base_name, name_uid_map=name_uid_map,
-                              avoid_names=avoid_names, namespace=namespace,
-                              zero_based=zero_based)
-    return (name, base_name)
+    return self._current_scope.original_name_scope
 
   def _set_scope(self, scope=None):
     if self._scope is None:
@@ -467,10 +152,11 @@ class Layer(checkpointable.CheckpointableBase):
             scope, default_name=self._base_name) as captured_scope:
           self._scope = captured_scope
 
-  def add_variable(self, name, shape, dtype=None,
-                   initializer=None, regularizer=None,
-                   trainable=True, constraint=None,
-                   partitioner=None):
+  def add_weight(self, name, shape, dtype=None,
+                 initializer=None, regularizer=None,
+                 trainable=True, constraint=None,
+                 use_resource=None,
+                 partitioner=None):
     """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
@@ -486,6 +172,7 @@ class Layer(checkpointable.CheckpointableBase):
         then this parameter is ignored and any added variables are also
         marked as non-trainable.
       constraint: constraint instance (callable).
+      use_resource: Whether to use `ResourceVariable`.
       partitioner: (optional) partitioner instance (callable).  If
         provided, when the requested variable is created it will be split
         into multiple partitions according to `partitioner`.  In this case,
@@ -504,10 +191,6 @@ class Layer(checkpointable.CheckpointableBase):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
-
-    # `init_graph` should point to the graph in which variable initialization
-    # will occur; it should be None if and only if initialization will take
-    # place in the eager context.
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -530,71 +213,43 @@ class Layer(checkpointable.CheckpointableBase):
 
     self._set_scope(None)
     reuse = self.built or self._reuse
+    prev_len_trainable = len(self._trainable_weights)
     with vs.variable_scope(
         self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        variable = self._add_variable_with_custom_getter(
-            name=name,
-            shape=shape,
-            getter=vs.get_variable,
-            # Manage errors in Layer rather than Checkpointable.
-            overwrite=True,
-            initializer=initializer,
+      self._current_scope = scope
+      with ops.name_scope(self._name_scope()):
+        use_resource = (use_resource or
+                        self._use_resource_variables or
+                        scope.use_resource)
+        variable = super(Layer, self).add_weight(
+            name,
+            shape,
             dtype=dtypes.as_dtype(dtype),
+            initializer=initializer or scope.initializer,
+            trainable=trainable,
             constraint=constraint,
-            trainable=trainable and self.trainable,
-            partitioner=partitioner)
-
-        if init_graph is not None:  # pylint: disable=protected-access
-          # The variable was created and initialized in a graph.
-
-          if variable in existing_variables:
-            # To match the behavior of tf.get_variable(), we only apply
-            # regularization if the variable is newly created.
-            return variable
-
+            partitioner=partitioner,
+            use_resource=use_resource,
+            getter=vs.get_variable)
+
+        if regularizer:
+          if context.executing_eagerly() or variable not in existing_variables:
+            self._handle_weight_regularization(name, variable, regularizer)
+
+        if init_graph is not None:
+          # Handle edge case where a custom getter has overridden `trainable`.
+          # There is one known occurrence of this, in unit test
+          # testBasicRNNCellNotTrainable in
+          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
           with init_graph.as_default():
             trainable_variables = tf_variables.trainable_variables()
           if (trainable and self.trainable and
               variable not in trainable_variables):
             # A custom getter / variable scope overrode the trainable flag.
-            trainable = False
-
-          if regularizer:
-            if isinstance(variable, tf_variables.PartitionedVariable):
-              for v in variable:
-                with ops.colocate_with(v.op):
-                  with ops.name_scope(name + '/Regularizer'):
-                    regularization = regularizer(v)
-                if regularization is not None:
-                  self.add_loss(regularization)
-            else:
-              with ops.colocate_with(variable.op):
-                with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(variable)
-              if regularization is not None:
-                self.add_loss(regularization)
-        elif regularizer:  # and initialization took place in an eager context
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            raise RuntimeError(
-                'Partitioned variable regularization is not yet '
-                'supported when executing eagerly. File a feature request '
-                'if this is important to you.')
-          # Save a zero-argument lambda which runs the regularizer on the
-          # variable, to be executed when `Layer.losses` is requested.
-          # This makes losses responsive to variable updates when executing
-          # eagerly.
-          #
-          # TODO(akshayka): Do the same for graphs as well, so that losses
-          # collected in a while_loop can be run outside its control flow
-          # context and so that losses won't be swallowed up by graph functions
-          # (i.e., `.losses()` should always create regularizers).
-          self._losses.append(lambda: regularizer(variable))
-
-    if trainable:
-      self._trainable_weights.append(variable)
-    else:
-      self._non_trainable_weights.append(variable)
+            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
+            self._trainable_weights = self._trainable_weights[
+                :prev_len_trainable]
+            self._non_trainable_weights += extra_trainable_vars
     return variable
 
   def __call__(self, inputs, *args, **kwargs):
@@ -622,35 +277,14 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     self._set_scope(kwargs.pop('scope', None))
-    input_list = nest.flatten(inputs)
 
-    build_graph = not context.executing_eagerly()
-    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
-    # which don't use an "inputs" argument.
-    in_deferred_mode = isinstance(input_list[0], _DeferredTensor)
-    # Ensure the Layer, if being reused, is working with inputs from
-    # the same graph as where it was created.
-    if build_graph:
+    if not context.executing_eagerly():
       try:
         # Set layer's "graph" at build time
-        self._graph = ops._get_graph_from_inputs(input_list, graph=self._graph)  # pylint: disable=protected-access
+        self._graph = ops._get_graph_from_inputs(nest.flatten(inputs),  # pylint: disable=protected-access
+                                                 graph=self._graph)
       except ValueError as e:
         raise ValueError('Input graph and Layer graph are not the same: %s' % e)
-    if build_graph or in_deferred_mode:
-      user_kwargs = copy.copy(kwargs)
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if (not hasattr(self, '_compute_previous_mask') or
-        self._compute_previous_mask):
-      previous_mask = _collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = estimator_util.fn_args(self.call)
-      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not _is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
 
     if self.built:
       try:
@@ -667,134 +301,27 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
-    input_shapes = None
-    with scope_context_manager as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        if not self.built:
-          if not build_graph:
-            # Activity regularization is currently unsupported in Eager mode.
-            if self._activity_regularizer:
-              raise ValueError(
-                  'activity_regularizer currently unsupported with '
-                  'eager execution enabled. Found an activity_regularizer in '
-                  '%s(%s).' % (self.__class__.__name__, self))
-          if not build_graph and not in_deferred_mode:
-            # TODO(agarwal): support _keras_history in Eager mode.
-            for x in input_list:
-              if hasattr(x, '_keras_history'):
-                raise ValueError('_keras_history currently unsupported in '
-                                 'Eager mode. Found _keras_history in %s while '
-                                 'executing __call__ for %s(%s)' %
-                                 (x, self.__class_.__name__, self))
-
-          # Check input assumptions set before layer building, e.g. input rank.
-          self._assert_input_compatibility(inputs)
-          if input_list and self._dtype is None:
-            try:
-              self._dtype = input_list[0].dtype.base_dtype.name
-            except AttributeError:
-              pass
-          if all(hasattr(x, 'get_shape') for x in input_list):
-            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-          self.build(input_shapes)
-        try:
-          # Note: not all sub-classes of Layer call Layer.__init__ (especially
-          # the ones under tensorflow/python/keras). Hence we recompute this
-          # attribute here if it is not set.
-          # TODO(agarwal): Fix the sub-classes and avoid this complexity.
-          call_has_scope_arg = self._call_has_scope_arg
-        except AttributeError:
-          self._call_fn_args = estimator_util.fn_args(self.call)
-          self._call_has_scope_arg = 'scope' in self._call_fn_args
-          call_has_scope_arg = self._call_has_scope_arg
-        if call_has_scope_arg:
-          kwargs['scope'] = scope
-        # Check input assumptions set after layer building, e.g. input shape.
-        if build_graph or in_deferred_mode:
-          self._assert_input_compatibility(inputs)
-
-        if not in_deferred_mode:
-          outputs = self.call(inputs, *args, **kwargs)
-          if outputs is None:
-            raise ValueError('A layer\'s `call` method should return a Tensor '
-                             'or a list of Tensors, not None.')
-        else:
-          # Deferred mode behavior: use `compute_output_shape` to
-          # infer the number of outputs of the layer and their shapes.
-          if input_shapes is None:
-            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-
-          output_shapes = self.compute_output_shape(input_shapes)
-          output_shapes = nest.flatten(output_shapes)
-          outputs = [
-              # TODO(fchollet): name the deferred tensors?
-              _DeferredTensor(shape=shape, dtype=self._dtype)
-              for shape in output_shapes
-          ]
-          if len(outputs) == 1:
-            outputs = outputs[0]
 
-        if build_graph:
-          # Apply activity regularization.
-          # Note that it should be applied every time the layer creates a new
-          # output, since it is output-specific.
-          if self._activity_regularizer:
-            output_list = nest.flatten(outputs)
-            for output in output_list:
-              with ops.name_scope('ActivityRegularizer'):
-                activity_regularization = self._activity_regularizer(output)
-              self.add_loss(activity_regularization, inputs=inputs)
+    with scope_context_manager as scope:
+      self._current_scope = scope
 
-          # TODO(fchollet): consider enabling masking for Eager mode.
-          if hasattr(self, 'compute_mask'):
-            output_mask = self.compute_mask(inputs, previous_mask)
-            if isinstance(outputs, (list, tuple)):
-              if output_mask is None:
-                output_mask = [None for _ in range(len(outputs))]
-              for x, m in zip(outputs, output_mask):
-                x._keras_mask = m  # pylint: disable=protected-access
-            else:
-              outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      try:
+        call_has_scope_arg = self._call_has_scope_arg
+      except AttributeError:
+        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_has_scope_arg = 'scope' in self._call_fn_args
+        call_has_scope_arg = self._call_has_scope_arg
+      if call_has_scope_arg:
+        kwargs['scope'] = scope
 
-    if build_graph:
-      # If all input tensors have history metadata,
-      # we update the output tensors
-      # with corresponding history metadata, thus eventually allowing to use
-      # these tensors to instantiate a Network.
-      if _have_all_keras_metadata(inputs):
-        # If the layer returns tensors from its inputs, unmodified,
-        # we copy them to avoid loss of tensor metadata.
-        output_ls = nest.flatten(outputs)
-        output_ls_copy = []
-        for x in output_ls:
-          if x in input_list:
-            with ops.name_scope(scope.original_name_scope):
-              x = array_ops.identity(x)
-          output_ls_copy.append(x)
-        if len(output_ls_copy) == 1:
-          outputs = output_ls_copy[0]
-        else:
-          outputs = output_ls_copy
+      # Actually call layer
+      outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
 
+    if not context.executing_eagerly():
       # Update global default collections.
       _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
-
-    if in_deferred_mode or build_graph:
-      if _have_all_keras_metadata(inputs):
-        # Add an inbound node to the layer, so it can keep track of this call.
-        # This updates the layer history of the output tensor(s).
-        self._add_inbound_node(
-            input_tensors=inputs, output_tensors=outputs, arguments=user_kwargs)
-
-    self.built = True
     return outputs
 
-  @property
-  def graph(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.graph not supported in Eager mode.')
-    return self._graph
-
   def __deepcopy__(self, memo):
     no_copy = set(['_graph'])
     shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
@@ -806,658 +333,12 @@ class Layer(checkpointable.CheckpointableBase):
         setattr(result, k, v)
       elif k in shallow_copy:
         setattr(result, k, copy.copy(v))
-      elif _is_tensor_or_tensor_list(v):
+      elif base_layer.is_tensor_or_tensor_list(v):
         setattr(result, k, v)
       else:
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
-
-    This simply wraps `self.__call__`.
-
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-    """
-    return self.__call__(inputs, *args, **kwargs)
-
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
-
-    Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
-
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
-
-    Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
-
-    Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
-
-    Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
-    """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
-    else:
-      return values
-
-  def get_input_shape_at(self, node_index):
-    """Retrieves the input shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_shapes',
-                                             'input shape')
-
-  def get_output_shape_at(self, node_index):
-    """Retrieves the output shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          'Layer.get_output_shape_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'output_shapes',
-                                             'output shape')
-
-  def get_input_at(self, node_index):
-    """Retrieves the input tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_input_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'input_tensors',
-                                             'input')
-
-  def get_output_at(self, node_index):
-    """Retrieves the output tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_tensors',
-                                             'output')
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name +
-                           ' is not connected, no input to return.')
-    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
-
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-      Output tensor or list of output tensors.
-
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
-    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
-
-  @property
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
-
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined input shape.')
-    all_input_shapes = set(
-        [str(node.input_shapes) for node in self._inbound_nodes])
-    if len(all_input_shapes) == 1:
-      input_shapes = self._inbound_nodes[0].input_shapes
-      if len(input_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in input_shapes
-        ]
-    else:
-      raise AttributeError('The layer "' + str(self.name) +
-                           ' has multiple inbound nodes, '
-                           'with different input shapes. Hence '
-                           'the notion of "input shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_input_shape_at(node_index)` '
-                           'instead.')
-
-  def count_params(self):
-    """Count the total number of scalars composing the weights.
-
-    Returns:
-        An integer count.
-
-    Raises:
-        ValueError: if the layer isn't yet built
-          (in which case its weights aren't yet defined).
-    """
-    if not self.built:
-      if self.__class__.__name__ == 'Sequential':
-        self.build()  # pylint: disable=no-value-for-parameter
-      else:
-        raise ValueError('You tried to call `count_params` on ' + self.name +
-                         ', but the layer isn\'t built. '
-                         'You can build it manually via: `' + self.name +
-                         '.build(batch_input_shape)`.')
-    weight_shapes = [w.get_shape().as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
-
-  @property
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
-
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
-
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined output shape.')
-    all_output_shapes = set(
-        [str(node.output_shapes) for node in self._inbound_nodes])
-    if len(all_output_shapes) == 1:
-      output_shapes = self._inbound_nodes[0].output_shapes
-      if len(output_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in output_shapes
-        ]
-    else:
-      raise AttributeError('The layer "%s"'
-                           ' has multiple inbound nodes, '
-                           'with different output shapes. Hence '
-                           'the notion of "output shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_output_shape_at(node_index)` '
-                           'instead.' % self.name)
-
-  @property
-  def inbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._inbound_nodes
-
-  @property
-  def outbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._outbound_nodes
-
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
-
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
-
-    Arguments:
-        inputs: input tensor or list of input tensors.
-
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
-    """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
-    else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.get_shape().ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
-
-
-@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
-
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
-
-
-class Node(object):
-  """A `Node` describes the connectivity between two layers.
-
-  Each time a layer is connected to some new input,
-  a node is added to `layer._inbound_nodes`.
-  Each time the output of a layer is used by another layer,
-  a node is added to `layer._outbound_nodes`.
-
-  Arguments:
-      outbound_layer: the layer that takes
-          `input_tensors` and turns them into `output_tensors`
-          (the node gets created when the `call`
-          method of the layer was called).
-      inbound_layers: a list of layers, the same length as `input_tensors`,
-          the layers from where `input_tensors` originate.
-      node_indices: a list of integers, the same length as `inbound_layers`.
-          `node_indices[i]` is the origin node of `input_tensors[i]`
-          (necessary since each inbound layer might have several nodes,
-          e.g. if the layer is being shared with a different data stream).
-      tensor_indices: a list of integers,
-          the same length as `inbound_layers`.
-          `tensor_indices[i]` is the index of `input_tensors[i]` within the
-          output of the inbound layer
-          (necessary since each inbound layer might
-          have multiple tensor outputs, with each one being
-          independently manipulable).
-      input_tensors: list of input tensors.
-      output_tensors: list of output tensors.
-      arguments: dictionary of keyword arguments that were passed to the
-          `call` method of the layer at the call that created the node.
-
-  `node_indices` and `tensor_indices` are basically fine-grained coordinates
-  describing the origin of the `input_tensors`.
-
-  A node from layer A to layer B is added to:
-    - A._outbound_nodes
-    - B._inbound_nodes
-  """
-
-  def __init__(self,
-               outbound_layer,
-               inbound_layers,
-               node_indices,
-               tensor_indices,
-               input_tensors,
-               output_tensors,
-               arguments=None):
-    # Layer instance (NOT a list).
-    if isinstance(outbound_layer, list):
-      raise ValueError(
-          '`outbound_layer` should be a layer instance, not a list.')
-    # this is the layer that takes a list of input tensors
-    # and turns them into a list of output tensors.
-    # the current node will be added to
-    # the inbound_nodes of outbound_layer.
-    self.outbound_layer = outbound_layer
-
-    # The following 3 properties describe where
-    # the input tensors come from: which layers,
-    # and for each layer, which node and which
-    # tensor output of each node.
-
-    # List of layer instances.
-    self.inbound_layers = inbound_layers
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.node_indices = node_indices
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.tensor_indices = tensor_indices
-
-    # Following 2 properties:
-    # tensor inputs and outputs of outbound_layer.
-
-    # List of tensors. 1:1 mapping with inbound_layers.
-    self.input_tensors = input_tensors
-    # List of tensors, created by outbound_layer.call().
-    self.output_tensors = output_tensors
-
-    # Following 2 properties: input and output shapes.
-
-    # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [layers_util.static_shape(x) for x in input_tensors]
-    # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [layers_util.static_shape(x) for x in output_tensors]
-
-    # Optional keyword arguments to layer's `call`.
-    self.arguments = arguments
-
-    # Add nodes to all layers involved.
-    for layer in inbound_layers:
-      if layer is not None:
-        # For compatibility with external Keras, we use the deprecated
-        # accessor here.
-        layer.outbound_nodes.append(self)
-    # For compatibility with external Keras, we use the deprecated
-    # accessor here.
-    outbound_layer.inbound_nodes.append(self)
-
-  def get_config(self):
-    inbound_names = []
-    for layer in self.inbound_layers:
-      if layer:
-        inbound_names.append(layer.name)
-      else:
-        inbound_names.append(None)
-    return {
-        'outbound_layer': self.outbound_layer.name,
-        'inbound_layers': inbound_names,
-        'node_indices': self.node_indices,
-        'tensor_indices': self.tensor_indices
-    }
-
-
-class _DeferredTensor(object):
-  """Tensor-like object used to build graphs of layers in Eager mode.
-
-  When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perfom shape inference to return new
-  DeferredTensors with appropriate shape information. Thus DeferredTensor
-  behaves like a graph-mode Tensor when manipulated by layers.
-  """
-
-  def __init__(self, shape, dtype, name=None):
-    self.shape = tensor_shape.TensorShape(shape)
-    if dtype is None:
-      self.dtype = dtypes.as_dtype(np.float32)
-    else:
-      self.dtype = dtypes.as_dtype(dtype)
-    self.name = name
-
-  def get_shape(self):
-    return self.shape
-
-  def __str__(self):
-    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-  def __repr__(self):
-    return "<_DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-
-def _is_tensor_or_tensor_list(v):
-  v = nest.flatten(v)
-  if v and isinstance(v[0], ops.Tensor):
-    return True
-  else:
-    return False
-
-
-def _to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def _to_list(x):
-  """This normalizes a list/tuple or single element into a list.
-
-  If a single element is passed, we return
-  a list of size 1 containing the element.
-
-  Arguments:
-    x: list or tuple or single element.
-
-  Returns:
-    A list.
-  """
-  if isinstance(x, (list, tuple)):
-    return list(x)
-  return [x]
-
 
 def _add_elements_to_collection(elements, collection_list):
   if context.executing_eagerly():
@@ -1473,105 +354,3 @@ def _add_elements_to_collection(elements, collection_list):
       if element not in collection_set:
         collection.append(element)
 
-
-def _is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  # We cannot use Python's `any` because the iterable may return Tensors.
-  for element in iterable:
-    if element is not None:
-      return False
-  return True
-
-
-def _have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def _collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-# A global dictionary mapping graph objects to an index of counters used
-# for various layer names in each graph.
-# Allows to give unique autogenerated names to layers, in a graph-specific way.
-PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
-
-
-def _get_default_graph_uid_map():
-  graph = ops.get_default_graph()
-  name_uid_map = PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections.defaultdict(int)
-    PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                       zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = _get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 9ed4afeaba..c05c675263 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -94,61 +94,6 @@ class BaseLayerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
         core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
 
-  def testGetVariable(self):
-    with self.test_session():
-
-      class MyLayer(base_layers.Layer):
-
-        def build(self, input_shape):
-          self.my_var = self.add_variable(
-              'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-
-        def call(self, inputs):
-          return inputs * 2
-
-      layer = MyLayer(name='my_layer')
-      inputs = random_ops.random_uniform((5,), seed=1)
-      layer.apply(inputs)
-      layer.apply(inputs)
-      self.assertEqual([v.name for v in layer.variables],
-                       ['my_layer/my_var:0'])
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time.  It uses scope "<current scope>/base_name"
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope'):
-        with variable_scope.variable_scope('my_layer'):
-          variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time. If 'scope' argument is passed to
-      # apply(), it uses that scope when accessing variables.
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope') as new_scope:
-        variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs, scope=new_scope)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope')
-
-      # Checking for graph equality is only done in GRAPH mode.
-      with ops.Graph().as_default():
-        inputs_ng = random_ops.random_uniform((5,), seed=1)
-        with self.assertRaisesRegexp(ValueError, r'graph are not the same'):
-          layer.apply(inputs_ng)
-
   @test_util.run_in_graph_and_eager_modes()
   def testCall(self):
 
@@ -165,38 +110,6 @@ class BaseLayerTest(test.TestCase):
       # op is only supported in GRAPH mode
       self.assertEqual(outputs.op.name, 'my_layer/Square')
 
-  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
-    # Note that this test is only run in Graph mode since with EAGER mode we can
-    # still create a new variable on second call.
-
-    class MyLayer(base_layers.Layer):
-
-      def build(self, _):
-        # Do not mark the layer as built.
-        pass
-
-      def call(self, inputs):
-        self.my_var = self.add_variable('my_var', [2, 2])
-        if self.built:
-          # Skip creating on the first call; try to create after it's
-          # built.  This is expected to fail.
-          self.add_variable('this_will_break_on_second_call', [2, 2])
-        return inputs + math_ops.square(self.my_var)
-
-    layer = MyLayer(name='my_layer')
-    inputs = random_ops.random_uniform((2,), seed=1)
-    outputs = layer.apply(inputs)
-    self.assertEqual(layer.built, True)
-    self.assertEqual(outputs.op.name, 'my_layer/add')
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-    with self.assertRaisesRegexp(ValueError,
-                                 'my_layer/this_will_break_on_second_call'):
-      layer.apply(inputs)
-    # The list of variables hasn't changed.
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-
   @test_util.run_in_graph_and_eager_modes()
   def testDeepCopy(self):
 
@@ -645,13 +558,14 @@ class BaseLayerTest(test.TestCase):
 
   def testLayerGraphSetInFirstApply(self):
     with ops.Graph().as_default():
-      layer = core_layers.Dense(1)  # Graph at construction time is ignored
+      # Graph at construction time is ignored
+      layer = core_layers.Dense(1)
     with ops.Graph().as_default():
-      layer.apply(constant_op.constant([[1]]))
+      layer.apply(constant_op.constant([[1.]]))
       # layer is now bound to second Graph
     with ops.Graph().as_default(), self.assertRaisesRegexp(
         ValueError, 'Input graph and Layer graph are not the same'):
-      layer.apply(constant_op.constant([[1]]))
+      layer.apply(constant_op.constant([[1.]]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 2d99b1688f..34a1487e74 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -32,201 +33,8 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Conv(base.Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_Conv, self).__init__(trainable=trainable, name=name,
-                                activity_regularizer=activity_regularizer,
-                                **kwargs)
-    self.rank = rank
-    self.filters = filters
-    self.kernel_size = utils.normalize_tuple(kernel_size, rank, 'kernel_size')
-    self.strides = utils.normalize_tuple(strides, rank, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.dilation_rate = utils.normalize_tuple(
-        dilation_rate, rank, 'dilation_rate')
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    kernel_shape = self.kernel_size + (input_dim, self.filters)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    self._convolution_op = nn_ops.Convolution(
-        input_shape,
-        filter_shape=self.kernel.get_shape(),
-        dilation_rate=self.dilation_rate,
-        strides=self.strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format,
-                                              self.rank + 2))
-    self.built = True
-
-  def call(self, inputs):
-    outputs = self._convolution_op(inputs, self.kernel)
-
-    if self.use_bias:
-      if self.data_format == 'channels_first':
-        if self.rank == 1:
-          # nn.bias_add does not accept a 1D input tensor.
-          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
-          outputs += bias
-        if self.rank == 2:
-          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          if outputs_shape[0] is None:
-            outputs_shape[0] = -1
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
-      else:
-        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      space = input_shape[1:-1]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0]] + new_space +
-                                      [self.filters])
-    else:
-      space = input_shape[2:]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0], self.filters] +
-                                      new_space)
-
-
 @tf_export('layers.Conv1D')
-class Conv1D(_Conv):
+class Conv1D(keras_layers.Conv1D, base.Layer):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -294,8 +102,7 @@ class Conv1D(_Conv):
                trainable=True,
                name=None,
                **kwargs):
-    super(Convolution1D, self).__init__(
-        rank=1,
+    super(Conv1D, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -417,7 +224,7 @@ def conv1d(inputs,
 
 
 @tf_export('layers.Conv2D')
-class Conv2D(_Conv):
+class Conv2D(keras_layers.Conv2D, base.Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -493,7 +300,6 @@ class Conv2D(_Conv):
                name=None,
                **kwargs):
     super(Conv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -622,7 +428,7 @@ def conv2d(inputs,
 
 
 @tf_export('layers.Conv3D')
-class Conv3D(_Conv):
+class Conv3D(keras_layers.Conv3D, base.Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -699,7 +505,6 @@ class Conv3D(_Conv):
                name=None,
                **kwargs):
     super(Conv3D, self).__init__(
-        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -828,169 +633,8 @@ def conv3d(inputs,
   return layer.apply(inputs)
 
 
-class _SeparableConv(_Conv):
-  """Abstract base layer for separable nD convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_SeparableConv, self).__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = depthwise_initializer
-    self.pointwise_initializer = pointwise_initializer
-    self.depthwise_regularizer = depthwise_regularizer
-    self.pointwise_regularizer = pointwise_regularizer
-    self.depthwise_constraint = depthwise_constraint
-    self.pointwise_constraint = pointwise_constraint
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    depthwise_kernel_shape = self.kernel_size + (input_dim,
-                                                 self.depth_multiplier)
-    pointwise_kernel_shape = (
-        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
-
-    self.depthwise_kernel = self.add_variable(
-        name='depthwise_kernel',
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    self.pointwise_kernel = self.add_variable(
-        name='pointwise_kernel',
-        shape=pointwise_kernel_shape,
-        initializer=self.pointwise_initializer,
-        regularizer=self.pointwise_regularizer,
-        constraint=self.pointwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-
 @tf_export('layers.SeparableConv1D')
-class SeparableConv1D(_SeparableConv):
+class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1072,7 +716,6 @@ class SeparableConv1D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv1D, self).__init__(
-        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1096,45 +739,9 @@ class SeparableConv1D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides * 2 + (1,)
-      spatial_start_dim = 1
-    else:
-      strides = (1, 1) + self.strides * 2
-      spatial_start_dim = 2
-
-    # Explicitly broadcast inputs and kernels to 4D.
-    # TODO(fchollet): refactor when a native separable_conv1d op is available.
-    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
-    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
-    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
-    dilation_rate = (1,) + self.dilation_rate
-
-    outputs = nn.separable_conv2d(
-        inputs,
-        depthwise_kernel,
-        pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.SeparableConv2D')
-class SeparableConv2D(_SeparableConv):
+class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
   """Depthwise separable 2D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1221,7 +828,6 @@ class SeparableConv2D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1245,31 +851,6 @@ class SeparableConv2D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    # Apply the actual ops.
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides + (1,)
-    else:
-      strides = (1, 1) + self.strides
-    outputs = nn.separable_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        self.pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=self.dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.separable_conv1d')
 def separable_conv1d(inputs,
@@ -1511,7 +1092,7 @@ def separable_conv2d(inputs,
 
 
 @tf_export('layers.Conv2DTranspose')
-class Conv2DTranspose(Conv2D):
+class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -1576,8 +1157,8 @@ class Conv2DTranspose(Conv2D):
                name=None,
                **kwargs):
     super(Conv2DTranspose, self).__init__(
-        filters,
-        kernel_size,
+        filters=filters,
+        kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
@@ -1593,120 +1174,6 @@ class Conv2DTranspose(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def build(self, input_shape):
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. Received input shape: ' +
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
-    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_height, out_width)
-      strides = (1, 1, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_height, out_width, self.filters)
-      strides = (1, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv2d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if not context.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv2d_transpose')
@@ -1806,7 +1273,7 @@ def conv2d_transpose(inputs,
 
 
 @tf_export('layers.Conv3DTranspose')
-class Conv3DTranspose(Conv3D):
+class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
   Arguments:
@@ -1885,153 +1352,6 @@ class Conv3DTranspose(Conv3D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def build(self, input_shape):
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5, received input shape:',
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined, found None: ' + str(input_shape))
-    input_dim = input_shape[channel_axis]
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(
-        'kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(
-          'bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    self.input_spec = base.InputSpec(ndim=5,
-                                     axes={c_axis: inputs_shape[c_axis]})
-
-    depth = inputs_shape[d_axis]
-    height = inputs_shape[h_axis]
-    width = inputs_shape[w_axis]
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_depth = utils.deconv_output_length(depth,
-                                           kernel_d,
-                                           self.padding,
-                                           stride_d)
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_depth, out_height,
-                      out_width)
-      strides = (1, 1, stride_d, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_depth, out_height, out_width,
-                      self.filters)
-      strides = (1, stride_d, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv3d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        data_format=utils.convert_data_format(self.data_format, ndim=5),
-        padding=self.padding.upper())
-
-    if not context.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
-                                                     kernel_d,
-                                                     self.padding,
-                                                     stride_d)
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], kernel_d, self.padding, stride_d)
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv3d_transpose')
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e598d9f83a..6d8e9eac87 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -27,23 +27,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.Dense')
-class Dense(base.Layer):
+class Dense(keras_layers.Dense, base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -108,73 +99,19 @@ class Dense(base.Layer):
                trainable=True,
                name=None,
                **kwargs):
-    super(Dense, self).__init__(trainable=trainable, name=name,
+    super(Dense, self).__init__(units=units,
+                                activation=activation,
+                                use_bias=use_bias,
+                                kernel_initializer=kernel_initializer,
+                                bias_initializer=bias_initializer,
+                                kernel_regularizer=kernel_regularizer,
+                                bias_regularizer=bias_regularizer,
                                 activity_regularizer=activity_regularizer,
+                                kernel_constraint=kernel_constraint,
+                                bias_constraint=bias_constraint,
+                                trainable=trainable,
+                                name=name,
                                 **kwargs)
-    self.units = units
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1].value is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(min_ndim=2,
-                                     axes={-1: input_shape[-1].value})
-    self.kernel = self.add_variable('kernel',
-                                    shape=[input_shape[-1].value, self.units],
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    if self.use_bias:
-      self.bias = self.add_variable('bias',
-                                    shape=[self.units,],
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-    shape = inputs.get_shape().as_list()
-    if len(shape) > 2:
-      # Broadcasting is required for the inputs.
-      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
-                                                             [0]])
-      # Reshape the output back to the original ndim of the input.
-      if not context.executing_eagerly():
-        output_shape = shape[:-1] + [self.units]
-        outputs.set_shape(output_shape)
-    else:
-      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
-    if self.use_bias:
-      outputs = nn.bias_add(outputs, self.bias)
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    input_shape = input_shape.with_rank_at_least(2)
-    if input_shape[-1].value is None:
-      raise ValueError(
-          'The innermost dimension of input_shape must be defined, but saw: %s'
-          % input_shape)
-    return input_shape[:-1].concatenate(self.units)
 
 
 @tf_export('layers.dense')
@@ -254,7 +191,7 @@ def dense(
 
 
 @tf_export('layers.Dropout')
-class Dropout(base.Layer):
+class Dropout(keras_layers.Dropout, base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
@@ -282,31 +219,14 @@ class Dropout(base.Layer):
                seed=None,
                name=None,
                **kwargs):
-    super(Dropout, self).__init__(name=name, **kwargs)
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-
-  def _get_noise_shape(self, inputs):
-    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
-    # which will override `self.noise_shape`, and allows for custom noise
-    # shapes with dynamically sized inputs.
-    if self.noise_shape is None:
-      return self.noise_shape
-    return nn_ops._get_noise_shape(inputs, self.noise_shape)
+    super(Dropout, self).__init__(rate=rate,
+                                  noise_shape=noise_shape,
+                                  seed=seed,
+                                  name=name,
+                                  **kwargs)
 
   def call(self, inputs, training=False):
-
-    def dropped_inputs():
-      return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self._get_noise_shape(inputs),
-                        seed=self.seed)
-    return utils.smart_cond(training,
-                            dropped_inputs,
-                            lambda: array_ops.identity(inputs))
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(Dropout, self).call(inputs, training=training)
 
 
 @tf_export('layers.dropout')
@@ -352,7 +272,7 @@ def dropout(inputs,
 
 
 @tf_export('layers.Flatten')
-class Flatten(base.Layer):
+class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
   Examples:
@@ -367,25 +287,7 @@ class Flatten(base.Layer):
     # now `y` has shape `(None, None)`
   ```
   """
-
-  def __init__(self, **kwargs):
-    super(Flatten, self).__init__(**kwargs)
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def call(self, inputs):
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
-    if not context.executing_eagerly():
-      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = [input_shape[0]]
-    if all(input_shape[1:]):
-      output_shape += [np.prod(input_shape[1:])]
-    else:
-      output_shape += [None]
-    return tensor_shape.TensorShape(output_shape)
+  pass
 
 
 @tf_export('layers.flatten')
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 83b201e642..33284b0d69 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -24,26 +24,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import moving_averages
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.BatchNormalization')
-class BatchNormalization(base.Layer):
+class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -143,485 +131,31 @@ class BatchNormalization(base.Layer):
                name=None,
                **kwargs):
     super(BatchNormalization, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    if isinstance(axis, list):
-      self.axis = axis[:]
-    else:
-      self.axis = axis
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = beta_initializer
-    self.gamma_initializer = gamma_initializer
-    self.moving_mean_initializer = moving_mean_initializer
-    self.moving_variance_initializer = moving_variance_initializer
-    self.beta_regularizer = beta_regularizer
-    self.gamma_regularizer = gamma_regularizer
-    self.beta_constraint = beta_constraint
-    self.gamma_constraint = gamma_constraint
-    self.renorm = renorm
-    self.virtual_batch_size = virtual_batch_size
-    self.adjustment = adjustment
-    if fused is None:
-      fused = True
-
-    self.fused = fused
-    self._bessels_correction_test_only = True
-
-    if renorm:
-      renorm_clipping = renorm_clipping or {}
-      keys = ['rmax', 'rmin', 'dmax']
-      if set(renorm_clipping) - set(keys):
-        raise ValueError('renorm_clipping %s contains keys not in %s' %
-                         (renorm_clipping, keys))
-      self.renorm_clipping = renorm_clipping
-      self.renorm_momentum = renorm_momentum
-
-  def _add_tower_local_variable(self, *args, **kwargs):
-    tower_context = distribute_lib.get_tower_context()
-    with tower_context.tower_local_var_scope('mean'):
-      return self.add_variable(*args, **kwargs)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if not input_shape.ndims:
-      raise ValueError('Input has undefined rank:', input_shape)
-    ndims = len(input_shape)
-
-    # Convert axis to list and resolve negatives
-    if isinstance(self.axis, int):
-      self.axis = [self.axis]
-
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
-    for idx, x in enumerate(self.axis):
-      if x < 0:
-        self.axis[idx] = ndims + x
-
-    # Validate axes
-    for x in self.axis:
-      if x < 0 or x >= ndims:
-        raise ValueError('Invalid axis: %d' % x)
-    if len(self.axis) != len(set(self.axis)):
-      raise ValueError('Duplicate axis: %s' % self.axis)
-
-    if self.virtual_batch_size is not None:
-      if self.virtual_batch_size <= 0:
-        raise ValueError('virtual_batch_size must be a positive integer that '
-                         'divides the true batch size of the input Tensor')
-      # If using virtual batches, the first dimension must be the batch
-      # dimension and cannot be the batch norm axis
-      if 0 in self.axis:
-        raise ValueError('When using virtual_batch_size, the batch dimension '
-                         'must be 0 and thus axis cannot include 0')
-      if self.adjustment is not None:
-        raise ValueError('When using virtual_batch_size, adjustment cannot '
-                         'be specified')
-
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
-      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
-      # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
-      # TODO(chrisying): fused batch norm is currently not supported for
-      # multi-axis batch norm and by extension virtual batches. In some cases,
-      # it might be possible to use fused batch norm but would require reshaping
-      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
-      # particularly tricky. A compromise might be to just support the most
-      # common use case (turning 5D w/ virtual batch to NCHW)
-
-    if self.fused:
-      if self.axis == [1]:
-        self._data_format = 'NCHW'
-      elif self.axis == [3]:
-        self._data_format = 'NHWC'
-      else:
-        raise ValueError('Unsupported axis, fused batch norm only supports '
-                         'axis == [1] or axis == [3]')
-
-    # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
-      param_dtype = dtypes.float32
-    else:
-      param_dtype = self.dtype or dtypes.float32
-
-    axis_to_dim = {x: input_shape[x].value for x in self.axis}
-    for x in axis_to_dim:
-      if axis_to_dim[x] is None:
-        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
-                         input_shape)
-    self.input_spec = base.InputSpec(ndim=ndims, axes=axis_to_dim)
-
-    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
-      # Single axis batch norm (most common/default use-case)
-      param_shape = (list(axis_to_dim.values())[0],)
-    else:
-      # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
-      if self.virtual_batch_size is not None:
-        # When using virtual batches, add an extra dim at index 1
-        param_shape.insert(1, 1)
-        for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
-
-    if self.scale:
-      self.gamma = self.add_variable(
-          name='gamma',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
-          trainable=True)
-    else:
-      self.gamma = None
-      if self.fused:
-        self._gamma_const = array_ops.constant(
-            1.0, dtype=param_dtype, shape=param_shape)
-
-    if self.center:
-      self.beta = self.add_variable(
-          name='beta',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
-          trainable=True)
-    else:
-      self.beta = None
-      if self.fused:
-        self._beta_const = array_ops.constant(
-            0.0, dtype=param_dtype, shape=param_shape)
-
-    # Disable variable partitioning when creating the moving mean and variance
-    try:
-      if self._scope:
-        partitioner = self._scope.partitioner
-        self._scope.set_partitioner(None)
-      else:
-        partitioner = None
-      self.moving_mean = self._add_tower_local_variable(
-          name='moving_mean',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_mean_initializer,
-          trainable=False)
-
-      self.moving_variance = self._add_tower_local_variable(
-          name='moving_variance',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_variance_initializer,
-          trainable=False)
-
-      if self.renorm:
-        # Create variables to maintain the moving mean and standard deviation.
-        # These are used in training and thus are different from the moving
-        # averages above. The renorm variables are colocated with moving_mean
-        # and moving_variance.
-        # NOTE: below, the outer `with device` block causes the current device
-        # stack to be cleared. The nested ones use a `lambda` to set the desired
-        # device and ignore any devices that may be set by the custom getter.
-        def _renorm_variable(name, shape):
-          var = self._add_tower_local_variable(
-              name=name,
-              shape=shape,
-              dtype=param_dtype,
-              initializer=init_ops.zeros_initializer(),
-              trainable=False)
-          return var
-
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_mean):
-          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
-          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
-        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
-        # renorm_stddev_weight. This allows us to (1) mix the average
-        # stddev with the minibatch stddev early in training, and (2) compute
-        # the unbiased average stddev by dividing renorm_stddev by the weight.
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_variance):
-          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
-          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
-                                                       ())
-    finally:
-      if partitioner:
-        self._scope.set_partitioner(partitioner)
-    self.built = True
-
-  def _assign_moving_average(self, variable, value, momentum):
-    with ops.name_scope(None, 'AssignMovingAvg',
-                        [variable, value, momentum]) as scope:
-      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
-      if decay.dtype != variable.dtype.base_dtype:
-        decay = math_ops.cast(decay, variable.dtype.base_dtype)
-      update_delta = (variable - value) * decay
-      return state_ops.assign_sub(variable, update_delta, name=scope)
-
-  def _fused_batch_norm(self, inputs, training):
-    """Returns the output of fused batch norm."""
-    beta = self.beta if self.center else self._beta_const
-    gamma = self.gamma if self.scale else self._gamma_const
-
-    def _fused_batch_norm_training():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          epsilon=self.epsilon,
-          data_format=self._data_format)
-
-    def _fused_batch_norm_inference():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=self.moving_mean,
-          variance=self.moving_variance,
-          epsilon=self.epsilon,
-          is_training=False,
-          data_format=self._data_format)
-
-    output, mean, variance = utils.smart_cond(
-        training, _fused_batch_norm_training, _fused_batch_norm_inference)
-    if not self._bessels_correction_test_only:
-      # Remove Bessel's correction to be consistent with non-fused batch norm.
-      # Note that the variance computed by fused batch norm is
-      # with Bessel's correction.
-      sample_size = math_ops.cast(
-          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
-      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
-      variance *= factor
-
-    training_value = utils.constant_value(training)
-    if training_value is None:
-      momentum = utils.smart_cond(training, lambda: self.momentum, lambda: 1.0)
-    else:
-      momentum = ops.convert_to_tensor(self.momentum)
-    if training_value or training_value is None:
-      mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                momentum)
-      variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, momentum)
-      self.add_update(mean_update, inputs=inputs)
-      self.add_update(variance_update, inputs=inputs)
-
-    return output
-
-  def _renorm_correction_and_moments(self, mean, variance, training):
-    """Returns the correction and update values for renorm."""
-    stddev = math_ops.sqrt(variance + self.epsilon)
-    # Compute the average mean and standard deviation, as if they were
-    # initialized with this batch's moments.
-    mixed_renorm_mean = (self.renorm_mean +
-                         (1. - self.renorm_mean_weight) * mean)
-    mixed_renorm_stddev = (self.renorm_stddev +
-                           (1. - self.renorm_stddev_weight) * stddev)
-    # Compute the corrections for batch renorm.
-    r = stddev / mixed_renorm_stddev
-    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
-    # Ensure the corrections use pre-update moving averages.
-    with ops.control_dependencies([r, d]):
-      mean = array_ops.identity(mean)
-      stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
-    if rmin is not None:
-      r = math_ops.maximum(r, rmin)
-    if rmax is not None:
-      r = math_ops.minimum(r, rmax)
-    if dmax is not None:
-      d = math_ops.maximum(d, -dmax)
-      d = math_ops.minimum(d, dmax)
-    # When not training, use r=1, d=0.
-    r = utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d))
-
-    def _update_renorm_variable(var, weight, value):
-      """Updates a moving average and weight, returns the unbiased value."""
-      value = array_ops.identity(value)
-      def _do_update():
-        """Updates the var and weight, returns their updated ratio."""
-        # Update the variables without zero debiasing. The debiasing will be
-        # accomplished by dividing the exponential moving average by the weight.
-        # For example, after a single update, the moving average would be
-        # (1-decay) * value. and the weight will be 1-decay, with their ratio
-        # giving the value.
-        # Make sure the weight is not updated until before r and d computation.
-        with ops.control_dependencies([value]):
-          weight_value = array_ops.constant(1., dtype=weight.dtype)
-        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
-        new_weight = self._assign_moving_average(weight, weight_value,
-                                                 self.renorm_momentum)
-        # TODO(yuefengz): the updates to var and weighted can not be batched
-        # together if we fetch their updated values here. Consider calculating
-        # new values and delaying the updates.
-        return new_var / new_weight
-
-      def _fake_update():
-        return array_ops.identity(var)
-      return utils.smart_cond(training, _do_update, _fake_update)
-
-    # TODO(yuefengz): colocate the operations
-    new_mean = _update_renorm_variable(self.renorm_mean,
-                                       self.renorm_mean_weight, mean)
-    new_stddev = _update_renorm_variable(self.renorm_stddev,
-                                         self.renorm_stddev_weight, stddev)
-    # Make sqrt(moving_variance + epsilon) = new_stddev.
-    new_variance = math_ops.square(new_stddev) - self.epsilon
-
-    return (r, d, new_mean, new_variance)
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=beta_initializer,
+        gamma_initializer=gamma_initializer,
+        moving_mean_initializer=moving_mean_initializer,
+        moving_variance_initializer=moving_variance_initializer,
+        beta_regularizer=beta_regularizer,
+        gamma_regularizer=gamma_regularizer,
+        beta_constraint=beta_constraint,
+        gamma_constraint=gamma_constraint,
+        renorm=renorm,
+        renorm_clipping=renorm_clipping,
+        renorm_momentum=renorm_momentum,
+        fused=fused,
+        trainable=trainable,
+        virtual_batch_size=virtual_batch_size,
+        adjustment=adjustment,
+        name=name,
+        **kwargs)
 
   def call(self, inputs, training=False):
-    in_eager_mode = context.executing_eagerly()
-    if self.virtual_batch_size is not None:
-      # Virtual batches (aka ghost batches) can be simulated by reshaping the
-      # Tensor and reusing the existing batch norm implementation
-      original_shape = [-1] + inputs.shape.as_list()[1:]
-      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
-
-      # Will cause errors if virtual_batch_size does not divide the batch size
-      inputs = array_ops.reshape(inputs, expanded_shape)
-
-      def undo_virtual_batching(outputs):
-        outputs = array_ops.reshape(outputs, original_shape)
-        return outputs
-
-    if self.fused:
-      outputs = self._fused_batch_norm(inputs, training=training)
-      if self.virtual_batch_size is not None:
-        # Currently never reaches here since fused_batch_norm does not support
-        # virtual batching
-        return undo_virtual_batching(outputs)
-      return outputs
-
-    # Compute the axes along which to reduce the mean / variance
-    input_shape = inputs.get_shape()
-    ndims = len(input_shape)
-    reduction_axes = [i for i in range(ndims) if i not in self.axis]
-    if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
-
-    # Broadcasting only necessary for single-axis batch norm where the axis is
-    # not the last dimension
-    broadcast_shape = [1] * ndims
-    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
-    def _broadcast(v):
-      if (v is not None and
-          len(v.get_shape()) != ndims and
-          reduction_axes != list(range(ndims - 1))):
-        return array_ops.reshape(v, broadcast_shape)
-      return v
-
-    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-    def _compose_transforms(scale, offset, then_scale, then_offset):
-      if then_scale is not None:
-        scale *= then_scale
-        offset *= then_scale
-      if then_offset is not None:
-        offset += then_offset
-      return (scale, offset)
-
-    # Determine a boolean value for `training`: could be True, False, or None.
-    training_value = utils.constant_value(training)
-    if training_value is not False:
-      if self.adjustment:
-        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
-        # Adjust only during training.
-        adj_scale = utils.smart_cond(training,
-                                     lambda: adj_scale,
-                                     lambda: array_ops.ones_like(adj_scale))
-        adj_bias = utils.smart_cond(training,
-                                    lambda: adj_bias,
-                                    lambda: array_ops.zeros_like(adj_bias))
-        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
-
-      # Some of the computations here are not necessary when training==False
-      # but not a constant. However, this makes the code simpler.
-      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
-
-      moving_mean = self.moving_mean
-      moving_variance = self.moving_variance
-
-      mean = utils.smart_cond(training,
-                              lambda: mean,
-                              lambda: moving_mean)
-      variance = utils.smart_cond(training,
-                                  lambda: variance,
-                                  lambda: moving_variance)
-
-      if self.renorm:
-        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            mean, variance, training)
-        # When training, the normalized values (say, x) will be transformed as
-        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
-        # = x * (r * gamma) + (d * gamma + beta) with renorm.
-        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
-        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
-        scale, offset = _compose_transforms(r, d, scale, offset)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keep_dims=True)
-        new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keep_dims=True)
-
-      def _do_update(var, value):
-        if in_eager_mode and not self.trainable:
-          return
-
-        return self._assign_moving_average(var, value, self.momentum)
-
-      mean_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_mean, new_mean),
-          lambda: self.moving_mean)
-      variance_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_variance, new_variance),
-          lambda: self.moving_variance)
-      if not context.executing_eagerly():
-        self.add_update(mean_update, inputs=inputs)
-        self.add_update(variance_update, inputs=inputs)
-
-    else:
-      mean, variance = self.moving_mean, self.moving_variance
-
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
-                                     self.epsilon)
-    # If some components of the shape got lost due to adjustments, fix that.
-    outputs.set_shape(input_shape)
-
-    if self.virtual_batch_size is not None:
-      return undo_virtual_batching(outputs)
-
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(BatchNormalization, self).call(inputs, training=training)
 
 
 @tf_export('layers.batch_normalization')
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 50503ce093..75abe56f51 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -13,92 +13,19 @@
 # limitations under the License.
 # =============================================================================
 
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the pooling layer classes and their functional aliases.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Pooling1D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 1D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling1D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 1, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 1, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=3)
-
-  def call(self, inputs):
-    # There is no TF op for 1D pooling, hence we make the inputs 4D.
-    if self.data_format == 'channels_last':
-      # input is NWC, make it NHWC
-      inputs = array_ops.expand_dims(inputs, 1)
-      # pool on the W dim
-      pool_shape = (1, 1) + self.pool_size + (1,)
-      strides = (1, 1) + self.strides + (1,)
-      data_format = 'NHWC'
-    else:
-      # input is NCW, make it NCHW
-      inputs = array_ops.expand_dims(inputs, 2)
-      # pool on the W dim
-      pool_shape = (1, 1, 1) + self.pool_size
-      strides = (1, 1, 1) + self.strides
-      data_format = 'NCHW'
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=data_format)
-
-    if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 1)
-    else:
-      return array_ops.squeeze(outputs, 2)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                      self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
-
-
 @tf_export('layers.AveragePooling1D')
-class AveragePooling1D(_Pooling1D):
+class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
   """Average Pooling layer for 1D inputs.
 
   Arguments:
@@ -119,8 +46,9 @@ class AveragePooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling1D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -165,7 +93,7 @@ def average_pooling1d(inputs, pool_size, strides,
 
 
 @tf_export('layers.MaxPooling1D')
-class MaxPooling1D(_Pooling1D):
+class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
   """Max Pooling layer for 1D inputs.
 
   Arguments:
@@ -186,8 +114,9 @@ class MaxPooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling1D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -231,79 +160,8 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling2D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 2, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      pool_shape = (1,) + self.pool_size + (1,)
-      strides = (1,) + self.strides + (1,)
-    else:
-      pool_shape = (1, 1) + self.pool_size
-      strides = (1, 1) + self.strides
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, 4))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-    rows = utils.conv_output_length(rows, self.pool_size[0], self.padding,
-                                    self.strides[0])
-    cols = utils.conv_output_length(cols, self.pool_size[1], self.padding,
-                                    self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
-
-
 @tf_export('layers.AveragePooling2D')
-class AveragePooling2D(_Pooling2D):
+class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
   """Average pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -328,8 +186,9 @@ class AveragePooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling2D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -373,7 +232,7 @@ def average_pooling2d(inputs,
 
 
 @tf_export('layers.MaxPooling2D')
-class MaxPooling2D(_Pooling2D):
+class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
   """Max pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -398,8 +257,9 @@ class MaxPooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling2D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -442,90 +302,8 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 3D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)`
-      while `channels_first` corresponds to
-      inputs with shape `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling3D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 3, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 3, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def call(self, inputs):
-    pool_shape = (1,) + self.pool_size + (1,)
-    strides = (1,) + self.strides + (1,)
-
-    if self.data_format == 'channels_first':
-      # TF does not support `channels_first` with 3D pooling operations,
-      # so we must handle this case manually.
-      # TODO(fchollet): remove this when TF pooling is feature-complete.
-      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper())
-
-    if self.data_format == 'channels_first':
-      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      len_dim1 = input_shape[2]
-      len_dim2 = input_shape[3]
-      len_dim3 = input_shape[4]
-    else:
-      len_dim1 = input_shape[1]
-      len_dim2 = input_shape[2]
-      len_dim3 = input_shape[3]
-    len_dim1 = utils.conv_output_length(len_dim1, self.pool_size[0],
-                                        self.padding, self.strides[0])
-    len_dim2 = utils.conv_output_length(len_dim2, self.pool_size[1],
-                                        self.padding, self.strides[1])
-    len_dim3 = utils.conv_output_length(len_dim3, self.pool_size[2],
-                                        self.padding, self.strides[2])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
-
-
 @tf_export('layers.AveragePooling3D')
-class AveragePooling3D(_Pooling3D):
+class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -552,8 +330,9 @@ class AveragePooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling3D, self).__init__(
-        nn.avg_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -599,7 +378,7 @@ def average_pooling3d(inputs,
 
 
 @tf_export('layers.MaxPooling3D')
-class MaxPooling3D(_Pooling3D):
+class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -626,8 +405,9 @@ class MaxPooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling3D, self).__init__(
-        nn.max_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index c941aad7bc..7e94dda648 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -89,33 +88,5 @@ class ConvUtilsTest(test.TestCase):
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 
-class GraphUtilsTest(test.TestCase):
-
-  def testGetReachableFromInputs(self):
-
-    with self.test_session():
-      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
-      x_1 = pl_1 + pl_2
-      x_2 = pl_2 * 2
-      x_3 = pl_3 + 1
-      x_4 = x_1 + x_2
-      x_5 = x_3 * pl_1
-
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1]),
-          {pl_1, x_1, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1, pl_2]),
-          {pl_1, pl_2, x_1, x_2, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_3]),
-          {pl_3, x_3, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([x_3]),
-          {x_3, x_5})
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index ee1a00623a..244702d13b 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -126,8 +126,6 @@ from tensorflow.python.ops.nn_impl import *
 from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
-from tensorflow.python.ops.rnn import *
-from tensorflow.python.ops import rnn_cell
 # pylint: enable=wildcard-import,unused-import
 
 
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7be2f4f61f..7713d78b8a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index bf361cf805..69b81f75fa 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,10 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -133,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index db8f626b98..96272d1b7d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index 809b3a5430..8fd55c8686 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 68d41bb6cc..47d1532c3c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 970b777e51..797d422a90 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 529c64ab29..269be1455b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7e7c330d74..3448136215 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index ada8466d74..979008d0ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 2a5c1cd530..0ffdffd4cd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 9a2cb29815..6b00f110ee 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index f5e991ea42..caff5a2f1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 31732214a6..4a72394921 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index 422eddf10d..9804394fa5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'trainable\', \'virtual_batch_size\', \'adjustment\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 9053a37916..5e5b04c7c6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -18,10 +17,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -70,10 +65,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -112,11 +103,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index 3d536d2182..b8eb4079b9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index a535f18170..3fdb101425 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -47,10 +46,6 @@ tf_class {
     name: "filters"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -139,10 +134,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -193,11 +184,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 801a033972..0be42471e3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 13352e264a..39ba31a709 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index f400e4a15c..26d9d8c476 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b3a9f573b8..43611017fa 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index a9be09c0ab..fa4925ab99 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index be1ef5eb92..c5c5d5e7c0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 30034f7eaf..36dc2d2e9a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Convolution2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index 189b38054c..23ec74370b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index a76d85c629..0e4089c578 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 782195d4ad..23ddbe1a92 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index 2cb7a39ea5..e04ab6bea8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index 8080330699..655314afff 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index 678f40bbc2..d5215f1330 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index fac826109b..310a3c3b91 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index b38716aa2c..2d67b5f720 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -2,10 +2,8 @@ path: "tensorflow.keras.layers.DepthwiseConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 285d544af2..0e493a7f2b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index b77976974c..14726b4b6c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index b07714d3f2..32a50455ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index e67d4ddfc4..2f615d8112 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index b2a668e5a8..82dc878a8c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 4274b8d425..d79d02b954 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 8d9f06083c..1d38ae64bb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -126,10 +121,6 @@ tf_class {
     name: "reset_after"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -176,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index f4f1a5d51c..135de9cd95 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index e502df5e17..5db6e433ee 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 9c8d5bfcd8..bf0dba0a92 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 8dd65f1f24..6da9803609 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 5e30571cc7..345593dec8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index ba90fa4546..5d3be9085e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 8823857758..0b79a87e05 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 500ced852b..68cdbac652 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index cf2717ed46..d5872b444f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index a86ff1a469..4b0cf9a5d3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index e01cc7c1b0..4c1adb2131 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 259c1fb37c..815f1cf580 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 0c41bf97f7..e027dd6cc2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index bec8817aa3..c647b24a23 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index 17be862229..75d70734b4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
index 3aeef347ae..29edabe048 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 6d2a8c5619..0ed383a355 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 490b5b618c..6d14c9c8f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -122,10 +117,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -176,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index 21a65b838a..ddf96aba34 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index 127b04738e..aca282d624 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -12,10 +11,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +59,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -90,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,11 +93,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 87e49f2ed5..b9c53b43c8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 1aa3aad324..2ee566d03b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5e9dc7d477..db0d0e816a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index 0d101e5b68..82008b89d0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index c85cd49ac8..31a34a17d0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 4f59e330c9..70d24ac75c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index c0ea0eb050..55b16564b3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index ca37ae5131..a230b74c38 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 3ede237834..d98f7c39f5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index d87e25a7ba..b2e96a4203 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index e4df7b48ae..0c45bbdf17 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 6bf7c77743..6423d83418 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index c14be132b7..6e17081375 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index 72ffbceae0..d01d371da5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index d3e780c8b2..d3f5508640 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index a27980a9d1..44e1007f54 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index 67f991276c..8fc3ec3331 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index fccea5e8af..457d277495 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d20663bdb0..54eda8ee21 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 889fa0a1b5..7111965546 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConvolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index c850f3fedc..815e34a48d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 526d88ccba..6614760e5e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 7fddae3447..bfcfd71ecd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -114,10 +109,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -164,11 +155,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
index 5b9b62fc97..9c4618c4e9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 769da30999..9a0a19d2d5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index fca2e42a15..446f7122a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 36e8de09a9..52a0485b5c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a96f16fae9..c82e7a192d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_size"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index e1cbd0e150..9ccf251a18 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index f0d35728fb..e080a07799 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -108,11 +99,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 74efaea6dd..5fadca0b83 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index dc5bd5fd53..2d395bf7e8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index e01ccfb74a..18d58ec3b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 7e6f90f762..6223cb2f3c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 4d0d402dad..e71bba6a7f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index b353a529bc..aba6d8cb1f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 9fe1256e61..ce545ecc95 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 8ccf15f9ab..3ac285681f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index be12b0bd2e..51ba0c5043 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,10 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -133,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index 1c4f550d7f..38fd78a5a8 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index d2db095269..86a524cc91 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 34d9a9df28..8a811fe456 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index 21ad0efecf..3923e706be 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index ed38747c76..7a0a8a2a51 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index ff453c6059..7ed3a65251 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index 5583bd22dc..23831aa74f 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index 63f0c32a7c..9d41a6b099 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index b77726252c..865fe08e63 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index 92db9f6dcd..ee164aae20 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index 80fa846a24..8167dc79cd 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index f63213b3dd..efa4419692 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
index 7c1d05cd2b..2ff89f0a6f 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index 4e45b2d513..b3a6dfdffa 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -23,6 +24,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +56,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -81,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -93,7 +102,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +120,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +132,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +160,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +172,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index 19ec33fce7..cef396489d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index 76180c333a..565f0c7a79 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index ded75c8ff0..595ce2eead 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
index 3dbfa5453f..ccca96f722 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index ab171df1d1..1c99c96182 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 9c71a24d05..f909cd8756 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 9e19f96b74..173d2eae63 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 7540aa6286..3c3e382297 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index fc1ff38669..db16660f11 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -107,7 +116,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -121,6 +134,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -129,10 +146,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -145,6 +174,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -153,6 +186,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 751122cfff..d7f658aaee 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 4b6313f395..b9ab487c77 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 00e8c71140..b9e3d93475 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 3852f90dd6..75b5898c59 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +25,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +57,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -90,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,7 +111,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -116,6 +129,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -124,10 +141,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,6 +169,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -148,6 +181,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 8f3f0f7506..fee0dc63b9 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-- 
GitLab


From 0932d4af60cd8c9ce322a8e16c8f51d300eb4402 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 13:57:00 -0700
Subject: [PATCH 0530/1262] Handle duplicate features by coalescing them
 together into a single feature.

PiperOrigin-RevId: 192341065
---
 .../python/sdca_estimator_test.py             | 53 ++++++++++++++++---
 .../linear_optimizer/python/sdca_optimizer.py | 53 ++++++++++++-------
 2 files changed, 80 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 79a5928a21..bed3d5139f 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -30,6 +30,13 @@ from tensorflow.python.platform import test
 
 class SDCALogisticClassifierTest(test.TestCase):
 
+  def _single_threaded_test_session(self):
+    # TODO(andreasst): figure out why SDCALinearRegressor needs a single
+    # threaded session to pass in tsan mode but SDCALogisticClassifier does not.
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
+    return self.test_session(config=config)
+
   def testRealValuedFeatures(self):
     """Tests SDCALogisticClassifier works with real valued features."""
 
@@ -41,7 +48,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       maintenance_cost = feature_column_lib.real_valued_column(
           'maintenance_cost')
       sq_footage = feature_column_lib.real_valued_column('sq_footage')
@@ -66,7 +73,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       dense_feature = feature_column_lib.real_valued_column(
           'dense_feature', dimension=2)
       classifier = sdca_estimator.SDCALogisticClassifier(
@@ -86,7 +93,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('price'),
           boundaries=[500.0, 700.0])
@@ -120,7 +127,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
@@ -151,7 +158,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 5])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
       country_weighted_by_price = feature_column_lib.weighted_sparse_column(
@@ -163,6 +170,38 @@ class SDCALogisticClassifierTest(test.TestCase):
       metrics = classifier.evaluate(input_fn=input_fn, steps=1)
       self.assertGreater(metrics['accuracy'], 0.9)
 
+  def testSparseFeaturesWithDuplicates(self):
+    """Tests SDCALogisticClassifier with duplicated sparse features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2']),
+          'age':
+              sparse_tensor.SparseTensor(
+                  values=['20-29'] * 5 + ['31-40'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+          'gender':
+              sparse_tensor.SparseTensor(
+                  values=['m'] * 5 + ['f'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+      }, constant_op.constant([[1], [0]])
+
+    with self._single_threaded_test_session():
+      age = feature_column_lib.sparse_column_with_hash_bucket(
+          'age', hash_bucket_size=10)
+      gender = feature_column_lib.sparse_column_with_hash_bucket(
+          'gender', hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id', feature_columns=[age, gender])
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertLess(metrics['loss'], 0.060)
+
   def testCrossedFeatures(self):
     """Tests SDCALogisticClassifier with crossed features."""
 
@@ -182,7 +221,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 1])
       }, constant_op.constant([[0], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       language = feature_column_lib.sparse_column_with_hash_bucket(
           'language', hash_bucket_size=5)
       country = feature_column_lib.sparse_column_with_hash_bucket(
@@ -215,7 +254,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[3.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       sq_footage_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('sq_footage'),
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index dffdddacfb..5d4572bf6c 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -181,28 +182,42 @@ class SDCAOptimizer(object):
         elif isinstance(
             column,
             (
+                layers.feature_column._WeightedSparseColumn,  # pylint: disable=protected-access
                 layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                 layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-          sparse_features.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=transformed_tensor.indices,
-                          num_or_size_splits=2,
-                          axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]), None))
-          sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-          id_tensor = column.id_tensor(transformed_tensor)
-          weight_tensor = column.weight_tensor(transformed_tensor)
+
+          if isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
+            id_tensor = column.id_tensor(transformed_tensor)
+            weight_tensor = array_ops.reshape(
+                column.weight_tensor(transformed_tensor).values, [-1])
+          else:
+            id_tensor = transformed_tensor
+            weight_tensor = array_ops.ones(
+                [array_ops.shape(id_tensor.indices)[0]], dtypes.float32)
+
+          example_ids = array_ops.reshape(id_tensor.indices[:, 0], [-1])
+
+          flat_ids = array_ops.reshape(id_tensor.values, [-1])
+          projection_length = math_ops.reduce_max(flat_ids) + 1
+          # project ids based on example ids so that we can dedup ids that
+          # occur multiple times for a single example.
+          projected_ids = projection_length * example_ids + flat_ids
+
+          # Remove any redudant ids.
+          ids, idx = array_ops.unique(projected_ids)
+          # Keep only one example id per duplicated ids.
+          example_ids_filtered = math_ops.unsorted_segment_min(
+              example_ids, idx,
+              array_ops.shape(ids)[0])
+
+          # reproject ids back feature id space.
+          reproject_ids = (ids - projection_length * example_ids_filtered)
+
+          weights = array_ops.reshape(
+              math_ops.unsorted_segment_sum(weight_tensor, idx,
+                                            array_ops.shape(ids)[0]), [-1])
           sparse_feature_with_values.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=id_tensor.indices, num_or_size_splits=2, axis=1)
-                      [0], [-1]),
-                  array_ops.reshape(id_tensor.values, [-1]),
-                  array_ops.reshape(weight_tensor.values, [-1])))
+              SparseFeatureColumn(example_ids_filtered, reproject_ids, weights))
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
         else:
-- 
GitLab


From 4995231f9e383b4edc222f63f546b9fa8577fb69 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 13:59:49 -0700
Subject: [PATCH 0531/1262] test previously untested eval codepaths.

PiperOrigin-RevId: 192341561
---
 tensorflow/contrib/gan/BUILD                  |  1 +
 .../eval/python/classifier_metrics_test.py    | 33 ++++++++++++++-----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 461066bbb4..b305f37791 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -364,6 +364,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 663e49bdca..4fb8d58bc9 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -22,6 +22,7 @@ import os
 import tarfile
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 from scipy import linalg as scp_linalg
 
@@ -182,13 +183,20 @@ def _run_with_mock(function, *args, **kwargs):
     return function(*args, **kwargs)
 
 
-class ClassifierMetricsTest(test.TestCase):
+class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
 
-  def test_run_inception_graph(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph(self, use_default_graph_def):
     """Test `run_inception` graph construction."""
     batch_size = 7
     img = array_ops.ones([batch_size, 299, 299, 3])
-    logits = _run_with_mock(classifier_metrics.run_inception, img)
+
+    if use_default_graph_def:
+      logits = _run_with_mock(classifier_metrics.run_inception, img)
+    else:
+      logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
     self.assertTrue(isinstance(logits, ops.Tensor))
     logits.shape.assert_is_compatible_with([batch_size, 1001])
@@ -196,14 +204,23 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
-  def test_run_inception_graph_pool_output(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph_pool_output(self, use_default_graph_def):
     """Test `run_inception` graph construction with pool output."""
     batch_size = 3
     img = array_ops.ones([batch_size, 299, 299, 3])
-    pool = _run_with_mock(
-        classifier_metrics.run_inception,
-        img,
-        output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+
+    if use_default_graph_def:
+      pool = _run_with_mock(
+          classifier_metrics.run_inception,
+          img,
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+    else:
+      pool = classifier_metrics.run_inception(
+          img, _get_dummy_graphdef(),
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
     self.assertTrue(isinstance(pool, ops.Tensor))
     pool.shape.assert_is_compatible_with([batch_size, 2048])
-- 
GitLab


From 9fe03a590c12b6b52cd561551c31ea2420fa39c7 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Tue, 10 Apr 2018 14:02:02 -0700
Subject: [PATCH 0532/1262] Pad support for quantized zero.

PiperOrigin-RevId: 192342172
---
 .../internal/optimized/optimized_ops.h        |  28 ++--
 .../internal/reference/reference_ops.h        |  13 +-
 tensorflow/contrib/lite/kernels/pad.cc        |  27 ++--
 tensorflow/contrib/lite/kernels/pad_test.cc   | 129 +++++++++++++++---
 4 files changed, 158 insertions(+), 39 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 5acf1eaede..e329e02273 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5240,7 +5240,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
@@ -5260,27 +5260,27 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, 0,
+    memset(output_data, pad_value,
            left_b_padding * output_height * output_width * output_depth *
                sizeof(T));
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
              left_h_padding * output_width * output_depth * sizeof(T));
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
                left_w_padding * output_depth * sizeof(T));
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
-                 left_d_padding * sizeof(T));
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+                 pad_value, left_d_padding * sizeof(T));
         }
 
         T* out = output_data +
@@ -5294,20 +5294,21 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
           memset(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              0, right_d_padding * sizeof(T));
+              pad_value, right_d_padding * sizeof(T));
         }
       }
       if (right_w_padding != 0) {
         memset(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            0, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth * sizeof(T));
       }
     }
     if (right_h_padding != 0) {
       memset(output_data + Offset(output_dims, 0, 0,
                                   output_height - right_h_padding, out_b),
-             0, right_h_padding * output_width * output_depth * sizeof(T));
+             pad_value,
+             right_h_padding * output_width * output_depth * sizeof(T));
     }
   }
   if (right_b_padding != 0) {
@@ -5319,6 +5320,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 4bbec52bf7..250a308f2a 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2860,7 +2860,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -2890,7 +2890,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = 0;
+            *out_ptr++ = static_cast<T>(pad_value);
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -2900,6 +2900,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 inline bool LoopCondition(int index, int stop, int stride) {
   return stride > 0 ? index < stop : index > stop;
 }
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index c29da3862e..4f9449a225 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -119,39 +119,46 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar)                                           \
+#define TF_LITE_PAD(type, scalar, pad_value)                                \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
             GetTensorDims(op_context.input), before_padding, after_padding, \
             GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output))
+            GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float);
+        TF_LITE_PAD(reference_ops, float, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float);
+        TF_LITE_PAD(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
+      // Quantized Pad requires that 0 is represented in the quantized range.
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                  std::numeric_limits<uint8_t>::min());
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t);
+        TF_LITE_PAD(reference_ops, uint8_t,
+                    op_context.output->params.zero_point);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t);
+        TF_LITE_PAD(optimized_ops, uint8_t,
+                    op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t);
+        TF_LITE_PAD(reference_ops, int32_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t);
+        TF_LITE_PAD(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t);
+        TF_LITE_PAD(reference_ops, int64_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t);
+        TF_LITE_PAD(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index 28834ad071..c06237e572 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class PadOpModel : public SingleOpModel {
  public:
@@ -29,6 +30,10 @@ class PadOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
@@ -36,6 +41,11 @@ class PadOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int output_;
@@ -50,16 +60,17 @@ class PadOpModel : public SingleOpModel {
 //    m.Invoke();
 class PadOpConstModel : public PadOpModel {
  public:
-  PadOpConstModel(std::initializer_list<int> input_shape,
+  PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -72,40 +83,45 @@ class PadOpConstModel : public PadOpModel {
 //    m.Invoke();
 class PadOpDynamicModel : public PadOpModel {
  public:
-  PadOpDynamicModel(std::initializer_list<int> input_shape,
-                    std::initializer_list<int> paddings_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  PadOpDynamicModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape, paddings_shape});
+    BuildInterpreter({input.shape, paddings_shape});
   }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
+                      {TensorType_FLOAT32}),
       "dims != 4");
 }
 
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -114,7 +130,8 @@ TEST(PadOpTest, SimpleConstTest) {
 }
 
 TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -124,7 +141,8 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -134,7 +152,8 @@ TEST(PadOpTest, AdvancedConstTest) {
 }
 
 TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -144,6 +163,80 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+class QuantizedPadOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 06efb16fb0b9ef7c7ce3d4bc0d5c677b3cbd5a6f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 14:04:29 -0700
Subject: [PATCH 0533/1262] [XLA] Redesign: implement and test Rev,
 BitcastConvertType, Map, and ReducePrecision.

PiperOrigin-RevId: 192342686
---
 .../xla/client/xla_client/xla_builder.cc      |  57 +++++++-
 tensorflow/compiler/xla/tests/BUILD           |   8 +-
 .../xla/tests/bitcast_convert_test.cc         |  20 +--
 tensorflow/compiler/xla/tests/map_test.cc     | 137 +++++++++---------
 .../xla/tests/reduce_precision_test.cc        |  27 ++--
 tensorflow/compiler/xla/tests/reverse_test.cc |   4 +-
 6 files changed, 153 insertions(+), 100 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 74d48635eb..7481b357ff 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1056,7 +1056,17 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
 
 XlaOp XlaBuilder::Rev(const XlaOp& operand,
                       tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    for (int64 dim : dimensions) {
+      instr.add_dimensions(dim);
+    }
+    return AddInstruction(std::move(instr), HloOpcode::kReverse, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Sort(const XlaOp& operand) {
@@ -1087,7 +1097,15 @@ XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
 
 XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
+                          {operand});
+  });
 }
 
 XlaOp XlaBuilder::SquareF32(const XlaOp& operand) {
@@ -1113,7 +1131,28 @@ XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
                       const XlaComputation& computation,
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       tensorflow::gtl::ArraySlice<XlaOp> static_operands) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (!static_operands.empty()) {
+      return Unimplemented("static_operands is not supported in Map");
+    }
+
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
+                                      dimensions));
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kMap, operands);
+  });
 }
 
 XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
@@ -1283,7 +1322,17 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
 
 XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                   const int mantissa_bits) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReducePrecisionShape(
+                            operand_shape, exponent_bits, mantissa_bits));
+    instr.set_exponent_bits(exponent_bits);
+    instr.set_mantissa_bits(mantissa_bits);
+    return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
+                          {operand});
+  });
 }
 
 void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 699b077d80..19fb4886db 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -415,6 +415,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -641,9 +643,9 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1397,8 +1399,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1486,8 +1488,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index 0d94d65c10..777ac167a3 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -42,7 +42,7 @@ class BitcastConvertTest : public ClientLibraryTestBase {
 };
 
 TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.BitcastConvertType(a, S32);
 
@@ -51,7 +51,7 @@ TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0f, 64.0f});
   builder.BitcastConvertType(a, F32);
 
@@ -60,7 +60,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
 }
 
 TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR1<int32>({0, static_cast<int32>(0x80000000), 0x3F800000,
                                  static_cast<int32>(0xBF800000), 0x3F000000,
@@ -72,7 +72,7 @@ TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
 }
 
 XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   builder.BitcastConvertType(a, F32);
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.6, 64.4});
   builder.BitcastConvertType(a, S32);
 
@@ -90,7 +90,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertS32Extremes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(
       {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
   builder.BitcastConvertType(a, F32);
@@ -100,7 +100,7 @@ TEST_F(BitcastConvertTest, ConvertS32Extremes) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
   b->BitcastConvertType(param, S32);
@@ -112,7 +112,7 @@ TEST_F(BitcastConvertTest, ConvertMapToS32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
   b->BitcastConvertType(param, F32);
@@ -129,7 +129,7 @@ TEST_F(BitcastConvertTest, ConvertMapToF32) {
 //   input -> convert -> reshape
 // the new convert should have the same element type as the old convert.
 TEST_F(BitcastConvertTest, ConvertReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR1<int32>({0x42280000});
   auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
   builder.BitcastConvertType(reshape, F32);
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 0cd812fd1b..efe6cc6787 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -50,18 +52,18 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOne() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOne() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
-    auto adder_to_one = mapped_builder.Add(x, one);
+    mapped_builder.Add(x, one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
   }
 
-  Computation CreateMax() {
-    ComputationBuilder b(client_, TestName());
+  XlaComputation CreateMax() {
+    XlaBuilder b(TestName());
     auto lhs = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto rhs = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     b.Max(lhs, rhs);
@@ -73,8 +75,8 @@ class MapTest : public ClientLibraryTestBase {
   // Creates a computation that accepts an F32 and returns T(1) (ignoring the
   // argument).
   template <class T>
-  Computation CreateScalarOne() {
-    ComputationBuilder mapped_builder(client_, "scalar_one");
+  XlaComputation CreateScalarOne() {
+    XlaBuilder mapped_builder("scalar_one");
     (void)mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     mapped_builder.ConstantR0<T>(1);
     auto computation_status = mapped_builder.Build();
@@ -87,11 +89,11 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (mul)
   //                /
   // 2.0f ---------/
-  Computation CreateMulByTwo() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateMulByTwo() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto two = mapped_builder.ConstantR0<float>(2.0);
-    auto mul_by_two = mapped_builder.Mul(x, two);
+    mapped_builder.Mul(x, two);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -105,12 +107,12 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add) ----> (mul)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOneTimesItself() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOneTimesItself() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
     auto adder_to_one = mapped_builder.Add(x, one);
-    auto result = mapped_builder.Mul(x, adder_to_one);
+    mapped_builder.Mul(x, adder_to_one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -122,12 +124,13 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} -----------> (map) ----> (add)
   //                         /           /
   // embedded_computation --/       n --/
-  Computation CreateMapPlusN(const Computation& embedded_computation, float n) {
-    ComputationBuilder builder(client_, TestName());
+  XlaComputation CreateMapPlusN(const XlaComputation& embedded_computation,
+                                float n) {
+    XlaBuilder builder(TestName());
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto map = builder.Map({x}, embedded_computation, {});
     auto constant_n = builder.ConstantR0<float>(n);
-    auto add = builder.Add(map, constant_n);
+    builder.Add(map, constant_n);
     auto computation_status = builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -135,11 +138,11 @@ class MapTest : public ClientLibraryTestBase {
 
   // Creates a binary function with signature (F32, F32) -> Pred
   // defined by (x, y) -> x > y.
-  Computation CreateGt() {
-    ComputationBuilder b(client_, "Gt");
+  XlaComputation CreateGt() {
+    XlaBuilder b("Gt");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
-    auto gt = b.Gt(x, y);
+    b.Gt(x, y);
     auto computation_status = b.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -152,13 +155,13 @@ class MapTest : public ClientLibraryTestBase {
   // y {R0F32} ----> (add) ---> (add)
   //                           /
   // z {R0F32} ---------------/
-  Computation CreateTernaryAdder() {
-    ComputationBuilder mapped_builder(client_, "TernaryAdder");
+  XlaComputation CreateTernaryAdder() {
+    XlaBuilder mapped_builder("TernaryAdder");
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = mapped_builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     auto z = mapped_builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "z");
     auto xy = mapped_builder.Add(x, y);
-    auto xyz = mapped_builder.Add(xy, z);
+    mapped_builder.Add(xy, z);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -167,13 +170,13 @@ class MapTest : public ClientLibraryTestBase {
 
 TEST_F(MapTest, MapEachElemPlusOneR0) {
   // Applies lambda (x) (+ x 1)) to an input scalar.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(42.0);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {});
+  builder.Map({param}, CreateAdderToOne(), {});
 
   ComputeAndCompareR0<float>(&builder, 43.0, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -181,13 +184,13 @@ TEST_F(MapTest, MapEachElemPlusOneR0) {
 
 XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -195,55 +198,55 @@ XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
 
 TEST_F(MapTest, MapEachElemPlusOneR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {3.2f, 4.3f, 5.4f, 6.5f},
                              {param0_data.get()}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachF32ElementToS32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<int32>(), {0});
+  builder.Map({param}, CreateScalarOne<int32>(), {0});
 
   ComputeAndCompareR1<int32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachF32ElementToU32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<uint32>(), {0});
+  builder.Map({param}, CreateScalarOne<uint32>(), {0});
 
   ComputeAndCompareR1<uint32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachElemLongerChainR1) {
   // Maps (lambda (x) (* (+ x 1) x)) onto an input R1F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOneTimesItself(), {0});
+  builder.Map({param}, CreateAdderToOneTimesItself(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {9.36f, 20.91f, 0.11f, 0.24f, 999000.0f, 65535.75f},
@@ -253,14 +256,14 @@ TEST_F(MapTest, MapEachElemLongerChainR1) {
 XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -269,7 +272,7 @@ XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
 TEST_F(MapTest, MapMultipleMapsR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -277,7 +280,7 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {6.4f, 8.6f, 10.8f, 13.0f},
                              {param0_data.get()}, ErrorSpec(0.01f));
@@ -285,14 +288,14 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
 TEST_F(MapTest, MapEachElemPlusOneR2) {
   // Maps (lambda (x) (+ x 1)) onto an input R2F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0, 1});
+  builder.Map({param}, CreateAdderToOne(), {0, 1});
 
   Array2D<float> expected_array(
       {{14.25f, 15.0f}, {-6.1f, -6.2f}, {-7.8f, 9.8f}});
@@ -317,18 +320,18 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) {
   auto embed2 = CreateMapPlusN(embed1, 2.0);
   auto embed3 = CreateMapPlusN(embed1, 4.0);
 
-  ComputationBuilder embed4_builder(client_, "embed4");
+  XlaBuilder embed4_builder("embed4");
   auto embed4_param = embed4_builder.Parameter(0, scalar_shape, "x");
   auto embed4_map_lhs = embed4_builder.Map({embed4_param}, embed2, {});
   auto embed4_map_rhs = embed4_builder.Map({embed4_param}, embed3, {});
-  auto embed4_add = embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
+  embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
   auto embed4_status = embed4_builder.Build();
   ASSERT_IS_OK(embed4_status.status());
   auto embed4 = embed4_status.ConsumeValueOrDie();
 
   auto embed5 = CreateMapPlusN(embed2, 6.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto constant_42 = builder.ConstantR0<float>(42.0);
   auto constant_7 = builder.ConstantR0<float>(7.0);
   auto map_42 = builder.Map({constant_42}, embed5, {});
@@ -359,7 +362,8 @@ TEST_F(MapTest, VersionedEmbeddedComputation) {
 
   // Add another Add(1) operation to the existing embedded computation. This
   // requires using the stub interface because the ComputationBuilder does not
-  // allow modification to the Computation objects after they have been built.
+  // allow modification to the XlaComputation objects after they have been
+  // built.
   BinaryOpRequest request;
   request.set_binop(BINOP_ADD);
   *request.mutable_lhs() = adder_to_one;
@@ -381,7 +385,7 @@ TEST_F(MapTest, VersionedEmbeddedComputation) {
 
 TEST_F(MapTest, MapBinaryAdder) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -393,8 +397,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(F32, &builder), {0});
+  builder.Map({param0, param1}, CreateScalarAddComputation(F32, &builder), {0});
 
   ComputeAndCompareR1<float>(&builder, {7.3f, 7.7, 4.3f, 0},
                              {param0_data.get(), param1_data.get()},
@@ -404,7 +407,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 // Adds two rank-2 arrays with different layouts. This test exercises a path
 // for Map that used to fail in shape inference (b/28989438).
 XLA_TEST_F(MapTest, AddWithMixedLayouts) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2WithLayout(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
   std::unique_ptr<GlobalData> param0_data =
@@ -417,8 +420,8 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1});
 
   Array2D<int32> expected(2, 2);
   expected(0, 0) = 11;
@@ -430,7 +433,7 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 }
 
 XLA_TEST_F(MapTest, AddR3_3x0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
   std::unique_ptr<GlobalData> param0_data =
@@ -443,8 +446,8 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1, 2});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1, 2});
 
   ComputeAndCompareR3<int32>(&builder, Array3D<int32>(3, 0, 2),
                              {param0_data.get(), param1_data.get()});
@@ -452,7 +455,7 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
 TEST_F(MapTest, MapTernaryAdder) {
   // Maps (lambda (x y z) (+ x y z)) onto three R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -469,7 +472,7 @@ TEST_F(MapTest, MapTernaryAdder) {
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
   auto param2 = builder.Parameter(2, param2_literal->shape(), "param2");
-  auto map = builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
+  builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {-2.7f, -92.3f, -895.7f, -400.0f},
@@ -479,24 +482,24 @@ TEST_F(MapTest, MapTernaryAdder) {
 
 TEST_F(MapTest, MapGt) {
   // Maps (x,y) -> x > y onto two R1F32 vectors.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto gt = CreateGt();
   b.Map({b.ConstantR1<float>({1, 20}), b.ConstantR1<float>({10, 2})}, gt, {0});
   ComputeAndCompareR1<bool>(&b, {false, true}, {});
 }
 
 TEST_F(MapTest, NestedBinaryMap) {
-  Computation max_with_square;
+  XlaComputation max_with_square;
   {
     // max_with_square(x) = do max(x, x^2) via a map.
-    ComputationBuilder b(client_, "max_with_square");
+    XlaBuilder b("max_with_square");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     b.Map({x, b.Mul(x, x)}, CreateMax(), {});
     auto computation_status = b.Build();
     ASSERT_IS_OK(computation_status.status());
     max_with_square = computation_status.ConsumeValueOrDie();
   }
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = b.ConstantR1<float>({0.1f, 0.5f, -0.5f, 1.0f, 2.0f});
   b.Map({input}, max_with_square, {0});
   ComputeAndCompareR1<float>(&b, {0.1f, 0.5f, 0.25f, 1.0f, 4.0f}, {});
@@ -505,13 +508,13 @@ TEST_F(MapTest, NestedBinaryMap) {
 TEST_F(MapTest, MapOperantionWithBuildError) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors but uses an unsupported
   // type combination (F32 + U16) to test that the error is reported to the
-  // outermost ComputationBuilder.
-  ComputationBuilder builder(client_, TestName());
+  // outermost XlaBuilder.
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("ErrorAdd");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(U16, {}), "y");
-  auto adder = sub_builder->Add(x, y);
+  sub_builder->Add(x, y);
   auto error_add = sub_builder->BuildAndNoteError();
 
   std::unique_ptr<Literal> param0_literal =
@@ -525,9 +528,9 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1}, error_add, {0});
+  builder.Map({param0, param1}, error_add, {0});
 
-  StatusOr<Computation> computation_status = builder.Build();
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -545,7 +548,7 @@ using MapTestWithFullOpt = ClientLibraryTestBase;
 // to have issues with such patterns and maybe invalidate the pointer to entry
 // computation.
 TEST_F(MapTestWithFullOpt, MapScalarPower) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -572,7 +575,7 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
 // Regression test for b/35786417, where the inliner would not notice the change
 // of parameter order inside the map.
 TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -598,7 +601,7 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
 // Regression test for b/35786417, where the inliner would CHECK-fail due to the
 // mul inside the map having more parameters than the map does.
 TEST_F(MapTestWithFullOpt, MapSquare) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index dc7ce3253c..b311785449 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
@@ -228,15 +228,14 @@ XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
   // This is required for proper handling of NaN values.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({input_values});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
   auto a = builder.Parameter(0, a_literal->shape(), "a");
 
-  auto reduce_precision =
-      builder.ReducePrecision(a, exponent_bits, mantissa_bits);
+  builder.ReducePrecision(a, exponent_bits, mantissa_bits);
 
   ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
 }
@@ -252,7 +251,7 @@ class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -265,7 +264,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
   // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
   // reduce-precision operation showed up in the correct place in the
   // graph.
-  auto log = builder.Log(abs);
+  builder.Log(abs);
 
   // Insert precision-reduction after the Abs(x) operation, rounding that
   // result to exactly 1.0f.
@@ -281,7 +280,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -290,7 +289,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kAbs operations.  This
   // should not see into the fusion nodes and thus should not affect the
@@ -307,7 +306,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -316,7 +315,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kFusion operations.
   auto reduce_precision_pass = execution_options_.mutable_debug_options()
@@ -331,7 +330,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -340,7 +339,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kCos operations.  This
   // should have no effect.
@@ -356,7 +355,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -365,7 +364,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kAbs operations.  This
   // should see the kAbs operation within the above fusion node.
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 8fc841f140..6959c95502 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -85,7 +85,7 @@ TEST_P(FloatReverseTest, Reverses) {
   auto r1_literal = Literal::CreateR1<float>(input_vector);
   auto input_literal = r1_literal->Reshape(spec.input_dims).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = AddParam(*input_literal, &builder);
   builder.Rev(a, spec.reversal);
 
-- 
GitLab


From 0b80e3dca1bf051f973212d45315c44c9c6a125d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 14:16:36 -0700
Subject: [PATCH 0534/1262] Add missing import for RNNClassifier

PiperOrigin-RevId: 192344760
---
 tensorflow/contrib/estimator/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 9a87fa915d..be20d1b777 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
 from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
+from tensorflow.contrib.estimator.python.estimator.rnn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
-- 
GitLab


From b0af2c890049a37b86f9724074570d80bb0dc14d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 14:22:33 -0700
Subject: [PATCH 0535/1262] Bug fix for statistical_testing:   - Max/Min
 computations should be done over the sample dimension.   - Change dominate
 check to be greater_equal instead of greater (for matching dimensions).

PiperOrigin-RevId: 192345809
---
 .../kernel_tests/statistical_testing_test.py  | 22 +++++-----
 .../python/ops/statistical_testing.py         | 43 +++++++++++--------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
index 0400c80c29..c4fb669ebb 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import statistical_testing as st
-from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
 
@@ -129,13 +128,13 @@ class StatisticalTestingTest(test.TestCase):
 
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is not 0.4.
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("Mean confidence interval too high"):
         sess.run(st.assert_true_mean_equal_by_dkwm(
             samples, 0., 1., 0.4, false_fail_rate=1e-6))
 
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is not 0.6.
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("Mean confidence interval too low"):
         sess.run(st.assert_true_mean_equal_by_dkwm(
             samples, 0., 1., 0.6, false_fail_rate=1e-6))
 
@@ -172,7 +171,7 @@ class StatisticalTestingTest(test.TestCase):
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(2, 1).
       beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("samples1 has a smaller mean"):
         sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
             samples1, 0., 1.,
             beta_high_samples, 0., 1.,
@@ -190,7 +189,7 @@ class StatisticalTestingTest(test.TestCase):
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(1, 2).
       beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("samples2 has a smaller mean"):
         sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
             samples1, 0., 1.,
             beta_low_samples, 0., 1.,
@@ -198,21 +197,22 @@ class StatisticalTestingTest(test.TestCase):
 
   def test_dkwm_argument_validity_checking(self):
     rng = np.random.RandomState(seed=0)
-    samples = rng.uniform(size=5000).astype(np.float32)
+    samples = rng.uniform(
+        low=[0., 1.], high=[1., 2.], size=(2500, 1, 2)).astype(np.float32)
 
     # Test that the test library complains if the given samples fall
     # outside the purported bounds.
     with self.test_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("maximum value exceeds expectations"):
         sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, 0., 0.5, error_rate=0.5))
-      with self.assertRaises(errors.InvalidArgumentError):
+            samples, [[0., 1.]], [[0.5, 1.5]], error_rate=0.5))
+      with self.assertRaisesOpError("minimum value falls below expectations"):
         sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, 0.5, 1., error_rate=0.5))
+            samples, [[0.5, 1.5]], [[1., 2.]], error_rate=0.5))
 
       # But doesn't complain if they don't.
       op = st.true_mean_confidence_interval_by_dkwm(
-          samples, 0., 1., error_rate=0.5)
+          samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
       _ = sess.run(op)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index 5c52015e5f..9b9fff0afa 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -234,7 +234,7 @@ def _maximum_mean(samples, envelope, high, name=None):
     envelope = ops.convert_to_tensor(envelope, name="envelope")
     high = ops.convert_to_tensor(high, name="high")
 
-    xmax = math_ops.reduce_max(samples, axis=[-1])
+    xmax = math_ops.reduce_max(samples, axis=[0])
     msg = "Given sample maximum value exceeds expectations"
     check_op = check_ops.assert_less_equal(xmax, high, message=msg)
     with ops.control_dependencies([check_op]):
@@ -279,7 +279,7 @@ def _minimum_mean(samples, envelope, low, name=None):
     envelope = ops.convert_to_tensor(envelope, name="envelope")
     low = ops.convert_to_tensor(low, name="low")
 
-    xmin = math_ops.reduce_min(samples, axis=[-1])
+    xmin = math_ops.reduce_min(samples, axis=[0])
     msg = "Given sample minimum value falls below expectations"
     check_op = check_ops.assert_greater_equal(xmin, low, message=msg)
     with ops.control_dependencies([check_op]):
@@ -319,8 +319,8 @@ def _dkwm_cdf_envelope(n, error_rate, name=None):
     return math_ops.sqrt(-gen_math_ops.log(error_rate / 2.) / (2. * n))
 
 
-def _check_shape_dominates(tensor, tensors):
-  """Check that broadcasting `tensor` against `tensors` does not expand it.
+def _check_shape_dominates(samples, parameters):
+  """Check that broadcasting `samples` against `parameters` does not expand it.
 
   Why?  Because I want to be very sure that the samples tensor is not
   accidentally enlarged by broadcasting against tensors that are
@@ -328,24 +328,27 @@ def _check_shape_dominates(tensor, tensors):
   sample counts end up inflated.
 
   Args:
-    tensor: A Tensor whose shape is to be protected against broadcasting.
-    tensors: A list of Tensors to check
+    samples: A Tensor whose shape is to be protected against broadcasting.
+    parameters: A list of Tensors who are parameters for the statistical test.
 
   Returns:
-    tensor: `tf.identity(tensor)` with control dependencies attached;
-      be sure to use that downstream.
+    samples: Return original `samples` with control dependencies attached
+      to ensure no broadcasting.
   """
   def check(t):
-    target = array_ops.shape(tensor)[1:]
-    result = array_ops.broadcast_dynamic_shape(target, array_ops.shape(t))
+    samples_batch_shape = array_ops.shape(samples)[1:]
+    broadcasted_batch_shape = array_ops.broadcast_dynamic_shape(
+        samples_batch_shape, array_ops.shape(t))
     # This rank check ensures that I don't get a wrong answer from the
     # _shapes_ broadcasting against each other.
-    gt = check_ops.assert_greater(array_ops.rank(target), array_ops.rank(t))
-    eq = check_ops.assert_equal(target, result)
-    return gt, eq
-  checks = list(itertools.chain(*[check(t) for t in tensors]))
+    samples_batch_ndims = array_ops.size(samples_batch_shape)
+    ge = check_ops.assert_greater_equal(
+        samples_batch_ndims, array_ops.rank(t))
+    eq = check_ops.assert_equal(samples_batch_shape, broadcasted_batch_shape)
+    return ge, eq
+  checks = list(itertools.chain(*[check(t) for t in parameters]))
   with ops.control_dependencies(checks):
-    return array_ops.identity(array_ops.identity(tensor))
+    return array_ops.identity(samples)
 
 
 def true_mean_confidence_interval_by_dkwm(
@@ -684,9 +687,13 @@ def assert_true_mean_equal_by_dkwm_two_sample(
       # I want to assert
       #   not (max_mean_1 < min_mean_2 or min_mean_1 > max_mean_2),
       # but I think I only have and-combination of asserts, so use DeMorgan.
-      clause1_op = check_ops.assert_greater_equal(max_mean_1, min_mean_2)
-      with ops.control_dependencies([clause1_op]):
-        return check_ops.assert_less_equal(min_mean_1, max_mean_2)
+      check_confidence_intervals_can_intersect = check_ops.assert_greater_equal(
+          max_mean_1, min_mean_2, message="Confidence intervals do not "
+          "intersect: samples1 has a smaller mean than samples2")
+      with ops.control_dependencies([check_confidence_intervals_can_intersect]):
+        return check_ops.assert_less_equal(
+            min_mean_1, max_mean_2, message="Confidence intervals do not "
+            "intersect: samples2 has a smaller mean than samples1")
 
 
 def min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
-- 
GitLab


From 706d8d34c4db4d8568e195d2cfdd54d812ff0b12 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Tue, 10 Apr 2018 14:24:51 -0700
Subject: [PATCH 0536/1262] ParseOpData returns kTfLiteError when error
 happens.

PiperOrigin-RevId: 192346224
---
 tensorflow/contrib/lite/model.cc | 86 ++++++++++++++++----------------
 1 file changed, 44 insertions(+), 42 deletions(-)

diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 13e5532909..87af953061 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -261,13 +261,11 @@ T* MallocPOD() {
 // Parse the appropriate data out of the op.
 //
 // This handles builtin data explicitly as there are flatbuffer schemas.
-//
-// Returns memory that must be feed.
-//
-// TODO(nupurgarg): Pass in void ** and return TfLiteStatus to ensure program
-// crashes if error reporter is called.
-void* ParseOpData(const Operator* op, BuiltinOperator op_type,
-                  ErrorReporter* error_reporter) {
+// If it returns kTfLiteOk, it passes the data out with `builtin_data`, which
+// need to be released by calling `free`.`
+// If it returns kTfLiteError, `builtin_data` will be `nullptr`.
+TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter, void** builtin_data) {
   auto parse_padding = [](Padding padding) {
     switch (padding) {
       case Padding_SAME:
@@ -316,7 +314,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     }
   };
 
-  void* builtin_data = nullptr;
+  *builtin_data = nullptr;
   switch (op_type) {
     case BuiltinOperator_CALL:
       // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
@@ -333,7 +331,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(conv_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TANH:
@@ -358,10 +356,11 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             ConvertTensorType(schema_params->out_data_type(),
                               &params->out_data_type, error_reporter);
         if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
-          break;
+          free(params);
+          return kTfLiteError;
         }
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_LSH_PROJECTION: {
@@ -370,7 +369,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* lshParams = op->builtin_options_as_LSHProjectionOptions()) {
         params->type = parseLSHProjectionType(lshParams->type());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_AVERAGE_POOL_2D:
@@ -386,7 +385,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(pool_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
@@ -400,7 +399,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(conv_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SVDF: {
@@ -410,7 +409,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(svdf_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
@@ -422,7 +421,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(sequence_rnn_params->fused_activation_function());
         params->time_major = sequence_rnn_params->time_major();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RNN: {
@@ -431,7 +430,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(rnn_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_EMBEDDING_LOOKUP:
@@ -444,7 +443,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_EmbeddingLookupSparseOptions()) {
         params->combiner = parseCombinerType(embedding_params->combiner());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_FULLY_CONNECTED: {
@@ -455,7 +454,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation = parse_activation(
             fully_connected_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_HASHTABLE_LOOKUP:
@@ -466,7 +465,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* softmax_params = op->builtin_options_as_SoftmaxOptions()) {
         params->beta = softmax_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_CONCATENATION: {
@@ -478,7 +477,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(concatenation_params->fused_activation_function());
         params->axis = concatenation_params->axis();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MUL: {
@@ -487,7 +486,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_ADD: {
@@ -496,7 +495,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DIV: {
@@ -505,7 +504,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SUB: {
@@ -514,7 +513,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -523,7 +522,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
@@ -535,7 +534,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->alpha = schema_params->alpha();
         params->beta = schema_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
@@ -548,7 +547,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->cell_clip = lstm_params->cell_clip();
         params->proj_clip = lstm_params->proj_clip();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
@@ -557,7 +556,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_ResizeBilinearOptions()) {
         params->align_corners = schema_params->align_corners();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_PAD: {
@@ -571,7 +570,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->shape, error_reporter);
         params->num_dimensions = new_shape->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SKIP_GRAM: {
@@ -581,7 +580,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->max_skip_size = skip_gram_params->max_skip_size();
         params->include_all_ngrams = skip_gram_params->include_all_ngrams();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_DEPTH: {
@@ -589,7 +588,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SpaceToDepthOptions()) {
         params->block_size = schema_params->block_size();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_GATHER: {
@@ -599,7 +598,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->axis = gather_params->axis();
       }
 
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
@@ -616,7 +615,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
         params->keep_dims = schema_params->keep_dims();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPLIT: {
@@ -624,7 +623,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SplitOptions()) {
         params->num_splits = schema_params->num_splits();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SQUEEZE: {
@@ -635,7 +634,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->squeeze_dims, error_reporter);
         params->num_squeeze_dims = squeeze_dims->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_STRIDED_SLICE: {
@@ -647,7 +646,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->new_axis_mask = schema_params->new_axis_mask();
         params->shrink_axis_mask = schema_params->shrink_axis_mask();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MAXIMUM:
@@ -660,16 +659,16 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         ConvertTensorType(schema_params->output_type(), &params->output_type,
                           error_reporter);
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
-      break;
+      return kTfLiteError;
     }
   }
-  return builtin_data;
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -709,10 +708,13 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
           reinterpret_cast<const char*>(op->custom_options()->data()),
           op->custom_options()->size(), nullptr, reg);
     } else {
+      void* builtin_data = nullptr;
+      TF_LITE_ENSURE_STATUS(
+          ParseOpData(op, op_type, error_reporter_, &builtin_data));
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0,
-          ParseOpData(op, op_type, error_reporter_), reg);
+          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
+          reg);
     }
   }
 
-- 
GitLab


From 02afb3d56e9270a9808693741b08c4fba997c3a2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 14:51:54 -0700
Subject: [PATCH 0537/1262] Run EvaluateNodes for
 ModelPrunerTest_StopGradientPruning. Also updated the test fixture to inherit
 from GrapplerTest.

PiperOrigin-RevId: 192350828
---
 tensorflow/core/grappler/optimizers/BUILD             |  2 ++
 .../core/grappler/optimizers/model_pruner_test.cc     | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index e4bc030885..a4545bb8f8 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -357,9 +357,11 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 8480a74572..2b12eadec9 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,7 +28,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ModelPrunerTest : public ::testing::Test {};
+class ModelPrunerTest : public GrapplerTest {};
 
 TEST_F(ModelPrunerTest, NoPruning) {
   // This trivial graph is so basic there's nothing to prune.
@@ -86,6 +88,13 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   EXPECT_EQ(NodeName(b.name()), new_e.input(0));
   EXPECT_EQ(1, new_d.input_size());
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, IdentityPruning) {
-- 
GitLab


From 16997696d2dec1d74bc6341d10bad17b8c830bdd Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 10 Apr 2018 14:59:23 -0700
Subject: [PATCH 0538/1262] Forcing the symlink creation.

---
 tensorflow/tools/docker/Dockerfile           | 2 +-
 tensorflow/tools/docker/Dockerfile.devel     | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu | 2 +-
 tensorflow/tools/docker/Dockerfile.gpu       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 024cb40eb4..78cb4d250e 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index c4f6b24e5c..b3dbe475d2 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,7 +38,7 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 5aea47e582..bfb96da58d 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 625321e123..9e1708662e 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
-- 
GitLab


From 99e198185d3a4a8bb089102b71b9fc3920427887 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 15:01:49 -0700
Subject: [PATCH 0539/1262] Add quantized LogSoftmax.

PiperOrigin-RevId: 192352432
---
 .../internal/optimized/optimized_ops.h        | 91 ++++++++++++++++++-
 .../kernels/internal/quantization_util.cc     | 16 ++++
 .../lite/kernels/internal/quantization_util.h |  7 +-
 .../internal/reference/reference_ops.h        | 86 ++++++++++++++++++
 .../toco/graph_transformations/quantize.cc    | 16 ++++
 tensorflow/contrib/lite/toco/model.h          | 11 ++-
 6 files changed, 224 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index e329e02273..22c0504ad2 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -4135,6 +4135,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
 // optimized yet.
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -4168,6 +4169,94 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Currently just a copy of the reference code.
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax/Uint8");
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic");
@@ -4181,7 +4270,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Logistic");
+  gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
   /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
   /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
   /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index 18be6777a5..dd86313726 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -78,6 +78,22 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
                                    quantized_multiplier, left_shift);
 }
 
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift) {
+  PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
+                           quantized_multiplier, left_shift);
+
+  // Also calculate what amounts to the inverse scaling factor for the input.
+  const double real_reverse_scaling_divisor =
+      (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
+  tflite::QuantizeMultiplierSmallerThanOne(real_reverse_scaling_divisor,
+                                           reverse_scaling_divisor,
+                                           reverse_scaling_right_shift);
+}
+
 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
                                     (1ll << (31 - input_integer_bits)) /
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index 9a04b76e56..1f6f5d3b15 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -196,7 +196,12 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
 void PreprocessSoftmaxScaling(double beta, double input_scale,
                               int input_integer_bits,
                               int32_t* quantized_multiplier, int* left_shift);
-
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift);
 // Calculate the largest input that will result in a within-bounds intermediate
 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
 // it must not overflow before we reduce the value by multiplication by the
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 250a308f2a..93b4eb5504 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2447,6 +2447,92 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   const int flat_size = MatchingFlatSize(output_dims, input_dims);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 5b1268f9a9..f50830ae60 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -44,6 +44,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowMinimum ||
          type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
+         type == OperatorType::kLogSoftmax ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
@@ -394,6 +395,19 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
                                  *quantization_params));
     return true;
   }
+  if (op.type == OperatorType::kLogSoftmax) {
+    // LogSoftmax has range: [LogSoftmaxOperator::kOutputRangeMin, 0].
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
+    quantization_params->zero_point = qp.max_value;
+    quantization_params->scale =
+        -LogSoftmaxOperator::kOutputRangeMin / (qp.max_value + 1);
+    // While not strictly necessary, it is easier to interpret output data and
+    // quantization if the scale is similar to others (such as power of 2).
+    CHECK(IsExactlyRepresentable(LogSoftmaxOperator::kOutputRangeMin / 2,
+                                 *quantized_data_type, *quantization_params));
+    return true;
+  }
   if (op.type == OperatorType::kTanh) {
     // Tanh has the range: [-1, 1].
     *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
@@ -661,6 +675,8 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
 
       // Fix up the min/max information on the output array to match the chosen
       // quantization parameters.
+      CHECK(output_array.minmax)
+          << "Output array named " << output << " lacks minmax";
       auto& output_minmax = output_array.GetMinMax();
       FixMinMaxPostQuantization(quantized_data_type, quantization_params,
                                 &output_minmax);
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 56ef9fe2a8..54c3a59506 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -1329,6 +1329,15 @@ struct SoftmaxOperator : Operator {
 // TensorFlow equivalent: LogSoftmax
 struct LogSoftmaxOperator : Operator {
   LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+
+  // LogSoftmax can in principal have very large negative output, depending on
+  // the input size.  However, input x_i that is less than x_max-10 is
+  // accumulated as exp(x_i-x_max), which is truncated to zero.
+  //
+  // Since we effectively disregard smallish inputs in the normalizing factor,
+  // we also drop them in the output (set to minimum output), and in doing so
+  // make better use of the quantization range / resolution.
+  static constexpr float kOutputRangeMin = -16.0;
 };
 
 // Cast operator.
@@ -1522,7 +1531,7 @@ class Shape {
   int dims(int i) const {
     // Always check for out-of-bounds accesses, even in optimized builds where
     // standard assertions are disabled. Out-of-bounds access here is a common
-    // occurence.
+    // occurrence.
     CHECK_GE(i, 0);
     CHECK_GT(dims_.size(), i);
     return dims_[i];
-- 
GitLab


From 0172f3b5b86ccdf32366259a31266a988a9445d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 15:23:05 -0700
Subject: [PATCH 0540/1262] Allow negative feature values in computation for
 `sum` combiner.

PiperOrigin-RevId: 192355950
---
 .../layers/python/layers/embedding_ops.py     | 15 ++++-
 .../python/feature_column/feature_column.py   | 15 ++++-
 .../feature_column/feature_column_test.py     | 57 ++++++++++++++-----
 3 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index ffa208540d..49c3faf3b7 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -140,6 +140,9 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != "sum":
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -188,13 +191,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 def scattered_embedding_lookup(params,
                                values,
                                dimension,
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7a104fa4ac..f9201a4794 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -3148,6 +3148,9 @@ def _safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != 'sum':
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -3196,13 +3199,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
                        collections.namedtuple('_IndicatorColumn',
                                               ['categorical_column'])):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 07588af37e..62718db0e5 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1511,6 +1511,28 @@ class LinearModelTest(test.TestCase):
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [5010.]], predictions.eval())
 
+  def test_sparse_combiner_with_negative_weights(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {
+          'wire_cast': wire_tensor,
+          'weights': constant_op.constant([[1., 1., -1.0]])
+      }
+      predictions = fc.linear_model(
+          features, [wire_cast_weights], sparse_combiner='sum')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
@@ -6164,14 +6186,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      }, (column,))
+      predictions = get_keras_linear_model_predictions(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
       with _initialized_session():
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
@@ -6255,13 +6279,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'ids': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      }, (column,))
+      predictions = fc.linear_model(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
       with _initialized_session():
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
-- 
GitLab


From 9eaab27bc41b6865bc945dcbb6b75c2427826ef3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 15:39:37 -0700
Subject: [PATCH 0541/1262] [XLA] Redesign: implement and test Conv.

PiperOrigin-RevId: 192359226
---
 .../xla/client/xla_client/xla_builder.cc      | 170 +++++++++++++++++-
 .../xla/client/xla_client/xla_builder.h       |  14 ++
 tensorflow/compiler/xla/tests/BUILD           |   2 +-
 .../compiler/xla/tests/convolution_test.cc    |  61 ++++---
 4 files changed, 210 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 7481b357ff..9e4b9ccd25 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -790,24 +790,101 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
   });
 }
 
+Status XlaBuilder::VerifyConvolution(
+    const Shape& lhs_shape, const Shape& rhs_shape,
+    const ConvolutionDimensionNumbers& dimension_numbers) const {
+  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
+    return InvalidArgument(
+        "Convolution arguments must have same number of "
+        "dimensions. Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_dims = ShapeUtil::Rank(lhs_shape);
+  if (num_dims < 2) {
+    return InvalidArgument(
+        "Convolution expects argument arrays with >= 3 dimensions. "
+        "Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_spatial_dims = num_dims - 2;
+
+  const auto check_spatial_dimensions =
+      [&](const char* const field_name,
+          const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>&
+              numbers) {
+        if (numbers.size() != num_spatial_dims) {
+          return InvalidArgument("Expected %d elements for %s, but got %d.",
+                                 num_spatial_dims, field_name, numbers.size());
+        }
+        for (int i = 0; i < numbers.size(); ++i) {
+          if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) {
+            return InvalidArgument("Convolution %s[%d] is out of bounds: %lld",
+                                   field_name, i, numbers.Get(i));
+          }
+        }
+        return Status::OK();
+      };
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("input_spatial_dimensions",
+                               dimension_numbers.input_spatial_dimensions()));
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("kernel_spatial_dimensions",
+                               dimension_numbers.kernel_spatial_dimensions()));
+  return check_spatial_dimensions(
+      "output_spatial_dimensions",
+      dimension_numbers.output_spatial_dimensions());
+}
+
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-  return UnimplementedOp();
+  return ConvWithGeneralDimensions(
+      lhs, rhs, window_strides, padding,
+      CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
 
 XlaOp XlaBuilder::ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return UnimplementedOp();
+  return ConvGeneral(lhs, rhs, window_strides, padding,
+                     CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
 
 XlaOp XlaBuilder::ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> base_area_dimensions(
+        dimension_numbers.input_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
+         ++i) {
+      base_area_dimensions[i] =
+          lhs_shape.dimensions(dimension_numbers.input_spatial_dimensions(i));
+    }
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+
+    return ConvGeneral(lhs, rhs, window_strides,
+                       MakePadding(base_area_dimensions, window_dimensions,
+                                   window_strides, padding),
+                       dimension_numbers);
+  });
 }
 
 XlaOp XlaBuilder::ConvGeneral(
@@ -815,7 +892,8 @@ XlaOp XlaBuilder::ConvGeneral(
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
+                            dimension_numbers);
 }
 
 XlaOp XlaBuilder::ConvGeneralDilated(
@@ -825,7 +903,89 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   lhs_dilation, rhs_dilation));
+
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(),
+                                           dimension_numbers));
+
+    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+
+    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
+                          {lhs, rhs});
+  });
+}
+
+StatusOr<Window> XlaBuilder::MakeWindow(
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation) const {
+  const auto verify_size = [&](const size_t x, const char* x_name) {
+    if (x == 0 || x == window_dimensions.size()) {
+      return Status::OK();
+    } else {
+      return InvalidArgument(
+          "%s", tensorflow::strings::StrCat(
+                    "Window has different number of window dimensions than of ",
+                    x_name,
+                    "\nNumber of window dimensions: ", window_dimensions.size(),
+                    "\nNumber of ", x_name, ": ", x, "\n")
+                    .c_str());
+    }
+  };
+  TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides"));
+  TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries"));
+  TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors"));
+  TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors"));
+
+  Window window;
+  for (size_t i = 0; i < window_dimensions.size(); i++) {
+    auto dim = window.add_dimensions();
+    dim->set_size(window_dimensions[i]);
+    if (!window_strides.empty()) {
+      dim->set_stride(window_strides[i]);
+    } else {
+      dim->set_stride(1);
+    }
+    if (!padding.empty()) {
+      dim->set_padding_low(padding[i].first);
+      dim->set_padding_high(padding[i].second);
+    } else {
+      dim->set_padding_low(0);
+      dim->set_padding_high(0);
+    }
+    if (!lhs_dilation.empty()) {
+      dim->set_base_dilation(lhs_dilation[i]);
+    } else {
+      dim->set_base_dilation(1);
+    }
+    if (!rhs_dilation.empty()) {
+      dim->set_window_dilation(rhs_dilation[i]);
+    } else {
+      dim->set_window_dilation(1);
+    }
+    dim->set_window_reversal(false);
+  }
+  return window;
 }
 
 XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index d747691f16..24e0be2ac1 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -835,6 +835,20 @@ class XlaBuilder {
   void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
                          bool* is_constant) const;
 
+  // Checks bounds for convolution parameters.
+  Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  StatusOr<Window> MakeWindow(
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
+
   string name_;  // Name to use for the built computation.
 
   // The first error encountered while building the computation.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 19fb4886db..67c53c6ac0 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -781,10 +781,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 72715398de..5eb3136abe 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -88,12 +88,12 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     ASSERT_EQ(2, arhs->width());
     ASSERT_EQ(2, arhs->height());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto lhs = builder.ConstantR4FromArray4D<T>(*alhs);
     auto rhs = builder.ConstantR4FromArray4D<T>(*arhs);
-    auto conv = builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
+    builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
 
-    ComputeAndCompare(&builder, conv, {}, error_spec_);
+    ComputeAndCompare(&builder, {}, error_spec_);
   }
 };
 
@@ -106,12 +106,12 @@ template <typename T>
 class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 1, 2);
     input_data.FillWithYX(Array2D<T>({
@@ -122,7 +122,7 @@ class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -137,12 +137,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -156,7 +156,7 @@ class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
         {7.0f, 8.0f},
     }));
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -171,12 +171,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -191,7 +191,7 @@ class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
         {7.0f, 8.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -207,12 +207,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 3, 3});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({{1.0f, 2.0f, 3.0f, 4.0f},
@@ -223,7 +223,7 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
     filter_data.FillWithYX(Array2D<T>(
         {{5.0f, 6.0f, 7.0f}, {8.0f, 9.0f, 10.0f}, {11.0f, 12.0f, 13.0f}}));
     // clang-format on
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -234,7 +234,7 @@ TYPED_TEST_CASE(Convolve_1x1x4x4_1x1x3x3_Same, TestTypes);
 TYPED_TEST(Convolve_1x1x4x4_1x1x3x3_Same, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -264,7 +264,7 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
       Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
       Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
@@ -300,7 +300,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -331,7 +331,7 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
 }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -365,7 +365,7 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
       Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
       Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
@@ -402,7 +402,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> input_dims = {1, 4, 2, 3, 3};
   std::vector<int64> filter_dims = {2, 2, 2, 3, 3};
   Shape input_shape = ShapeUtil::MakeShape(F32, input_dims);
@@ -469,7 +469,7 @@ template <typename T>
 class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     std::vector<int64> input_dims = {1, 3, 3, 5};
     std::vector<int64> filter_dims = {3, 3, 5, 3};
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
@@ -537,7 +537,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
     execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
         "convolution-canonicalization");
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29});
   Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10});
 
@@ -551,8 +551,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   dnums.set_kernel_output_feature_dimension(1);
   dnums.set_output_batch_dimension(0);
   dnums.set_output_feature_dimension(1);
-  auto conv = builder.ConvWithGeneralDimensions(input, filter, {},
-                                                Padding::kValid, dnums);
+  builder.ConvWithGeneralDimensions(input, filter, {}, Padding::kValid, dnums);
 
   Array2D<float> param0(4, 29);
   param0.FillUnique();
@@ -563,7 +562,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   Array2D<float> expected_result(29, 10);
   expected_result.Fill(0);
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(param0)),
                      std::move(*Literal::CreateFromArray(param1))},
                     error_spec_);
@@ -587,7 +586,7 @@ class Convolve1D1WindowTestBase
  protected:
   template <typename T>
   void TestImpl() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     int64 input_feature = GetParam().input_feature;
     int64 output_feature = GetParam().output_feature;
     int64 batch = GetParam().batch;
@@ -724,12 +723,12 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   auto input = builder.Parameter(0, input_shape, "input");
   auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<bfloat16> input_data(1, 1, 1, 2);
   input_data.FillWithYX(Array2D<bfloat16>({
@@ -740,7 +739,7 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
       {bfloat16(5), bfloat16(6)},
   }));
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(input_data)),
                      std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
-- 
GitLab


From 15b104a047c1ec8ec07045047d46a300ebc6b2e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 15:45:37 -0700
Subject: [PATCH 0542/1262] Small changes to testing code, plus a new binary to
 check diff from command line.

PiperOrigin-RevId: 192360373
---
 tensorflow/contrib/lite/testing/BUILD         | 13 +++--
 .../contrib/lite/testing/generate_testspec.cc | 49 +++++++++++++------
 .../contrib/lite/testing/generate_testspec.h  |  2 +-
 tensorflow/contrib/lite/testing/tf_driver.cc  |  9 +++-
 .../lite/testing/tflite_diff_example_test.cc  |  7 ++-
 .../contrib/lite/testing/tflite_diff_flags.h  |  4 +-
 .../contrib/lite/testing/tflite_diff_util.cc  | 10 ++--
 .../contrib/lite/testing/tflite_driver.cc     |  1 -
 8 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 9f0ba43252..198984e7e7 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -196,7 +196,6 @@ cc_library(
 
 cc_library(
     name = "util",
-    testonly = 1,
     hdrs = ["util.h"],
 )
 
@@ -251,7 +250,6 @@ cc_test(
 
 cc_library(
     name = "generate_testspec",
-    testonly = 1,
     srcs = ["generate_testspec.cc"],
     hdrs = ["generate_testspec.h"],
     deps = [
@@ -277,7 +275,6 @@ cc_test(
 
 cc_library(
     name = "tflite_diff_util",
-    testonly = 1,
     srcs = ["tflite_diff_util.cc"],
     hdrs = ["tflite_diff_util.h"],
     deps = [
@@ -295,7 +292,6 @@ cc_library(
 
 cc_library(
     name = "tflite_diff_flags",
-    testonly = 1,
     hdrs = ["tflite_diff_flags.h"],
     deps = [
         ":split",
@@ -338,6 +334,15 @@ tf_cc_test(
     ],
 )
 
+cc_binary(
+    name = "tflite_diff",
+    srcs = ["tflite_diff_example_test.cc"],
+    deps = [
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
 tf_cc_test(
     name = "generated_examples_zip_test",
     size = "large",
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index eb3deafb69..6580845af4 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -22,7 +22,22 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-void GenerateTestSpecFromTensorflowModel(
+template <typename T>
+void GenerateCsv(const std::vector<int>& shape, float min, float max,
+                 string* out) {
+  auto random_float = [](int min, int max) {
+    static unsigned int seed;
+    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(shape, random_t);
+  *out = Join(data.data(), data.size(), ",");
+}
+
+bool GenerateTestSpecFromTensorflowModel(
     std::iostream& stream, const string& tensorflow_model_path,
     const string& tflite_model_path, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
@@ -31,12 +46,6 @@ void GenerateTestSpecFromTensorflowModel(
   CHECK_EQ(input_layer.size(), input_layer_type.size());
   CHECK_EQ(input_layer.size(), input_layer_shape.size());
 
-  // Initialize random functions.
-  static unsigned int seed = 0;
-  std::function<float(int)> float_rand = [](int idx) {
-    return static_cast<float>(rand_r(&seed)) / RAND_MAX - 0.5f;
-  };
-
   // Generate inputs.
   std::vector<string> input_values;
   input_values.resize(input_layer.size());
@@ -46,15 +55,25 @@ void GenerateTestSpecFromTensorflowModel(
     auto shape = Split<int>(input_layer_shape[i], ",");
 
     switch (type) {
-      case tensorflow::DT_FLOAT: {
-        const auto& data = GenerateRandomTensor<float>(shape, float_rand);
-        input_values[i] = Join(data.data(), data.size(), ",");
+      case tensorflow::DT_FLOAT:
+        GenerateCsv<float>(shape, -0.5, 0.5, &input_values[i]);
+        break;
+      case tensorflow::DT_UINT8:
+        GenerateCsv<uint8_t>(shape, 0, 255, &input_values[i]);
+        break;
+      case tensorflow::DT_INT32:
+        GenerateCsv<int32_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_INT64:
+        GenerateCsv<int64_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_BOOL:
+        GenerateCsv<int>(shape, 0.01, 1.99, &input_values[i]);
         break;
-      }
       default:
-
-        fprintf(stderr, "Unsupported type %d when generating testspec\n", type);
-        return;
+        fprintf(stderr, "Unsupported type %d (%s) when generating testspec.\n",
+                type, input_layer_type[i].c_str());
+        return false;
     }
   }
 
@@ -82,6 +101,8 @@ void GenerateTestSpecFromTensorflowModel(
     stream << "  output: \"" << runner.ReadOutput(i) << "\"\n";
   }
   stream << "}\n";
+
+  return true;
 }
 
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/contrib/lite/testing/generate_testspec.h
index 3529ee709b..6e31a853c3 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.h
+++ b/tensorflow/contrib/lite/testing/generate_testspec.h
@@ -34,7 +34,7 @@ namespace testing {
 //   input_layer_type: datatypes of input tensors. Example: float
 //   input_layer_shape: shapes of input tensors, separated by comma. example:
 //   1,3,4 output_layer: names of output tensors. Example: output
-void GenerateTestSpecFromTensorflowModel(
+bool GenerateTestSpecFromTensorflowModel(
     std::iostream& stream, const string& tensorflow_model_path,
     const string& tflite_model_path, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 2c253bb198..7b295875aa 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -87,10 +87,9 @@ TfDriver::TfDriver(const std::vector<string>& input_layer,
 
 void TfDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
   std::ifstream model(bin_file_path);
   if (model.fail()) {
-    Invalidate("Failed to find the model");
+    Invalidate("Failed to find the model " + bin_file_path);
     return;
   }
 
@@ -121,6 +120,10 @@ void TfDriver::SetInput(int id, const string& csv_values) {
       FillTensorWithData<int32_t>(&tensor, csv_values);
       break;
     }
+    case tensorflow::DT_UINT8: {
+      FillTensorWithData<uint8_t>(&tensor, csv_values);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
@@ -162,6 +165,8 @@ string TfDriver::ReadOutput(int id) {
       return TensorDataToCsvString<float>(output_tensors_[id]);
     case tensorflow::DT_INT32:
       return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+    case tensorflow::DT_UINT8:
+      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
     default:
       fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
index 3817e68111..5afa0f800c 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
@@ -19,10 +19,13 @@ limitations under the License.
 int main(int argc, char** argv) {
   ::tflite::testing::DiffOptions options =
       ::tflite::testing::ParseTfliteDiffFlags(&argc, argv);
+  if (options.tensorflow_model.empty()) return 1;
+  int failure_count = 0;
   for (int i = 0; i < 100; i++) {
     if (!tflite::testing::RunDiffTest(options)) {
-      return 1;
+      ++failure_count;
     }
   }
-  return 0;
+  fprintf(stderr, "Num errors: %d\n", failure_count);
+  return failure_count != 0 ? 1 : 0;
 }
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 5f1129d501..706108ed73 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -51,9 +51,11 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
                        "output_1,output_2"),
   };
 
+  bool no_inputs = *argc == 1;
   bool success = tensorflow::Flags::Parse(argc, argv, flags);
-  if (!success || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
+  if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
   }
 
   return {values.tensorflow_model,
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
index 9ef4e1f66c..f601d3752d 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
@@ -27,13 +27,13 @@ namespace testing {
 
 bool RunDiffTest(const DiffOptions& options) {
   std::stringstream tflite_stream;
-  GenerateTestSpecFromTensorflowModel(
-      tflite_stream, options.tensorflow_model, options.tflite_model,
-      options.input_layer, options.input_layer_type, options.input_layer_shape,
-      options.output_layer);
+  if (!GenerateTestSpecFromTensorflowModel(
+          tflite_stream, options.tensorflow_model, options.tflite_model,
+          options.input_layer, options.input_layer_type,
+          options.input_layer_shape, options.output_layer))
+    return false;
   TfLiteDriver tflite_driver(/*use_nnapi=*/true);
   tflite_driver.LoadModel(options.tflite_model);
-  std::cout << tflite_stream.str();
   return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
 }
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index c399f4f2b7..3764bab035 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -143,7 +143,6 @@ void TfLiteDriver::AllocateTensors() {
 
 void TfLiteDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
 
   model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
   if (!model_) {
-- 
GitLab


From 21e1bd6fcd671f41858fca47306e07c76ada7e9a Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Tue, 10 Apr 2018 15:48:15 -0700
Subject: [PATCH 0543/1262] In `get_variable`, nest the choice to use
 `ResourceVariable` under an `init_scope`.

This makes sure that, when executing eagerly, calls to `get_variable` in a
`defun`-compiled function retrieve `ResourceVariable`s instead of `Variables`.

PiperOrigin-RevId: 192360775
---
 tensorflow/python/kernel_tests/BUILD          |   2 +
 .../kernel_tests/variable_scope_test.py       | 118 +++++++++++-------
 tensorflow/python/ops/variable_scope.py       |  12 +-
 3 files changed, 89 insertions(+), 43 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3033b48977..1827a26902 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1029,12 +1029,14 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 86ab9fbb70..51aa671098 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -24,11 +24,13 @@ import threading
 import numpy
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -118,6 +120,16 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
+  def testGetVariableInGraphNestedUnderEagerContext(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        v = variable_scope.get_variable("should_be_resource", [])
+        self.assertEqual(type(v), resource_variable_ops.ResourceVariable)
+
+      f()
+
   def testEagerVariableStore(self):
     with context.eager_mode():
       store = variable_scope.EagerVariableStore()
@@ -156,6 +168,28 @@ class VariableScopeTest(test.TestCase):
       for v in new_store.variables():
         self.assertEqual(v.numpy(), 1)
 
+  def testEagerVariableStoreWithEagerDefun(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        x = constant_op.constant([[2.0]])
+        d1 = core_layers.Dense(
+            1, name="my_dense", kernel_initializer=init_ops.ones_initializer())
+        _ = d1(x)  # create variables
+        self.assertEqual(len(d1.variables), 2)
+        v1, v2 = d1.variables
+        d2 = core_layers.Dense(
+            1,
+            name="my_dense",
+            kernel_initializer=init_ops.ones_initializer(),
+            _reuse=True)
+        _ = d2(x)
+        self.assertEqual(len(d2.variables), 2)
+        v3, v4 = d2.variables
+        self.assertAllEqual([v1, v2], [v3, v4])
+      f()
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -209,15 +243,15 @@ class VariableScopeTest(test.TestCase):
 
           with variable_scope.variable_scope("not_cached", caching_device=""):
             v2_not_cached = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_not_cached.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_not_cached.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope(
               "not_cached_identity_device",
               caching_device=lambda op: op.device):
             v2_identity_device = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_identity_device.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_identity_device.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope("we_will_do_it_live") as vs_live:
             vs_live.set_caching_device("/job:live")
@@ -484,15 +518,19 @@ class VariableScopeTest(test.TestCase):
 
   def testVarScopeGetOrCreateReuse(self):
     with self.test_session():
+
       def test_value(value):
         x = constant_op.constant(value)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = state_ops.assign(variable_scope.get_variable("var", []), x)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
         self.assertEqual(value, x.eval())
+
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
       test_value(17.)
@@ -551,19 +589,16 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("default") as default:
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer/w:0")
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_1/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_1/w:0")
         with variable_scope.variable_scope(default):
           pass
         # No matter the jump in the middle, unique numbering continues.
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_2/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_2/w:0")
 
   def testVarOpScopeReuse(self):
     with self.test_session():
@@ -935,12 +970,12 @@ class VariableScopeTest(test.TestCase):
   def testGetCollection(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
-      _ = variable_scope.get_variable("testGetCollection_b", [],
-                                      trainable=False)
+      _ = variable_scope.get_variable(
+          "testGetCollection_b", [], trainable=False)
       with variable_scope.variable_scope("testGetCollection_foo_") as scope1:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -954,8 +989,8 @@ class VariableScopeTest(test.TestCase):
         ])
       with variable_scope.variable_scope("testGetCollection_foo") as scope2:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -992,22 +1027,22 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope(
           "testGetTrainableVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetTrainableVariables_b", [])
-        _ = variable_scope.get_variable("testGetTrainableVariables_c", [],
-                                        trainable=False)
-        self.assertEqual([v.name
-                          for v in scope.trainable_variables()],
-                         ["testGetTrainableVariables_foo/"
-                          "testGetTrainableVariables_b:0"])
+        _ = variable_scope.get_variable(
+            "testGetTrainableVariables_c", [], trainable=False)
+        self.assertEqual(
+            [v.name for v in scope.trainable_variables()],
+            ["testGetTrainableVariables_foo/"
+             "testGetTrainableVariables_b:0"])
 
   def testGetGlobalVariables(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
       with variable_scope.variable_scope("testGetGlobalVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetGlobalVariables_b", [])
-        self.assertEqual([v.name
-                          for v in scope.global_variables()],
-                         ["testGetGlobalVariables_foo/"
-                          "testGetGlobalVariables_b:0"])
+        self.assertEqual(
+            [v.name for v in scope.global_variables()],
+            ["testGetGlobalVariables_foo/"
+             "testGetGlobalVariables_b:0"])
 
   def testGetLocalVariables(self):
     with self.test_session():
@@ -1016,10 +1051,8 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("foo") as scope:
         _ = variable_scope.get_variable(
             "b", [], collections=[ops.GraphKeys.LOCAL_VARIABLES])
-        _ = variable_scope.get_variable(
-            "c", [])
-        self.assertEqual([v.name
-                          for v in scope.local_variables()], ["foo/b:0"])
+        _ = variable_scope.get_variable("c", [])
+        self.assertEqual([v.name for v in scope.local_variables()], ["foo/b:0"])
 
   def testGetVariableWithRefDtype(self):
     v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
@@ -1242,10 +1275,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with ops.name_scope("prod_getter"):
         return g_0 * g_1
 
-    with variable_scope.variable_scope(
-        "prod_scope", custom_getter=prod_getter):
-      with variable_scope.variable_scope(
-          "sum_scope", custom_getter=sum_getter):
+    with variable_scope.variable_scope("prod_scope", custom_getter=prod_getter):
+      with variable_scope.variable_scope("sum_scope", custom_getter=sum_getter):
         with variable_scope.variable_scope(
             "inner_sum_scope", custom_getter=sum_getter):
           # take sums of sums of products
@@ -1270,9 +1301,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       np_vars, np_v = sess.run([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
-          np_v,
-          (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3]))
-           + ((np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
+          np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
+              (np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
 
   def testVariableCreator(self):
 
@@ -1368,7 +1398,11 @@ class VariableScopeMultithreadedTest(test.TestCase):
 
     graph = ops.get_default_graph()
     threads = [
-        threading.Thread(target=thread_fn, args=(i, graph,)) for i in range(2)]
+        threading.Thread(target=thread_fn, args=(
+            i,
+            graph,
+        )) for i in range(2)
+    ]
 
     threads[0].start()
     # Allow thread 0 to finish before starting thread 1.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index e33085ba62..ba213ef884 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -307,6 +307,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # Variable creation and initialization takes place in `init_scope`s;
+        # as such, if an `init_scope` lifts us into the eager context, then we
+        # need to use `ResourceVariable`s.
+        use_resource = True
+
+    # Note that it's fine to reuse eager variables whose initialization was
+    # lifted from a function-building graph into the eager context (that's why
+    # the following clause is not wrapped in an `init_scope`); lifted variables
+    # are tracked by the graph's `VariableStore`.
     if context.executing_eagerly():
       if not self._store_eager_variables and reuse:
         raise RuntimeError(
@@ -315,7 +326,6 @@ class _VariableStore(object):
             " EagerVariableStore for example usage.")
       if self._store_eager_variables:
         reuse = AUTO_REUSE
-      use_resource = True
 
     # If a *_ref type is passed in an error would be triggered further down the
     # stack. We prevent this using base_dtype to get a non-ref version of the
-- 
GitLab


From 4a2420589da03ed8d1af9fa92073d2973d315ee4 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 10 Apr 2018 15:49:03 -0700
Subject: [PATCH 0544/1262] Cleaning up _distributed_apply now the device
 policy is unnecessary

PiperOrigin-RevId: 192360913
---
 tensorflow/python/training/optimizer.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 75665fc284..46a58a9adf 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -689,9 +689,7 @@ class Optimizer(
       # device_policy is set because non-mirrored tensors will be read in
       # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
       # is an example.
-      with ops.name_scope(
-          "update_" + scope_name), context.context().device_policy(
-              context.DEVICE_PLACEMENT_SILENT):
+      with ops.name_scope("update_" + scope_name):
         return p.update_op(self, g)
 
     with ops.name_scope(name, self._name) as name:
@@ -707,11 +705,8 @@ class Optimizer(
         return self._finish(update_ops, "update")
 
       non_slot_devices = distribution.non_slot_devices(var_list)
-      # Device policy is needed because hyperparameter tensors (such as
-      # AdamOptimizer's beta1_t) need to be copied across devices in Eager.
-      with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, self, update_ops)
+      finish_updates = distribution.update_non_slot(
+          non_slot_devices, finish, self, update_ops)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
-- 
GitLab


From 47d72205f3c58d31bfec52eb331e89edc562106c Mon Sep 17 00:00:00 2001
From: Mingxing Tan <tanmingxing@google.com>
Date: Tue, 10 Apr 2018 15:59:39 -0700
Subject: [PATCH 0545/1262] Allow passing allow_custom_ops for toco_convert.

PiperOrigin-RevId: 192362688
---
 tensorflow/contrib/lite/python/lite.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index ed6dd036f9..cf50f9d4d6 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -145,7 +145,8 @@ def toco_convert(input_data,
                  input_format=TENSORFLOW_GRAPHDEF,
                  output_format=TFLITE,
                  quantized_input_stats=None,
-                 drop_control_dependency=True):
+                 drop_control_dependency=True,
+                 allow_custom_ops=None):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -178,9 +179,12 @@ def toco_convert(input_data,
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
+  toco.inference_type = inference_type
   toco.drop_control_dependency = drop_control_dependency
+  if allow_custom_ops is not None:
+    toco.allow_custom_ops = allow_custom_ops
+
   model = _model_flags_pb2.ModelFlags()
-  toco.inference_type = inference_type
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
       tflite_input_type = FLOAT
-- 
GitLab


From fd75fb4b7740c1a1b82d2252f33c4b22f1f47e0f Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 10 Apr 2018 14:59:23 -0700
Subject: [PATCH 0546/1262] Forcing the symlink creation.

---
 tensorflow/tools/docker/Dockerfile           | 2 +-
 tensorflow/tools/docker/Dockerfile.devel     | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu | 2 +-
 tensorflow/tools/docker/Dockerfile.gpu       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 024cb40eb4..78cb4d250e 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index f2415930d5..390d7442c3 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,7 +38,7 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1d19821968..293028d229 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 625321e123..9e1708662e 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
-- 
GitLab


From 9846c26ddd2b163ead837b0e1150ab385f2e20b6 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 10 Apr 2018 16:10:13 -0700
Subject: [PATCH 0547/1262] Updating the sed command for docker parameterized
 build.

---
 tensorflow/tools/docker/parameterized_docker_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b4fba5b8f5..05de25f2cb 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@@ -306,7 +306,7 @@ else
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
     else
-- 
GitLab


From dc8aa019ba27d65789bcecbc776d1ccc9359c011 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 10 Apr 2018 16:11:38 -0700
Subject: [PATCH 0548/1262] Fix `nn` module RNN namespace issues.

PiperOrigin-RevId: 192364808
---
 tensorflow/python/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index da836aca6f..13f8420a67 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -157,6 +157,9 @@ from tensorflow.python.ops import rnn_cell
 # Required due to `rnn` and `rnn_cell` not being imported in `nn` directly
 # (due to a circular dependency issue: rnn depends on layers).
 nn.dynamic_rnn = rnn.dynamic_rnn
+nn.static_rnn = rnn.static_rnn
+nn.raw_rnn = rnn.raw_rnn
+nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
 nn.rnn_cell = rnn_cell
 
 # Symbols whitelisted for export without documentation.
-- 
GitLab


From 2891e0930eba15c7f27b0ab5732554e6b2c474d5 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Tue, 10 Apr 2018 16:12:19 -0700
Subject: [PATCH 0549/1262] [XLA] GRPC service definition.

PiperOrigin-RevId: 192364932
---
 tensorflow/compiler/xla/rpc/BUILD             |  79 ++++++
 .../compiler/xla/rpc/grpc_client_test.cc      | 109 ++++++++
 tensorflow/compiler/xla/rpc/grpc_service.cc   | 192 ++++++++++++++
 tensorflow/compiler/xla/rpc/grpc_service.h    | 126 +++++++++
 .../compiler/xla/rpc/grpc_service_main.cc     |  62 +++++
 tensorflow/compiler/xla/rpc/grpc_stub.cc      | 244 ++++++++++++++++++
 tensorflow/compiler/xla/rpc/grpc_stub.h       | 141 ++++++++++
 tensorflow/compiler/xla/rpc/xla_service.proto | 225 ++++++++++++++++
 tensorflow/compiler/xla/xla.bzl               |  13 +-
 .../core/platform/default/build_config.bzl    |   5 +
 10 files changed, 1194 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/compiler/xla/rpc/BUILD
 create mode 100644 tensorflow/compiler/xla/rpc/grpc_client_test.cc
 create mode 100644 tensorflow/compiler/xla/rpc/grpc_service.cc
 create mode 100644 tensorflow/compiler/xla/rpc/grpc_service.h
 create mode 100644 tensorflow/compiler/xla/rpc/grpc_service_main.cc
 create mode 100644 tensorflow/compiler/xla/rpc/grpc_stub.cc
 create mode 100644 tensorflow/compiler/xla/rpc/grpc_stub.h
 create mode 100644 tensorflow/compiler/xla/rpc/xla_service.proto

diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
new file mode 100644
index 0000000000..977f863787
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -0,0 +1,79 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_proto_library",
+    "xla_py_grpc_library",
+)
+
+xla_proto_library(
+    name = "xla_service_proto",
+    srcs = ["xla_service.proto"],
+    use_grpc_plugin = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+    ],
+)
+
+cc_library(
+    name = "grpc_stub",
+    srcs = ["grpc_stub.cc"],
+    hdrs = ["grpc_stub.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla:service_interface",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+    ],
+)
+
+tf_cc_binary(
+    name = "grpc_service_main_cpu",
+    srcs = ["grpc_service_main.cc"],
+    deps = [
+        ":grpc_service",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+tf_cc_test(
+    name = "grpc_client_test",
+    srcs = ["grpc_client_test.cc"],
+    data = [
+        "//tensorflow/compiler/xla/rpc:grpc_service_main_cpu",
+    ],
+    deps = [
+        ":grpc_stub",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_service",
+    srcs = ["grpc_service.cc"],
+    hdrs = ["grpc_service.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
new file mode 100644
index 0000000000..b559ee4b5a
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple C++ test to exercise the GRPC capabilities of XLA.
+//
+// Launches an RPC service in a subprocess and connects to it over a socket
+// using an RPCStub.
+#include <memory>
+#include <vector>
+
+#include "grpc++/create_channel.h"
+#include "grpc++/security/credentials.h"
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class GRPCClientTestBase : public ::testing::Test {
+ protected:
+  GRPCClientTestBase() {
+    string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+    string service_main_path = tensorflow::io::JoinPath(
+        test_srcdir, "compiler/xla/rpc/grpc_service_main_cpu");
+    int port = tensorflow::internal::PickUnusedPortOrDie();
+    subprocess_.SetProgram(
+        service_main_path,
+        {service_main_path, tensorflow::strings::Printf("--port=%d", port)});
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDOUT,
+                                 tensorflow::ACTION_DUPPARENT);
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDERR,
+                                 tensorflow::ACTION_DUPPARENT);
+    CHECK(subprocess_.Start());
+    LOG(INFO) << "Launched subprocess";
+
+    auto channel =
+        ::grpc::CreateChannel(tensorflow::strings::Printf("localhost:%d", port),
+                              ::grpc::InsecureChannelCredentials());
+    channel->WaitForConnected(gpr_time_add(
+        gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN)));
+    LOG(INFO) << "Channel to server is connected on port " << port;
+
+    xla_service_ = grpc::XlaService::NewStub(channel);
+    stub_.reset(new GRPCStub(xla_service_.get()));
+    client_.reset(new Client(stub_.get()));
+  }
+
+  ~GRPCClientTestBase() override {
+    LOG(INFO) << "Killing subprocess";
+    subprocess_.Kill(SIGKILL);
+  }
+
+  tensorflow::SubProcess subprocess_;
+  std::unique_ptr<grpc::XlaService::Stub> xla_service_;
+  std::unique_ptr<GRPCStub> stub_;
+  std::unique_ptr<Client> client_;
+};
+
+TEST_F(GRPCClientTestBase, ItsAlive) {
+  ASSERT_NE(xla_service_, nullptr);
+  ASSERT_NE(stub_, nullptr);
+  ASSERT_NE(client_, nullptr);
+}
+
+TEST_F(GRPCClientTestBase, AxpyTenValues) {
+  ComputationBuilder builder(client_.get(), "axpy_10");
+  auto alpha = builder.ConstantR0<float>(3.1415926535);
+  auto x = builder.ConstantR1<float>(
+      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
+  auto y = builder.ConstantR1<float>(
+      {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0});
+  auto ax = builder.Mul(alpha, x);
+  auto axpy = builder.Add(ax, y);
+
+  std::vector<float> expected = {
+      1.85840735, -1.85840735, 2.28318531,   -2.28318531,  -6.42477796,
+      6.42477796, 10.56637061, -10.56637061, -14.70796327, 14.70796327};
+  std::unique_ptr<Literal> expected_literal =
+      Literal::CreateR1<float>(expected);
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer(
+                                                   computation, {}, nullptr));
+  LiteralTestUtil::ExpectNear(*expected_literal, *result_literal,
+                              ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
new file mode 100644
index 0000000000..414829d6e7
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
+    perftools::gputools::Platform* platform) {
+  std::unique_ptr<GRPCService> grpc_service(new GRPCService());
+  TF_ASSIGN_OR_RETURN(grpc_service->service_,
+                      ::xla::Service::NewService(platform));
+  return std::move(grpc_service);
+}
+
+::grpc::Status DelegateRPC(std::function<tensorflow::Status()> op) {
+  tensorflow::Status s = op();
+  return tensorflow::ToGrpcStatus(s);
+}
+
+::grpc::Status GRPCService::Computation(::grpc::ServerContext* context,
+                                        const ComputationRequest* arg,
+                                        ComputationResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Computation(arg, result); });
+}
+
+::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context,
+                                     const OpRequest* arg, OpResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Op(arg, result); });
+}
+
+::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context,
+                                       const UnregisterRequest* arg,
+                                       UnregisterResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Unregister(arg, result); });
+}
+
+::grpc::Status GRPCService::DeconstructTuple(::grpc::ServerContext* context,
+                                             const DeconstructTupleRequest* arg,
+                                             DeconstructTupleResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->DeconstructTuple(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context,
+                                           const SetReturnValueRequest* arg,
+                                           SetReturnValueResponse* results) {
+  return DelegateRPC([this, arg, results]() {
+    return service_->SetReturnValue(arg, results);
+  });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* context,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
+}
+
+::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context,
+                                         const ExecuteAsyncRequest* arg,
+                                         ExecuteAsyncResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ExecuteAsync(arg, result); });
+}
+
+::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
+                                             const WaitForExecutionRequest* arg,
+                                             WaitForExecutionResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->WaitForExecution(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToClient(::grpc::ServerContext* context,
+                                             const TransferToClientRequest* arg,
+                                             TransferToClientResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToClient(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToServer(::grpc::ServerContext* context,
+                                             const TransferToServerRequest* arg,
+                                             TransferToServerResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToServer(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToInfeed(::grpc::ServerContext* context,
+                                             const TransferToInfeedRequest* arg,
+                                             TransferToInfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToInfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferFromOutfeed(
+    ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+    TransferFromOutfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferFromOutfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::ResetDevice(::grpc::ServerContext* context,
+                                        const ResetDeviceRequest* arg,
+                                        ResetDeviceResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ResetDevice(arg, result); });
+}
+
+::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context,
+                                       const IsConstantRequest* arg,
+                                       IsConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->IsConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context,
+                                            const ComputeConstantRequest* arg,
+                                            ComputeConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ComputeConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
+                                     const GetShapeRequest* arg,
+                                     GetShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationShape(
+    ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+    GetComputationShapeResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationShape(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context,
+                                          const GetLocalShapeRequest* arg,
+                                          GetLocalShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetLocalShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationStats(
+    ::grpc::ServerContext* context, const ComputationStatsRequest* arg,
+    ComputationStatsResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationStats(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SnapshotComputation(
+    ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+    SnapshotComputationResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->SnapshotComputation(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::LoadComputationSnapshot(
+    ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+    LoadComputationSnapshotResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->LoadComputationSnapshot(arg, result);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
new file mode 100644
index 0000000000..7c9e484517
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+
+#include "grpc++/server_context.h"
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service/service.h"
+
+namespace xla {
+
+// Service implementation which wraps a XLA Service with a GRPC interface.
+class GRPCService : public grpc::XlaService::Service {
+ public:
+  // Factory for creating a RPCService. The parameter platform is the platform
+  // that the service should target. If platform is null then the default
+  // platform is used.
+  static StatusOr<std::unique_ptr<GRPCService>> NewService(
+      perftools::gputools::Platform* platform = nullptr);
+
+  ::grpc::Status Computation(::grpc::ServerContext* context,
+                             const ComputationRequest* arg,
+                             ComputationResponse* result) override;
+
+  ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg,
+                          OpResponse* result) override;
+
+  ::grpc::Status Unregister(::grpc::ServerContext* context,
+                            const UnregisterRequest* arg,
+                            UnregisterResponse* result) override;
+
+  ::grpc::Status DeconstructTuple(::grpc::ServerContext* context,
+                                  const DeconstructTupleRequest* arg,
+                                  DeconstructTupleResponse* result) override;
+
+  ::grpc::Status SetReturnValue(::grpc::ServerContext* context,
+                                const SetReturnValueRequest* arg,
+                                SetReturnValueResponse* results) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
+
+  ::grpc::Status ExecuteAsync(::grpc::ServerContext* context,
+                              const ExecuteAsyncRequest* arg,
+                              ExecuteAsyncResponse* result) override;
+
+  ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
+                                  const WaitForExecutionRequest* arg,
+                                  WaitForExecutionResponse* result) override;
+
+  ::grpc::Status TransferToClient(::grpc::ServerContext* context,
+                                  const TransferToClientRequest* arg,
+                                  TransferToClientResponse* result) override;
+
+  ::grpc::Status TransferToServer(::grpc::ServerContext* context,
+                                  const TransferToServerRequest* arg,
+                                  TransferToServerResponse* result) override;
+
+  ::grpc::Status TransferToInfeed(::grpc::ServerContext* context,
+                                  const TransferToInfeedRequest* arg,
+                                  TransferToInfeedResponse* result) override;
+
+  ::grpc::Status TransferFromOutfeed(
+      ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  ::grpc::Status ResetDevice(::grpc::ServerContext* context,
+                             const ResetDeviceRequest* arg,
+                             ResetDeviceResponse* result) override;
+
+  ::grpc::Status IsConstant(::grpc::ServerContext* context,
+                            const IsConstantRequest* arg,
+                            IsConstantResponse* result) override;
+
+  ::grpc::Status ComputeConstant(::grpc::ServerContext* context,
+                                 const ComputeConstantRequest* arg,
+                                 ComputeConstantResponse* result) override;
+
+  ::grpc::Status GetShape(::grpc::ServerContext* context,
+                          const GetShapeRequest* arg,
+                          GetShapeResponse* result) override;
+
+  ::grpc::Status GetComputationShape(
+      ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  ::grpc::Status GetLocalShape(::grpc::ServerContext* context,
+                               const GetLocalShapeRequest* arg,
+                               GetLocalShapeResponse* result) override;
+
+  ::grpc::Status GetComputationStats(::grpc::ServerContext* context,
+                                     const ComputationStatsRequest* arg,
+                                     ComputationStatsResponse* result) override;
+
+  ::grpc::Status SnapshotComputation(
+      ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+      SnapshotComputationResponse* result) override;
+
+  ::grpc::Status LoadComputationSnapshot(
+      ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+      LoadComputationSnapshotResponse* result) override;
+
+ private:
+  std::unique_ptr<::xla::Service> service_;
+
+  GRPCService() {}
+  GRPCService(const GRPCService&) = delete;
+  void operator=(const GRPCService&) = delete;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
new file mode 100644
index 0000000000..e29908ccec
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic server binary that exposes a xla::Service through a GRPC interface
+// on a configurable port.
+#include "grpc++/security/server_credentials.h"
+#include "grpc++/server.h"
+#include "grpc++/server_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace {
+
+int RealMain(int argc, char** argv) {
+  int32 port = 1685;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("port", &port, "port to listen on"),
+  };
+  string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_values_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parsed_values_ok) {
+    LOG(ERROR) << usage;
+    return 2;
+  }
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  std::unique_ptr<xla::GRPCService> service =
+      xla::GRPCService::NewService().ConsumeValueOrDie();
+
+  ::grpc::ServerBuilder builder;
+  string server_address(tensorflow::strings::Printf("localhost:%d", port));
+
+  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+
+  LOG(INFO) << "Server listening on " << server_address;
+  server->Wait();
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) { return xla::RealMain(argc, argv); }
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
new file mode 100644
index 0000000000..e1f2b0abe3
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -0,0 +1,244 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+GRPCStub::~GRPCStub() = default;
+
+tensorflow::Status MakeRPC(
+    const std::function<::grpc::Status(::grpc::ClientContext*)>& rpc_method) {
+  ::grpc::ClientContext context;
+  ::grpc::Status s = rpc_method(&context);
+  return tensorflow::FromGrpcStatus(s);
+}
+
+tensorflow::Status GRPCStub::TransferToClient(
+    const TransferToClientRequest* request,
+    TransferToClientResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToClient(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToServer(
+    const TransferToServerRequest* request,
+    TransferToServerResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToServer(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToInfeed(
+    const TransferToInfeedRequest* request,
+    TransferToInfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToInfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferFromOutfeed(
+    const TransferFromOutfeedRequest* request,
+    TransferFromOutfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferFromOutfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
+                                         ResetDeviceResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ResetDevice(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::LoadComputationSnapshot(
+    const LoadComputationSnapshotRequest* request,
+    LoadComputationSnapshotResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->LoadComputationSnapshot(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Execute(const ExecuteRequest* request,
+                                     ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
+                                          ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraph(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteParallel(
+    const ExecuteParallelRequest* request, ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* request,
+    ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraphParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
+                                          ExecuteAsyncResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteAsync(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::WaitForExecution(
+    const WaitForExecutionRequest* request,
+    WaitForExecutionResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->WaitForExecution(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::DeconstructTuple(
+    const DeconstructTupleRequest* request,
+    DeconstructTupleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->DeconstructTuple(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationStats(
+    const ComputationStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationGraphStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationShape(
+    const GetComputationShapeRequest* request,
+    GetComputationShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetShape(const GetShapeRequest* request,
+                                      GetShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetDeviceHandles(
+    const GetDeviceHandlesRequest* request,
+    GetDeviceHandlesResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetDeviceHandles(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::CreateChannelHandle(
+    const CreateChannelHandleRequest* request,
+    CreateChannelHandleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateChannelHandle(context, *request, response);
+  });
+}
+
+// Methods used by ComputationBuilder.
+tensorflow::Status GRPCStub::Computation(const ComputationRequest* request,
+                                         ComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Computation(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Op(const OpRequest* request,
+                                OpResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateOp(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
+                                           GetLocalShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetLocalShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::SetReturnValue(
+    const SetReturnValueRequest* request, SetReturnValueResponse* responses) {
+  return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
+    return grpc_stub_->SetReturnValue(context, *request, responses);
+  });
+}
+
+tensorflow::Status GRPCStub::IsConstant(const IsConstantRequest* request,
+                                        IsConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->IsConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstant(
+    const ComputeConstantRequest* request, ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* request,
+    ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstantGraph(context, *request, response);
+  });
+}
+
+// Methods used by Computation.
+tensorflow::Status GRPCStub::SnapshotComputation(
+    const SnapshotComputationRequest* request,
+    SnapshotComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->SnapshotComputation(context, *request, response);
+  });
+}
+
+// Methods used by GlobalData.
+tensorflow::Status GRPCStub::Unregister(const UnregisterRequest* request,
+                                        UnregisterResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Unregister(context, *request, response);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
new file mode 100644
index 0000000000..fd9810d4f1
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service_interface.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+class GRPCStub : public ServiceInterface {
+ public:
+  explicit GRPCStub(grpc::XlaService::Stub* stub) : grpc_stub_(stub) {}
+  ~GRPCStub() override;
+
+  tensorflow::Status TransferToClient(
+      const TransferToClientRequest* arg,
+      TransferToClientResponse* result) override;
+
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override;
+
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override;
+
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override;
+
+  tensorflow::Status LoadComputationSnapshot(
+      const LoadComputationSnapshotRequest* request,
+      LoadComputationSnapshotResponse* result) override;
+
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override;
+
+  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* request,
+                                  ExecuteResponse* response) override;
+
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override;
+
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* request,
+      ExecuteParallelResponse* response) override;
+
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override;
+
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override;
+
+  tensorflow::Status DeconstructTuple(
+      const DeconstructTupleRequest* arg,
+      DeconstructTupleResponse* result) override;
+
+  tensorflow::Status GetComputationStats(
+      const ComputationStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* request,
+      ComputationStatsResponse* response) override;
+
+  tensorflow::Status GetComputationShape(
+      const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  tensorflow::Status GetShape(const GetShapeRequest* arg,
+                              GetShapeResponse* result) override;
+
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override;
+
+  tensorflow::Status CreateChannelHandle(
+      const CreateChannelHandleRequest* arg,
+      CreateChannelHandleResponse* result) override;
+
+  // Methods used by ComputationBuilder.
+  tensorflow::Status Computation(const ComputationRequest* arg,
+                                 ComputationResponse* result) override;
+
+  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
+  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
+                                   GetLocalShapeResponse* result) override;
+
+  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
+                                    SetReturnValueResponse* results) override;
+
+  tensorflow::Status IsConstant(const IsConstantRequest* arg,
+                                IsConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
+                                     ComputeConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
+
+  // Methods used by Computation.
+  tensorflow::Status SnapshotComputation(
+      const SnapshotComputationRequest* ag,
+      SnapshotComputationResponse* result) override;
+
+  // Methods used by GlobalData.
+  tensorflow::Status Unregister(const UnregisterRequest* arg,
+                                UnregisterResponse* result) override;
+
+  grpc::XlaService::Stub* service() { return grpc_stub_; }
+
+ private:
+  grpc::XlaService::Stub* grpc_stub_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GRPCStub);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
new file mode 100644
index 0000000000..c47164ee1b
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA service API.
+//
+// Users 1) build up computations and 2) create allocations via this API.
+// Computations are composed of data flowing between arbitrarily-sized
+// vector-oriented operations.
+//
+// Users build up computations using a ComputationHandle, and talk about
+// allocations using GlobalDataHandles.
+//
+// There are currently no checkpointing capabilities or distribution/replication
+// guarantees. The service runs on a single machine (e.g. one task) and that is
+// its failure domain.
+//
+// Canonical example of "alpha * X + Y":
+// * Make a computation.
+// * Add alpha and X and Y as parameters.
+// * Request the multiplication of alpha and X.
+// * Request the addition of that result and Y.
+//
+// Then, pass the computation and appropriately shaped inputs to the XLA
+// service's Execute method, which provides a result as a GlobalDataHandle.
+//
+// All data in XLA computations are conceptually immutable.
+//
+// Note: this API is subject to change / refinement over time -- use the
+// provided client libraries to insulate code from changes to this service API.
+
+syntax = "proto3";
+
+import "tensorflow/compiler/xla/xla.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+package xla;
+
+service XlaService {
+  /////////////////////////
+  // Global data requests
+
+  // Unregisters a global allocation.
+  //
+  // If the handle given is not currently allocated, a NOT_FOUND status is
+  // returned.
+  rpc Unregister(UnregisterRequest) returns (UnregisterResponse) {
+  }
+
+  // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
+  // element in the tuple.
+  rpc DeconstructTuple(DeconstructTupleRequest)
+      returns (DeconstructTupleResponse) {
+  }
+
+  // Unpack requests that a global data handle, with a tuple shape, has global
+  // data handles created for each of its constituent members. This is the
+  // equivalent of the "destructuring assignment" present in various programming
+  // languages.
+  rpc Unpack(UnpackRequest) returns (UnpackResponse) {
+  }
+
+  // Requests the shape of the referenced global data.
+  rpc GetShape(GetShapeRequest) returns (GetShapeResponse) {
+  }
+
+  // Requests the program shape of the referenced computation.
+  rpc GetComputationShape(GetComputationShapeRequest)
+      returns (GetComputationShapeResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  rpc GetComputationStats(ComputationStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc GetComputationGraphStats(ComputationGraphStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Loads a variable number of values with a given element type from ColumnIO.
+  rpc LoadData(LoadDataRequest) returns (LoadDataResponse) {
+  }
+
+  // Transfers the given global data to the client in the form of a Literal.
+  rpc TransferToClient(TransferToClientRequest)
+      returns (TransferToClientResponse) {
+  }
+
+  // Transfers the given literal to the server to be stored in a global
+  // allocation, which is returned.
+  rpc TransferToServer(TransferToServerRequest)
+      returns (TransferToServerResponse) {
+  }
+
+  // Transfers the given literal to the Infeed buffer of the device.
+  rpc TransferToInfeed(TransferToInfeedRequest)
+      returns (TransferToInfeedResponse) {
+  }
+
+  // Transferred literal from the Outfeed buffer of the device.
+  rpc TransferFromOutfeed(TransferFromOutfeedRequest)
+      returns (TransferFromOutfeedResponse) {
+  }
+
+  // Resets the device, clearing all existing state on the device.
+  rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) {
+  }
+
+  // Tests if an expression is a compile-time constant.
+  rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) {
+  }
+
+  // Computes the value of a constant expression.
+  rpc ComputeConstant(ComputeConstantRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Computes the value of a constant expression. The request contains the
+  // computation graph for the constant expression.
+  rpc ComputeConstantGraph(ComputeConstantGraphRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Retrieves the inferred shape for a value within a computation.
+  rpc GetLocalShape(GetLocalShapeRequest) returns (GetLocalShapeResponse) {
+  }
+
+  // Requests one or more device handles from the target. The returned device
+  // handles can be used to specify the device on which to execute computations
+  // or transfer data.
+  rpc GetDeviceHandles(GetDeviceHandlesRequest)
+      returns (GetDeviceHandlesResponse) {
+  }
+
+  // Creates a channel handle that can be used to transfer data between
+  // two computations via a pair of Send and Recv instructions.
+  rpc CreateChannelHandle(CreateChannelHandleRequest)
+      returns (CreateChannelHandleResponse) {
+  }
+
+  // Requests that the referenced computation be specialized for the provided
+  // arguments for subsequent execution. This permits things such as value
+  // specialization.
+  rpc Specialize(SpecializeRequest) returns (SpecializeResponse) {
+  }
+
+  // Modifies the provided computation so that subsequent executions
+  // will compute the provided ComputationDataHandle, rather than the
+  // last expression enqueued on that Computation.
+  rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
+  }
+
+  // Computation creates a new computation with the given name.
+  // A unique ComputationHandle is returned.
+  rpc Computation(ComputationRequest) returns (ComputationResponse) {
+  }
+
+  // Adds a new op to a computation.
+  rpc CreateOp(OpRequest) returns (OpResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns global data output and execution timing.
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. The request contains the whole computation graph.
+  // Returns global data output and execution timing.
+  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  rpc ExecuteParallel(ExecuteParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc ExecuteGraphParallel(ExecuteGraphParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns a handle to the execution.
+  rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) {
+  }
+
+  // Waits until the given execution (aysnchronously launched) is complete, and
+  // returns the global data output.
+  rpc WaitForExecution(WaitForExecutionRequest)
+      returns (WaitForExecutionResponse) {
+  }
+
+  // Serializes a computation to proto form, so it can be loaded via
+  // LoadComputationSnapshot.
+  rpc SnapshotComputation(SnapshotComputationRequest)
+      returns (SnapshotComputationResponse) {
+  }
+
+  // Loads a computation from a captured snapshot.
+  rpc LoadComputationSnapshot(LoadComputationSnapshotRequest)
+      returns (LoadComputationSnapshotResponse) {
+  }
+}
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 6b136d333b..1439f1bcc5 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -6,7 +6,9 @@ load("//tensorflow/core:platform/default/build_config_root.bzl",
      "if_static")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
+def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
+  if kwargs.get('use_grpc_plugin'):
+    kwargs['use_grpc_namespace'] = True
   cc_proto_library(name=name,
                    srcs=srcs,
                    deps=deps,
@@ -16,6 +18,13 @@ def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
                    ),
                    protoc="@protobuf_archive//:protoc",
                    testonly=testonly,
-                   visibility=visibility,)
+                   visibility=visibility,
+                   **kwargs)
+
+def xla_py_grpc_library(**kwargs):
+  # Note: we don't currently define any special targets for Python GRPC in OSS.
+  _ignore = kwargs
+  pass
+
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e01e076bcf..4cfa25bf66 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -122,6 +122,7 @@ def cc_proto_library(
     protoc="@protobuf_archive//:protoc",
     internal_bootstrap_hack=False,
     use_grpc_plugin=False,
+    use_grpc_namespace=False,
     default_header=False,
     **kargs):
   """Bazel rule to create a C++ protobuf library from proto source files.
@@ -169,8 +170,11 @@ def cc_proto_library(
     return
 
   grpc_cpp_plugin = None
+  plugin_options = []
   if use_grpc_plugin:
     grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+    if use_grpc_namespace:
+      plugin_options = ["services_namespace=grpc"]
 
   gen_srcs = _proto_cc_srcs(srcs, use_grpc_plugin)
   gen_hdrs = _proto_cc_hdrs(srcs, use_grpc_plugin)
@@ -184,6 +188,7 @@ def cc_proto_library(
       protoc=protoc,
       plugin=grpc_cpp_plugin,
       plugin_language="grpc",
+      plugin_options=plugin_options,
       gen_cc=1,
       outs=outs,
       visibility=["//visibility:public"],
-- 
GitLab


From bb1dec54a63ad0a5f43208fa7617f090bc5be2e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 16:26:05 -0700
Subject: [PATCH 0550/1262] Supporting FakeQuant num_bits and getting the fake
 quant op matching tensorflow.

PiperOrigin-RevId: 192367307
---
 .../contrib/lite/kernels/internal/BUILD       |  2 +
 .../internal/optimized/optimized_ops.h        | 79 +++++--------------
 .../kernels/internal/quantization_util.cc     | 21 +++++
 .../lite/kernels/internal/quantization_util.h |  8 ++
 .../internal/reference/reference_ops.h        | 79 +++++--------------
 .../contrib/lite/toco/export_tensorflow.cc    |  3 +
 .../contrib/lite/toco/import_tensorflow.cc    |  3 +
 tensorflow/contrib/lite/toco/model.h          |  4 +-
 .../contrib/lite/toco/tflite/operator.cc      |  3 +
 .../contrib/lite/toco/tflite/operator_test.cc |  2 +
 10 files changed, 81 insertions(+), 123 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 167c0f1fde..32a0acf888 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -154,6 +154,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
+        ":quantization_util",
         ":types",
         ":round",
         "//third_party/eigen3",
@@ -238,6 +239,7 @@ cc_library(
         "reference/reference_ops.h",
     ],
     deps = [
+        ":quantization_util",
         ":round",
         ":types",
         "//third_party/eigen3",
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 22c0504ad2..5f60b2d6a0 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -4750,66 +4751,23 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("FakeQuant");
 
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
 
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
@@ -4820,11 +4778,12 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
       for (int x = 0; x < width; ++x) {
         for (int c = 0; c < depth; ++c) {
           const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
+          const float clamped =
+              std::min(nudged_max, std::max(nudged_min, src_val));
+          const float clamped_shifted = clamped - nudged_min;
+          const float dst_val =
+              TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+              nudged_min;
           output_data[Offset(output_dims, c, x, y, b)] = dst_val;
         }
       }
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index dd86313726..b0951aac8c 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -104,4 +104,25 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   return static_cast<int>(std::floor(max_input_rescaled));
 }
 
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* scale) {
+  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / *scale;
+  uint16 nudged_zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    nudged_zero_point = static_cast<uint16>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    nudged_zero_point = static_cast<uint16>(quant_max);
+  } else {
+    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+  }
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index 1f6f5d3b15..4a217515f1 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -209,6 +209,14 @@ void PreprocessLogSoftmaxScaling(double beta, double input_scale,
 // Softmax.
 int CalculateInputRadius(int input_integer_bits, int input_left_shift);
 
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max, float* scale);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 93b4eb5504..0912f5928c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -2697,74 +2698,30 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
 
   const int flat_size = MatchingFlatSize(output_dims, input_dims);
-
   for (int i = 0; i < flat_size; i++) {
     const float src_val = input_data[i];
-    const float unclamped_quantized_val =
-        TfLiteRound(zero_point + src_val / scale);
-    const float quantized_val =
-        std::min(qmax_float, std::max(qmin_float, unclamped_quantized_val));
-    const float dst_val = scale * (quantized_val - zero_point);
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
     output_data[i] = dst_val;
   }
 }
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 4a85f3c5a4..99ccfaea64 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -883,6 +883,9 @@ void ConvertFakeQuantOperator(const FakeQuantOperator& src_op,
   CHECK(src_op.minmax);
   (*fakequant_op->mutable_attr())["min"].set_f(src_op.minmax->min);
   (*fakequant_op->mutable_attr())["max"].set_f(src_op.minmax->max);
+  if (src_op.num_bits) {
+    (*fakequant_op->mutable_attr())["num_bits"].set_i(src_op.num_bits);
+  }
 }
 
 void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 6b62eeb638..155d890c9f 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -694,6 +694,8 @@ void ConvertFakeQuantWithMinMaxArgs(
   minmax.min = GetFloatAttr(node, "min");
   minmax.max = GetFloatAttr(node, "max");
   op->outputs.push_back(node.name());
+  // tf.fake_quant_with_min_max_args num_bits defaults to 8.
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -711,6 +713,7 @@ void ConvertFakeQuantWithMinMaxVars(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 54c3a59506..616d53ae3e 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -724,8 +724,7 @@ struct L2PoolOperator : Operator {
 // The expected [min, max] range of values in a given array.
 // Used for quantization only.
 // This information typically comes from special nodes found in quantized
-// models,
-// see FakeQuantOperator, and is used during quantization to resolve
+// models, see FakeQuantOperator, and is used during quantization to resolve
 // actual quantization parameters (see QuantizationParams).
 struct MinMax {
   double min = 0.;
@@ -753,6 +752,7 @@ inline bool operator==(const MinMax& m1, const MinMax& m2) {
 struct FakeQuantOperator : Operator {
   FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
   std::unique_ptr<MinMax> minmax;
+  int num_bits = 8;
 };
 
 // Element-wise division operator.
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index e015108120..0e057fd252 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -260,12 +260,15 @@ class FakeQuant : public CustomOperator<FakeQuantOperator> {
                     flexbuffers::Builder* fbb) const override {
     fbb->Float("min", op.minmax->min);
     fbb->Float("max", op.minmax->max);
+    fbb->Int("num_bits", op.num_bits);
   }
   void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
     auto* minmax = new MinMax;
     minmax->min = m["min"].AsFloat();
     minmax->max = m["max"].AsFloat();
     op->minmax.reset(minmax);
+    const auto& num_bits = m["num_bits"];
+    op->num_bits = num_bits.IsInt() ? num_bits.AsInt32() : 8;
   }
 };
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 24ba71e459..a947630e28 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -165,10 +165,12 @@ TEST_F(OperatorTest, CustomFakeQuant) {
   minmax->min = -10;
   minmax->max = 200;
   op.minmax.reset(minmax);
+  op.num_bits = 16;
   auto output_toco_op = SerializeAndDeserialize(
       GetOperator("FAKE_QUANT", OperatorType::kFakeQuant), op);
   EXPECT_EQ(op.minmax->min, output_toco_op->minmax->min);
   EXPECT_EQ(op.minmax->max, output_toco_op->minmax->max);
+  EXPECT_EQ(op.num_bits, output_toco_op->num_bits);
 }
 
 TEST_F(OperatorTest, CustomFullyConnected) {
-- 
GitLab


From c2f265493879a86b3ce200f9af56747bfb9dd653 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 16:31:46 -0700
Subject: [PATCH 0551/1262] Update programmers guide

PiperOrigin-RevId: 192368335
---
 tensorflow/docs_src/programmers_guide/faq.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 392ac6f7f1..51c1a1e032 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -121,7 +121,7 @@ dimensions:
   devices, which makes it possible to speed up
   @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
 * The Session API allows multiple concurrent steps (i.e. calls to
-  @{tf.Session.run} in parallel. This
+  @{tf.Session.run} in parallel). This
   enables the runtime to get higher throughput, if a single step does not use
   all of the resources in your computer.
 
-- 
GitLab


From 16eec071ea0a83dc5303758ac0e528f59337a1ce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 16:32:05 -0700
Subject: [PATCH 0552/1262] [XLA] Redesign: implement and test ReduceWindow.

PiperOrigin-RevId: 192368401
---
 .../xla/client/xla_client/xla_builder.cc      | 36 +++++++++++++-
 tensorflow/compiler/xla/tests/BUILD           |  3 +-
 .../compiler/xla/tests/reduce_window_test.cc  | 48 +++++++++----------
 3 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 9e4b9ccd25..c869eb2ec5 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1425,7 +1425,21 @@ XlaOp XlaBuilder::ReduceWindow(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_RETURN_IF_ERROR(
+        ValidatePaddingValues(AsInt64Slice(operand_shape.dimensions()),
+                              window_dimensions, window_strides));
+
+    std::vector<std::pair<int64, int64>> padding_values =
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding);
+    return ReduceWindowWithGeneralPadding(operand, init_value, computation,
+                                          window_dimensions, window_strides,
+                                          padding_values);
+  });
 }
 
 XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
@@ -1434,7 +1448,25 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
+                                               instr.window(), to_apply_shape));
+
+    AddCalledComputation(computation, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
+                          {operand, init_value});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 67c53c6ac0..a615acdbb8 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1091,10 +1091,11 @@ xla_test_library(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 8dd24f1237..8ef980ebd9 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -63,11 +64,9 @@ class ReduceWindowTestBase : public ClientLibraryTestBase {
 class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                          public ReduceWindowTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {
-    set_use_bfloat16(GetParam());
-  }
+  ReduceWindowTest() : builder_(TestName()) { set_use_bfloat16(GetParam()); }
 
-  void ReduceWindowAdd(const ComputationDataHandle& input,
+  void ReduceWindowAdd(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -78,16 +77,17 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(const ComputationDataHandle& input,
+  void ReduceWindowMax(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
     auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
-    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
-                          window_strides, padding);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMaxComputation(FloatType(), &builder_),
+                          window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(const ComputationDataHandle& input,
+  void ReduceWindowMin(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -97,7 +97,7 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
 };
 
 TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
@@ -310,7 +310,7 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   auto rhs = b->Parameter(1, scalar, "rhs");
   b->Min(b->Add(lhs, rhs),
          CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
-  Computation reduce_fn = b->BuildAndNoteError();
+  XlaComputation reduce_fn = b->BuildAndNoteError();
 
   builder_.ReduceWindow(
       input,
@@ -338,7 +338,7 @@ TEST_P(ReduceWindowTest, R4UnitWindow) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -406,7 +406,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -428,7 +428,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -450,7 +450,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -551,7 +551,7 @@ TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
 
 TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input = builder_.Broadcast(
+  XlaOp input = builder_.Broadcast(
       CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
@@ -610,7 +610,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
   R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
@@ -621,7 +621,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     std::unique_ptr<Literal> input_literal =
         Literal::CreateR4FromArray4DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
 
@@ -962,7 +962,7 @@ class R3ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R3ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd);
 
@@ -973,7 +973,7 @@ TEST_P(R3ReduceWindowTest, Add) {
       Literal::CreateR3FromArray3DWithLayout(
           input, LayoutUtil::MakeLayout(param.layout));
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
   auto init_value =
@@ -1100,7 +1100,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
   R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
     CHECK(param.reducer == kAdd);
 
@@ -1110,7 +1110,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         Literal::CreateR2FromArray2DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
     std::vector<std::pair<int64, int64>> padding(2);
@@ -1298,7 +1298,7 @@ class R1ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R1ReduceWindowTest, DoIt) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
@@ -1307,7 +1307,7 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
 
-- 
GitLab


From f5c2e5d968d371c0855c6d7b2cc4f050615d4bc4 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 10 Apr 2018 16:35:58 -0700
Subject: [PATCH 0553/1262] Fix issue with gradients of resource variables in
 cond.

PiperOrigin-RevId: 192369091
---
 tensorflow/python/ops/control_flow_grad.py |  6 ++++++
 tensorflow/python/ops/gradients_test.py    | 25 ++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 45955554ca..6a551deb5b 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
@@ -74,6 +75,11 @@ def _SwitchGrad(op, *grad):
     # At this point, we have created zero_grad guarded by the right switch.
     # Unfortunately, we may still get None here for not trainable data types.
     if zero_grad is None:
+      # For resource variables we get None always on the other branch, so bypass
+      # this.
+      if op.inputs[0].dtype == dtypes.resource:
+        return merge(
+            [grad[op_ctxt.branch]] * 2, name="cond_resource_grad")[0], None
       return None, None
     return merge(grad, name="cond_grad")[0], None
   else:
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index c94f1396b2..0603d3b670 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
@@ -810,5 +811,29 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
       gradients.gradients(y, x)
 
 
+class ResourceCondTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    gamma = resource_variable_ops.ResourceVariable(
+        np.random.random((3,)),
+        dtype="float32", name="gamma")
+
+    inputs = array_ops.ones(shape=(3,), dtype="float32")
+
+    def TestFn():
+      output = inputs + gamma
+      return output
+
+    training = array_ops.placeholder_with_default(True, shape=())
+    output = control_flow_ops.cond(
+        training, TestFn, lambda: inputs)
+
+    loss = output
+
+    grads = gradients.gradients(
+        loss, [gamma])
+    self.assertTrue(None not in grads)
+
+
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From 69342d7a6c61c4aa2ca42ac010ed0e66f0b89755 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 10 Apr 2018 16:10:13 -0700
Subject: [PATCH 0554/1262] Updating the sed command for docker parameterized
 build.

---
 tensorflow/tools/docker/parameterized_docker_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b4fba5b8f5..05de25f2cb 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@@ -306,7 +306,7 @@ else
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
     else
-- 
GitLab


From f83843a4b8dde5e9306c2b91da8ccbd438a7265f Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Tue, 10 Apr 2018 16:45:19 -0700
Subject: [PATCH 0555/1262] Add a thread-safe producer-consumer queue.

PiperOrigin-RevId: 192370670
---
 tensorflow/compiler/jit/BUILD                 |  19 +++
 .../compiler/jit/producer_consumer_queue.h    | 132 +++++++++++++++++
 .../jit/producer_consumer_queue_test.cc       | 139 ++++++++++++++++++
 3 files changed, 290 insertions(+)
 create mode 100644 tensorflow/compiler/jit/producer_consumer_queue.h
 create mode 100644 tensorflow/compiler/jit/producer_consumer_queue_test.cc

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index a492fc6b9b..4cefc08645 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -318,6 +318,25 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
+cc_library(
+    name = "producer_consumer_queue",
+    hdrs = ["producer_consumer_queue.h"],
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_cc_test(
+    name = "producer_consumer_queue_test",
+    size = "small",
+    srcs = ["producer_consumer_queue_test.cc"],
+    deps = [
+        ":producer_consumer_queue",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "graph_to_functiondef_test",
     size = "small",
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
new file mode 100644
index 0000000000..7c8c04152d
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue.h
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+
+#include <deque>
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// A thread-safe, first-in-first-out queue.
+template <typename T>
+class ProducerConsumerQueue {
+ public:
+  ProducerConsumerQueue()
+      : capacity_(std::numeric_limits<std::size_t>::max()) {}
+  ~ProducerConsumerQueue() = default;
+
+  // Wait until the queue is non-full, then append a copy of v.
+  void Put(const T &v);
+
+  // Wait until the queue is non-empty, then remove and return the head value.
+  T Get();
+
+  // If the queue is non-empty, remove the head value, placing it in *pv, and
+  // return true; otherwise return false.
+  bool TryGet(T *pv);
+
+  // Set the capacity of the queue; the queue is full whenever count() >=
+  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
+  void set_capacity(std::size_t size);
+
+  // Return the capacity of the queue.
+  std::size_t capacity() const;
+
+  // Return the number of elements in the queue.
+  std::size_t count() const;
+
+  // Implementation details follow.  Clients should ignore.
+ private:
+  mutable tensorflow::mutex mu_;  // protects all fields below
+  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
+  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
+  std::size_t capacity_ GUARDED_BY(mu_);
+  std::deque<T> queue_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
+};
+
+// ------------------------------------------------------
+// Implementation details follow.  Clients should ignore.
+
+// Wait until the queue is non-full, then append a copy of v.
+template <typename T>
+void ProducerConsumerQueue<T>::Put(const T &v) {
+  mutex_lock lock(mu_);
+  while (queue_.size() >= capacity_) {
+    non_full_.wait(lock);
+  }
+  queue_.push_back(v);
+  non_empty_.notify_one();
+}
+
+// Wait until the queue is non-empty, then remove and return the head value.
+template <typename T>
+T ProducerConsumerQueue<T>::Get() {
+  mutex_lock lock(mu_);
+  while (queue_.empty()) {
+    non_empty_.wait(lock);
+  }
+  non_full_.notify_one();
+  T result_value = queue_.front();
+  queue_.pop_front();
+  return result_value;
+}
+
+// If the queue is non-empty, remove the head value, placing it in *pv, and
+// return true; otherwise return false.
+template <typename T>
+bool ProducerConsumerQueue<T>::TryGet(T *pv) {
+  mutex_lock lock(mu_);
+  bool got_element = !queue_.empty();
+  if (got_element) {
+    non_full_.notify_one();
+    *pv = queue_.front();
+    queue_.pop_front();
+  }
+  return got_element;
+}
+
+// Set the capacity of the queue; the queue is full whenever count() >=
+// capacity().  The initial value is the maximum size_t.  Requires size > 0.
+template <typename T>
+void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
+  mutex_lock lock(mu_);
+  CHECK_NE(size, 0);
+  capacity_ = size;
+  non_full_.notify_all();
+}
+
+// Return the capacity of the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::capacity() const {
+  mutex_lock lock(mu_);
+  std::size_t max_elements = capacity_;
+  return max_elements;
+}
+
+// Return the number of elements in the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::count() const {
+  mutex_lock lock(mu_);
+  std::size_t num_elements = queue_.size();
+  return num_elements;
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
new file mode 100644
index 0000000000..f61260c6e5
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/producer_consumer_queue.h"
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+typedef ProducerConsumerQueue<int> IntQueue;
+
+// Insert integers between low inclusive and high exclusive into q.
+void PushRange(IntQueue *q, int low, int high) {
+  while (low != high) {
+    q->Put(low);
+    VLOG(2) << "Pushing " << low;
+    ++low;
+  }
+}
+
+// Push the numbers between 0 and 999 inclusive from several threads in the
+// pool.
+void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
+  VLOG(1) << "Adding 20-36";
+  pool->Schedule([queue] { PushRange(queue, 20, 36); });
+  VLOG(1) << "Adding 7-20";
+  pool->Schedule([queue] { PushRange(queue, 7, 20); });
+  VLOG(1) << "Adding 36-501";
+  pool->Schedule([queue] { PushRange(queue, 36, 501); });
+  VLOG(1) << "Adding 501-1000";
+  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
+  VLOG(1) << "Adding 0-5";
+  pool->Schedule([queue] { PushRange(queue, 0, 5); });
+  VLOG(1) << "Adding 5-7";
+  pool->Schedule([queue] { PushRange(queue, 5, 7); });
+}
+
+// Pop elements from queue using Get().  Make sure that exactly <high> elements
+// were present and their values are all integers between 0 and high-1
+// inclusive.
+void GetRange(IntQueue *queue, int high) {
+  VLOG(1) << "Testing Wait";
+  std::vector<int> results;
+  for (int i = 0; i != high; ++i) {
+    int r = queue->Get();
+    VLOG(2) << "Waited and got " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK(results[i] == i);
+  }
+}
+
+// Pop elements from queue using TryGet().  Make sure that exactly <high>
+// elements were present and their values are all integers between 0 and high-1
+// inclusive.
+void TryGetRange(IntQueue *queue, int high) {
+  std::vector<int> results;
+  // Give up if we don't get all the elements back from the queue
+  // in 10 seconds.
+  int timeout = 10;
+  int r;
+  for (int i = 0; i != high; ++i) {
+    while (!queue->TryGet(&r)) {
+      if (!timeout--) {
+        LOG(FATAL) << "Can't find all elements in the queue";
+      }
+      VLOG(1) << "Sleeping for a second...";
+      sleep(1);
+    }
+    VLOG(2) << "Popped " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  CHECK(!queue->TryGet(&r));
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK_EQ(i, results[i]);
+  }
+}
+
+const int kNumThreads = 15;
+
+TEST(ProducerConsumerQueue, GetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  GetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, TryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  TryGetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, ParallelGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { GetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+TEST(ProducerConsumerQueue, ParallelTryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From 408f524761e50b98159ad8ff3b18a0f6af08d867 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 10 Apr 2018 16:47:08 -0700
Subject: [PATCH 0556/1262] Add types to error message in case of mismatch.
 NFC.

PiperOrigin-RevId: 192370979
---
 tensorflow/core/framework/tensor.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index e2111d6038..d5a45c73c3 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -610,11 +610,15 @@ bool Tensor::IsInitialized() const {
 }
 
 void Tensor::CheckType(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
 }
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
   CHECK(IsAligned()) << "CheckTypeAndIsAligned";
 }
 
-- 
GitLab


From ffc651af58ebacdf3ddbe9537efda694c71a64f3 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Tue, 10 Apr 2018 17:37:53 -0700
Subject: [PATCH 0557/1262] Update LogToSTDErr for TF Lite usage

PiperOrigin-RevId: 192379483
---
 tensorflow/contrib/lite/kernels/arg_max_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/arg_max_test.cc b/tensorflow/contrib/lite/kernels/arg_max_test.cc
index f4e1da3a6e..31b15fe19a 100644
--- a/tensorflow/contrib/lite/kernels/arg_max_test.cc
+++ b/tensorflow/contrib/lite/kernels/arg_max_test.cc
@@ -100,8 +100,7 @@ TEST(ArgMaxOpTest, GetMaxArgOutput64) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
-  FLAGS_logtostderr = true;
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
-- 
GitLab


From 0c219524a9b2ad82dfac1659d0957c0475d0cc25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 17:41:56 -0700
Subject: [PATCH 0558/1262] [XLA] Redesign: implement and test
 SelectAndScatter.

PiperOrigin-RevId: 192380121
---
 .../xla/client/xla_client/xla_builder.cc      | 34 +++++++++++++++++--
 tensorflow/compiler/xla/tests/BUILD           |  4 +--
 .../xla/tests/select_and_scatter_test.cc      | 29 ++++++++--------
 3 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index c869eb2ec5..b96421128e 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1499,7 +1499,14 @@ XlaOp XlaBuilder::SelectAndScatter(
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
     const XlaOp& source, const XlaOp& init_value,
     const XlaComputation& scatter) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    return SelectAndScatterWithGeneralPadding(
+        operand, select, window_dimensions, window_strides,
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding),
+        source, init_value, scatter);
+  });
 }
 
 XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
@@ -1509,7 +1516,30 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const XlaOp& source, const XlaOp& init_value,
     const XlaComputation& scatter) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& source_shape, GetShape(source));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& select_shape,
+                        select.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape,
+                        scatter.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferSelectAndScatterShape(
+                            operand_shape, select_shape, instr.window(),
+                            source_shape, init_shape, scatter_shape));
+
+    AddCalledComputation(select, &instr);
+    AddCalledComputation(scatter, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kSelectAndScatter,
+                          {operand, source, init_value});
+  });
 }
 
 XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index a615acdbb8..2a2ef229ed 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1132,11 +1132,11 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index d268fdcace..7015e5a6a3 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -50,7 +50,7 @@ class SelectAndScatterTest
     : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<SelectAndScatterTestParam> {
  public:
-  SelectAndScatterTest() : builder_(client_, TestName()) {
+  SelectAndScatterTest() : builder_(TestName()) {
     // Create S32 GE and ADD computations for select and scatter respectively.
     ge_s32_ = CreateScalarGeComputation(S32, &builder_);
     add_s32_ = CreateScalarAddComputation(S32, &builder_);
@@ -60,13 +60,13 @@ class SelectAndScatterTest
     min_f32_ = CreateScalarMinComputation(F32, &builder_);
   }
 
-  ComputationBuilder builder_;
-  Computation ge_s32_;
-  Computation add_s32_;
-  Computation ge_f32_;
-  Computation add_f32_;
-  Computation max_f32_;
-  Computation min_f32_;
+  XlaBuilder builder_;
+  XlaComputation ge_s32_;
+  XlaComputation add_s32_;
+  XlaComputation ge_f32_;
+  XlaComputation add_f32_;
+  XlaComputation max_f32_;
+  XlaComputation min_f32_;
 };
 
 XLA_TEST_P(SelectAndScatterTest, ParamTest) {
@@ -80,12 +80,11 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) {
   s.FillRandom(12.0f);
   auto source = builder_.ConstantFromArray(s);
 
-  auto select_and_scatter = builder_.SelectAndScatter(
-      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
-      add_f32_);
+  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
+                            GetParam().window_strides, GetParam().padding_type,
+                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
-- 
GitLab


From 0f862770b3890a12d783c3fa31f4aaf8b6233a21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 17:44:48 -0700
Subject: [PATCH 0559/1262] [XLA] Redesign: implement ReduceAll.

PiperOrigin-RevId: 192380688
---
 tensorflow/compiler/xla/client/xla_client/xla_builder.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index b96421128e..a08ad0e30e 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1417,7 +1417,12 @@ XlaOp XlaBuilder::Reduce(
 
 XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                             const XlaComputation& computation) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<int64> all_dimnos(ShapeUtil::Rank(operand_shape));
+    std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
+    return Reduce(operand, init_value, computation, all_dimnos);
+  });
 }
 
 XlaOp XlaBuilder::ReduceWindow(
-- 
GitLab


From 462e799f0c2c3652b0cc712f34cf5142b487bad2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 17:47:13 -0700
Subject: [PATCH 0560/1262] [XLA] Redesign: implement SliceInDim.

PiperOrigin-RevId: 192381080
---
 .../compiler/xla/client/xla_client/xla_builder.cc    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index a08ad0e30e..c7c303fe9d 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -538,7 +538,17 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
 
 XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
                              int64 limit_index, int64 stride, int64 dimno) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
+    std::vector<int64> limits(shape.dimensions().begin(),
+                              shape.dimensions().end());
+    std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
+    starts[dimno] = start_index;
+    limits[dimno] = limit_index;
+    strides[dimno] = stride;
+    return Slice(operand, starts, limits, strides);
+  });
 }
 
 XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-- 
GitLab


From 874cee614d2baca210abe06e21f16632f3e4b97d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 17:49:51 -0700
Subject: [PATCH 0561/1262] [XLA] Redesign: implement Conj.

PiperOrigin-RevId: 192381481
---
 tensorflow/compiler/xla/client/xla_client/xla_builder.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index c7c303fe9d..ba76001c78 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1083,7 +1083,9 @@ XlaOp XlaBuilder::Complex(
   return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::Conj(const XlaOp& operand) { return UnimplementedOp(); }
+XlaOp XlaBuilder::Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
 
 XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-- 
GitLab


From fe4dd168744b39daca4761d5e6ccf5c93458f023 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 10 Apr 2018 17:57:19 -0700
Subject: [PATCH 0562/1262] Forward the status from LookupResource to
 GetInputTensorFromVariable rather than returning a generic error status

PiperOrigin-RevId: 192382499
---
 tensorflow/core/kernels/training_op_helpers.h | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index 857daae177..7e56e15450 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -78,24 +78,21 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
                                   bool lock_held, bool sparse, Tensor* out) {
   if (ctx->input_dtype(input) == DT_RESOURCE) {
     Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      core::ScopedUnref unref_var(var);
-      if (lock_held) {
+    TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
+    core::ScopedUnref unref_var(var);
+    if (lock_held) {
+      TF_RETURN_IF_ERROR(
+          PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+      *out = *var->tensor();
+    } else {
+      mutex_lock ml(*var->mu());
+      if (!sparse) {
         TF_RETURN_IF_ERROR(
             PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        if (!sparse) {
-          TF_RETURN_IF_ERROR(
-              PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        }
-        *out = *var->tensor();
       }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
+      *out = *var->tensor();
     }
+    return Status::OK();
   }
   *out = ctx->mutable_input(input, lock_held);
   return Status::OK();
-- 
GitLab


From 69136b4d2204b8e6dfd619bdb9a2a788c3c8b431 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 10 Apr 2018 18:04:20 -0700
Subject: [PATCH 0563/1262] TFTS: De-flake the LSTM test

Disabling the value-based check for now. Hopefully the shapes are deterministic.

PiperOrigin-RevId: 192383553
---
 tensorflow/contrib/timeseries/examples/lstm_test.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index ca56e38ca0..c58e24e6d9 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -36,17 +36,14 @@ class LSTMExampleTest(test.TestCase):
   def test_periodicity_learned(self):
     (observed_times, observed_values,
      all_times, predicted_values) = lstm.train_and_predict(
-         training_steps=100, estimator_config=_SeedRunConfig(),
+         training_steps=2, estimator_config=_SeedRunConfig(),
          export_directory=self.get_temp_dir())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
     self.assertAllEqual([200, 5], predicted_values.shape)
-    self.assertGreater(
-        predicted_values[100, 4]
-        - predicted_values[115, 4],  # Amplitude of fifth component
-        0.2)
-
+    # TODO(allenl): Make the model deterministic so you can check something
+    # substantive.
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 695340d72acb786805837df0040332b81aafcaa9 Mon Sep 17 00:00:00 2001
From: Maciej <mbajkowski@hotmail.com>
Date: Tue, 10 Apr 2018 20:37:03 -0500
Subject: [PATCH 0564/1262] typo and readability fixes in CPU section (#18370)

Fixed a typo in the Tuning MKL section, and modified punctuation for intra_op_parallelism_threads section for easier readability.
---
 tensorflow/docs_src/performance/performance_guide.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 580a899ac4..b1796cf9b2 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -475,7 +475,7 @@ optimizations.
 ### TensorFlow with Intel® MKL DNN
 
 Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon
-Phi™ though the use of Intel® Math Kernel Library for Deep Neural Networks
+Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks
 (Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups
 for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel
 published paper
@@ -581,9 +581,9 @@ Each variable that impacts performance is discussed below.
     for optimal settings.
 
 *   **intra_op_parallelism_threads**: Setting this equal to the number of
-    physical cores is recommended. Setting the value to 0, which is the default
-    and will result in the value being set to the number of logical cores, is an
-    option to try for some architectures.  This value and `OMP_NUM_THREADS`
+    physical cores is recommended. Setting the value to 0, which is the default,
+    results in the value being set to the number of logical cores - this is an
+    alternate option to try for some architectures.  This value and `OMP_NUM_THREADS`
     should be equal.
 
 *   **inter_op_parallelism_threads**: Setting this equal to the number of
-- 
GitLab


From 0899c019e404c0df17af70e50be95e1de1698b64 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 10 Apr 2018 18:37:35 -0700
Subject: [PATCH 0565/1262] Fix code block rendering issue in adding_an_op.md
 (#18368)

* Fix code block rendering issue in adding_an_op.md

In adding_an_op.md, html code was used in markdown for code blocks.
However, this does not work very well as some of the code blocks
includes incorrect rendering.

This fix converts html into "```c++" (backticks) so that the rendering
could be fixed.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix additional html code

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix lang-cpp issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Further clean up

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/extend/adding_an_op.md | 159 +++++++++++----------
 1 file changed, 84 insertions(+), 75 deletions(-)

diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 15075e1df8..84da2165b5 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -530,56 +530,58 @@ form [described below](#attr_types).
 
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
-    .Input("to\_zero: int32")
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("preserve_index: int")
+    .Input("to_zero: int32")
     .Output("zeroed: int32");
-</code></pre>
+```
 
 (Note that the set of [attribute types](#attr_types) is different from the
 @{tf.DType$tensor types} used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {
     // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
-    // Check that preserve\_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
-                                        preserve\_index_));
-  </b>}
-  void Compute(OpKernelContext\* context) override {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preserve_index", &preserve_index_));
+    // Check that preserve_index is positive
+    OP_REQUIRES(context, preserve_index_ >= 0,
+                errors::InvalidArgument("Need preserve_index >= 0, got ",
+                                        preserve_index_));
+  }
+  void Compute(OpKernelContext* context) override {
     // ...
   }
- <b>private:
-  int preserve\_index\_;</b>
+ private:
+  int preserve_index_;
 };
-</code></pre>
+```
 
 which can then be used in the `Compute` method:
-<pre class="prettyprint"><code class="lang-cpp">
-  void Compute(OpKernelContext\* context) override {
+```c++
+  void Compute(OpKernelContext* context) override {
     // ...
-<br/>
-    <b>// We're using saved attr to validate potentially dynamic input
-    // So we check that preserve\_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
-    </b>// Set all the elements of the output tensor to 0
+
+    // We're using saved attr to validate potentially dynamic input
+    // So we check that preserve_index is in range
+    OP_REQUIRES(context, preserve_index_ < input.dimension(0),
+                errors::InvalidArgument("preserve_index out of range"));
+
+    // Set all the elements of the output tensor to 0
     const int N = input.size();
     for (int i = 0; i < N; i++) {
       output\_flat(i) = 0;
-    }<br/>
-    <b>// Preserve the requested input value
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+    }
+
+    // Preserve the requested input value
+    output_flat(preserve_index_) = input(preserve_index_);
   }
-</code></pre>
+```
 
 #### Attr types
 
@@ -725,12 +727,12 @@ you would then register an `OpKernel` for each supported type.
 
 For instance, if you'd like the `ZeroOut` op to work on `float`s
 in addition to `int32`s, your op registration might look like:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Your op registration now specifies that the input's type must be `float`, or
 `int32`, and that its output will be the same type, since both have type `T`.
@@ -790,66 +792,73 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
 
-<pre class="prettyprint"><code class="lang-cpp">
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
-class ZeroOut<b>Int32</b>Op : public OpKernel {
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+class ZeroOutInt32Op : public OpKernel {
   // as before
-};<br/>
-class ZeroOut<b>Float</b>Op : public OpKernel {
+};
+
+class ZeroOutFloatOp : public OpKernel {
  public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
-      : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutFloatOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<float>();
+
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<float>();
+
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/><b>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
-    ZeroOutOp<b>Int32</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOpInt32);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
     ZeroOutFloatOp);
-</b></code></pre>
+```
 
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
 > an existing op:
 >
-> <pre class="prettyprint"><code class="lang-cpp">
-> REGISTER\_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
->   .Input("to\_zero: T")
+> ```c++
+> REGISTER_OP("ZeroOut")
+>   .Attr("T: {float, int32} = DT_INT32")
+>   .Input("to_zero: T")
 >   .Output("zeroed: T")
-> </code></pre>
+> ```
 
 Let's say you wanted to add more types, say `double`:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, double, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
-- 
GitLab


From 8a5a41f72a8f48d2bb337aca018bf1216b17a07b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 10 Apr 2018 18:38:06 -0700
Subject: [PATCH 0566/1262] Fix incorrect math equation renderings in
 random_fourier_features.py (#18367)

* Fix incorrect math equation renderings in random_fourier_features.py

This fix fixes incorrect math equation renderings for markdown
in random_fourier_features.py. The issue is that
"```" backtick should not be added when mathjax quote is used ("\\(" or "$$").

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Additional fix.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* MathJax fixes

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint errors

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/mappers/random_fourier_features.py | 42 +++++++++----------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 091f0a1098..9a721a9d44 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
   r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-  ```
   $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
-  ```
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `\\(Omega \in R^{d x D}\\)` and a bias vector
-  `\\(b \in R^D\\)` where `d` is the input dimension (number of dense input
-  features) and `D` is the output dimension (i.e., dimension of the feature
-  space the input is mapped to). Each entry of `Omega` is sampled i.i.d. from a
-  (scaled) Gaussian distribution and each entry of `b` is sampled independently
-  and uniformly from [0, \\(2 * pi\\)].
-
-  For a single input feature vector x in R^d, its RFFM is defined as:
-  ```
-      $$sqrt(2/D) * cos(x * Omega + b)$$
-  ```
-  where `cos` is the element-wise cosine function and `x, b` are represented as
-  row vectors. The aforementioned paper shows that the linear kernel of
-  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+  The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector
+  \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input
+  features) and \\(D\\) is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d.
+  from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled
+  independently and uniformly from [0, \\(2 * \pi\\)].
+
+  For a single input feature vector \\(x \in R^d\\), its RFFM is defined as:
+  $$\sqrt(2/D) * cos(x * \Omega + b)$$
+
+  where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are
+  represented as row vectors. The aforementioned paper shows that the linear
+  kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial
+  vectors.
 
   """
 
   def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
-    """Constructs a RandomFourierFeatureMapper instance.
+    r"""Constructs a RandomFourierFeatureMapper instance.
 
     Args:
       input_dim: The dimension (number of features) of the tensors to be mapped.
@@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
       stddev: The standard deviation of the Gaussian kernel to be approximated.
         The error of the classifier trained using this approximation is very
         sensitive to this parameter.
-      seed: An integer used to initialize the parameters (`Omega` and `b`) of
-        the mapper. For repeatable sequences across different invocations of the
-        mapper object (for instance, to ensure consistent mapping both at
-        training and eval/inference if these happen in different invocations),
-        set this to the same integer.
+      seed: An integer used to initialize the parameters (\\(\Omega\\) and
+        \\(b\\)) of the mapper. For repeatable sequences across different
+        invocations of the mapper object (for instance, to ensure consistent
+        mapping both at training and eval/inference if these happen in
+        different invocations), set this to the same integer.
       name: name for the mapper object.
     """
     # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
-- 
GitLab


From fad74785d12ea7463e5d0474522cd7d754699656 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 10 Apr 2018 18:41:37 -0700
Subject: [PATCH 0567/1262] Fix for users who were passing `Dimension` type as
 `units` arg in `Dense`.

PiperOrigin-RevId: 192387984
---
 tensorflow/python/keras/_impl/keras/layers/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 87b997232e..f64174a23f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -836,7 +836,7 @@ class Dense(Layer):
 
     super(Dense, self).__init__(
         activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
-    self.units = units
+    self.units = int(units)
     self.activation = activations.get(activation)
     self.use_bias = use_bias
     self.kernel_initializer = initializers.get(kernel_initializer)
-- 
GitLab


From 5ad9e4588874f30d0d079acc60e07f2eddc0480f Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Tue, 10 Apr 2018 18:44:13 -0700
Subject: [PATCH 0568/1262] Merge changes from github.

PiperOrigin-RevId: 192388250
---
 README.md                                     |    3 +-
 RELEASE.md                                    |   11 +
 configure.py                                  |    1 -
 tensorflow/compiler/xla/tests/build_defs.bzl  |    3 +-
 tensorflow/compiler/xla/tests/slice_test.cc   |   39 +-
 .../notebooks/dev_summit_2018_demo.ipynb      | 1919 +++++++++++++++++
 .../bayesflow/python/ops/monte_carlo_impl.py  |   39 +-
 .../python/training/tpu_cluster_resolver.py   |    2 +-
 .../training/tpu_cluster_resolver_test.py     |    8 +-
 tensorflow/contrib/cmake/python_modules.txt   |    1 +
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |   16 +-
 .../kernel_tests/sequence_dataset_op_test.py  |    6 +
 .../contrib/data/python/ops/resampling.py     |    1 +
 .../distribute/python/cross_tower_ops.py      |    4 +-
 .../distribute/python/cross_tower_utils.py    |    2 +-
 .../python/shared_variable_creator.py         |    2 +-
 .../bijectors/kumaraswamy_bijector_test.py    |    2 +-
 .../distributions/python/ops/estimator.py     |    2 +-
 .../distributions/python/ops/independent.py   |    2 +-
 .../python/ops/onehot_categorical.py          |    4 +-
 .../python/ops/relaxed_bernoulli.py           |    8 +-
 .../python/ops/relaxed_onehot_categorical.py  |    2 +-
 .../python/ops/vector_student_t.py            |    2 +-
 .../python/ops/clustering_ops.py              |   11 +-
 .../python/ops/factorization_ops.py           |   71 +-
 .../factorization/python/ops/gmm_ops.py       |    4 +-
 .../factorization/python/ops/kmeans.py        |    8 +-
 .../contrib/factorization/python/ops/wals.py  |    6 +-
 .../estimator/python/gan_estimator_impl.py    |    4 +-
 .../gan/python/losses/python/losses_impl.py   |   14 +-
 .../python/losses/python/losses_impl_test.py  |   22 +
 tensorflow/contrib/gan/python/train.py        |    4 +
 tensorflow/contrib/gan/python/train_test.py   |   25 +-
 .../contrib/layers/python/layers/layers.py    |   14 +-
 .../python/ops/linear_operator_block_diag.py  |    3 +-
 .../unpartition_embedding_lookup.cc           |    2 +-
 tensorflow/contrib/lite/toco/python/BUILD     |    3 -
 .../contrib/lite/toco/python/toco_wrapper.py  |   13 +-
 tensorflow/contrib/lookup/lookup_ops.py       |    2 +-
 .../kernel_tests/attention_wrapper_test.py    |   36 +
 .../seq2seq/python/ops/attention_wrapper.py   |    3 +-
 .../contrib/tensorrt/convert/convert_nodes.cc |    9 +-
 .../timeseries/python/timeseries/BUILD        |    1 +
 tensorflow/contrib/tpu/tpu_estimator.md       |    2 +-
 .../training/python/training/evaluation.py    |   10 +-
 .../python/training/evaluation_test.py        |   16 +-
 tensorflow/contrib/verbs/rdma.h               |    2 +-
 .../common_runtime/scoped_allocator_mgr.cc    |    2 +-
 .../core/kernels/mkl_input_conversion_op.cc   |   52 +-
 tensorflow/core/kernels/mkl_softmax_op.cc     |    2 +-
 .../core/kernels/reduction_gpu_kernels.cu.h   |   37 +-
 .../core/kernels/segment_reduction_ops.h      |    8 +
 tensorflow/core/ops/dataset_ops.cc            |    7 +-
 tensorflow/core/ops/nn_ops.cc                 |   26 +-
 tensorflow/core/public/version.h              |    2 +-
 .../api_guides/python/contrib.graph_editor.md |   18 +-
 .../docs_src/api_guides/python/io_ops.md      |    4 +-
 tensorflow/docs_src/api_guides/python/nn.md   |   18 +-
 tensorflow/docs_src/get_started/index.md      |   21 +-
 tensorflow/docs_src/get_started/leftnav_files |    5 +-
 .../get_started/premade_estimators.md         |    2 +-
 tensorflow/docs_src/install/install_c.md      |    2 +-
 tensorflow/docs_src/install/install_go.md     |    2 +-
 tensorflow/docs_src/install/install_java.md   |   22 +-
 tensorflow/docs_src/install/install_linux.md  |   51 +-
 tensorflow/docs_src/install/install_mac.md    |   10 +-
 .../docs_src/install/install_sources.md       |   14 +-
 .../docs_src/programmers_guide/using_tpu.md   |   10 +-
 tensorflow/docs_src/tutorials/layers.md       |   54 +-
 tensorflow/java/BUILD                         |    3 +
 tensorflow/java/src/gen/cc/java_defs.h        |   45 +-
 tensorflow/java/src/gen/cc/source_writer.cc   |  305 ++-
 tensorflow/java/src/gen/cc/source_writer.h    |  192 +-
 .../java/src/gen/cc/source_writer_test.cc     |  369 +++-
 .../java/src/gen/resources/test.java.snippet  |    2 +
 tensorflow/python/client/timeline_test.py     |    5 +-
 .../python/eager/execution_callbacks.py       |    2 +-
 .../python/kernel_tests/init_ops_test.py      |    2 +-
 tensorflow/python/ops/control_flow_ops.py     |    3 +
 tensorflow/python/ops/ctc_ops.py              |    4 +-
 tensorflow/python/ops/custom_gradient.py      |    2 +-
 tensorflow/python/ops/data_flow_ops.py        |   11 +-
 .../python/ops/linalg/linear_operator.py      |    3 +-
 .../ops/linalg/linear_operator_composition.py |    3 +-
 .../python/ops/linalg/linear_operator_diag.py |    3 +-
 .../ops/linalg/linear_operator_full_matrix.py |    3 +-
 .../ops/linalg/linear_operator_identity.py    |    6 +-
 .../linear_operator_lower_triangular.py       |    3 +-
 tensorflow/python/training/distribute.py      |    2 +-
 tensorflow/python/training/session_manager.py |   10 +-
 .../tools/ci_build/install/install_golang.sh  |    2 +-
 .../ci_build/windows/bazel/bazel_test_lib.sh  |    4 +-
 .../tools/pip_package/build_pip_package.sh    |    4 +-
 tensorflow/tools/pip_package/setup.py         |    2 +-
 94 files changed, 3314 insertions(+), 409 deletions(-)
 create mode 100644 tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
 create mode 100644 tensorflow/java/src/gen/resources/test.java.snippet

diff --git a/README.md b/README.md
index a69cf1ffea..29418dc2e9 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
-between them.  This flexible architecture lets you deploy computation to one
+between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
 code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
 
@@ -86,6 +86,7 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 * [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
+* [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 * [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
diff --git a/RELEASE.md b/RELEASE.md
index c63d9f20c9..e845953174 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,8 @@
 * Distributed Mutex / CriticalSection added to `tf.contrib.framework.CriticalSection`.
 * Better text processing with `tf.regex_replace`.
 * Easy, efficient sequence input with `tf.contrib.data.bucket_by_sequence_length`
+* Initial support for `tf.contrib.tensorrt` that enables native TensorRT in
+  TensorFlow.
 
 ## Bug Fixes and Other Changes
 * Accelerated Linear Algebra (XLA):
@@ -50,6 +52,15 @@
   * Support `float16` `dtype` in `tf.linalg.*`.
   * Add `tf.estimator.export.TensorServingInputReceiver` that allows `tf.estimator.Estimator.export_savedmodel` to pass raw tensors to model functions.
 
+## Deprecations
+
+* TensorFlow 1.7 may be the last time we support Cuda versions below 8.0.
+  Starting with TensorFlow 1.8 release, 8.0 will be the minimum supported
+  version.
+* TensorFlow 1.7 may be the last time we support cuDNN versions below 6.0.
+  Starting with TensorFlow 1.8 release, 6.0 will be the minimum supported
+  version.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
diff --git a/configure.py b/configure.py
index da3f97ab30..81d5ad77ee 100644
--- a/configure.py
+++ b/configure.py
@@ -505,7 +505,6 @@ def set_cc_opt_flags(environ_cp):
   write_to_bazelrc('build --copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
   write_to_bazelrc('build --host_copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
 
-
 def set_tf_cuda_clang(environ_cp):
   """set TF_CUDA_CLANG action_env.
 
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 610302ac12..eac2eb286c 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -137,7 +137,8 @@ def xla_test(name,
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
       this_backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
-      backend_deps = plugins[backend]["deps"]
+      backend_deps = []
+      backend_deps += plugins[backend]["deps"]
       this_backend_copts += plugins[backend]["copts"]
       this_backend_tags += plugins[backend]["tags"]
       this_backend_args += plugins[backend]["args"]
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 8d9a9c7b73..52195db2aa 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -214,6 +214,9 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+// A version of SliceR1Test used to label and disable 'large' tests
+class SliceR1LargeTest : public SliceR1Test {};
+
 string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
   const R1Spec& spec = data.param;
   return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
@@ -233,8 +236,21 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
+XLA_TEST_P(SliceR1LargeTest, DoIt_F32) { Run<float>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_F64) { Run<double>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U32) { Run<uint32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S32) { Run<int32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U64) { Run<uint64>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
+
 XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
 
+
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
@@ -242,12 +258,6 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestInstantiation,
     SliceR1Test,
     ::testing::Values(
-// TODO(b/69425338): This uses too much memory on GPU.
-#ifndef XLA_TEST_BACKEND_GPU
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
-#endif
         R1Spec{10, 0, 0, 1},
         R1Spec{10, 7, 7, 1},
         R1Spec{10, 0, 5, 1},
@@ -283,6 +293,23 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestDataToString
 );
 
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestBigSlicesInstantiation,
+    SliceR1LargeTest,
+    ::testing::Values(
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
+    ),
+    SliceR1TestDataToString
+);
+#endif
+
 INSTANTIATE_TEST_CASE_P(
     SliceStridedR1TestInstantiation,
     SliceR1Test,
diff --git a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
new file mode 100644
index 0000000000..d62390494b
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
@@ -0,0 +1,1919 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Dev Summit 2018 - Autograph",
+      "version": "0.3.2",
+      "views": {},
+      "default_view": {},
+      "provenance": [
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python2",
+      "display_name": "Python 2"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "g7nGs4mzVUHP",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Experimental: TF Autograph\n",
+        "**TensorFlow Dev Summit, 2018.**\n",
+        "\n",
+        "This interactive notebook demonstrates **autograph**, an experimental source-code transformation library to automatically convert TF.Eager and Python code to TensorFlow graphs.\n",
+        "\n",
+        "**Note: this is pre-alpha software!** The notebook works best with Python 2, for now.\n",
+        "\n",
+        "> ![alt text](https://lh3.googleusercontent.com/QOvy0clmg7siaVKzwmSPAjicWWNQ0OeyaB16plDjSJMf35WD3vLjF6mz4CGrhSHw60HnlZPJjkyDCBzw5XOI0oBGSewyYw=s688)\n",
+        "\n",
+        "### Table of Contents\n",
+        "1. _Write Eager code that is fast and scalable._\n",
+        "2. _Case study: complex control flow._\n",
+        "3. _Case study: training MNIST with Keras._\n",
+        "4. _Case study: building an RNN._"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "uFcgBENZqkB2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Install TensorFlow; note that Colab notebooks run remotely, on virtual\n",
+        "# instances provided by Google.\n",
+        "!pip install -U -q tf-nightly"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Pa2qpEmoVOGe",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "ZVKfj5ttVkqz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 1. Write Eager code that is fast and scalable\n",
+        "\n",
+        "TF.Eager gives you more flexibility while coding, but at the cost of losing the benefits of TensorFlow graphs. For example, Eager does not currently support distributed training, exporting models, and a variety of memory and computation optimizations.\n",
+        "\n",
+        "Autograph gives you the best of both worlds: write your code in an Eager style, and we will automatically transform it into the equivalent TF graph code. The graph code can be executed eagerly (as a single op), included as part of a larger graph, or exported."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "snaZRFdWd9ym",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "For example, autograph can convert a function like this:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9__n8cSIeDnD",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def g(x):\n",
+        "  if x > 0:\n",
+        "    x = x * x\n",
+        "  else:\n",
+        "    x = 0\n",
+        "  return x"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "gq0eQcuReHET",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "... into a TF graph-building function:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "sELSn599ePUF",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 413
+        },
+        "outputId": "bb0c7216-1ca3-4da1-d1fb-589902cdcd1a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737505,
+          "user_tz": 240,
+          "elapsed": 243,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "print(autograph.to_code(g))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "from tensorflow.contrib.autograph.impl import api as autograph_api\n",
+            "from tensorflow.contrib.autograph import utils as autograph_utils\n",
+            "\n",
+            "def tf__g(x):\n",
+            "  with tf.name_scope('g'):\n",
+            "\n",
+            "    def if_true():\n",
+            "      with tf.name_scope('if_true'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = x_1 * x_1\n",
+            "        return x_1,\n",
+            "\n",
+            "    def if_false():\n",
+            "      with tf.name_scope('if_false'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = 0\n",
+            "        return x_1,\n",
+            "    x = autograph_utils.run_cond(tf.greater(x, 0), if_true, if_false)\n",
+            "    return x\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "j74n-8hEe6dk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can then use the converted function as you would any regular TF op -- you can pass `Tensor` arguments and it will return `Tensor`s:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AkVaY0-dfEbH",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "4ffe3757-c44d-424c-c2a8-7ddc973bfcce",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737841,
+          "user_tz": 240,
+          "elapsed": 257,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "tf_g = autograph.to_graph(g)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "\n",
+        "  g_ops = tf_g(tf.constant(9))\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    tf_g_result = sess.run(g_ops)\n",
+        "\n",
+        "  print('g(9) = %s' % g(9))\n",
+        "  print('tf_g(9) = %s' % tf_g_result)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "g(9) = 81\n",
+            "tf_g(9) = 81\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "trrHQBM1VnD0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 2. Case study: complex control flow\n",
+        "\n",
+        "Autograph can convert a large chunk of the Python language into graph-equivalent code, and we're adding new supported language features all the time. In this section, we'll give you a taste of some of the functionality in autograph.\n",
+        "Autograph will automatically convert most Python control flow statements into their correct graph equivalent.\n",
+        "  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "u0YG3DPgZxoW",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We support common statements like `while`, `for`, `if`, `break`, `return` and more. You can even nest them as much as you like. Imagine trying to write the graph version of this code by hand:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xJYDzOcrZ8pI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "6c244ee4-b141-4ad6-eefa-cfffa71f33c6",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738402,
+          "user_tz": 240,
+          "elapsed": 483,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def sum_even(numbers):\n",
+        "  s = 0\n",
+        "  for n in numbers:\n",
+        "    if n % 2 > 0:\n",
+        "      continue\n",
+        "    s += n\n",
+        "  return s\n",
+        "\n",
+        "\n",
+        "tf_sum_even = autograph.to_graph(sum_even)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    result = sess.run(tf_sum_even(tf.constant([10, 12, 15, 20])))\n",
+        "\n",
+        "  print('Sum of even numbers: %s' % result)\n",
+        "  \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(sum_even))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Sum of even numbers: 42\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "_YXo4KOcbKrn",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Try replacing the `continue` in the above code with `break` -- Autograph supports that as well!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xHmC0rBIavW_",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The Python code above is much more readable than the matching graph code. Autograph takes care of tediously converting every piece of Python code into the matching TensorFlow graph version for you, so that you can quickly write maintainable code, but still benefit from the optimizations and deployment benefits of graphs."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UEHWGpBXbS7g",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's try some other useful Python constructs, like `print` and `assert`. We automatically convert Python `assert` statements into the equivalent `tf.Assert` code.  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "qUU57xlEbauI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "add3db4a-2077-4dd5-f7a7-a5b5a4529c26",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738697,
+          "user_tz": 240,
+          "elapsed": 253,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(x):\n",
+        "  assert x != 0, 'Do not pass zero!'\n",
+        "  return x * x\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    try:\n",
+        "      print(sess.run(tf_f(tf.constant(0))))\n",
+        "    except tf.errors.InvalidArgumentError as e:\n",
+        "      print('Got error message: %s' % e.message)\n",
+        "      \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Got error message: assertion failed: [Do not pass zero!]\n",
+            "\t [[Node: f/Assert/Assert = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "w5hBZaVJbck4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can also use `print` functions in-graph:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "6NdzRKLEboRv",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "fb82dfc3-790f-4127-87f6-361805be9e9b",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345739013,
+          "user_tz": 240,
+          "elapsed": 247,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_sign(n):\n",
+        "  if n >= 0:\n",
+        "    print(n, 'is positive!')\n",
+        "  else:\n",
+        "    print(n, 'is negative!')\n",
+        "  return n\n",
+        "\n",
+        "\n",
+        "tf_print_sign = autograph.to_graph(print_sign)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf_print_sign(tf.constant(1)))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_sign))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1 is positive!\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9u_Z3i3AivLA",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We can convert lists to TensorArray, so appending to lists also works, with a few modifications:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "MjhCQJVuiTNR",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "dc320b87-595b-4392-d29c-994486fd8a0a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345744470,
+          "user_tz": 240,
+          "elapsed": 5391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(n):\n",
+        "  numbers = []\n",
+        "  # We ask you to tell us about the element dtype.\n",
+        "  autograph.utils.set_element_type(numbers, tf.int32)\n",
+        "  for i in range(n):\n",
+        "    numbers.append(i)\n",
+        "  return numbers.stack() # Stack the list so that it can be used as a Tensor\n",
+        "\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    print(sess.run(tf_f(tf.constant(5))))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[0 1 2 3 4]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UdG8ZFrkTAF2",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "And all of these functionalities, and more, can be composed into more complicated code:\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DVs6wt8NKaGQ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "cellView": "code",
+        "outputId": "0a4b8d08-8f65-4bbc-85ba-dc4c60563519",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345745186,
+          "user_tz": 240,
+          "elapsed": 658,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_primes(n):\n",
+        "  \"\"\"Returns all the prime numbers less than n.\"\"\"\n",
+        "  assert n > 0\n",
+        "  \n",
+        "  primes = []\n",
+        "  autograph.utils.set_element_type(primes, tf.int32)\n",
+        "  for i in range(2, n):\n",
+        "    is_prime = True\n",
+        "    for k in range(2, i):\n",
+        "      if i % k == 0:\n",
+        "        is_prime = False\n",
+        "        break\n",
+        "    if not is_prime:\n",
+        "      continue\n",
+        "    primes.append(i)\n",
+        "  all_primes = primes.stack()\n",
+        "\n",
+        "  print('The prime numbers less than', n, 'are:')\n",
+        "  print(all_primes)\n",
+        "  return tf.no_op()\n",
+        "\n",
+        "    \n",
+        "tf_print_primes = autograph.to_graph(print_primes)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    n = tf.constant(50)\n",
+        "    sess.run(tf_print_primes(n))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_primes))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "The prime numbers less than 50 are:\n",
+            "[ 2  3  5  7 11 13 17 19 23 29 31 37 41 43 47]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "JQ8kQT99VqDk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 3. Case study: training MNIST with Keras\n",
+        "\n",
+        "As we've seen, writing control flow in Autograph is easy. So running a training loop in graph should be easy as well!\n",
+        "\n",
+        "Here, we show an example of such a training loop for a simple Keras model that trains on MNIST."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "0CrtGWgwuLJr",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import gzip\n",
+        "import shutil\n",
+        "\n",
+        "from six.moves import urllib\n",
+        "\n",
+        "\n",
+        "def download(directory, filename):\n",
+        "  filepath = os.path.join(directory, filename)\n",
+        "  if tf.gfile.Exists(filepath):\n",
+        "    return filepath\n",
+        "  if not tf.gfile.Exists(directory):\n",
+        "    tf.gfile.MakeDirs(directory)\n",
+        "  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'\n",
+        "  zipped_filepath = filepath + '.gz'\n",
+        "  print('Downloading %s to %s' % (url, zipped_filepath))\n",
+        "  urllib.request.urlretrieve(url, zipped_filepath)\n",
+        "  with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out:\n",
+        "    shutil.copyfileobj(f_in, f_out)\n",
+        "  os.remove(zipped_filepath)\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def dataset(directory, images_file, labels_file):\n",
+        "  images_file = download(directory, images_file)\n",
+        "  labels_file = download(directory, labels_file)\n",
+        "\n",
+        "  def decode_image(image):\n",
+        "    # Normalize from [0, 255] to [0.0, 1.0]\n",
+        "    image = tf.decode_raw(image, tf.uint8)\n",
+        "    image = tf.cast(image, tf.float32)\n",
+        "    image = tf.reshape(image, [784])\n",
+        "    return image / 255.0\n",
+        "\n",
+        "  def decode_label(label):\n",
+        "    label = tf.decode_raw(label, tf.uint8)\n",
+        "    label = tf.reshape(label, [])\n",
+        "    return tf.to_int32(label)\n",
+        "\n",
+        "  images = tf.data.FixedLengthRecordDataset(\n",
+        "      images_file, 28 * 28, header_bytes=16).map(decode_image)\n",
+        "  labels = tf.data.FixedLengthRecordDataset(\n",
+        "      labels_file, 1, header_bytes=8).map(decode_label)\n",
+        "  return tf.data.Dataset.zip((images, labels))\n",
+        "\n",
+        "\n",
+        "def mnist_train(directory):\n",
+        "  return dataset(directory, 'train-images-idx3-ubyte',\n",
+        "                 'train-labels-idx1-ubyte')\n",
+        "\n",
+        "def mnist_test(directory):\n",
+        "  return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zu1U9Nqir6L",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "First, we'll define a small three-layer neural network using the Keras API"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "x_MU13boiok2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def mlp_model(input_shape):\n",
+        "  model = tf.keras.Sequential([\n",
+        "      tf.keras.layers.Dense(100, activation='relu', input_shape=input_shape),\n",
+        "      tf.keras.layers.Dense(100, activation='relu'),\n",
+        "      tf.keras.layers.Dense(10, activation='softmax')])\n",
+        "  model.build()\n",
+        "  return model"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Wuqg3H8mi0Xj",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's connect the model definition (here abbreviated as `m`) to a loss function, so that we can train our model."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "W51sfbONiz_5",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def predict(m, x, y):\n",
+        "  y_p = m(x)\n",
+        "  losses = tf.keras.losses.categorical_crossentropy(y, y_p)\n",
+        "  l = tf.reduce_mean(losses)\n",
+        "  accuracies = tf.keras.metrics.categorical_accuracy(y, y_p)\n",
+        "  accuracy = tf.reduce_mean(accuracies)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "035tNWQki9tr",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Now the final piece of the problem specification (before loading data, and clicking everything together) is backpropagating the loss through the model, and optimizing the weights using the gradient."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "CsAD0ajbi9iZ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def fit(m, x, y, opt):\n",
+        "  l, accuracy = predict(m, x, y)\n",
+        "  opt.minimize(l)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "PcVRIacKjSwb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "These are some utility functions to download data and generate batches for training"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "RVw57HdTjPzi",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def setup_mnist_data(is_training, hp, batch_size):\n",
+        "  if is_training:\n",
+        "    ds = mnist_train('/tmp/autograph_mnist_data')\n",
+        "    ds = ds.shuffle(batch_size * 10)\n",
+        "  else:\n",
+        "    ds = mnist_test('/tmp/autograph_mnist_data')\n",
+        "  ds = ds.repeat()\n",
+        "  ds = ds.batch(batch_size)\n",
+        "  return ds\n",
+        "\n",
+        "def get_next_batch(ds):\n",
+        "  itr = ds.make_one_shot_iterator()\n",
+        "  image, label = itr.get_next()\n",
+        "  x = tf.to_float(tf.reshape(image, (-1, 28 * 28)))\n",
+        "  y = tf.one_hot(tf.squeeze(label), 10)\n",
+        "  return x, y"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zEJH5XNjgFz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "This function specifies the main training loop. We instantiate the model (using the code above), instantiate an optimizer (here we'll use SGD with momentum, nothing too fancy), and we'll instantiate some lists to keep track of training and test loss and accuracy over time.\n",
+        "\n",
+        "In the loop inside this function, we'll grab a batch of data, apply an update to the weights of our model to improve its performance, and then record its current training loss and accuracy. Every so often, we'll log some information about training as well."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UUI0566FjZPx",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(train_ds, test_ds, hp):\n",
+        "  m = mlp_model((28 * 28,))\n",
+        "  opt = tf.train.MomentumOptimizer(hp.learning_rate, 0.9)\n",
+        "  train_losses = []\n",
+        "  train_losses = autograph.utils.set_element_type(train_losses, tf.float32)\n",
+        "  test_losses = []\n",
+        "  test_losses = autograph.utils.set_element_type(test_losses, tf.float32)\n",
+        "  train_accuracies = []\n",
+        "  train_accuracies = autograph.utils.set_element_type(train_accuracies,\n",
+        "                                                      tf.float32)\n",
+        "  test_accuracies = []\n",
+        "  test_accuracies = autograph.utils.set_element_type(test_accuracies,\n",
+        "                                                     tf.float32)\n",
+        "  i = tf.constant(0)\n",
+        "  while i < hp.max_steps:\n",
+        "    train_x, train_y = get_next_batch(train_ds)\n",
+        "    test_x, test_y = get_next_batch(test_ds)\n",
+        "    step_train_loss, step_train_accuracy = fit(m, train_x, train_y, opt)\n",
+        "    step_test_loss, step_test_accuracy = predict(m, test_x, test_y)\n",
+        "    if i % (hp.max_steps // 10) == 0:\n",
+        "      print('Step', i, 'train loss:', step_train_loss, 'test loss:',\n",
+        "            step_test_loss, 'train accuracy:', step_train_accuracy,\n",
+        "            'test accuracy:', step_test_accuracy)\n",
+        "    train_losses.append(step_train_loss)\n",
+        "    test_losses.append(step_test_loss)\n",
+        "    train_accuracies.append(step_train_accuracy)\n",
+        "    test_accuracies.append(step_test_accuracy)\n",
+        "    i += 1\n",
+        "  return (train_losses.stack(), test_losses.stack(),  train_accuracies.stack(),\n",
+        "          test_accuracies.stack())"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "cYiUQ1ppkHzk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Everything is ready to go, let's train the model and plot its performance!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "K1m8TwOKjdNd",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 988
+        },
+        "outputId": "f9d3eef3-5bea-45c1-ddf9-4edee73e4436",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345800262,
+          "user_tz": 240,
+          "elapsed": 52391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "with tf.Graph().as_default():\n",
+        "  hp = tf.contrib.training.HParams(\n",
+        "      learning_rate=0.05,\n",
+        "      max_steps=500,\n",
+        "  )\n",
+        "  train_ds = setup_mnist_data(True, hp, 50)\n",
+        "  test_ds = setup_mnist_data(False, hp, 1000)\n",
+        "  tf_train = autograph.to_graph(train)\n",
+        "  (train_losses, test_losses, train_accuracies,\n",
+        "   test_accuracies) = tf_train(train_ds, test_ds, hp)\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    (train_losses, test_losses, train_accuracies,\n",
+        "     test_accuracies) = sess.run([train_losses, test_losses, train_accuracies,\n",
+        "                                  test_accuracies])\n",
+        "    plt.title('MNIST train/test losses')\n",
+        "    plt.plot(train_losses, label='train loss')\n",
+        "    plt.plot(test_losses, label='test loss')\n",
+        "    plt.legend()\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Loss')\n",
+        "    plt.show()\n",
+        "    plt.title('MNIST train/test accuracies')\n",
+        "    plt.plot(train_accuracies, label='train accuracy')\n",
+        "    plt.plot(test_accuracies, label='test accuracy')\n",
+        "    plt.legend(loc='lower right')\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Accuracy')\n",
+        "    plt.show()"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/train-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/train-labels-idx1-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/t10k-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/t10k-labels-idx1-ubyte.gz\n",
+            "Step 0 train loss: 2.244329 test loss: 2.2499208 train accuracy: 0.12 test accuracy: 0.161\n",
+            "Step 50 train loss: 0.64771986 test loss: 0.56013924 train accuracy: 0.82 test accuracy: 0.836\n",
+            "Step 100 train loss: 0.49011207 test loss: 0.42143965 train accuracy: 0.84 test accuracy: 0.879\n",
+            "Step 150 train loss: 0.3768609 test loss: 0.39319593 train accuracy: 0.88 test accuracy: 0.883\n",
+            "Step 200 train loss: 0.36007702 test loss: 0.37089333 train accuracy: 0.9 test accuracy: 0.881\n",
+            "Step 250 train loss: 0.182115 test loss: 0.28543878 train accuracy: 0.94 test accuracy: 0.915\n",
+            "Step 300 train loss: 0.2119576 test loss: 0.22305593 train accuracy: 0.92 test accuracy: 0.93\n",
+            "Step 350 train loss: 0.12932214 test loss: 0.29057172 train accuracy: 0.96 test accuracy: 0.906\n",
+            "Step 400 train loss: 0.22937602 test loss: 0.2200287 train accuracy: 0.92 test accuracy: 0.925\n",
+            "Step 450 train loss: 0.23444137 test loss: 0.19857481 train accuracy: 0.94 test accuracy: 0.94\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3XmAFNW9Pvynlt5mYdhmQMHggnGN\nS9zCD0ElKug1edUY9ZoQTYze3GuiRk1uYjRqRHNj4n5NrhKjiUYlbihGQFRUFDSoKIvgICAO6+xL\n711V5/2jlq7qZaZnpnumZ3g+/zjTXV1dXSP91PecU+dIQggBIiIiGjLkwT4AIiIi6h2GNxER0RDD\n8CYiIhpiGN5ERERDDMObiIhoiGF4ExERDTEMb6JeOOigg3DllVdmPf6rX/0KBx10kGe766+/3rPN\ne++9h9mzZwMAtm3bhkMPPdR57osvvsCPfvQjzJw5EzNnzsTZZ5+NV199FQBw0003YdasWZg1axYO\nO+wwnHLKKc7v4XDY8x7JZBLz58/v9edavXo1Lr300oK2XbBgAebMmdPn97J19/rZs2fjhRde6PO+\niYY7hjdRL3366aee0Ewmk1izZk3WditXrsQnn3xS0D6vu+46TJs2DYsXL8bixYtxyy234LrrrsPO\nnTtxyy23YNGiRVi0aBHGjRuH3//+987vVVVVnv188sknfQrUI444Ag8//HBB2y5fvhxTpkzp83vZ\n+vt6oj0Zw5uol0444QQsWbLE+f3tt9/GV77ylaztrrnmGtx+++0F7bO+vh5HHnmk8/uRRx6JxYsX\nY/z48QUfV3NzM3784x/jo48+wkUXXQTAbAF48MEHMXPmTOi6jlWrVuHcc8/FrFmzcOaZZ2L58uUA\nzFaB0047DQBw//334ze/+Q2uuOIKfP3rX8d5552HxsZG533ee+89HHzwwVnv9cEHH+Bb3/oWTjvt\nNJx//vloaGgAAOzevRsXX3wxzjzzTJx66qm4++67cx5rPu+99x7OOecczJo1C9/+9redC6Vc++3u\ncSEE/vd//xczZ87EKaecgjlz5kDXdQDAwoULcdZZZ+GMM87AN77xDbz33nsFn3eiwcDwJuqlM844\nAy+99JLz+z//+U/MmjUr53ZCCCxatKjHfU6fPh1XXnkl/va3v2HTpk0AgHHjxkGSpIKPa+zYsbjm\nmmtw1FFH4YknnnAeF0Jg8eLFUBQFv/71r3HppZdi0aJFuPzyy3HTTTfl3NeiRYtw/fXX49VXX8WY\nMWPw7LPPAgA2bdqE2tpaTJgwwfNe4XAY//mf/4lrrrkGS5Yswfe+9z1cddVVAIBHH30Uxx13HF5+\n+WUsWLAADQ0NMAwj57FmikQiuOqqq3DDDTdg0aJF+OEPf4jrrrsOhmHk3G9jY2Pex1944QUsWrQI\nzzzzDJYsWYKGhgY8+eSTAIBbbrkFDz74IBYuXIibbroJr7/+esHnnWgwMLyJeun444/Hxo0b0dLS\nglgshlWrVmHKlCk5t73++uvxhz/8AYlEott9/v73v8d3vvMdLFiwAGeddRZmzJjhBEt/nXzyyc7P\n8+fPxxlnnAEAOOaYY5zqONOxxx6LCRMmQJIkHHLIIdi5cycAYMWKFTk/6wcffIBx48Zh6tSpAICz\nzjoLX3zxBXbs2IExY8bg7bffxvvvvw+/34+77roLdXV1BR376tWrMX78eBxzzDEAgJkzZ6KtrQ3b\nt2/Pu998jy9duhTf+ta3UF1dDVVV8e1vfxuvvPIKAGDMmDF46qmnsH37dhx77LH45S9/WdjJJRok\n6mAfANFQoygKTj/9dCxcuBCjR4/GiSeeCFXN/U/psMMOw3HHHYdHHnkERx99dN59BgIBXHrppbj0\n0kvR2dmJRYsW4fbbb8fEiRMxbdq0fh3vyJEjnZ8XLFiAv/3tb4hEIjAMA/mWNqiurnZ+VhTFaV5+\n5513cMkll2Rt39nZiYaGBk8LhN/vR2trKy655BIYhoFbbrkFjY2N+M53voOf/OQnBR17a2srRowY\nkXVsLS0tefeb7/Guri48/PDDmDdvHgBA13WMHj0aAPCnP/0Jf/rTn3Duuedir732wvXXX4/jjz++\noGMkGgwMb6I+OPPMM3H33Xdj1KhRPfbZ/vSnP8W5556LiRMn5ny+tbUV69evd6rWESNG4Pzzz8ey\nZctQX1/f7/C27d69GzfccAOefvppHHLIIfj8888xc+bMgl+vaRrWrFmT8yKkrq4O+++/P5577rmc\nr7388stx+eWXY8uWLbjsssucSronY8aMQXt7u/O7EAIdHR0YM2YMVFXNud+pU6fmfLyurg4zZszA\nd7/73az3+dKXvoTf/va3MAwD8+fPx7XXXotly5YVeGaIBh6bzYn64Oijj0ZjYyM2btzYY4VWV1eH\n73znO7j//vtzPh+Px3HllVd6wmLr1q34+OOPceyxx/bquFRVRTgczllRt7a2oqKiAvvvvz80TXMq\n0EgkUtC+V69ejYMOOgh+vz/rvY488kg0NTXh448/BgA0NDTgZz/7GYQQ+PWvf4133nkHgBmSY8eO\nhSRJ3R6r7YgjjkBzczNWrVoFwBxfMH78eEycODHvfvM9/vWvfx0vvPACYrEYAOCpp57C888/j9bW\nVnz/+99HOByGLMs48sgjezXWgGgwsPIm6gNJknDaaachFotBlnu+Bv7BD36Ap59+Oudze++9N/70\npz/hvvvuw5w5cyCEQFVVFX75y196RqAX4phjjsEf/vAHTJs2DW+++abnuYMPPhjTp0/HzJkzMWbM\nGPziF7/Ahx9+iNmzZ+O///u/e9y3fYtYvve67777cOuttyISicDn8+Gqq66CJEm48MIL8etf/xq3\n3norhBCYMWMGpkyZgh07dnheryhK1ntWVFTgnnvuwa233opoNIrRo0fjrrvu6na/I0eOzPk4AGzc\nuBHnnHMOADPYb7vtNowePRrTpk3Dt771LSiKAp/Ph9tuu61X551ooElcz5uIiGhoYbM5ERHREMPw\nJiIiGmIY3kREREMMw5uIiGiIYXgTERENMUPmVrGmpq6i7m/UqAq0tUWLus89Ec9j//Ec9h/PYXHw\nPPZfsc9hbW11zsf32MpbVbPvKaXe43nsP57D/uM5LA6ex/4bqHO4x4Y3ERHRUMXwJiIiGmIY3kRE\nREMMw5uIiGiIYXgTERENMQxvIiKiIYbhTURENMQwvImIaNh6443XCt723nvvxI4d23vc7sMP38cN\nN/y8P4fVbwxvIiIalnbu3IFXX11c8PZXXXUt9t57QgmPqHiGzPSoREREvXHXXb/D+vXr8Mgjc2EY\nBnbs2I6dO3fgnnv+iN/+9jdoampELBbDD35wOaZOnYYf//hyXHPNz7F06WuIRML44out2L59G668\n8lpMmTI153u89toSzJv3dyiKgoMOOgS33XYL6us34M47fwefzwe/349bbvktdu7cnvVYdXXuqU8L\nsceGd0c4gfc3NOLYg+sG+1CIiIa9f7z+GVZuaCzqPo87uA7nz5ic9/l///fZeO65f+D7378MDz/8\nIDQthT/+8c9oa2vF8cd/DWeccRa2b9+GG2/8BaZOneZ5bWPjbvzhD/fh3XeX44UXns0Z3tFoFA89\n9AAeeeQJVFRU4Oc//yneffddvPzyyzjnnPMwa9a/4YMPVqK1tQUvv7wg6zGGdx9ceecbaO2M46ZL\njsOk8X0/gURENDQccshhAIDq6hFYv34dXnzxOUiSjM7OjqxtjzjiKABAXV0dwuFwzv01NHyBiRO/\nhIqKCgDA0Ucfg/Xr1+PEE0/CH/7wP2ho+AJf//ppmDRp35yP9cceGd5b23YiPOFNSMnD0dwRZ3gT\nEZXY+TMmd1slDwSfzwcAWLJkETo7O/HAA39GZ2cnfvjD2VnbKkp6gREhRM79SZL3OU1LQZJCOPbY\n4/HnP/8Ny5cvw5w5N+PHP74652Nf/eqxff4se2R4f7ztCyjVbTBG70RLZ3ywD4eIiEpAlmXoup71\neHt7O/baa2/Isow333wdqVSqT/vfZ59J2LbtC0SjEVRUVGLVqg9x1VU/xrPPzsOUKSfi9NPPgBAC\n9fUbsGXLpqzHGN69dPykA7G4CZArO9DSwfAmIhqOJk3aD59+ugH33XcnKiurnMdPPnkGfvGLa/DJ\nJ2vxb//2TdTV1eGRR+b2ev+hUAhXXHEVrr32J5AkGUcccRSOPfZY7NzZghtv/AWqqqrg8/lw/fU3\nob7+06zH+kMS+doDykxTU1dR93fjit+ipTOCQyLn4yfnHlHUfe9Jamuri/632dPwHPYfz2Fx8Dz2\nX7HPYW1t7m7dPfY+7y+P2Q+SL4mmcOtgHwoREVGv7LHhPbFmPACgLdk2yEdCRETUO3tseI8JjQIA\nxBFGPKkN8tEQEREVbs8N74rRAADJH+egNSIiGlL22PAeW2FW3pI/xtvFiIhoSNljw3uME96svImI\naGjZY8M75AvCLwcg+eNoZuVNRDQs9WZJUNtHH32ItjbvnUjlsAyo2x4b3gAwMlDDypuIaJjq7ZKg\ntn/+88Ws8C43e+QMa7a6ijFojDWiqSt7UnoiIhra3EuCXnDBRbj99lvQ1dUFXddx9dU/w+TJB+Lx\nxx/Fm28uhSzLmDp1Gg455FAsW/YGtmzZjDlz7sD48eOz9pu5DOjVV1/nLANaWRkCIJdkGVC3PTy8\nxwItQKfWPtiHQkQ0rD332UtY1bimqPs8uu4rOHfyWXmfdy8J+uijf8YJJ/w/fOMbZ2PLls24994/\n4J57/oinnnoc8+cvgqIomD//WRx33NcwefKXcc01P88Z3LmWAf3ww/fx1ltLcc4552H27AuxaNHr\nJVkG1G2PDu/a0FgAQAysvImIhrM1a1ajvb0Nixe/DABIJMzu0pNP/jquvvq/cNpps3D66bN63E+u\nZUDr6zc4S362tOzClCknlWQZULc9OrzrKszwTildMISALEmDfERERMPTuZPP6rZKLjWfT8VPf/oz\nHH64dy2L6677JbZu/Ryvv74EP/nJf+Chh/7a7X5yLQMaCAScJT/XrFlZsmVA3fboAWt25Y1gFNE4\nZ1kjIhpO3EuCHnro4XjrrTcAAFu2bMZTTz2OcDiMRx6Zi0mT9sX3v38ZqqtrEI1G8i4lCniXAQWA\nVas+xEEHHYpnn52Hzs4OfPOb38QFF1yE+voNzmOnn36G81ix7NGV96hgDSQhQw5EEYmnUBXyDfYh\nERFRkbiXBP3hD3+E2267Gf/1Xz+EYRi4+urrUFVVhfb2Nlx22fcQClXg8MOPwIgRNTjqqK/ihhv+\nG7/97Z3Yf/8DPPvMtQzokUcehVgsihtv/AVGjaoBIJdkGVC3PXZJUHvZtp++fgvicQM/O+pa7L/3\niKK+x56ASwj2H89h//EcFgfPY/9xSdABEpCCkNQUIvHUYB8KERFRQfb48A4qIUiqhs4oJ2ohIqKh\nYY8P7wo1BABoj4YH+UiIiIgKs8eHd6XPvFevIxEZ5CMhIiIqzB4f3iMClQCAjjjDm4iIhoY9PrxH\nVZgj+Xa1tw3ykRARERVmjw/v0RXm7WE7OjrQHk4M8tEQERH1bI8P70qfOWBNUpNYvallkI+GiIio\nZwxvn9nnDSWFpvbY4B4MERFRAUo6Peodd9yBDz74AJqm4T/+4z9w+umnO88tX74cd911FxRFwfTp\n03HFFVeU8lDysm8Vk9QUWjvZbE5EROWvZOH97rvvYuPGjZg3bx7a2tpwzjnneMJ7zpw5ePjhhzFu\n3Dh897vfxcyZMzF58uRSHU5eITVo/qBoaOviRC1ERFT+Shbexx13HI44wlx6bcSIEYjFYtB1HYqi\noKGhATU1Ndhrr70AACeddBJWrFgxKOHtV/wAAJ9foK2NlTcREZW/koW3oijOYuXPPPMMpk+fDkVR\nAABNTU0YPXq0s+3o0aPR0NDQ7f5GjaqAqipFPcba2mqM1M3K2+8XaI8kMXZsFSSu690r+SbOp8Lx\nHPYfz2Fx8Dz230Ccw5IvCfrqq6/imWeewV/+8pd+7aetLVqkIzLZK78IISBLMiTFQCKpY+u2NlQG\nuTRoobgKUf/xHPYfz2Fx8Dz237BYVWzZsmX4v//7P8ydOxfV1ekDqKurQ3Nzs/P77t27UVdXV8pD\nyUuSJPhlP2TVXHi9jYPWiIiozJUsvLu6unDHHXfgwQcfxMiRIz3PTZw4EeFwGNu2bYOmaVi6dCmm\nTp1aqkPpkV/xAbIZ3h2R5KAdBxERUSFK1mz+8ssvo62tDVdffbXz2AknnICDDjoIp512Gm6++WZc\ne+21AIAzzzwT++23X6kOpUd+xY9kyhxpHo5xXW8iIipvJQvvCy64ABdccEHe54877jjMmzevVG/f\nKwHFjw6YS4IyvImIqNzt8TOsAYBf9kMXZmhHGN5ERFTmGN4w+7wNGIBksPImIqKyx/BGeqIWyDrC\ncYY3ERGVN4Y3zD5vAGZ4s/ImIqIyx/CG2ecNAKrPYJ83ERGVPYY3rPu8AYRCEitvIiIqewxvpPu8\nQyEgHNMG+WiIiIi6x/BGus87GABiCQ26YQzyEREREeXH8Ea68g5YS3tH46y+iYiofDG8Afhls89b\nVc2KO57UB/NwiIiIusXwRrryllUBwGw6JyIiKlcMbwABJQAAzrKgrLyJiKicMbwBhFQzvCXVrLjj\nSVbeRERUvhjeAIKKNVJNNu/xjiVYeRMRUflieAMIWpW3IZkVd4yVNxERlTGGN4CQGgIAGJJZecdZ\neRMRURljeAMIWgPWdCQBsM+biIjKG8MbgCqrUCQFmhXe7PMmIqJyxvAGIEkSgmoAKWGFNytvIiIq\nYwxvS1AJImkkAABxTtJCRERljOFtCaoBJHQrvDlJCxERlTGGtyWkBpHQk1BkNpsTEVF5Y3hbgkoQ\nAgKBoOCtYkREVNYY3hZ7opZgCIiyz5uIiMoYw9sSVM0pUitCQCSWGuSjISIiyo/hbQlZ85sHQwJJ\nzUAixaZzIiIqTwxvi115B4IGAFbfRERUvhjeFrvP2+c3wzvM8CYiojLF8LbYzeYqw5uIiMocw9ti\nV96yz+zrZngTEVG5YnhbglblLavmbWIMbyIiKlcMb4tdeUNheBMRUXljeFtC1mhzQzJDm+FNRETl\niuFtCTK8iYhoiGB4W+w+b3tNb85vTkRE5YrhbfHJKmRJdtb0TunGIB8RERFRbgxviyRJCClBZ01v\nneFNRERliuHtElQDiGlxKLLEypuIiMoWw9slqAYR1xJQFRmaJgb7cIiIiHJieLsErWZzRQE0Vt5E\nRFSmGN4uITUAAQHVLxjeRERUthjeLj7Fb/5XNRjeRERUthjeLn7ZBwCQVYGUzj5vIiIqTwxvF5+s\nAgAU1YCm9b/ybutK4MEX16G5I9bvfREREdkY3i4+xay8FaU4fd5PvFqP9z7Zjb8u3NDvfREREdkY\n3i4+u9ncZ0ArQrN5PKl7/ktERFQMDG8Xu89bUQwYQsAw2O9NRETlh+HtYjebS4rZZM5Z1oiIqBwx\nvF2c0eayGdq8XYyIiMoRw9vF7vO2K+9i9HsTEREVG8PbxWk2tyvvItwuRkREVGwlDe/6+nqceuqp\nePzxx7OemzFjBi666CLMnj0bs2fPxu7du0t5KAWxK2/I5ujwfjebC1buRERUfGqpdhyNRnHrrbdi\nypQpebeZO3cuKisrS3UIvebPCG8OWCMionJUssrb7/dj7ty5qKurK9VbFF1Ws3mxwlsqzm6IiIiA\nElbeqqpCVbvf/U033YTt27fjmGOOwbXXXgtJGtyUs6dHFZLdbM5mbyIiKj8lC++eXHnllZg2bRpq\nampwxRVXYPHixZg1a1be7UeNqoCqKkU9htraas/vcf9IAIBqLi6Gqqpg1ja94fObp9enKv3aT7kb\nzp9toPAc9h/PYXHwPPbfQJzDQQvvs88+2/l5+vTpqK+v7za829qiRX3/2tpqNDV1eR4Lx1IAgJSW\nBAA0t4TRVBPo83ukkpq1Pz3rvYaLXOeReofnsP94DouD57H/in0O810IDMqtYl1dXbj00kuRTJoh\nuXLlShx44IGDcSge9mhzQ+KANSIiKl8lq7zXrl2L3/3ud9i+fTtUVcXixYsxY8YMTJw4Eaeddhqm\nT5+OCy64AIFAAIceemi3VfdA8St2n7dZMevs8yYiojJUsvA+/PDD8dhjj+V9/uKLL8bFF19cqrfv\nE6fyBitvIiIqX5xhzUWRFEiQYMCsvDnDGhERlSOGt4skSfApPqfy7uk+7x3hXXjsk38grsUH4vCI\niIgADOJo83Lll33QhTVKvIc+7/s+eghdyTDGVdTi9H1PGYjDIyIiYuWdKagEkDQSAAC9m8p7W2MY\nXckwACBpJAfk2IiIiACGd5bairGIGREEv/oqtic3593ulfcbnJ8lzn9KREQDiOGdYXyFORe7pGpY\nrb2af0N3i/ogT+tKRER7FoZ3hnGV6YVUVPjzbifAe8CJiGhwMLwzjK+oTf8iCquoZTabExHRAGJ4\nZxhfOc75OYEINEPLvaGn8GZ4ExHRwGF4Z6j2V+EHX/4h9I4xgCTQGm/r8TXs8iYiooHE8M5h/5pJ\nMLpGAQCaYq05t/GMV8tTebNXnIiISoHhnYOqSBApc7BaLJV7KVLhSmbeKkZERAOJ4Z2DqsiAYU4+\nl8g7AYsnvYmIiAYMwzsHVZEhDAUAkNBzh3chzeZERESlwPDOQVUkQDfDO5knvHuD4U5ERMXE8M5B\nkiQo1pot21o6cm8kvNsTERENFIZ3Hgp8AICV9TuwsyWS9TxHkhMR0WBheOdhhzdkHZ2R7pvO2SxO\nREQDieGdhyqlwzsX4bpXzBD5lw4lIiIqNoZ3HnZ4S0qe6VFd3EFORERUagzvPHyKz5yIRdaR1Lqv\nrA2w8iYiooHD8M5DlRXAUCApOpKp7KZzd7HNZnMiIhpIDO88fKp1r7esI5nqofJmszkREQ0ghnce\n5ixrKiRFQ0LLUXm7f2blTUREA4jhnYeqyAVX3nqe8GZBTkREpcDwzkOWJXN+c1lHIpljxLn7VjEO\nWCMiogHE8M7DMIQ5YE0WSGiprOe9zeY9lNicw4WIiIqI4Z2HYQhAN+c3j2mJ7rdlnzcREQ0ghnce\nuiGcZUFjqXj2Bp5bxdi5TUREA6eg8F67di2WLl0KALj77rtx8cUX4/333y/pgQ023RAQyQAAIKKH\nu92Wfd5ERDSQCgrvOXPmYL/99sP777+PNWvW4MYbb8R9991X6mMbVIYhIBIhAEBMdGU9z1vFiIho\nsBQU3oFAAPvuuy9ee+01nH/++Zg8eTJkeXi3uJuVdzfh7VmYhM3mREQ0cApK4FgshoULF+LVV1/F\niSeeiPb2dnR2dpb62AaVIQREMggASCLXet7pwK7f1pZzxDkXLCEiolIoKLyvueYaLFiwAD/96U9R\nVVWFxx57DJdcckmJD21w6a5m86Scq8873VS+uy2CpvZY9hZ2djPDiYioiNRCNvra176Gww8/HFVV\nVWhubsaUKVPw1a9+tdTHNqgMwwAMFUJToSvRrOfdlTckkQ5q9zZW5c0KnIiIiqmgyvvWW2/FwoUL\n0d7ejgsvvBCPP/44br755hIf2uD60rhqAIBIhKCrkawAzry3O3ezub1taY6RiIj2TAWF9yeffIJv\nf/vbWLhwIc455xzcc8892Lp1a6mPbVBdcsbB+N7Mg+DTqwFZR0cyo49fSieyxMqbiIgGUEHhbYfP\nG2+8gRkzZgAAkslk6Y6qDFQGfTj56AkIiBEAgMZok+d54b63WxI5VyGxA53ZTURExVRQeO+33344\n88wzEYlEcMghh2D+/Pmoqakp9bGVhZAwP+fOsDe8vROziJwBzcqbiIhKoaABa3PmzEF9fT0OOOAA\nAMDkyZNxxx13lPTAykW1MgotALZ37fY8nll557rXO+lrARSJfd5ERFRUBYV3PB7H66+/jnvvvReS\nJOGoo47C5MmTS31sZWGkbzSAXM3mmaPNvQm9qf1ztO31OvwVYyBaTy71YRIR0R6koGbzG2+8EeFw\nGBdeeCHOP/98NDc344Ybbij1sZWFmmAVhACi1uIkH3zaiBfe3gJkNJvruje869s2AQCUmhb2eRMR\nUVEVVHk3Nzfjrrvucn4/5ZRTMHv27JIdVDmpCKpAVIJm6ACAB55fCwA4cLLrukcS6EqGcdt7D+Oc\nyf+GQ8cchNZ4KwBApHzs8yYioqIqeHrUWCw9g1g0GkUi0f0a18NFZVAFhAxN1z2Pp9y/S8DqjlXY\nEdmFBz5+GADQEm8DAIhkiH3eRERUVAVV3hdccAHOOOMMHH744QCAdevW4aqrrirpgZWLiqAPEBI0\n4Q3vpK65fhMQGQndaoe3prLyJiKioioovM877zxMnToV69atgyRJuPHGG/HYY4+V+tjKgll5S9AN\n74xqKS0d3lLGgDUhhFN5QzYY3kREVFQFhTcA7LXXXthrr72c31evXl2SAyo3duWti+6azYUnoBN6\n0pk+VZJ1DlgjIqKi6vOi3HtKNVkZVCGEbC5U4pIy3GEunAFtABDX4+mnFH2POVdERDQw+hzekiQV\n8zjKVkVQBSBBhze8tYzKO2GkB/DFtHR4S7LOAWtERFRU3Tabn3TSSTlDWgiBtra2kh1UOamw+ryF\n8PZdp3QNAdd2yTzhzT5vIiIqtm7D+4knnhio4yhbiixDEjIMpKC5J2KR3TOsGXkrb7DPm4iIiqzb\n8J4wYcJAHUdZkyUJAgZSWrqpXJJdt4pJQMod3qmoazsDBpjeRERUPH3u8y5EfX09Tj31VDz++ONZ\nzy1fvhznnXceLrjgAjzwwAOlPIx+kyADEEhprn5v1R3eAkmRDu+2RIfn9ULSQEREVCwlC+9oNIpb\nb70VU6ZMyfn8nDlzcP/99+PJJ5/EO++8g88++6xUh9JviiRDSAaSrvD2VN4QSBnp9c2d8DbM0ysk\n721mRERE/VGy8Pb7/Zg7dy7q6uqynmtoaEBNTQ322msvyLKMk046CStWrCjVofSbLCkABOJJVwgr\n3klaUsIV3vF28wctCAAQYOVNRETFU7LwVlUVwWAw53NNTU0YPXq08/vo0aPR1NSUc9tyoMgyJFmg\nI5JuGpdUb+Wtwd1sboV3yhwf+2w3AAAgAElEQVSPzsqbiIiKqeAZ1gbbqFEVUFWlqPusra0uaDtV\nMU+TUFzXOlblLTQVkj/puQu8PWk1m1vhDVkv+L2GouH82QYKz2H/8RwWB89j/w3EORyU8K6rq0Nz\nc7Pz++7du3M2r7u1tUW7fb63amur0dTUVdC2spABCdi2M31vu2SHt+5zqvB9qvZGQ3gHuhJh87lU\nABIAQ9IKfq+hpjfnkXLjOew/nsPi4Hnsv2Kfw3wXAiUdbZ7PxIkTEQ6HsW3bNmiahqVLl2Lq1KmD\ncSgFUWTzNHVE0/3akDXz/m3NvP6pwlhMm+AdnCecZnP2eRMRUfGUrPJeu3Ytfve732H79u1QVRWL\nFy/GjBkzMHHiRJx22mm4+eabce211wIAzjzzTOy3336lOpR+UxUF0IHOqGvaU1UDdBWQzHu4fQhA\nldOn0y/7ENet5nb2eRMRURGVLLwPP/zwbpcNPe644zBv3rxSvX1RqbIV3rH0oDQoGoSuOn3fighA\nkdN98j7Fh6iumE0bDG8iIiqiQWk2H2pUK5S7oq7R5opZeUtOePuhSunwViU1fZ+3zGZzIiIqHoZ3\nAXyKGcrhuN3nLZzK2x6spghvs7kqqxCaFeYyK28iIioehncB/NatYl0xK7xlA5IkzD5v2A/5PM3m\nqqxA6NbvisaVxYiIqGgY3gXwWfeX68KqoJ3bxNzh7Tebyi2qpMIwrN9lnUuTEBFR0TC8C1Dh91k/\nmRHszGvuCm9J+Jy+cQBmFW5V3pJVeXdGk7j/2dXY1hgekOMmIqLhieFdAL/PCm/JmkdNsSpwwzXj\nm65mNJur6cpcMdf0/ufyrVi1sRn3Pbt6AI6aiIiGK4Z3ARTJOk2SgCSlK293szkMNavZ3A53STYr\nb3s98GSKA9iIiKjvGN4FUOxbwCSByqAPvoDVg+2qvCXd22yuuprNoegw2OlNRERFMmQWJhlMslV5\nS5JAZciHsGrAACB0BYmNR0EZ2QRZqvbcKqZIKgAZQpedypuIiKgYWHkXIN1sbuDEr4yHL2D1fRsq\njLbxSG35CoRhB7b9GsXZxu7zdkjSwBw4ERENSwzvAshWEP/7qZNx5tcmweczk9i5jxuArouMZnPV\n2UbKvM+bVTgREfUDw7sAduU9fmwIkiRB8dmVtyu8hchoNndV3jL7vImIqHgY3gWQrSVBDWGGtqxa\no8Vdo811XXjmNreb0IWuAIoGwzDSO2SzORER9QPDuwB2Fa1b4S1Z93m7m80NQzgD2wCkg1xXIUlA\nyuDiJEREVBwcbV4AO5S/6NyGz9o3A0rKfMJwVd6GAclVUctOs7n537iWXguceuetj3dgQm0lDti7\nZrAPhYioLDC8C2D3eS/e+joAQLZWGfMMWDPSk7CYr/Fuc+fqe3AELhqQ4x1OYgkNjy7cAAD4yy9m\nDPLREBGVBzabF0B29WUDgJCyJ2nRDYHHXql3frfDW1LNKj2hJyBghntnJIkHnlsDg6POe6TpRs8b\nERHtYRjeBVAk72kSMMy7vQxvn/fGhnbXa8zntN2T0tsgXZl/UN+Enc2REh0xERENZwzvAmSGNwAr\nuNN93PGk7unztkebG51jobWMN3+Gd05znfeP9YhniIgoG8O7ALKsZD+oe4cLRGIpz+8KXK+xKnQD\n3hHnDO+esWeBiCgbw7sAco7KWxgZ/eAwB1c5r5HdK45Z94lL3srbYHj3iOdoePvX+t247I6l2N0a\nHexDIRpSGN4FyN9s7hV2Vd+K69TaQS/YbN5rDO/h7c8vfQLdEFi2eudgHwrRkMLwLkDmaHMATjXt\n5g7j5vZk+glhbtssbYLkT1cYKY6k7hFH5A9v/PMS9Q3DuwC5Ku9RVaFuX7P4vW3pX6yg362uQ/Co\nt5yHUymGd08Y3kRE2RjeBcjV5z1+VBWqQj4AQCjQw1w3OZrYAVbehWCzORFRNoZ3AZQczeaqrDrL\nfFZX+Lp9vcjRxA4AyZSe83FKY3jvGbhWD1HvMLwLIOf4ZnEv/1kdyg7vQyeNxrUXHoWvHTouu/KW\nNQACb3e8jHe2vwcAWPDOFsxd8ElRj3s4YHYTEWVjeBcgksq+jcW9/Gd1hT/r+ZHVfhy272jzOeE9\nzZI/DskfxxfJT/HEp88CAJ5ftgUr1u0q8pEPnLWbW7BibfGPn5U3EVE2hncBJo3YBwDw5VGTncfM\nZnPz5xGV6fAWmlmFj6kYCQCQ5exmcykQg+RPrzJmrxMOwGmKz2f+ss34+LPmPnyK0rrrHx9j7kvF\nbznggDUiomwM7wJU+6vwwIw7cOa+pzqPqa5Z1/yq7Axei6+ZisTGo3DUhP0BABUBNavZXArEIAVi\nzu+NkXQYdxdWndEkXnznc9z7zOr+faAS6unio7fKObxffGeLs+IZEdFAYnj3guIKbFVWPfNu71NX\nBQCoCYzAdbPOwKTx1QCAiqAv655wSUlB8qfD+4GP/+KsEa7p3YR3JJn3uXKR1Io7gr6cm83nL9uC\ntz7eMdiHMaSV8bUZUVljePeCu59blVQ4y2ZIwMGTRgEAamtCzs8AUBlSs/q8IQlP5d2aaIW692YA\ngN7N7WPhaCrvc+Wi2CPoyzm8iYgGC8O7F7Iqbye7JZxxwpfwzan74rJvHOp5TWXQlzUPOmTDCe+v\n7zPd3F9tAyBrWZV3Uk9i4ZZX0Z7oQGe0/CvvRLHD23U6/ufvH6KxPZZ/40HCC4y+4y1iRH3D8O4F\n9/3e7j5vSQJURcbZ0/ZH7UjvzGuVOZrNIRmQ/HEoIoBzDzwLB4a+AknVIPnj0DIq73/Uv4CXtryC\nFzctQke4/MM7WeRZ49x93vUN7Zj32sai7r8YONlO37HZnKhvGN69oHbT551PZUjN7vOWDEi+JFQj\nCAAwdOt5SUBzVXHtiQ6s2LnS+b0jo897xY6VWLBpUS8/RWkVu/IWGVVtOS7mknnBRURUagzvXvBU\n3pKCQtK7MuiDENnN5lBSkI0AAEDT0o+7+7zvWzU3/RJJdgashQLm/h7f8DQWbX0dulE+M7UVu887\nM6zLsYk6VeRBekREPWF490L2aHMzSLrrtzNvFcucpCUBSQIk3QzvlDUOTZIM6Fafd2ckieZYi/Oa\nqBZzKu+qjBndolr59AMnSthsnuv3cqAxvPuNfd9EvcPw7gXPaHO5h8VILLIsZd/n7TMnaJE0c3IX\n3S5WJQOaYQbBtX98C7rQceDIAwAAsVQMHZEEAMCvKp77qbuS4d5/mCJyH0vxR5tn/l4e4a27Dox9\n3n0nCup8IqJMDO9eUFyBrcqq606xHsoG4X1e8pshbM/GJgzreUkgkdTxxqrt0GWzyq5UK+BX/Ihq\nMcQTZjDqhkBcT8/QFklF+vyZisHdtF30Pu/MyrtMwlvT0sfR3b35ROXglZUNWL+1bbAPg4qI4d0L\n7nW9PQPWemzyywhvnxnMwqq8Dd16Xjbw7Fub8bfFn0JSzI5wvxxEhRpCTIs5FZ4hBLqS6cAO55h7\nfSC5w9tdeacMDXd/+Ces2LEy18sKkt1s3uddFZW72uaANSpniZSOp17biN8/uWqwD4WKiOHdC1kD\n1iw9ZXfdqFDOx42kWXk74S0Z2N5kNoFLqtkRHrDCO6rFnYFRhiEQTrnDe5Arb1d4ufu8t3Y24LP2\nLXh8w9N933eZjjZ3BzYHrFE5K5fWKiouhncvSK5RNYprkpae3Pz94zC+60QkPj3G83gqYTbD233e\nkiTgXApY06X65QBCaghxLY6UZm5oCOFpKh/sZnMtT+WtGVquzXsl84unXAasuQepsc+7H6w/Z+bY\nBiLqHsO7j3yyAvf0qN0J+lVMCnwZRkcthKv/OxaVYRgCuqvytkmqGXw+KYAKXxACAklh9pUbRmaz\n+WBX3q4+by0d3rmWUu2tzLDOvO97sLgvWDjavP/K5aJsOOK5HZ4Y3n2UOT1qTxTFOtVGeluR8iMS\nT0HX0n3e6RdYlbcUQIVaYT1vPmYIIJxKjzAPJ7NDMvMfbCKl4911u5zqvZjczebJZPrnLtcxLnrv\niz7tO/N7Rx/gLyJNN7BkZQOice+88u7AZp93/7Fpt3TKpauJiovh3UfmwiSmQu5R9dnh7VqkROgq\nWjsTcPLUGm0OpPu8VRFASA1ab2pW45l93pnN5l/s7sIPf7cUb3603XnsuTc346EFn2D+si0FfT63\ndVta8doH2/I+7xlt7ro4CLtuYVv4Xu/fF8jRbD7AX0TPv7UZT762EX9fUu953N1Uzmbz/mN4l065\ntFZRcTG8+6jQ+7xtimIlvHuFMV3BLY+uRDhid3obTsVsh7cCPypUc8CbZFXjhiE8TdI7Irs8s6wt\nX7sLAPDU6585jzU0dgEANu3o7NVxA8Cd8z7C35fU521+y9fn7b7/3JD7tiJa1gxrA/w9tHFbBwCg\ntTPheZwD1orD/nOyabd0mN3DE8O7j1RZ6dWiCnblLQz7vxKc028FuiS5m83NKlsRfgTUgPVYesBa\nXDPv8z669itoT3Rgbct656V2FSP7EqhvMwPcp5qj4/vTbJ6vOvKMNk+6wtvVIiAUb/gV/J5Z93kP\nbFC2dZnHPbI64Hnc22wuEI2nsO7z1gE9tuGEAVM6bNUYnhjefaRIhU2PalNVO6itjQ1X5S7Sk7TY\n7CpbNgLpJnopfatYzArv0yedAgB4a9sK57VOv/A+a3DvqofwcdNa+K33T/ajSszXt5tvkhZ35d3X\n8M5s8hvoUcntYfO4R1T4PY+nXIP0NM3AnfM+xp1PfYT6hvYBPb6hzv6nM9AXZW5CCCz9cBt2tQ7u\nfAmlwlaN4Ynh3UeqrOLkoycAAA760qgCtvc2m8vCHd7Wn8E1YE3yJyAMCbLwwWc10UtyepKWmBaD\nLBQ89sIuHDhyf2xo24hdkd3m7uzAC5lBMn/Ty/D5zPdI9WPu8XwDX9yjzd39v+5BdYbct+VMM9/S\nEAKabmTNvFYq9mfOfD8tY5KWLTvN7oimMlxvvJw5zeaD2POweWcnHnulHr+a++7gHUQJsfIenhje\nfeSTFXzntC/jjh9NwWH7ju5xe6fytprNZeSqvN3hHYNIhqDrrv512Wo2N4CYFofQfdi8vRMnjDfv\nH/+0bZP5vN1vnjJHqTdGmyGpZgWZ7EezuZ5jGlAhhGeeb/e0oRHXKHhD6Wt4Zw9Yu/z3b2DO3z7o\n0/56w90FkDkoTcszYI0LbPRNb6vD9zc04sEX1xWlqozEzC6q4VqgsvIenhjefaTKKmRJwtiRuWdP\ny9o+Y7S5e7S63Q9uN5srqg7Jn4RIhKDpBnyKtYqY5K6844BuTtEaUioBAM+/vRHN7bF0FaOkB4nF\nVXOFMvfgqriW7hMvRGbl3RZvx/ee+ylWtb0PqGY4u0MtYbgCW+1bs3n2gDXzd7vSLaXWrvT88cmM\nFgv3eXT/LGekdyyhIZbo/2Q1A03TDWzd1TVg79fb6vCP89fivU92Y3cBTd2vf7itV5/FMATunPeR\n526NoYyV9/DE8O4j91SphVCt0eb2JC2q5FrW0x6wZjWLT5xo7lskQkhpRlazOWCGt6GZj8swt4+m\n4nhx+efpK20lHRqblXcANenp835iwzO4d9VD+KhpbUGfQc+oPjd1fI6ElsCyliUIHvkG4Is74W0I\nA5qhQYHVV9zHyjuzz3sgFwGx108Huq+8NU/l7Q3vK+5+C1fc/VaJjrB0HlrwCW55dOWA9eH3tTp0\n5k/Io7E9hsdfqcctj+afXz+ztaSxPYZ1W1rx10Wf9umYyg2ze3gqaXjffvvtuOCCC3DhhRdi9erV\nnudmzJiBiy66CLNnz8bs2bOxe/fuUh5K0Vz+le/hrP1metb2LoRdeUtWda1KKn71PWu61IxmcyVo\n9puKRAU03Ug3m9vN6rIBXegwUnZ4p5/fvKPTudIWcgp1FWMBAElEoY773FMl2iPUN1rN7T3pbrIH\nSTGgjGx0giypm8EXRJV1Avp2q1jSSHpaEIq95Gh3uqLp982cRU3zDFhzDTTsY7N5S6wVN7xzOza0\nbuzbDors/Q2NAFBQZVsMfa0OMy8oMxXy/0vmn2y4dX2w8h6eShbe//rXv7B161bMmzcPt912G267\n7basbebOnYvHHnsMjz32GMaNG1eqQymqI2sPxxn7fb3Xr3Oaza3wliDjgL1roMiS0w/ujDb3m1+Y\nduWdsjPErrytMBO6VZFb64VLio4dzRGzipEMQNYxOjAKFx30LfP5jACt9JnN7YVOr6plfAnYI95t\nysgmZxR2QrdmiDMqrffuW+W9UnseoWNegz20qbezRdW3fYZH1j2BVB/mWe+KuirvjLECqTxzm/e1\ngnz1izfRlmjH3DWPFbS9EAJPLKnHui2lvT2tKuTreaMi6Gu+9HSPfUE5LHX765DHPu/hqWThvWLF\nCpx66qkAgAMOOAAdHR0Ih8M9vGr4UuzR5s7tZVbftyJD2KPN7T5t1aq8k0GkdAML3ramFrUGrNnL\nhcIKbwjF83wkrjkBH1KDOGj0gZ7nbVU+c0BbOJk7vNvi7Xh03ZOQrIuJzCrHvtf8lJFnw4hXQK5u\ncyrUlNXfrYgghC47y6D2VhhmOMk1zX16/b2rHsL7uz/CxwV2Dbh1uirvzJDwDNJzN6FrfWz+tVpy\nNFHYRcb2pghe/WAb7pz3UZ/erzvukfWZF2yl4q4OOyNJrN3ckndbz/H11I3ShzJ6uGXdnlh5G0Lg\nd3//EP9c8flgH0rJ9G6asF5obm7GYYcd5vw+evRoNDU1oaqqynnspptuwvbt23HMMcfg2muvzeov\ndBs1qgKq2rum6p7U1lYXdX/dGdlsNT/azeaKitraavhUGYmUt887FJKBlFlZ+/wqWtpTwCjXJC5W\neAvdrIpGjrA+hxXOmiGchU1GVY/AXnXWrWzW/u3PXREIAl1AzIjmPBfLPnkbK3evQuAIGfH3T0f1\niJBnu+QXZrhVh6ogkgHIwSg0w0BtbTVi7eaAMkX2QST9gJrM+R7vbVuFUcEafHns/p7Ho/EUKoLp\nqk8Zux1GR61nm978/Xyh3v+9U64vPUOSPK/3B9LHJrv6XYMVfmc7dytBT+8dCJj/FDVDK+g4O+Lp\nC7Fi/3/c1pluUQm5Pk8pqT7FeZ9fPLQEja1R3Hftydhv75qsbSOx9EVVVXXQeV2u44y7rrnyfY6a\n1phnm5he+N9tKGgKpy+cC/k8w+Ezh6NJfNrQjk8b2nHJN78y4O8/IP9mSv4Olsz7ZK+88kpMmzYN\nNTU1uOKKK7B48WLMmjUr7+vb2orb91ZbW42mpoEbTdvVZX1BWAEqdKCpqQuyLOWYpMX6YhYyOrvi\nUCUFCddrneZva8BaW6s5ktsO/65Iup9Y0hR0tVnPWzO0NTZ2QpIkdMbMintXuAlf7GyCX/Z5+vI7\nwzFnv1IwjJaWCJpC5nsmkjpefOdTqOOARFQ4rQApI4mmpi7s6jAHOukpCdD8kIKRrPNtCAN3vvMQ\nAOCBGXc4j2/Y2oY7nlyF807e32yokAClphkpyfBML9ubv19LR5dn+22NYUgSMKG2yrPdZ9s7cO/T\nH+Pq849EY0u6RSIWT3le3+EKuIireb2tPepsF0+mq+jujvWtxmVY9Nkbzu/23ydTMqXj3U9247iD\n69DWnv73UKz/j1es24VdLVEctl/61sfWtuiA/DuJu85vo9XPvnFLC6p82Y2Dja576Ztawmiq9uf9\n99zSkm7ty/c5OjLOZVNzz68ZSlpb0/8f9/R5Bvp7sVQiroWEBvrzFPsc5rsQKFmzeV1dHZqb002d\njY2NqK1NV05nn302xowZA1VVMX36dNTX1+fazbDh3ELk6vMGkNHnbQ1YU+1FjmVougG/Yo3Ylg0E\nfIrTbG73ecPwNpvHk5qzTYUagk/2eZ5/7q3NeGfNTqfPOqEncd1bv8b1Cx/CZ9s7nGN2z58uBSOe\npuKuWNJpAZAMn3MsQtagG4bTbA5dgdB8kBQdCc3bdJ7Qc98+ttIaLLVw5RanA1JSNchVfR/5HE15\nJ0/59V/+hRsf/lfWds8s/QyRuIZnlm5yms1HVPg8zea6YeCL3el/nO4+b/e98O6R/d01Xc5bu8Dz\ne0TLfaG6YPnneHThBjz56saSNO3OXfAJFiz/HM0d6XM1UPO25+qXzddk7668e1qOtZAxEpnvM9xW\n4ervx+mIJPHBp03FOZgBMtz+hrmULLynTp2KxYsXAwDWrVuHuro6p8m8q6sLl156KZJJ88t85cqV\nOPDAA0t1KGUhff+vNe847D5vKV1NWuEtK1Z1bshIaQYCavo+74BPTt8CZjWb6xrM6t0K51hCd6rz\nkBqCIiuQhAzJev6fK7bi4X+uzxpwFg5twd3/SPehulcFU0Y1oiWRHhxlGMK5QJCFz6m8JUWDpgkk\nrNHmwlAgUubFR1vcezUaTaXf372wiv1FLqvmY0I3L07kEd5+UPMiQcsK5lw6k7mvhA1hYHc0/cVk\nT6aj6Qa6oklUhXwI+BVPiK1YuxtrXQPFtDyD19yz2el5phAzRPbjLbHcg9B2tZih/vmu0t7j3tKR\n/ruUMrzdrXG5Lm7yjST3hHcPo80LGayV+d7D7YvffQ76MjPh//z9Qzzw/JohNfVvrgmlhpuShfdX\nv/pVHHbYYbjwwgsxZ84c3HTTTXjuueewZMkSVFdXY/r06c5tZKNHj+62yXw4kK0Ba3a/tV15T6yt\nAmA1nctWVW6FNwwFKV0gqKbv8w74FSek7cldNN2AJBSn2Tye0JyAt5cTlaB41wuHQEJPoNJeKxyA\n0FTPVbq78lZrt+PvDQ8imdJR39BuTlpih7er8oaiIaUbSOr2iHgZ0Mzwrm/x3pIW1dKh61772/4y\ntS8OjC6zGVcOeQc8aprAw2sfw8+W3YRIKuoJccMQePHtLc5kOJ3J3IG3cMur+M27v3fudbfvCtB0\nga5oCtUVPvhUb3hv3tHh2UfKM2DNtba5PUJd0pHQcg9Ei2vZrQ/5Rv/b/w/phijpl1OLq0uglMud\nunMkZ3jnCdGwK7x7Or5CgjgrvHvY519eXo///r/lPe63XLg/X19Gntu3Cw6lqX/zXSwPJyXt877u\nuus8vx988MHOzxdffDEuvvjiUr59WXEqbzugrdHml5xxMPbfewdeiSsw7AFp9n+FDE0zYOj23Oe6\n2Wwup8MdsKoj4Qp1pG/NspcTlYXqHW2uaBAQ2K9mknO/t4hXpudgR+4Q+cvL6/Gv9Y046/9NgqRo\nELoC3YCn8tZ1A0nDWr5UV2AkzGOYt+kZHD/hSAStVdJirvDuSHRiZMAcnGR/v8jWoDsRr4DQFUhB\n7/GkdANrms1j//mymwEAPzjsIhwz7ij8a/1uzH97C0LHCEABOhPp4HdXa+/sMCfvWLV7DY6qPdwJ\n70RKRziWwoSxlYgndU94B/2utdxlAy0j/gUlPgJSMIIW3Q/AHHyXTBmAZCB45FuYv6kT3z3sW1nn\nM7P1AzAHreVi37FgGKLHirM/8lXeiZSOrmgSY2sKm1WwJ+4gyZWx+YI3Ek+fn54uYgoZaZ35Pj0F\n/turd1rbGVDk8p/nyhPehkAP89rkNZRaJIbSsfZV+f+fN0yMqQlaP3mbzasr/Pi3KftClc1wkkc2\nIpwKQ7Kq8ZRuIJGy/keUzD5vJ9ytyjulGea93q5wloLm1fLY0Bjzd6E4zeZAetWyoBLElV/5sfmg\nrHtmrAqnIhgd9C668q/1Zn/0xoYOc1CcrkI3BISRWXlbzea6DL3xSxBJM7Cjrv5cd3gv3rrUqdad\nudntixFdhYhXWp/JfZtQdoDtipjHZ37BC2cZ1Q5X5e2e6tReS317s9msbs+E12atJmZW3rInxGLW\nQLTbLjsBoTFtSFR/Dv8Bq+GbsAmrxItOU3hS0yH545D8CWzsyD0NbVw3gzKoBHDKPicCQM570qOp\nmPP31Q0BrciVhbs5tdm1drm7JeGOJ1bh539a4Zl5rj/0HirCfBcoUdd0sz1V3oWFd+ZtgIV98Wd2\nKQgh8PFnzWU3Ha773PYn1IbSLWdsNqeiGVUdwG2XneBUywq8k18okgJJ1RD48ofYEdllzaomIaUZ\nSKYEhCFDkg34fa6QFq5lPo2McA5EASFhTMhscpZyVN4AsGZjJ3738GdQjRAgG051J4RAOBVBlTWR\nS6akpluVt2p++en2RDEaNF044W1oMiBk6O3mYEU7oAHvILKPm9bipS3mGIms6V11FUas0hz17k+/\nRtMMyJL3f2FP8Lmmh+1IdDrv51621J4AJ2m9zl533V6UpLrSD58qw3AtwBK3ngv61Zwz7dmfMakZ\nkHxmOLfEWz2f3WZX3idNnIqJVXtnfwaYs9XNee9ObAq+CkBYK6sV98vJHUSeytsVjvZ88u5m6/5w\nh0GuUMn3GfU83RQ5t+1L5a27jyv//jOPb/naXbj3mdX466INPb7nQHJ/hP5c8w2lanYoXWj0FcN7\nAO01phIHpk6D3joOU+r+n+e5zBHGftkHVTFHmyeSulllywZURXaazYWr2VwYCiRfylkARA5GIeuh\n9LzoIqMyt5qk7XlzzIsD3ak8E3oSmqGhUq1AcrN5n6Q9hzoAJDTdDEddhaYJT5/3X15ej664GZS6\nZq+mZr7WDnXAW3kDwEeNa8xNnT5vb+UNAFIo3XSe0DSnYjyq1jzGlGEHp56ezAaAgMD7uz92nks/\nYU9ba430z2hTHFFhhjeQDji7sgr6FShq9rdh0khCNwxsbwxD8iec998VzZ4C2J7oJqQGnb9VKiPk\nP2hcjY5kJ8LyLsjVbVafd3Er77hrBTXPimnWZ3avsFasL3F3tZ85h735Prk/o2dq2j40my98dytW\nb2rJu02+VfIyZVbe9iDGTdtLv2hOb3gGBvbjNoVi/z9XSkPpQqOvGN4D7IpZU3H1cT/AtMO+1O12\nqqzCp5qVdyJlhndNtYq6kaH0wDO72Vw3zIFhAEJfXQqoCUj+BJRUFYQQ+OPzaxCNCUiygNPsbM8X\n7p5iVU734dn93RVqJTsCVIYAACAASURBVPTmCdA7R8OADsBuEk5Bks3Qjic1T5/3Z9s6sGqTGVS6\nZlXyVmVu94UDQDSjv7cl3obGaHO6/9OpvBWIlNns7p5mNZyMQkDgyNrD8c39Z5rnwqpaw7GU83q9\nrQ5CAG82LEdjW8QTRPY99kKyBt9l3F49wmo2B9Jf1vGEBglAwK84I+LdknoKTy/dhKde/wzwpZug\nd4R3ZW1rV95BNQjVuqVPM7zhvXLXh87PytjtVp934V9OWzq+wJ/XPo5wMpJ3tHE8zxzg9mduaDSv\n8tQJG/HytpcKfu/u9NRsnm+kuztce2w2z9hvNK7h6Tc24Z6nP05v002fd+b+3ecvc8rcZmtA11in\ni6w86D20cBS8nyE09Vyxu5XKEcN7gPl9Cg6eNKrb2eQAwCer8Cmy1WyuQ5FVCDkJQ04BkvWl4fR5\n604VDgDKSPP+eilZiVhCw/ufNmXdCy75zdCwQ1FYfeZ25b0zYgbNCL81QYDzeiu8EXFeH4lrnsob\nSFfYuqZ4Xp9wVd6fN5mVynXH/Bhn7GtOpdueaE9/Qcqu+9l17/EDQJc1rWuVr8IJPrvyjsRS6dHq\nsSroLXthV2wXfvXss3jh7S3pE23tLwnzizfzy7raVXl/vH0zVu/YjFhSRzCgQJYkyEqu8E5imTWo\nya68ge7DO6QE0pV3RrN5S7wN1b4qyEKFXNmRNWBtV2Q33tuZf33zf9TPx6rG1bjx5b/i9sdzbxfP\n009rn4+GRnNMgG/CJqxu/xDhWApPLKnvVxO6O0dyZUrmMqw276IwvWs2D8ey++u7azbPvIBwH1Pm\nc01Wd8OoEYFuj2mgGT3cklfwfoZQNbsn9HkP2Axr1DuqYt5fHEtqSGoGAlAQTnXhXflvgGwu4iKE\nq9lcl5wFFeRqs0lQSoUQtkfmWkHv+9IGpD4/DFLAXrnMHDls6GZzvH070spdq8ztIxMAtDmVM2Qd\nMFTEpU4oMEeoR42Uq/K2mrqtCwwtZVW2OZrN12zdBXUsMMJf5dzSFtPi6S8J2T52NT0JnSss7TnZ\nK32V8Cne4AvHNE+fubbjAKhjd0Ie0YpVG9OTB9n3w8eMMO54/37ElNEA9nKe19ROrK94BsrY/fBk\nwyLz/RJnOyPOJVflbUSrIFeEkdCTqAgoiCU0p88bAHZEssM77qq884V3OBnBmNAoSMlKdIR2Q0fK\nEzi3vncnAKAmMAL71UxCwJ7Uxz4uawBdomIbNm34ctYxAN5mczc7HKMJLf33ADD3pbVYs6kNmiHw\nvZkH5XxtT/L1ecuSBEOIvCuCefq8ezlgzT1ffa73BrxVW+b+3bPmuS/0hBDOQL5yC7nM0eZ9NZQC\nkc3mNGh8soqAT0VXxPyySfc3Cydw7C/7ZMqAcC2bKVeYVZIwZIStLys7PNW6bZBrmiFb4W3Ezfu8\nDatvWlENGMLA6uZPEMIIvPCKNWGIYc+/bo149pnNqCJe4am81boGyKN2Oc3Qeko2mxFzhLd7xLs7\nvJ2Kxj2TnD0Lnavytu9Dr/RVOLPI2f3F4VjK2b/QVAgtPdGNh7WNhhS2djag0f+x5+k2YxeSUgT+\n/dMLm8STOkKBdDcBACTWHwe9dbzzGYP281blXaFU5q68dbvPO2Teiw/vrWIpQ0Ncj6PaV4VqqRaS\nBBiBzpxNyvd/NBd3rLwPQgi8/uE2ayY2gdZ4m3ksqubchZApka/Z3AooTRee127aaf5/YfSjedId\nJO4+b7v1J6nluaDoplk7U+aXeFeOkfLdNptrmeGt53zOfftavhYDIQSefXOTZxbDgeAZbd6Ppu+B\nWqSmGPaE+7wZ3mXi9Emn4OBRBzr3OvtkFUG/4vzDE3L6S8eerGREyAy8aELzNM/KlWZ4G7qcbtbU\nXaOiJQEpEDOraWsCFfteclk2oBk6UkYKWjQEZ35Su9ncqnxl64vciFeiPZxIr3AGwDfhMwgrZFMp\nCaOqA1AlMzyT1rSpumFO8iKEOUNb0BXedsUl5HSfd2azPwBENKvyViucCxk7+CKxFKCmK3e4lk1N\nnwcjPSFOPkp2c3IslUDQnx5dDwBC8zvvsaO1AxV2ePviEJoPtYFx6Eh2eia+AbwD1pZ+YIb79tb0\ngCd7lrsqfyWqYN72JwKdeQcP7Yo2YmP7Jjz+Sj2WvN+AjmSXZzIcuTJ7MFVcS2Bly/KsVeeAdEBp\nugHZdZ99NGVdlAT7vmSonmcglWKHd54Q9FbehQ9YE0KgM9q7ZvPsyts1sM89IY/r4iffRcfnu7rw\nzxVbcftj+bs4SsGd17kGBvbEPb/AUFGs1oZyxvAuE//fAWfgJ0dfZt0iBqiy2WxuS0npL2A7qGsq\nrfCOa5B82TN1CUN2+vjcfeKSrEMKRK0mc8nZFgBk1XACUNcl1768fd72hCkiXoH2cDIdrjCb0u3K\nW+gKVEXGiKDZPG/fLhWOpszPofmh6cKpvONaHAnNACCQ8rUDwgxGu9nefTucHUqVvgrzVjtIzoC4\ncCzlDG4TKX+Oyl3Af9BKz2fPJQVr/vdPj4HeYYYnanYh4LcvqlyD6qxz8MTrG8zKXElBCkZhRKsw\n2mfeKpdZfcdc4b16o1khN7anJ5SxZ56r9lVBMsygFJLebRW0rvlT5+ftHebAwUlV5gBJKZQ9Teyz\nG1/EB13L4Nvn06zn7PBKaYZ5+6HFvmjpzz3NIk+zuT1oMpmnP7uvfd66Yc6alykz4LuvvHM3m7tb\nLqJGFz5sXJ31Pok8XROl1t8Ba3Z4r9ncUvKpeYuluwuw7vx10QZnEp5yx/AuM4p137JPVhH0eatl\nN2HIGGmFdyyhQW8dl7Uvs/K2vmxEOoilQBSSqkEkXTNluSpbu8/VHinuft4OTykQMydesSZnCfpV\nnDzCmkFMNiAk3ZqaVIJPlVFTYb5XJGmGVWs4BikQgxGvQDJleJrNkykdck0z9EA7KhP7mHO4Z1T+\nABC1Ku8qfyUkSYJPVhFJxLHgnS1WeNvN5n4AMoQhpcPfl4AywgxLvWW8s09ZT48U3m+vaucWPpEM\nQsTMufn9B6xG20irerIH1bmqe8jmjGxyVbvZzN01GmN85t9nc8fnnr+RHd6bGiLOHPbuPm+7X7/K\nX+lZtz39hZT9ZRxOpPvZd0fN/v0vjzBnN5QrO7NGnO+KmhPb5Ap2O7xSuuFtclfSa8c/99YmvPbB\ntqzX9iTfaPN05e0Nu2ff3ISXln+OsNTsdH/0ps9b13NX3lpGuOVbqx0AYnmazd2tBLvHvYSH1z6O\n19au97z2b1/8H/wHZy+GUyhDCM/FQ8Gv62cVav89GhrD+M2j72PTjg7c98xqRON9v3ArplxjI/py\nwZJM6Xjzox34y8vre964DDC8y4w9baoqKZ7KO4sho6bKbPKOJjSkPj8MX459A0YsPamK0CWn8naW\nEQUgWc3u9qxn9v4As9nZvlXJXXlnjVZXXCPMAVQEVewd2MfaRoMOzemHVhUZFX4zFCNWsOzobIYk\nCYh4BZKajpCSEd6VZr9gZXQ/81hzNJt3psyFEsYEzYrYp/iwszWM55dtMf/B+lyVN2BeaNjH75rn\nXa4I43jpAnNbpB8/cOJIp5lbaH7rIsDU5WswH5fTg+Ls1gF17834rKUBcrV5cWCER2Kczzw3G9q8\nM63t6mqF0GX88dkNCPnM/bv7vO1b9qp9Va7WAyNdfarZYbR5d5vzc7u1GEyNOhpGIgg51JX1ZaZI\n9oWZAXWfDVDqvjB/l1zN5prhad2RrM8djafw0vKt+PuSeuyM7Ma8T5/POV97Lkae+7ztqYTXbmnF\nR5+lBxf+c8VWvLBuGT6vfhnq3uY8+T32eXtmFzN6rLx1XXQ72txTeWu5K2+7p+mJ1z51LpQMYaAj\n1QZlRO5FZwpx3zOr8V93vdVta4emG57lMM337t993plTwP7+yVX46LNmvPnR9l7vq9iefXMTfnTn\nm9jZ4p06ubtBh24bt7Wjrcv8/zVfyLd0xEs6HXFfMbzLjF15GxCe8E6PJbcYCmoqrfCOpwChoEau\n9TRf64bkDFjzfPFat4l5mrqtn9uqV6fvv3Y1J2eFp6x7Xl8RUOFTFXMOckWH4QlvCRX/P3vfGW9H\nVa/9TN/19H5OzknvIR0SEjpEulIFiShYLyI2BEQR9PpD5aJX5d5XQbHAtYAIypULWABpIXRIg5De\nc0pO3XXKej+sMmv2npOQkJAE5vlAOHvKXrNm9jzr356/wfTMWax0xyDt5EUKSdiOhxjTYM+5eRSY\nJjgdgxEYq+w2H3D7YGkmKkxqERuqESB33eR9z/k51OD4GZztI2F5FehIjxDu/mRMxylzRvgxascA\nsf34rgGLndIG8VhnODZGNZaFPvlpkZvgZSphKnG0pVqwrm+DiGl7xENPoQskT5vT8LwDuXXqoBTz\nlhdQPO6rsAXKmMqRmBc/EwCwTYqZ9xeY7CuJg2QroJhF9GT7AxKnQo42MQijeQN0Rt5xU4fteMg5\nebylPwa10idSYXnnfCL58Su348mtS/D0tucwHAghIg8j2DDD30f+/Cf3Bd3PWgO18DU2Fsfx8Pra\nnmFL1uRzOR4JlXYtzXrfXZ33cAlroXFuhYhEtqCG/b4RAReWkRvHlOJbv34Bn//RUwGyGS488Xah\nlogfcC/Du9Uudnd4aMlGAMCK9cFFUVAlL/yaO3uz+O7/vIyb734RQDjJ9/Tn8dWfPov/vPe1sm0H\nGxF5H2Lgcp8e8QJu85OSl+CyKR8RWeeEqEjFDWiqItxXhq4G4rfE8RPWvLxvkYsabznWy+uwY9vx\n5zUPBT4D4Mufqi4AAqjB2vKERevS4eo0EU3xyds0NCTMIHl35ujLl1reXiDmXXRcEVsnpCRhTvXd\nxUNuH+pitejpz+P7v30Zjh20qDXTptYwczcTT2rqwv51drbD3dXC+qYbgOohldBx2xePRW1lDBk7\nA0MxAaIGLG9DYeSt+AI1gfkC/GQ3x4DjemhPt8EhDr551xMAgK5cD4jiwcumEDM1DGR4jbwjXLfC\n8jZTUtzft7x5XH989RjUaC1iO8cga4X6qwfXw8vSmv2/L1+BL972NJ5bSePv/YX+wHGinaylwXE9\nPLVlCfr1jZClCbjl3ZcpAIoL64h/iYXGa10rUIqubA8Gi0P4zSNv4KofP4Wt3ZlhNbdLX7YD2SJW\n71oLrWETVOba91gI47W1PfjRH1/Dd38d7o4W51IdvLTzFQxk/UUst4qD3+1hlf009DYa/y+zvAsS\nebthbnP/M0X1hFUnt9flHqF9xe6M561d9HmRPQHDLYzeLrRS5aJDEKW6GYFF2zBW86ad9J70MC3/\nsORH3tt+1cbesm0HGxF5H2KQyduSyLs53YA5jTNgKiwm66mIWzoqkqZI7DE0FfUVPkm7riLI29ky\nDsUNkwFIwiEy2Uj//0bvWwDoAsHfTv/fHLNMxHJlyzwRM6DrKrW8VQdEcaEpPB6uIcXc5isHX8fD\n6/+B3iJzKRcSKNgutnXmoCoqNnX3omh7Qq5UfAdhMWtOiEYBLhxk+k1c87MleHNzHwaGXMhtTxW9\n6LvMAboA4W5zVodOHJal7hIYGvMU6P6POGNnYaksN0Amb7Yw8VAU4QNSQt6KXqTKdVDgekR0U4Pq\nghCC7Sx5jeTSKBRdZLJ+jTx/6QvL20j6fd+lmDe3vFNGCiopDy2IVquOKch7xY4NAIA7HlwJQgj6\nCiWlSxqXf6WWt0tCrErNRUXSxMBQEUosCzVGX3KqomJd/4aApekRD//x4m345Yrf4cnXaDLQ+m0D\nw8a8S8l73dYB/PjV22GOXClkfVHiiXpdcq/L4C9xo/0N3Lf+TxhM+fFM/rIujXFvJstgtKwXf3O8\n2rUcAwU/L6DUba5YGcSP/Jv/5ap/H4ekKgO5MmRf8HZ6cpcuSDj25DbfuGMA/3X/soDrfTjyPpRy\nuEuH+HZi3p0lLU7DKjhMQyv77FBBRN6HGHi3MZd4Abd5OkGJQybvmKkFpBh1XUVrbYX4m3gaeofY\nKp9ocDtHiAYn/Bz+viGPQpjbHIA1eSn9n5KYNz1GBzQXRHVE85WYqSNp+eP86/q/+brmtonnV+7E\nt3/zIlxbw2CBveTYGF23xDvAPuelal2d0o+LaDSBTKUdxVy1ECBcriInn58vWlzPg6XSfXVDJu8M\nYiodu6gVB+ApLC9AsUXSXqAcDzSWbjHCdlxPiKcomgvXI9gyRInMy6XYi1AR96efuXeHbE7eKSGB\nC9XzrT5meadNSu6EoKScboiGDYgKkqXPhpyYtq2/F45EzqpnsnI6D6ahwnY9IfIiQzc8NFbHA+1n\nTxt5MmY3TAfgl8ABtAFNxsliTd86keCn6wrk05JhyAYA1m4rr4tWNV8NcHcQOvkshGEnt4ltnHxl\nK01uHSvvs7p3LX6+7C68YD8ItXon9Kb1cFwPHgsDFG0XetPGkkF6tIwS/n0EEBDuKRsv8fZIzm8n\nbC1n4e+N5X3lfzyOl1d34bkV5Tr85eMoP9ef1/zfbtX+9hV7Gnep5e0GLO/wY3mf8mSMl5mW73co\nl5lF5H2IgVvepJS848wFzcmbqIgZQfI2NEVYhAAATxUPKIXix39RalmHrDBD3OoyFATd5lTpTYei\nuVAU1gwFQNzUUBEP9oC2XZ6lreI1Fssjju5b1oxcbVt+80iWM3P9J9W0fz2uCkUliM/5BxQzD6J4\nAVc37bxGaDy9pDOb6xKYjFw1g373uv4NsD0HMS3BxufPnY0C8k4BLmxhvYfNkaHQc7oegcoFDVUX\nRdtFX44nDkpa2GyBMsAWXZt6ekA8BU7RD4koiivkTLmLO2kkqTEqhwYA5NwMVI/OPfdCFIlv+e0c\npLHCZHYU8svno4KwzHvdEfK8A5LL18vTcyUTCpK8xpuPQU+IOZRlcLn17xFPJPFpqor/emCZf97d\nSHgOZotlOR+q7ore67sDf4nzygoS8/MBuFXtegSKlYU17Um83hOMsfMXf3+BHpdVemGNewVG+5so\n2g4efHo9rvrxU1i5cRfUip7AsUrA8vYTqoazvPNOAf/+3K349crf7/aa9qTbrVZ24U3mPQPefsxb\nnvdk3F+YD7eYKP246Nr4+6YncNeqe3Y7vr2F63n42h1LcO9j4W11AZQ6YgJW9HCaCNt76LuxtoL+\n/sLc6283Uc0jBKs29r6jxi97i4i8DzHwhDW3JObNLW9D5a5XD5apo7bSJ0VdV8vIuxQyAQXIhoQ8\nCnsgd/m7EjFK3rL1yWO0MVNDMhaU7LSJLb6DJxEpngHD8jCiIeWXAknhQSK3PWX/5vNAR2MaJ8xs\nDYxXYdnqPMnMMjV/PjSnrDOb43rCbc47hf34lTsAUPUzOugY7E1UCtT2CljeQ12w3mBN4FwyTGbN\nP7NsOx54YpMYe9HxROa9fJ9UaIDioj9TxKadg+jNDQKOibVbB0RCG1RPlCxx8t64Nc+6z0neBcVD\nkRRgEDZ+fq3En1SejY5CEqZdA5PF8hXNgc403Tlx0fmk280YEd4WbnnHtLjwLshKev1539LnBNfd\nnwsmj9VtwD82/QuATzAXnzyOnstxxaLW3jyOnch5W+QtkvGYkp6iEtF5T7a89Za1UONZPLL1kcDx\nolQupMd63inikefpPX3hra1Q48GMZyhyzFsi72Es78c2P4nOXDde3PkqVvS8OSxp7l6m1IM14SX8\nz9q7sXGAVkS83WzzjTv9+/R2Er5K8wGyUmfEMG/NviKTc9DVl8fGnYPIOTlsGiwvS1R3Y3kPN/4u\nFs/mW12XAEYe8SMfwWObnsTTr2/H7//xVuixpXjxjU6ahf9WePjmQCAi70MMgZh3wG1OicVQfOvN\nKnWba6qwfADfsm6o9gl+WPIOURIjw7jNOUbUVfqHqwomtlcFysc42cRMXciJcnDxEz6GuKVjYmsD\nHGLjcxeNRnUFHWdBTiJ2Zbc3V3BTkU4YaKyOB65HJOUxy7syYYpriM96TKjQnbuQkoHjEphsMdLX\n9Dh6cr2iZGt2zZHivM6OUXD7a2ATGy/upPrvXBa11G0OACZbbK3fPuhnzGsOirYrVMrkudVVHVA9\nPL9qJ2761QtQDBq394h0P1TXLxdi5L1lexH5okv34d4JVmGQGeT3UQFxNbjwJ7UvT4nZKRiIW5oY\nLzQbpk7H1S/FefkCSTdcn7wNej5L8cm74PrWZU/Od3trjLxL1dOMjlV4YM1DcDwXhAAT26tw1GRa\nG19winCJC7evDs72MaKigTeMCUPfUAGPv7JVkJDcjc4cuRJqRXfA8pZDQNLFis5hgYQzBqphH1zA\naH3t0gLDldzmEnlb4eQtJ/r9v9fuxLLulaH7DVceV3RtaHV+WODxzc8AKPdqEELws9d/jf9dG1yo\n9PTnWRc8EiDm4cgvb7t4+LmNWL2ZlmxmbT+GvCvfF3rMvoCX5xUdF//96p34/gs/KRM7KvXWBGr3\nh1ns8NACfw4c1xNVDH9a81f86onnsKnTv++7s8K3sERBfr/fDUTkfYhBVcOzzbmVoTGZUUV1ETM0\n1ErkPba1UsiE0pPQ40c2paXPJHeYbJmXan5Lx7O9yzaPb6kVC4yBrI3KlIXjjmgX23lTkpipiZcc\nh6uxlxnLJm+pTeDoFkqSd6+6V4xHdpvLMWthgbsa0gkDDdWJwPWUlsNVJM3A9ei1NN5cnaIuccfz\nhCeBqDbuXf0AvcaqMWhPjQheuEv3W9e3EaYSA8nR+f33y+eXzRG3vAH4Cxtmeedt+sKvTib8/TUD\niurhjU19gOJC0VwQx4TrefA8iJg4J29uUaowaRmT7DYXjVmkBZur0wx5Bp4QZ+cNxC1dkLeiOeLe\n9hcly7tAnzdV84TbnBOXqcZD3ea9WXo8IUy6Vy+KEkYK/9nryXK3ugKTkXPe4wsxQ1wDNCcQ81ZT\nvbjnzT8LBb8f3PMq7n70TSxdxWK3hpSAVbMT1sQXBQm6HvGrGmRIuQWDdjl5F11byMNyD4ilxv3K\nDtUT3gWZvLmGQSl6MoMwSAIN8ToAwxPgcKpyj21+MqDBz/NKZPL9xV9XYe2urVjWvRKPbHxMfH7/\nk2tx9zPPIT7zceitawLqdjIxquke4bnY3p3BH59Yi+/9lraslWV4d2a7Qse4L8ixDP9C0cP6Aerp\nKG3y44sJ2VjbtwHLi0/AGE1DII7r4anXt+FP/1obOIaHRGQJYPkdEZu6BErCf/YzuxGl6WFW/HCS\nvgcCEXkfYjihbSEA4JSO42GZ5daAcFUzy7u1jr4oJnVUY1RzhbAeAQh3LHe5A74rm26XasI7R8Dp\nbIPT2eZvl9zQJJeCN1SJRNHvuGVqJs4/kVoZU0ZS17HIqAZQLPjkHbf0gIAMjAIjW7pPU20Ccxpn\nYFrdJKzr34AhbTs7h3TxcsyaK615GtIJk1qBsopcSUb9rPH1qEun/HMxxboYUzVzXSL01wFfxtXQ\nDJhG8GfCSSTjZGEp/uKptT6FUpiaLITj66sXbQ95FhNorvGTDGO6IVnOvshMNu/QlzBbwMiWNyGA\n6rG+6l65d0K27ImniVp2wE+kKuboPbKYWA50h1U7EAzZQ1DsONA5GvYW1pVMc0SiD0+aMxCDxa5X\nJm/umvcGqJiOVrsNg3JrTql0atsQJVtVVYVlXfAYKTAvCl3EObAMSU9/1HI8ufVZPLLhn3A9V5RM\n9bA2nUqImI3sNlfCyrcUV7yMB/dgeXMPSEz1PUCK6olqD368O1gFNZ4RLm0ZWTuHQk7DB0efAcDv\nA5ArOPjt31eL/Rw3PKntjV1BFy8PXcge7KGcjd89/1TZsX99diNyBiVEo3VtoFc5J38lNgRr0guw\nJtM6/lK1uqyUUf+/ax8WvyEZL+54Bc9L/elLUXSL5fr/kuXN8asVv8OPX75dhMe4Vfzwhn/ihy//\nP2zxVkKv2waoDhzPw6/+7w08tGRjwAshpH+55e2RMg+kKpP3btrfdrPnbLgGPwcCEXkfYphcOwE/\nOf67mNVwBCyj/PYYnGBUDzFTQ1XKwq1XHI0vf5hm+fK4LQBBvgH3ouwelC1vosHeMBVeVs5WD24v\nrJyPKUnfhWxqBj588nh89zPzMGMctRZiElnlmfEbs6jbvLBsIYobJ4rtinR+njRycvvx9OsUjyXE\nlMfdrSnPCsubeNTytgzNT3aDH1fk1xC3NMwd7y88eDcxUzOggCa1aFKHXMI8Dbyvugw5NGCqscC2\ns0afiqq+2eLvQHvOgHyqi6JXBCEKWup8z0jcNH0vCCccx0Qmb9MsbE+lMe+Cr3QH1wAhCLjNdU3y\nTngl91+aJy7/6hQMxE0NMS3OzmvT5iu6DZe4UPKViO+a5hOo6kiWN53LJ1/y+8bLMW+e8ObsGAli\nGzDa30Rfwbcqq6sly3Dlb2BNfRquPgRNVaEqCoqk3PImqitCSfLcPrrxMVz1xNegN9FSL9cjwoPh\nZYOLq0DCWgi5Q/WEBSqTN/GYfKtr+6ED9jwljARkHf3BnA3Xc7FhYBNMLyUWMLe8eBs2D/ou7oJt\nU8liR0c2xyxCRn6PLN0kyc8SbM9tx5cfvwlPrH8hMNwEy81wulqhQRM6/4E4t+KhR6WJX2rp619a\n5PXYdBHVP1TwNQXYgpiXBZaSmWx5bx7ahqU7yrPOf7Xy9/jNyj+Iv4u2G9B8v/Wl/8Y1T92EXfle\n/HrFH7Az0yme9VIZ1NV9a6HX00UQJ+A1fesD+6iJwYDbnH+XJ2nYc0+G63riPnLIev6lynUyIvKO\nAADQVPYjUspdedzyVlRPuNJrKmJCwjBgeTOrNpDMEaKqxlGZNANxW0teCDA0VfrkbmoGFEVBY7Xv\n9pXJm7+EYqbGXKBKILNazlavTtPjWpK+znhpkhx3hauJId8t7lLL2zTUACnpsWLgHKauBRc2jNhM\nzYSmqXC8oOXNyVtX9fLYqpQ3YKlWYNOpI09ERW6cv13zr5d7PbTa7XijawOKjg14KtobZcvbpN4F\nkDLL2/OISNrjpMFnvQAAIABJREFULwlFt0EcAx4jb3gaFIVlC3P3eWkSIRfaAc1GB2huQNzS/fun\nOYiZmp87UIhTS5yoILaBIjJSzLsI4mpYsqwbdz9MXZM524/r8nixl6mE09kORSEYcCh5X/Ghqaiu\nCU6vmhhCwaRuV8NQy8ibXoODdFJ+PoOWqNEuNVlhiwsu7AJQFz63vF3J8k5qKcQd2kRGUT3YbJ4H\n7SHEtBgaN58HZ/toAIBDbL8Gmn1Hykj4vyvFQ6Ho4q3e9cg5eaSdVnj9dWIMXUO+KtiWHuZKdw0M\nZei4RJMdRhp6y1rE5vwdD+/6LYrI4c8rngxc85Cdpde1fip01RALKNntrbe+BcegnhAufyyseOn3\ns9T5E17dvAFf+q9nxCLHigWJqbQ3Oq9l93rpb3ht34bAdtlb4HgObNfG13/+HP7th/8Sn29l5ZM3\nPPtdvLDzZfxr6xLkmOVdCHNJMw8cH2NNrDqwWUkMBkrF1vZuxuceuwbLu94Qn8ltb6EHr4mrJAI0\ncY4QgnvefACvdvnhCdvx0McSE4frQX8gEJH3IYzm2gROmNWKL15whPgskE0eAqOEcDVVCcgbkuEs\nbwC1lbEAoafiQWICgNYaP0lNjudyWJLb3M821/06TEk0hYu4AEBVih4XsFRLM+Cl97OaYGVWsuWt\ny+QddJsbuio0vGWYqgFNU6h2t7SY4Mk3pmqUkbec9CeTsxibNN+xEMtbjWXxt/7f04Q1TxPudtNQ\nYeq+Z0V0RXNMZPIOtSCY5e1fqA04BmzH893mAGJxlLnNP3bqBEHufFvOy9JnytMRs3RhvampPjqn\njLy9giU8EKQYR8YblFzGRX9O2Hf15XwrLONkaEzZMcR+3EqLWRo8zd83zix/T6X3z9BUOGD3UnwH\n/d5kXAqTlHTVcwerpG1sIWeb8DIVYpz8he95HqAX4RXi+FjHlTAddqzqsg531PJOm0nYDoSHpugV\nxQKAex8qrKT4XXHPx7Iu6vKOF5vhDVXD3joGAPDcm742+NZeupghjo6BQWZpMsubX6Wa3hUoAyxk\ng7+/jJ1hc6RAgyFCF778bT/05vVAMQEvkxZiQaVqfRxLNvtlc+NHVOH8U/zcDzW1C/IP8ub/eQld\ng9TFbO8YgYQex/qSJjy25xPj7ct+gy8/eQN2ubQpztbuTKjVammmkKQNI0au9Oc4dCxyxjtA3d6y\nbsCDq/8BAPjjW38Wn/FjHdcLvEMAQElI5J230Vvow5Nbl+Dny+4Sn+8azIuZiCzvCACoxfzRRRNw\nxBh/tT4hdQTcwSoU3pwdekwpucdMLaiQJJM3CZKZkDdlkBOpONpqq6T9yxcSlaZvRfKXWMyULT//\n/LpE3tzy1lTNJ9mSxUVx3RHwWMIUb0nKY96moQlXOOCX9nDi0jU1IBwiX4OuKtjUOYTfPbpOfM7L\no/RQ8vYXKLy0SoYWIG95MVOSw6C6UIkuLNiKhCnlNDjCCiCOQd3mhJM3LwVzoai0tj5XcFDgbnMA\nlim7zTUoACxDCyTNAUCB5JDQaC5CwtIRN+j86rU70KmsgWKypKd8TMwDKcThEgdEp5nJil70xXDY\ngi3LuscRQtDv7PLbz7Lvz7t0e6ezCTuTVNr0kxMvw1ktF9Ahs/71pqHCUYKVA2JRKB5PAhhFjEx3\n4OjmuXRqpC588iKosHIezGKNyDsAaLKiYhQB2/QXSACgUMvbIx6G7AzSZgq27Sc2Op4jkTf9jsp4\nWlLCo9v6cixbv0jnmeTpwAekBc6OPhZGcA08vIS6yGWyo99B5X7zrx1L50hx8M+XttA+5ZkiuocG\nxBzpii5i5tzw1iq7oShAYeN4EMeEogDZYkHEkkvj/tttP8FLU5WA0Iw1+XmYE18AJ/A1W/rx6rrt\nbJ4NjKrsQHd+FwaKfqWCHMte2fMmrftnv+MbfrEUP/uLb81yFN2i0DRwPSJCFv7AWNWJ68sJK0RF\n7oVF9JqsbMBtvnEbnfNdhV6YE16A3voWc6F71I3O3iFpvQLENpnbnC0M8g5eWeu3C/WIhy2dQ3h+\nVac/3ihhLcJwiGtxFFfNg9dfH7q9lLwtUyuxvOWENXr7501uxPc/O5+2/pMs7/GtJf5MMMuCn1sr\nt7wbEv5Cg1tIPGv5e5+Zh7PmjQ0dK7e8AYiM5dIMYJJP4SOTzg1+oWR5u92tZePxX8R+9q0MQzWk\nemH/+3gs2ND0snri0fX+NRoh5C3PtxJI6C8JA6gudMUQZXQVSRNtKRqX16q6AuSbZZa3r89OAuSe\nLTjCbQ4Ahkl8kvc0aBpLAJOS5gACm+TAeSNmakjr/uIrhz5R1uQWLDEPXoH1ZlcGac9yzRPhEL5Y\n4q1fu3O7YKMgLF5ueXsKJYo3B/3yqE1bbdz10Dq2ncdXM7Br3gKgCNLjC4BYjL2U9SIUBUjoSVw0\n4TwoTgyKbqOphu4vXP/FGEBUmAodP7f+O6uepIsgT4Preb4YDot5d2W74REP9fE62K4nFp02sSWl\nO2Z5mwkpt4Fu4yEEl1Vf8DnK2QUUbRe/fGgVVm1hjXpcXWyX8wb4dRLHFGI7ikoT2VZv7sMdf10B\nWymI+VWhS25ztsBgdegkmxZz2DkwhKLtQW9eC60mqKrW7/o1y6qqBKRhAdAOaZK1nnPZ78s10Jyg\nZX49OT80kA35/cmu+tfWlau6FdyicJtD8VhIyQfPc+GLqEwxA89mioKuCkVzgqV10m9Qq+wRTXgc\nhzDLm97Hj7Z/Bl6mIuClsl0Pv3/CL9+7c/n/4DtLbsMDT/niMZHlHWFYqHu4Y2aJNWwZWlD3N1Aq\nRh/kptoE6qvi0FQ1QO6TO2rF/08fU4svXzjdj8ejNL5OURvzCZ94Kl08MJd5Q3UCR030CVa4iAGk\nErIrmrfwLL/YMXV+TJx386pImNA1Bc7WsdQqIeUxfuIRVFoVpaeDqRmi5EgJqXU3VCMgvfjZD07B\n5R+YIf7Ww8hb2t+TpEcntNUFd9RtqIqOuGR5z2+eCxAFesNmP6Pe1ZHJ2zR2yaw6a+ozfptXx0Au\nHyTvbGIjI2h6nzVNoeTLXtqJlEvdpooHz6afxS0dST2Jwlv0+lzFFpa3V4gL0RauVpbxBkUSk8hl\nYN+/dscu/Owvy/HEm5ScSYaFW3jZGrdwTN+789yrg4J8XOY2R7IXUF2MVuaCFBOB73hsgJYUcosx\nriawoycLt6hDt1xRiREgb0BkxOftIjziIR9jFmM+IfIKAFoOt2pjL/64lNbzt6aaqQyqwlu32uVu\n81hSkD/XyOdVBbativtJPy/gsZe34ull27F5Fy2Rq02m/aQ/j7vNFYgcCFsqeWT3dzBrY8122mKX\ne0Dyeep2J4SAe43VWAbEU0AKcXGN37l7KdZvH4AxgvUzcHTkXliEpNMIBzZ4GZ+mKgErmkP+zdhM\nuY84hig5zEreroxdImID3+1N57A8abDgFkTCGkqSyVJGUhCrIyzvrK+q6OqA7gT7juvhSWe268F1\nCfNuqMgXJE8Zu8ai7QbG8GrXcmjpXvEbEfu8S4jI+z2GcLe5VPIVEvMOWJYSudek/Bfr/KlNmDra\nJ3MAAUEYDpnc4WkBlzk9p2+5VyV88RiZ8MQChBGVSGarS6IuLnkDPA26ptDEKkUBoIAUEtAhuarZ\nS8ojBIs6TkBV/wwa72OQyZk37pARqJsHVZKrkhYBCb085i27zT2pfj5VojKnKABcFQ3VCUwbXYu5\nExtQHauC5VZBiQ8FMuppqZhfh6omhkQmLHENDOVtFGzfbd6XXI7p09l99VToqsK6zrH5GPMMVNZb\nmj8TBduFoijwhmhoxEYOipmHQhTAlmLezPLut/tgxIPkzc/Vn83h+VWd+PsKSt6lljePLTpMMCb/\n+kLs7CmIuHavthHfff5HUHU6B5br51qIlynJQzFzIt5tKQlk8g6Ia8BVCjBYtUYpefNcjbybD1iD\nzrYxrByPC9FQ8n19K81gbk01U8ubu80JJe+EpQuXdtKK+d4PI0jeXPdAdPDzCsiKen3672lzx4rf\nnS2XWqksROKYoGI7qng+sgUnQJwAkMl6ICBwPMePeceyIIUEAFXyDri49wnfclR0ByAqFM8MzLWm\nKqHlcmD3UYkPQq3sogtqT4NC6Dhkb1fGLre8lVjWJz+jnFjzbkGUivE5cnc14gOpy2CqlvjMcT04\nnoO8mxeqisTVoaiOmGN6fSXfwd3ujkcXAJoNuAZts+zySgJ/n7LjpTkChkmqO0CIyPsww5566IZa\n3oGYd3mdNycbz/MCCWvyQqBUfrB0eygUglhJrbpsrVt6+PG8QQh/iR49tQkfP20irr5oBkzNpCtu\nNv50wixrSiCTN38RVqUs6KqO6sKkQHtUUzVEOdCYmjZ888jrMK5q9LDXaGhqoJZ9XHMdPnDkCNx0\n2Vz/slUFhVVzoearsKDZF27hgh4yKhMJ6JqKL104HfOnUq+CiQR9YRh+0h0tFSOB8h7enAWOgf6h\nYHY9APSxzm10kaMGLG8A0OtYwhT7bOG0ZurZYQRAyTEPnSQAKNA1BQumNWFEFQ3Z7Mr3wUqweHKJ\n5a0YBVimImWrJ6Brip/YptlQFF+qlQghGVVoxW8Z2ibmwHN8qV23zw8ZKUZRWN4WErSch32HbrJY\nrsVkMJnHIGXRf9/Y0oVfPkL7NDudbSDFOAtNcPJmI0pQi7Ml2URj3sxtTsnbRdzSka70UBmjrV1L\nyZ+7r4u8RxD7DRb1XRhwugHFhcZEgyqsBGrScRAiuc0VOW4vJe0x0ti4Y1C4r0lJ7kHRsyl560W6\nwGChB+Fh01ykkiE04AYXWSqzvHUvjuKGSbC3jaLbmSWqN9MFDsnR3vSKS8chk3e2pH4bAPSGLYjN\noNnmhuWTYFKpggIFBafot2FlY/EKcShOnC7CJLc5j6kHLG/NoUTMUGrdK6oHKJS4HZewcj0ahvIX\nOPQ7MnlbkLfT3QJnRzubA7o9bmmR5R1heBT3QN56iaVoGcGENeLJ2xXpv1wmskSqkyGsLWCY5Q0A\nY6voD5vYFsa0BF3VMtEamoFvf+JI3HrF0SXnpeTI5V1jpoZjp7eIuHiVVcmuRRMNW2QYEnlfcfZ0\nfP7caRjTSo/RNZW9YPh1aSJO1VSTQGOqBhWmbJkH57M0/p00E/jwiePQ3ugfo6kKvMFaJDedgOqY\n/7nc7IGjpabc2rcU+oJVOem4GnIFF7miCy3mZ1VziyWQxyBZ+l051vCFuc0NPRgWEYlVnobPnD0F\nNRUxen+IBuJqKHg5KLotXsS6ruITZ0zGNefT+9WT3wU9zsnbEucCAK1yF5qmv+W77l0dLbVJsVDQ\n67Yj1rLJF3MJkZYFAM9gCnBF+twkYjq83iZUDbIKDL0o9NKTajWyeceP+zJLTjGZNeZpuOXf5iPN\nyHv11h68vtFPsgJo8ppX4vZWrBwsNYZ7/rYJBP4C1CU0YU2zCsg4Q+iobGGeJhWEKFA1Rt6eDVM1\nYLOsZrHAqejBC7gPWsNmaJXsGow4aiuo0EvOLhey4fFuIkkFr9vZi9gUKpzCFy5y3NzziEgMEwtX\nISTjoChJ2Y6qYhnlJeENVSXoLw4grqThdnb4izUtaBUX3jgyMA5ZMjUjZYIHcmMAQPEEeRc3TMIM\n71xYmomiWxAxb1GD7eooFF0YqinKHh2X+Cp2IrFRh6J5yBSkeWTkrQ42wB1gZWWqC9vxqKdDs0Fc\ng3lwuOVNv38wa4v5cLvaUB1jybvsGY9behTzjjA89mR5lwovWKaGie30ITthVmvoS5KngHgEQQlR\nibiUEPIu7fTEccX0T6Bi2wkgmUqcefTIYcdqqDra6lOoqYiVfS6j1Hr33dYkKNTBj5dUz2pSScwc\n71tqhqYG6n0BX7iBZ30nDD9cUGZ5l2Sex8JKxbgbnhDELR1nzO/AvMmNWDituWxfSyuPmXPyVrhl\nzd2sRRfElPpCM3KXQx2Vdf6LWGQrexp0VaVubzlhx2KuVtfXntfZfSaOgSFniMqzshc5d5vH9TgS\nehy7cr2iJI+/zD+4YIw4f6eyRri947qJz35oqug0BwBoXYmCW4ACJZDMJ5frOBq1evM5Rt5snKpD\nCXj2tCTMup3wCnGkvUYqYcleuq/iL4DqUPJm46urjPueE83P6OcvfM8jovWqxvu6aw5yWQVLWJtM\ngy1aHTh0MZ2gNdod6XamSgfAU6FoLJud2DA1U2RNBxfQ/iKNz21N2gI8Ddtz23HHsrsgMvqlccLT\nxMKoM+tnO8ulcIBP3rwlqli4Sm5z3oa3Wm3CN46/KuCh4ffC1bLwiIdxDS04YVYrxrXUse22P5eA\nOG57Fx1vgLyZZXzVjE/jqhmfDswBvRd8gRLDUMaFpZk0YU3EvNn8cfJWTJFQ5rgeduWpp8kX86H/\nDhX8MSi6DS+TRmbVLH8Bwo7f5qyHogDeUGXAbS5yC3K2mA/iGJg7voWek2kimHpkeUfYDfbU67fU\nhZyMGWitT+G2Lx6DxaeMLy9XAkRrPyrmIFnGEonK33v6yJNRH68NxH5lWJqJq886ATd8bI7I+JXB\nm6+UeglKt/NyH8sILjgqmeWt6DbSyZBac0XOXA9+h6YpActbBpf7TOp+LH5PlndY9yQeo+eqcecd\nNwafPnsKmmuTqDQqA/uGldvFWekWfzEeN82vr1WkF7/O483SgqxdnVZ2PuL6lrdcIy7I3/W158e3\nV2H+lEY0pqtEaRBPaNOlhUtNrBo9+V7U1zOyZy/CU+YEdeCJ4oB4KmaOa0RTTQLfuuyowPa8W4Cl\nmaiuKF8EAUBRoyV7OUbeosENK9dzk53wFAfurkbs7M0hm7eF29RGHlrtdroAkcSBYixPQdHcMnf0\n8nW70DfAXMUa70jmBBZIwuOkuFTVLk5Jo6OizV9oen5M2iU2DCk8M5yXAQDqE3WoqYgJ1/1rXcuR\nJf2wJlBJUd/y1oU7l3sv7O0j4Q0wi5aR8zPbn4dLiBAb4QtX/syYY19FhtDx1+ktSFspGLoKj7e5\nZZamrdHjm5J1+OiiCWirpgaBYmVhjnsZWsUudk56n555lWaqb+rpFdfG3ea5IR1JI/heUHQbnsEs\nZ9tAf6YIS7NQCIl5wzGQLzq+qJLqoug6uH/NX+k1MhU7fo0i1q7QOm6xuJcaBdmOh60OFW5xu1tD\nLe+hrB2o8EjH6Hvig8eOwHc/PQ+WoUUx7wjDY97kJswaX4+vLZ4Vup1nVNdZ9Zg2uhZnL6Qu7GSM\nJWaFkTezvUvbBcqiJvKmM0Yvwk3zrw0mp5WgtjKGUc3h5M7JebiYOd8u9MdLkt7SJn0BKbqDdLyc\nvGXLu1RIxtBUv+SoBJwYZMtbLyFXTmAfnXQhxlaNwsiKkqYlAM6Y34HT5rXjU2dNLtv2hWlXBWr0\nwzL2E5rk1lc0jGzyCb+m+xhfI54n+kj3tEFvxw+O/ffgCT1VinlL99RgbnfPz3jXVBWfOmsK6lL+\nvXOKLAFLWrjUxqphezZ67R7qvuS92y3//GkjhYq0CsXTce6xNI9A04KLy135PliaiY+dOhFnzO8o\nmwvCwgCdPVRHnN8jTmI7MszqtC1s685Qy1tWA+WhBdvCSbOobn/CYG1Nx7wu+otzwn9pdZcIJ2ga\nK8nTnMACSSgPcnI26QJjRLoVlsmS+jwNHmhfe9uz0dMnJToNoxxYvXURLM2kSoeyVCk2+vMhW95C\n598RcyD2Y8f/c9OT6NPXQIlTD0ap5a1oHvQxNJuee5Fk8halWCo9vi5OibE6Sc9jtKyHVs3ugfQc\ncm8N11bo7suhM0sJ/Sf3vFn221fTPSC1G0CKFrxsBdZvH0BPn4OcUxAiLVzbgdgx5G0XGgwxxgIZ\nQme2G9PrpooWvdzyzhTZ74Qt1OIaj/v7mgeO6yGDXpCiBZJL0wx1fs/ZImkoZ0uuewOVTGggFgcq\nUxZMQ0XRdvdoYO0vhJs+EQ5ZWKaGK88tt644UkYS35p/HSrMVHhMOqS1J3/Zlbb+k6340pZ77wS+\n5R1O/pogb/riLiXvuJThHeY214gpvqd0gUHJV8XE+Cx0NFQFtnELf3duc+5Wntc8B/Oa54SOP27p\nuOD4saHbUrFYwPIPu0dJPQk4/vbqtH+9llcJe/0UWJOfB1GZFSBZhZahIaZbSOoJP8bo6dBUBTFL\ng9vTgqJuw+zw5SHh6mVd32TLyCkyYpeItyZO44V9hX5U6JXg7RsURUFh9UxY41+B7dlIWAZqrKQI\njWglnouck0M6UYdpo2sxbXQtHlqyEYWVR6FuyhoMEhazJ4Bjq+hoTQjPByfvXqaRbqoxbO/JImbq\nouc44MtbLpzSjounUNnadExanNWzpL1Aq1z6Ha45CKj1rCpAmmON11mzeD57oSeNhL/wJCo8uEhY\nOlzVLbG2gwsYTt5xRp7phBH4neaJn+XtDXBi8suYeLlVIJ9B+v+COgjVGoKXjwsPguyB4z9z/rsy\nNBW9/QRWo+/9Kap0DPUsVl2TKM/VCHw/I/8iyeP1tT348f8+g9gR6+EO1ACuIQiZQ6vugqIAxS3j\nAU9HvujCLChQzaLoC6ym+kEIdWsXii7i4HF5FzZT4RvoKxeEGsxnAZjQ0rS6okKvQhcQ0DywHQ8O\n8ZUCs3lb/K54eCJrboNVxWrfPRWVcbqIzjssVBUbgDbiDeSKp5XNzYFAZHm/B1EXrxk2mexblx9V\n9hmnZbIbgi61yt8JRFx+mFOqSnncXkZCcmvL7U55rJlnm4dZtfzlf3TNSTh7zKmh35PYjdv8ncLU\ntYALN8z7kNb9a7JUU7jhAZoMN29C0NqX3eomW4BUxZi1ThSAKNBUBcmYgfqqONydI6ES2UrSAhYz\nAD+jHxAvYpng5Xr+ilgSs8bXi0XloglzUUmakXcLyDm5gJiPripCHpQjVhL394aqMdM8xf/A1QEo\naK5Jipp8txict7pUBTp7cxjMFuF2t+KktuMB+PKWDekKUXWRNMt/G7LkbYJtH4qvg9FOFzky2QkJ\nYJ6Mp9iIaVbwuXV05L0cYpVDVMSmpLJDBifvGHvuUnED8o8jD2r1FlbPFORbEaf7ajXbpQ575RoO\nAPWsKUYRhCWr3fjxuThnQfnikqvrmYYq7jl3mxdUujyrZ5Z3Q0U5eZeqNxJPRc7JY9naHiEA43bS\nZ3XZup7gHHDPgOwV83QoCi+1I1CT/SC5FFRCyZ9b3takpbAVapWv3iDVkrt8AcF6rTdSQZZRFvOI\nSZZ10XHhoCg8BnLuBPds8GeBjRhxg3fQo+SdSa6F0bwBm3qD7UoPFCLyfp9hREMKN867BjcvuEF8\nxt08Lvt3XP+5uOGoqwPH7U/y5pa1S8KTO9QSy7s05t2UpOpN9bHaQO05fzlzyzuMGPnLP6xjG/+e\nZIjlffHJ4zB+RFVACW5fQL9fETHN/kJ5b2ceFgAASw+St6oquPC4oDu+QrIkeQ9s/pKloQefMK5f\nPBvzpjQibUrk7Oplcyxn3E8f2YxPnDEJx83wBXZ4xj9A5+vKc6dhFksMvPCEsRjZQL8/5+QDSXma\npsLZOg7FdVP9awxJ2otp/vg4cTbX+Za36ypI6v51N1VVwiME67cPQFVUnNi+AIBfTiff0zE1I/zs\neAauugbQOefQG1g3L4mYDFWjiyJmeXuqLeLoHPa2MSDwUGxaRj/gmvNmubdJMQsgRKEd5cDIW/N/\nG0VlqGwMHfXU82GOXAU11ReYJ/n7AKDIiJfY1LXb0ZTG+NagZgPgaxYQ4ru9eYJWERkYqiEWdfXp\n8pBYaSIeHAMFL49ETIfCeoDzkM+ytUHyVpmSn+w14Za8Yuap7oHmwstUwjI1arm7vuVcTG1mx0jN\nhQIxawIt3Y8RqTbUWLWB8Sqai6ydp78VtmjpzxQDx1tSCaCzg4Z3+D1/bPNTeHzz037mPQn3KO5v\nROT9PkRDog6VVvnKmbvGLSWFpmRDYFtIXtY+QxXkHX5SlcfaWcy71FoZXdmBzx7xcXxp9hXB47ik\nNCPv0pp3gFs1VIq0FMJtHmJ5nzJnBK67ZFawZn4foCgKrr5oBmbWURWz0sQdAEiYMVHrbGkmkjFd\nkLKmBUkLAJKmFONn19DMFjgcfOlVmbLw6bOmoDImddjytLJEx3qplOeoiW1YMK1ZzB2AwPOTCLsG\naQ5ly5vfS/klHUbeimv6nhNO3jVJcbzreUhLY2itoZ6ATN5BIqajwkoHqiHkMVZYaehvLkL+9YXi\nM/k+xIzdh5scj8BQLKiJQSjxAXhKOXl7fY1I6Wm4Zl/g+NLKCQFXF2JKybgRVC5TWaxXImdu9QF+\nXH+4RLiiTscwsq4ON1xKQz1hiZIJk96zwWxRsjqZ2xw5VJgp8ZzIz5x8DYCfsEk8DXllANvct0SN\nNo9Db9hRrtYG0JJD/qyLOTviaRH+IPkkYqaGfNHBUM6fI0/3NQ9Kx2N0rGJzRFATr0KMe5mE5e0K\nsR5O/rmCFPPWHDom1QOxTdibJtFxSc/tfW89KMJYils+twcCEXm/j/GlC6ejrT6JY46gJQ+cvGWC\nGsvqoxtq4uUn2EfwOHRYpjbgW+Ya++2kQmq5p9VNLluAcLe5RuiPKszyPml2G7568UyMaPDJ6+On\nTUR7QwpjWqk1EbS8939ayOSRNbjsiPNxycTzsajjxLLtluRaNzUqQiMatygKNFULvDiC5E3nroy8\nSxwnSSNoeZeiPu6Tt0zEHBVSA5rQBUiAvP2xcs+HHDoI08h3XCI8LPwl3taQFDK6iZiBasn676j3\n3fiJmA5VUZHQ/WssXfAYugqST6Hw5mwU101FXaU/3ngIecvEWSi6mKgfDUVzoTdtgIci4iELkLRR\nCaIEO7uVhif880slmpoaIG9P9ZOkOJRA1QDvXS/FsaUua45OiW/e+A7RwS6szDNlMNlbqVaeyt8S\n2MghLXljShd79BqY5rzJa8jpta8k//TVANl5t3aXS6USxwCIhoZqdi8kmeOWDno9Zx45HjGTajP0\nZf2ySRKKQnq/AAAbb0lEQVQbCJxfHo+i29CbN7BrTIjx8Xtijl6OdblV9CBXFzkAckKbodN7Egif\nlNxzy6I/suaaYEXJgUJE3u9jTBtdi29/4ihhhXLXuKym9qULp+Nri2dhTMv+eyC55T0cefPt6YSO\nb19+JCrfpqv6iDHUHdZWzVyKIdZFzNQxqaM68PI5dnoLbrr8SBh6iOUdco79AV3VcXTLkaFjNAzV\n713NSJeTN19Y8aoCUzUQtyTVOmF5S33R6ZkCf6UMyfIOJW/frZowysm7UnqRlxJj6THyNXLrkkus\nAggo1vE6esfxfCEPRkS1FTGcMa8DC6Y14XPnTEW15Sccjm32a/m5cE9ausbSaxC1+P31cLvbxLMD\nhJO3vMAp2C7aY+MB0HI7opAyyxsAKqQmL9yKa2sIL1NEqbWmlv825PvUW/Sbhvglf/52N6RxkRyO\naU+34YNjTgsQZMqS480avEwaWsUuaPVbQBQvcDwAjKroQEqpFp2+xjbV4rR57SK0YW8eL/bV0n1C\nOnU48FBGQzUTKUr6IaW8QRu3jKqvQ8LSkc07yG0ZAQyy6xS96/05mDLC9x7yTPWkkfS9H9K+y7LP\n0jE4BkawBQ4kt7mha9QL4egY1ZzGly+cXrbodPUcTM1EOvHOQmtvFxF5RxDglrfspo5bOsa1VQ13\nyD5B3UPMW1jm8IZ/2YXg46dNxFXnHYGFU6hs4R7lW4eBoRnCZbuv53gnMHVNvMy5vCTPOOfZ2jUx\npg6lKIhJ8WruNi9VsCq1vNvSkmBMyAtVJtQwy1te1ISRe3wYy1tkrEtWolySyMvRbMcT18jjoYqi\nIBEz8IkzJqO5NonqmL+gbK3zn9EjxtJrr5LIPVGywCgNxUwf689XmEuYuJqwVfNFF0nTAnE1EVMP\nI++URN7cyjv32NE4//gxZfsSV99ziZFENos6ThL/z/UQZOudZCqRe/HkQGy/QiJfRVGwqOMEJGzf\nQ5OWyRsKiutoAqLesLnseAC4es7nME+7UCwARjdX4YLjx4pyPrenFflXj5PGT3uNDwfujeGWt73N\nn6ch0Bh52kxhVEsFXI+gp9dD88CxwXOwRe8lp4zHF844DpW5CQAANUkt85SRRJznHYQtJFzd98rx\nksGqbrg1a2jioWvgklMmYOro2jLvQ09uV6gH5kAhIu8IAsfPpAlJs8aHtxvdXxhTORJAmHVIMb1u\nCh1P24K9Om/M1DFjXB3SZhKWZgaSrvYWPEZ6INzme4IpWd5claqmgvc7py8MTmxFtwhLiqNazHug\nqzqumvFptPfRspVSWhhd6ddUf2NxeQWCjHgIecsItbyHiXnLXp20Qq1dXv8L+Ja37bpoSNDnkBBg\nzsRgDgYAVPMFDILiOVzJri3V4o+x1PKWyPu4GS1oqfWvIWUl8K3510IfkhY4ro6PnEItyUVzR9De\n6LYpuqrFJaW9tnrqrm9IBpvoANSDcvq8Dnz9yC/jxBHHBM4vo3YoqONAXE2QCQBMrh+Lr5TkfJSF\nPzxdlNQBCP09aNLzLSc+AsBXPngs4GqC+NIhxxMoYlw8h+Wy0yYKWWRSjAnLXHZpA0DlllNwztgz\n/HOxPIiKBPME9jYF8hIA6k2Z2O7f95baCmiuf2/5d+SLDlRFRYfH+ruzkreUZHmHClY5BkYIqWMF\nbj99RovJbfQjVy+rfvHnItwDc6AQ1XlHEDh9XgcWTmt+227qfcWF4z+ICdVjMatxeuj2SbXj8b2F\n3wyWK+0FNFXDNXM+H3AN7y0Sehx9hf6DYnkbuubXm7JabRHzZqRTKxGX/DIxpSz6CTVjYbkZAD1l\n7D0i5WeOj24O96x8ceZnsHFwS5m7tBRhZYmyNR6WkAYA7eZErCg8g+aUb/0J8nY8zG2cgXV9GxDL\njMTZx00qO77GCo77psvmYiBbFHM1qmoEsCV8DHweZ42vx8dOnRgcu6WjLl6LOSNH47lupn3u6Zg/\npQknzaZCL6+s7mJSpdQzInsqvrZ4Nrr6cuhVN4nvT5oWvvrJo0TYoCXVhIUtR+GxzU/R87s6IHHC\nCGUatrxYg/icf9APSohGVYKJi6RE2lh87vj3Jox8PdZ61elpQoKFX266bC5Wb+7D5JE1UFdUwovv\nYseXPwfHTm/GP5dyOWBK3o01CXz90jlYuWEX/vPe10DsGHXtl1xDtVGHk9tnYkzlKNz94t+xsZMu\ndqaMqsH0TbUY0ZjCX5esByGK8C6kzBTGj/AXXifPbsPmt6rQyYVYuDgMlzw2LZCiJRZZKTOJmDK8\n5U1cHUdPbcIf/klbpBbfnIPYnL/BNQbE9pgxvOt/uGf9QCCyvCMIKIpywIkboC7Go5pn79aqTUuZ\nrfuCpmQjUua+kT9A3c5pI1VWc/5uwNJV1roRqGSJYaUxb9EUAcGyt9KSLz6DpIS9Dc3AhOqxGF9V\n7sLlGFc9Bie3Hzfsdo6w8IdcBx6WkAYAU5Nz8NkjPo6zR/v19kdNpkQ+sb0auqrjkknn47w5c0Q+\nggzZbQ4A7Y1pTB3lx65HVUqysiXPkio66ZW7qrkVP76+zf/Q1f0sZQCmqQUy5mXhoLilo70xHcgb\nOG56G1rqgs9jXPYGuDpOnOV/XzpBVdZ4YlmYlRhIOvR8adLW+iTOOYY1B5LqpsPCHzWDM+H2NqCt\nOE/McXtjGiczmVut6Lv+wyz3uso4KhL02r2S52DyyBpceMJYv1lKyTV091PCHVXZjqnG8aLne1NN\nAl+4YDqV2iWqyI8wVB2WZiIVN/CJMybhyxdOR3tjGpMa/C6AHz91Mlrrk2KR1TdUCCRHUstbCx0P\nAMAxkIobuP6js3Hy7DYACohtwVN5A52g5X3JxPOFpxAID58cKESWd4QIIfjY5ItQcIvvaAGxr9B1\nFW5nG2yjgCvPOx8AUJP21a+AYDKWXH5klpK3SJ0t/56rZn66/MO9wHnjzsIDax7CxJpxZdsaE37o\nZTjytkwd0+qCNevnHDMacyc2BKoBhgOPaZdm1nPwpD7e31lGW30SmzuHUFc1/MtWvi7iagGXv6Vr\nAZd02Eu7OdmI5mQjtmd2oqmy3LshW84nzhyB8SP8fToa0wAUmEocBZIVVutpR7ULyeOEEYcChS7M\nJCI6YWYrTpzVhhNnt+Hz/1WEmuqDSozQZ/mCo+bi0aWNWPyhCaFzoBerwIVd08N4ssZVjcZLna/5\n1QESqtIWyFZekkUt81HNVP50guT+lhdn3PuSZImH3kAN1FgWtudn4C+QmvzMbZyJf215BgBNPj12\nuh8uaapJYHl3HGqKJr8ljSRivN+BY+L85o/h3rcegJryLWuAVtmMba3EkZMace+W5diapS4U4hgB\nsaKjW47ElNqJeK2b9q1/N2PeEXlHiBACUzOHVak70DCYhKuzdRzqE9R6a29M4ZxjRokOaTweXB+v\nDVjbZoj4DDCsmN07wokjjgnGbSXIRCFaNZYgbKyqqgTaq+4Opmbg5gXfGHZxAAC5l06iL9sPBD//\n6AcmoK0hxayr0rHTfyulxjtfuXB2YB/TUAPkHQ/pLqcoCq6Z83m81rUC0+unlm3XVA03zbsWf1n3\nMOY1B88/aSQlNiPXgEJsA5QEJRdNU8X9VhUVcT2GrJMrkTv1O7DpJIbCiqNRlQ4nlTEtlbjinOHl\nluOZDgwZW2FU9pZpP3AsnnQBptVNxuyQMFgypkvtR6llfvyMFpxz7KhABYvrlmfX88WS21/ni+WE\nYGTFCDQmGtCSKs+h+dAxo5B5eRJeGKKqZykjAVPKjxhd3Q4vlxbkXZr1P7atEvW91YK8DcUs03pI\nmynoqg7HcyLLO0KE9zOSMQOXfmBCwPpUFAVnLRgl/q6NV+PauVehxqoWtdMAy1QPwbvUKyGARR0n\n4G8bH0d7upwggXIvwb6gcpjOdhy3f+kUhDlPYqaO044qb4RSio9Nvggv7HwFExpaA5/HTC0QTx7u\npW1qJuY2zRz2/PWJWnxy6uKyz6tSFlrrkujcUgN97Aa/R3XJjeTiIh111Th/8Ww8+vwmHD2Fkpii\nKHBcD4CCUU27n6fhoCkGiqtnY+KY6mFzH3Z3jRPaq1D7Vgp96BYKZJapBcIbANA7SGPSFSG9Cnij\nkXFVo8u2AfQ6bzjqK6GeBUPXcMmcU/DCE4/T79aswH6WqQXaKB83pTyMJHdPTJrhXRJrrCp05roD\nuQ8HGhF5R4hwCIJn/u8OYaSol3Tt8t9T7z57nz36VBzVNCvUnQr4mfEHEqX913eHay6eiQeeWheY\n+yObZuHIpvIOfg3VCZw4bQyeHqB61/EDYHGdd/wY/PTPWRQ3TII3SC3x0kXYpJrxWLVrNU4ffRLG\n1ldibFu4FT1hxL6Ve/L5U/YxPUpTVZw//Rj8YvkGuF10XsMWmJUpujiZPKqmbBscE1+c/CWMqKsu\n38awu/CWoer4ztHXI+fky/bTVQWktxVesg+jnWNw6YfKEyPlBWKlVU7eAK3+6Mx1v6sJaxF5R4jw\nHkLpy4n/fRAMbyiKMixxA8O7+A8WJnZU42sds/e8I8MHpx8JrO2GSzyMZuWP+xMzxtZhbGsVVm30\nPQSliYeXT/kICm4xkMAYhpHDtOfdEy47bRLufvRNXHRSeV7D28XMhmn45lFfxdeefx1AeF+Bs44e\nicqkhWOOaA58fvnpk/Dy6i6MaWh6R9LE1bEqhFG/rqtQMrUoLF+A6qnhYYFKSU2wKhHufeClm2Hh\nkwOFiLwjRHg/4GCw9x5Qmhl/uCFhxHHxxPMO6Hc01iSwamOv+LvU8k4YiVBteY5vXDoHb27uxbi2\nfVNIbKlL4tpLyj0Pe4vGZD14NnxYuMTQNZEhLmPhEc1YWELo+xO6poqSjPgwuvNT6yahVZuADVuK\n+MAJ4eWtnLwjt3mECBH2Cj/43AK4XnnSz26SzQ869kfM+72OxupgedecCeHW4XAY3VKB0S37ZnUf\nKBxKizZFAbhBP5znPWkkcN2xl9Me4lY4ZY6pot6RltSBW2iUIiLvCBHeA6geJpt4yqgavPRmF2aN\nrwvdfjBxqLnND0U0VvtW9e1XH79XMfxDFeYwCmXvJj555iS8sbGPlX3tObSkKsqwxA0A46vH4gfH\n/ntkeUeIEGH/4NjpLRjVVPG26qbfbZjvASI60JgyqhqTOqpx9NSm9wRxA76lezBx9NRmHD2VWsn7\nK6fz3SRuICLvCBHe01AVBR1N+67xfiDwqbMmY/32gVDVtAhBGLqGr148fKnZ4YTT53Xg2eXbUfUu\nqDjuDfzQ0qEYXBoeB3Qpd/PNN+PDH/4wLrroIrz++uuBbc8++yzOP/98fPjDH8Z///d/H8hhRIgQ\n4RDC/ClN+MjJ4/e8Y4T3FM4/fgx+eOXCQBOZQwEXn0wz6WVltsMBB8zyfv7557Fx40bcc889WLt2\nLa6//nrcc889Yvt3vvMd3HnnnWhsbMTixYvxgQ98AGPHjj1Qw4kQIUKECBHKILvQDyccsCXQkiVL\ncPLJJwMAxowZg/7+fgwNDQEANm/ejMrKSjQ3N0NVVRx33HFYsmTJgRpKhAgRIkSI8J7CAbO8u7u7\nMWWK322lpqYGXV1dSKVS6OrqQk1NTWDb5s2bd3u+6uoE9P0cI6uvP7RigYcronl854jm8J0jmsP9\ng2ge3znejTl81xLWSjV59xa9vdn9NBKK+vo0uroG9+s534+I5vGdI5rDd45oDvcPonl859jfczjc\nQuCAuc0bGhrQ3d0t/u7s7ER9fX3otp07d6KhYe/EByJEiBAhQoT3Kw4YeS9YsACPPvooAGDFihVo\naGhAKkVrTdva2jA0NIQtW7bAcRw8/vjjWLBgwYEaSoQIESJEiPCewgFzm8+aNQtTpkzBRRddBEVR\ncOONN+L+++9HOp3GKaecgptuuglf+cpXAACnn346Ro0atYczRogQIUKECBEAQCHvNBj9LmF/x2Gi\n2M7+QTSP7xzRHL5zRHO4fxDN4zvHYR/zjhAhQoQIESIcGETkHSFChAgRIhxmiMg7QoQIESJEOMwQ\nkXeECBEiRIhwmCEi7wgRIkSIEOEww2GTbR4hQoQIESJEoIgs7wgRIkSIEOEwQ0TeESJEiBAhwmGG\niLwjRIgQIUKEwwwReUeIECFChAiHGSLyjhAhQoQIEQ4zROQdIUKECBEiHGY4YF3FDmXcfPPNeO21\n16AoCq6//nocccQRB3tIhzRWr16NK664Ah//+MexePFibN++Hddccw1c10V9fT3+4z/+A6Zp4sEH\nH8RvfvMbqKqKCy+8EBdccMHBHvohg1tuuQUvvfQSHMfBZz7zGUybNi2aw71ALpfDddddh56eHhQK\nBVxxxRWYOHFiNIf7iHw+jzPPPBNXXHEF5s+fH83jXmDp0qX4whe+gHHjxgEAxo8fj09+8pPv/hyS\n9xmWLl1KPv3pTxNCCFmzZg258MILD/KIDm1kMhmyePFi8o1vfIPcfffdhBBCrrvuOvJ///d/hBBC\nfvCDH5Df/va3JJPJkEWLFpGBgQGSy+XIGWecQXp7ew/m0A8ZLFmyhHzyk58khBCya9cuctxxx0Vz\nuJd46KGHyB133EEIIWTLli1k0aJF0Ry+A/zwhz8k5557LvnTn/4UzeNe4rnnniOf//znA58djDl8\n37nNlyxZgpNPPhkAMGbMGPT392NoaOggj+rQhWma+PnPf46Ghgbx2dKlS3HSSScBAE444QQsWbIE\nr732GqZNm4Z0Oo1YLIZZs2bh5ZdfPljDPqQwd+5c/PjHPwYAVFRUIJfLRXO4lzj99NPxqU99CgCw\nfft2NDY2RnO4j1i7di3WrFmD448/HkD0e94fOBhz+L4j7+7ublRXV4u/a2pq0NXVdRBHdGhD13XE\nYrHAZ7lcDqZpAgBqa2vR1dWF7u5u1NTUiH2iefWhaRoSiQQA4L777sOxxx4bzeE+4qKLLsLVV1+N\n66+/PprDfcT3v/99XHfddeLvaB73HmvWrMFnP/tZXHzxxXjmmWcOyhy+L2PeMkikDvuOMNz8RfNa\njn/84x+477778Mtf/hKLFi0Sn0dz+Pbxhz/8AatWrcJXv/rVwPxEc/j28Oc//xkzZszAiBEjQrdH\n87hnjBw5EldeeSVOO+00bN68GZdeeilc1xXb3605fN+Rd0NDA7q7u8XfnZ2dqK+vP4gjOvyQSCSQ\nz+cRi8Wwc+dONDQ0hM7rjBkzDuIoDy089dRT+NnPfoZf/OIXSKfT0RzuJZYvX47a2lo0Nzdj0qRJ\ncF0XyWQymsO9xBNPPIHNmzfjiSeewI4dO2CaZvQs7iUaGxtx+umnAwDa29tRV1eHZcuWvetz+L5z\nmy9YsACPPvooAGDFihVoaGhAKpU6yKM6vHD00UeLOfzb3/6GY445BtOnT8eyZcswMDCATCaDl19+\nGXPmzDnIIz00MDg4iFtuuQW33347qqqqAERzuLd48cUX8ctf/hIADX1ls9loDvcBP/rRj/CnP/0J\n9957Ly644AJcccUV0TzuJR588EHceeedAICuri709PTg3HPPfdfn8H3ZVezWW2/Fiy++CEVRcOON\nN2LixIkHe0iHLJYvX47vf//72Lp1K3RdR2NjI2699VZcd911KBQKaGlpwXe/+10YhoFHHnkEd955\nJxRFweLFi3H22Wcf7OEfErjnnntw2223YdSoUeKz733ve/jGN74RzeHbRD6fx9e//nVs374d+Xwe\nV155JaZOnYprr702msN9xG233YbW1lYsXLgwmse9wNDQEK6++moMDAzAtm1ceeWVmDRp0rs+h+9L\n8o4QIUKECBEOZ7zv3OYRIkSIECHC4Y6IvCNEiBAhQoTDDBF5R4gQIUKECIcZIvKOECFChAgRDjNE\n5B0hQoQIESIcZnjfibREiHC44ZZbbsGyZctQKBSwcuVKzJw5EwBw3nnn4UMf+tDbOscdd9yB8ePH\nCz3rMHz0ox/Fr3/9a2iatj+GHcDOnTuxbt06zJ8/f7+fO0KE9yOiUrEIEQ4TbNmyBR/5yEfw5JNP\nHuyh7DUefPBBrF27Fl/60pcO9lAiRHhPILK8I0Q4jHHbbbdhy5Yt2LZtG6699lrk83nceuutME0T\n+XweN954I6ZMmYLrrrsOs2fPxvz58/Fv//ZvWLhwIV5//XVkMhncfvvtaGxsxIQJE7BixQr89Kc/\nRV9fH3bs2IGNGzfiqKOOwg033IBCoYBrr70WW7duRVNTEzRNw4IFCwI9ijOZDL7yla9gYGAAjuPg\nhBNOwJlnnokf/ehHIISgqqoKl1xyCb797W9j48aNyGQyOPPMM3H55Zfj/vvvx9///ncoioKdO3di\n9OjRuPnmm2EYxkGc4QgRDk1EMe8IEQ5zbNmyBXfddRemTp2Kvr4+3HTTTbjrrrtw6aWX4vbbby/b\nf+3atTj33HPx29/+FpMmTcLDDz9cts/KlSvxk5/8BPfddx/uv/9+9Pf348EHH4TjOPjjH/+Ib37z\nm3jmmWfKjnv22WfhOA5+97vf4Q9/+AMSiQRaW1txzjnn4Oyzz8Zll12Gu+66Cw0NDbj77rvxxz/+\nEQ899BDeeOMNAMCyZctw66234r777sO2bdsOSy9DhAjvBiLLO0KEwxzTp0+HoigAgLq6Otxyyy0o\nFAoYHBxEZWVl2f7V1dUYN24cAKClpQV9fX1l+8yePRuapkHTNFRXV6O/vx+rVq3CkUceCQCor6/H\n7Nmzy46bNWsWfvKTn+ALX/gCjjvuOFxwwQVQ1aCNsHTpUuzYsQMvvPACAKBYLGLTpk3ieN4+debM\nmVi7dq3okxwhQgQfEXlHiHCYQ3YrX3PNNfjWt76F+fPn4/HHHxfNPGSUJqSFpb2E7eN5XoCIS0kZ\noL2M//KXv+CVV17BP//5T5x33nl44IEHAvuYponPfe5zOPXUUwOf33///fA8b7fjihAhAkXkNo8Q\n4T2E7u5ujBs3Dq7r4pFHHkGxWNxv5x49ejReeeUVAEBPTw9eeun/t3eHOAoDYRTHHyGYJlwAMAjg\nAFROSC0STCWCIJCYBhwOwxEqegIkuqLBbRN0LQaBxkBZsdkaDJutmeb/05PJ517eZCbz9bYmSRLF\ncazhcKggCOQ4jm63m2q1mh6Ph6SfVv97VJ/nuXa7XdH+z+ez7ve7Xq+X0jTVYDAobX6gSmjeQIUs\nFgvNZjO1Wi3N53MFQaAoikrZezqdKo5j+b6vTqcj13XfGnq329V6vVYYhqrX6zLGqN1uy3VdrVYr\nNRoNLZdLZVkm3/f1fD7leV7xVWq/39dms9HlclGv15MxppTZgarhqRiAj1yvV6VpqvF4rDzPNZlM\ntN1ui3fn/3U4HHQ6nbTf70vZD6gymjeAjzSbTR2Px+J/4tFoVFpwA/gbmjcAAJbhwhoAAJYhvAEA\nsAzhDQCAZQhvAAAsQ3gDAGAZwhsAAMt8AxJ5C+54P8QOAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72fab5e290>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzsvXe8XVWZ///e5dTba3pCQiAJCSWE\nIJGmoSSgjsg4gmCb4Tf+dCwURUdEQXGs41gYFQvDiIyIiKIIJIAgEBJCgJBKertpt59z76m7fv9Y\nu55zboiQBCL783rllXt2WXvttfden6et55Fs27aJECFChAgRIhw1kF/vDkSIECFChAgR/jZE5B0h\nQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8IESJEiBDhKENE3hEiRIgQIcJRhoi8I7yp\nMW3aND796U9Xbf/iF7/ItGnTQsfdcMMNoWOWL1/OBz/4QQB2797NCSec4O3btWsXH/vYx1iwYAEL\nFizgkksu4bHHHgPgpptuYuHChSxcuJCZM2fy9re/3fudy+VC19A0jfvvv/9vvq/Vq1dz1VVXHdSx\nDzzwAF/72tde9bVcvNbz3wi46667+P73v/96dyNChFeE+np3IEKE1xsbN24kl8tRX18PCBJas2ZN\n1XErVqxg/fr1IZIeCZ/97Gd597vfzW233QbAqlWr+PCHP8zDDz/MV77yFe+4+fPn8+1vf5vTTjut\nZjvr16/n/vvv55JLLvmb7umkk07i9ttvP6hjly5dyvnnn/+qr+XitZ7/RsAHPvCB17sLESIcFCLN\nO8KbHm95y1t49NFHvd9LlizhxBNPrDruuuuu4+tf//pBtblp0yZOPvlk7/fJJ5/M4sWLGT169EH3\nq6+vj09+8pO89NJLXHHFFYCwAPz0pz9lwYIFmKbJypUrufTSS1m4cCEXX3wxS5cuBYRV4IILLgDg\n1ltv5atf/Sqf+MQnOO+883jve99LT0+Pd53ly5czffr0qmu98MIL/OM//iMXXHAB73vf++jq6gKg\nu7ubD3/4w1x88cWcf/75fO9736vZ18p7ueqqq1i4cCHz58/njjvu8PatXbuWSy+9lAULFvCBD3zA\nu85I26dNm8b+/fu9893fy5cv5/LLL+fqq6/mM5/5DAD33nsvF110ERdeeCFXXnkle/bsAcC2bb7x\njW8wf/58FixYwC9+8QtvrL74xS8CsH///pD15MknnwTAMAy++MUvsmDBAi644AI++clPVllMIkQ4\n3IjIO8KbHhdddBF//vOfvd8PPvggCxcurHmcbdssWrToFds855xz+PSnP82dd97J1q1bARg1ahSS\nJB10v9rb27nuuus45ZRT+PWvf+1tt22bxYsXoygKX/7yl7nqqqtYtGgRH/3oR7nppptqtrVo0SJu\nuOEGHnvsMdra2rjvvvsA2Lp1Kx0dHYwbNy50rVwux8c//nGuu+46Hn30UT70oQ9x9dVXA/C///u/\nzJ07l4ceeogHHniArq4uLMuq2VcXP/nJTxg/fjyLFi3il7/8Jd/97nfZt28fIISiq6++msWLF3P+\n+edzyy23HHD7gbB+/Xouv/xyvvvd79Lf389Xv/pV7rjjDh555BEmTpzIj3/8YwD+9Kc/sXr1ahYv\nXsx9993HXXfdxerVq0Ntff7zn2f69OksXryYn/3sZ3zuc59jcHCQJUuWsHv3bhYtWsQjjzzC1KlT\nWbly5Sv2LUKEQ4mIvCO86XH66aezefNm+vv7KRaLrFy5knnz5tU89oYbbuA///M/KZfLB2zzO9/5\nDldeeSUPPPAA73znO5k/fz533333Ienv2972Nu/v+++/n4suugiAOXPmeNppJU477TTGjRuHJEnM\nmDHDI85ly5bVvNcXXniBUaNGceaZZwLwzne+k127drF3717a2tpYsmQJzz//PPF4nP/6r/+is7Pz\ngH2+8cYb+dKXvgTAhAkT6OjoYPfu3Wzfvp3BwUHOPfdcQJitb7311hG3vxKSyaR3P21tbbzwwgue\nteO0007zxuepp55iwYIFxGIx6uvreeihh0LWlkKhwPLly/nIRz4CwKRJk5gzZw5PPvkkra2tbN26\nlUcffZRiscg111zD2Wef/Yp9ixDhUCLyeUd400NRFC688EIefvhhWltbOeuss1DV2p/GzJkzmTt3\nLnfccQezZ88esc1EIsFVV13FVVddxdDQEIsWLeLrX/8648ePf80TfXNzs/f3Aw88wJ133kk+n8ey\nLEYqVdDQ0OD9rSgKpmkC8Mwzz3gEFcTQ0BBdXV0hC0Q8HmdgYICPfOQjWJbFV77yFXp6erjyyiv5\n1Kc+dcA+r1mzxtO2ZVmmt7cXy7IYHBwM9U1VVVRVHXH7K6Gpqcn72zRNfvjDH/L4449jmib5fJ7J\nkycDMDg4SGNjo3dsOp0OtTM8PIxt21x++eXetkKhwBlnnMFJJ53EjTfeyK9+9Ss+//nPM3/+fG66\n6aZQexEiHG5E5B0hAnDxxRfzve99j5aWlpo+2yCuvfZaLr30UsaPH19z/8DAAC+//LKntTY2NvK+\n972Pp59+mk2bNh0yLa27u5sbb7yRe++9lxkzZrBjxw4WLFhw0OcbhsGaNWtqCiGdnZ1MmTKF3//+\n9zXP/ehHP8pHP/pRtm/fzr/+678yZ86cA17r+uuv58Mf/jDvf//7kSTJG4OWlhYymQyWZSHLMrqu\n093dPeL28ePHI8uyJ3xks9kRr/nQQw/x+OOPc9ddd9Ha2spvf/tbHnjgAe+6g4OD3rF9fX0kk0nv\nd1tbG4qicN9991FXV1fVtrs6IJPJcMMNN3D77bdz7bXXHnAMIkQ4lIjM5hEiALNnz6anp4fNmzdz\n+umnH/DYzs5OrrzyyhHNuKVSiU9/+tM8/fTT3radO3eyatWqEaPKR4KqquRyuZoa9cDAAOl0milT\npmAYBvfccw8A+Xz+oNpevXo106ZNIx6PV13r5JNPpre3l1WrVgHQ1dXF9ddfj23bfPnLX+aZZ54B\nYOLEibS3tyNJ0gH72t/fz6xZs5AkiT/84Q8Ui0UKhQLHHHMMo0eP5pFHHgHgd7/7HV/+8pdH3A7Q\n0dHBhg0bALjvvvuQ5drTWH9/P+PGjaO1tZXBwUEefvhhb2zmz5/Pgw8+iKZpFAoFrrjiCjZt2hQa\n93PPPZff/OY3ABSLRb7whS+wb98+7rvvPn70ox8BwgoyZcqUgxrvCBEOJSLyjhABkCSJCy64gLe+\n9a0jkkEQ//Iv/4Ku6zX3jR07lp/85CdeVPiFF17Itddeyxe+8IVQBPrBYM6cOfT09HD22Wd72qaL\n6dOnc84557BgwQIuu+wy5s+fzymnnOKtPX8lLF26NOTvDl4rFovxwx/+kFtuuYWLLrqIT3ziEyxc\nuBBJkrj88sv53ve+50W4z549m3nz5h2wr1dffTWf+MQneNe73kWhUOCyyy7jS1/6El1dXfzgBz/g\ntttu48ILL+TPf/4zN998M5Ik1dwOwvJx88038+53v5tUKuUt8avEO9/5TjKZDBdccAGf+cxnuOaa\na9i/fz/f/OY3ufjiiznrrLO48MILec973sN73/teTj311ND5N998MytWrGDhwoW85z3vYcKECYwZ\nM4bzzjuPdevWceGFF3LRRRexZcsW/vmf//mgxjxChEMFKarnHSFChAgRIhxdiDTvCBEiRIgQ4ShD\nRN4RIkSIECHCUYaIvCNEiBAhQoSjDBF5R4gQIUKECEcZIvKOECFChAgRjjIcNUlaenuHD2l7LS1p\nBgcLh7TNNyOicXztiMbwtSMaw0ODaBxfOw71GHZ0NNTc/qbVvFVVeb278HeBaBxfO6IxfO2IxvDQ\nIBrH144jNYZvWvKOECFChAgRjlZE5B0hQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8I\nESJEiBDhKENE3hEiRIgQIcJRhoi8I0SIECFChKMMEXlHiBAhQoQIRxkOK3lv2rSJ888/n7vuuqtq\n39KlS3nve9/LZZddxo9+9KPD2Y0IESJEiBDh7wqHjbwLhQK33HIL8+bNq7n/a1/7Grfeeit33303\nzzzzDFu2bDlcXYkQIUKECBH+rnDYyDsej/Pzn/+czs7Oqn1dXV00NTUxZswYZFnm3HPPZdmyZYer\nKxEivGmhGxZL1+6jWDZe76542NuXZ822/te7G0cNXtjYy879wyxduw/Lsl/v7rxq9GWKrN8x8Hp3\nA4D9AwVWbekDoKyZPPdyN7Y98tjmSzovbOw54DFHGoetMImqqqhq7eZ7e3tpbW31fre2ttLV1XXA\n9lpa0oc8Z+xICd8j/G2IxvG143CN4d2PbOTXizdw3twc11x+6mG5xt+Kf/nm4wDc/+13oSiHTn/4\ne3wP9/Tm+NEf1ni/48k4F8075rBe83CNo/vcf3XzQpobEoflGn9rX+79+jv4+d0vsmzNPmRV4aK3\nTq55/I9/8SzPv9zNdVecytvnTHjF9o/Eu3jUVBU71JVuOjoaDnmlsjcjonF87TicY7hhu9BwN+wY\neMM9p737syTjh2YK+nt9D7dWaKobt/dz2tS2w3a9IzGOXXsz6K3pw3qNg0V3zzArN/YAsGnnAKcd\n117zuA3Oc3hh/X5mTWw+YJuHegzfUFXFOjs76evr8353d3fXNK9HiBDhtcE180lIr3NPqqEZ1uvd\nhTc8SroZ+m2aR/+YvZFcOJZtY5jiG1EPYAVqrheWgsHh8hHp18HgdSHv8ePHk8vl2L17N4Zh8MQT\nT3DmmWe+Hl2JEOHvGq6LTnrjcTdGRN6viHIFeRtHsc/bRb6kv95d8GBaticQqcrIH0mLY+bP5N44\n5H3YzOZr167lW9/6Fnv27EFVVRYvXsz8+fMZP348F1xwATfffDOf+cxnALj44ouZPLm2ryFChAiv\nHW9E8tYj8n5FaHp4jEzz6CfvQun11byDQWeWZeP+UuSRddn6VAyAzAE072x5iKZE4yHp48HgsJH3\nrFmz+NWvfjXi/rlz53LPPfccrstHiPCGwf6BAo3pOOmk+Nx6MkXSCdWbEGqhe6BAQzpGOukf0z1Y\noLk+QSJWHbiZzZUxLZvWxmRou+Wazd+A7H0kzOYDQyUUWaKp/rUHSFm2TVd3jgmj6pEliZ7BAk11\nCRLx8PMoayZ9QyXGtde9pusVSjq7e3OhbYPDJbJ5jaa6uLetN1MkGVdoSMcrm6BYNtiyJ8u49rqq\ndwOEANWXLTKmrbqvA0Ml4jGF/mzJu+dK2LZNV0+Ose11ntnZtm329OUZ116H5IxTXeBdz5cM9vbl\n6WxJeedYts2W3VniMZljRjfSkynSmI6FYiJ2dQ8zpq2OmFqbZGudUwslzbdmmJZV9bdl2WzenSGV\nUEknVOJxxfuOhgo6fZkidakYqYR/naV7V/B/G+7lIye8n4s7zjng9Q8VjpqAtQgRjkaUNZMbfvYs\njXVxvv+pswD499uWIQG3//v8mufohsXNd6xg9nHtfPQfZgLQny1x48+X8455k7jk7ClV51z7388A\n8D8VbbpKhvw6cLdpmWzL7mBq85SawsOR0Lw/++OlQPW4vBosfm4X9z6xlcvPO45Tj2vn33/6LBOm\nZ7A7N3LdnH+jOdEEwDf/70V2dg/z7Y/No7059aqvd/MdK+jLlkLbNuzKcO2tS0L3c8Mf7sYup/nF\nxy6vauOexzezZNdKmuoVvnvlZVX7n1i5h3v+spmvXHU64zvqve2mZXljB3DlBcdz3pzxVeev2TbA\n9+9dxZknjuaqd5wAwOMv7uH/Ht3E+88/jtOmdfLvP32W9iZfcFi5uZdfLd7I208dxwcvnAbAqi19\n3HqfiKq//v2z+c7dK5k+sZnPXSFWSGzcNci3fr2SudM7+fgls6r6kc2V+ffbljF1XBM3fHBOjdH0\nEdT8g0vvXFJ/YVMvP7l/rbddSuZomLUSuXkaVqaTz922jHHtddzy/73FO+avu5cAsKJ7JRefeGTI\nO0qPGiHCYYRmiAlhKK8BYDj+tQMZP4tlg7Juhvxre/pymJb9N/vcfBPhkWfvezf/ie+v/CkrulfW\n3K8bZs3tbyT8cevDfGHJLWimxsrNIsh21ZY+9vYXIFair/FZ+kuDdA3v8c7Z2S0ijQcOMrhJt2qb\nkSuJuxZ2De0hPmkDieNfrLm/qzdH4riXKI15ofY1MkVsoKsnrOFXmrbXjrAuf9veLADPrNnvbXtx\nUy8AK17uYSivIaWHyE+7H7mpx2lLRG4/8aI/Zv2Be3VzAGzYlfHb3LsRKV5gxYaemv3oHiwCsGVP\ntuZ+0zIxLfG+BX3uZoC8NSe+oC9bDJ0rN/Wjy3kxxoo4d09fPnRM2RDPOqkcuSVwEXlHiHAYURlf\ndDDapghSssnGtzGsiUm1NyMmt6DPc1XvWnoLB0524pL366F5P71HJF7al+/2tlkBf+PR4PN+ZOcT\nDGnD9BbD45zJlVEa/WVcRaOaaJUDBEC52DW8m2v+egNL9jxb+wDJIjZlNXJzd2iz+1yf6FpywPZ7\nC/6qHsuuHm83IK43EyasSvIeyVRdy6Li3rdhWpiWTWzsVtHGpA0j9jN4vf4KoaW/OMCSwu+Jz3hu\nxPMrz6nEf734E7723HeBcLR7Tisi1WWIH/8Cw8ZwVV8AJFXz/pbragsHZVMck1CqXReHCxF5R4hw\nGFG5tEc/iKU+Zd1Ead9LpvU5/mfdrwF/cnU1hf35Hn625k7+47n/8jQGoCoDl6d315hkNw5sYU3f\n+oO+l1eLxri/TjVI2G/0pWLuhAygW+EI6d5MESnuE0ZBD5MfgHIQEtOjO/8KwIPbH625X2ndj9q+\nl8TxYeuFu7xpW3YnALZR7QEtaQZFxSfvWn0cibzzAQKT6rIMpNbXJH+5xj2qTuCXYdqifcVpq0Yf\na12vp6Ivq3qFCVtOjEzQwf6XtDD5dg3vYcfQLnoKfWim7l9L0fn+y98mOfNZlOZetiUfreoLgBQr\nB/7WqIWyeeSj0CPyjvC64I2UZvBwwqwgU10/OPKWG4RWtze3D/AnJ3epUG9RTMq6pYcmm0rh4EBW\n8x++9DNuW/2/r9ifkfBiz2q+8dz3KRrVpBCc6IPEFyTv16p5W7bFrSt/7hFg9X4bpW0vcsv+mvtf\nCbuGdnt/l4zw5NybKSLFAuRtVCeRMi2bIW2Y32/+MzktX7U/eI2JDeOq+g4gN4nnbNvhB1jWTWzb\nJlN2TMuKUUWufZkScr2vKeZr9NGNZu/LhImx4JiWpbosyZnL2Bd/gd3De6vOryWfuEuuTMuirJtI\nqmjLNqsDNF3BsxAwZe9zTNKphAgEfMkhb9saWRjakFvtPefKe1m+z3cZ5PScZzavJGJNyZLT86G+\nACEhDTV8zu83/5lfrL0LzXnHa1lgDhci8n6TwPUvWrZdMYGaNY97pW2vBat61/LJJz7PtuyOmvst\n2+Kl3rUU9dIBr23VEAAOVV9/uf433LT0m6/6fLcfQfI2TCtEriMJMJpmIsUFIbYmW4Cg2VycP1jy\n/YHByaaSED2zeeC3bdvkdJ9MKv3ouiGIwbKtmtqWi9vX3sXu3F5W9673zgHxXILm/JAGG+hfppzh\nzvX30F3oxbQsBofL2LZd9QxFX6rHqrfYz4bBzdy/9aGq4yzbxjBMYsesIzaxtrm2ss18SQ+ZTHcN\n++RdOSn3ZkpIcX/cCjUEGNO0+c7z/81fup7imb3Lvevphskftz7MZ578Mn0lIaTJkh+xXiwb7O4f\nRG7sQ2l0yLscDnzTdJMhLYdhi/5KEhR1v49lzWT7/iGkhE/Y+RoCxLDdR+yYtfQMD4W254o6iZlL\nSc70a04MlAZDxzyzdzm7zZeRm3tInb6IPbl92LbtRZAXk7tZMfCMr3lXQB2zleuXfImCXgwJoK5F\nJp2IYds2e/Ou8CXh2pJ0w2JgqESuqDNYyrIz/gyJ414C/JgDwzK4e+PveaFntX9fWt5/xnJ1v7Kl\noWqzeby25m1ZNn/peoqVgfaPJHlH0eZvAnQPFvjCT5/l4jMm8fLOQbbvG+J//n0+Dy7bwX1PbuPm\nf57LxFEN/PWlPdy5aCPXv382MyYJ0vjNXzbzyIouvvWxeXS8hsjZIO7fIibbv3Y9w5SmY6r2L937\nHHdv/D2N+jF0r5zOj649J7QsA2BTV4Zv/t+LfPySWcydLrLzPfp8F3c/tpkbPjgHPdVNS6KZ0XWv\nLnPfc/tFAJBhGajy3/aZvLxzkO/cvZIPLZzGceP9VIr5khEycRumTUyt1iZKuomUEGSwfVeZNW39\nXhCNaxbvCfgyQ5p3gBwf2LaY4eQQ0IYkSZR1k49/90nOOGEU557lB9Zc96On+MAFM5h/6niG8hrX\n3LqEc04eiz3xRV7u38R/nHUjsQOMgaZb/P//+SRnnTiGf3nHDD73k6VkpN0kRCBxyKToE7PFnwZv\nB6A50cSG5Z1s2JUhEVcoayafuewUZk5u5cWe1WzdrPDw091V0dvd+XDw0n/d8xLb9g3xb+85ke/+\n5iWuuGgikmKCbGFaJorsE+SqLX384Herue59JzNrShvb9w1xyy+fB+CbH5tHZ3OKVV27vON/8dBq\nxqnT/WsnXkJp9f3QtUzSmfKgR3j3P72de34Dn79iNt/69UpSpz8ROvalbfvZ2TlMc32cz922DOnY\n5SSm+wKQVKHxLVu3n98/v5LkTH9btpynLp6mpBlc/+Ol5EsGiVk+mQxr1Zp3X/ol1NR+8oqBbvhR\n0kPlAnJdmNAHy2F/76833AdAfLLQqH+y5EFSPafQ0SKeUXncc7yUAzlZ+x5iEzZj2LBpcAu5cnXf\nDNPi9kWrKDrmckm2QBZC4pduX06PE6TWcEwXeJ+5ze0PvsykUQ3stzZXxRIM6znyJdFfqYZQ8Z3f\nPUeb7FtB5MZ+5PQw2BJIdugeilr1+bWEuMOFSPN+E2CjE7X50LM72b5PfJCWbXPfk9sAPzr0waXC\nf7Z07T7v3EdWiIIxm7p8Te+1wtXmRlp77EbuZhFmuv6hamn2ryvFMfc+4ZeSve+vIjBm+cbd/PdL\nv+CW5f/5mvsa1BoPFktWi34/tGxnyOddKOkhzbsye5aLkmYguf492eLhZ3d6y1hcTb7HMZvHlXhI\nU3DJ0bAMFu34CwPNKwDxvIediPdn13fTlfMjfVEML3p2l6O1PLVqN893v0TeKJAp1Q7ScbEvI/Yv\nWbMPy7YZGCqHTI1lI0jezrNP+pO1bulelHDZuc+la/ezYWAzt6+9i8cHBUm8vCus+e2vIO91OwYp\nlk1+/dgG1NHbWbRGmFslSZivg1i0XBDzn5buAMS6eq/d/gLL973Attxmb5tml0NLBOzOTeJ/UwgE\ntczmPaVe729TEtaRxc+NUIBJ0dm0O0NXTw7dsFCawgFykmqA5L87f1ixKqQVA2RLeedehCY7arRN\nLOW/vzm9uo+u8UFp2093xo84HypWa+lBzTtoNZJi4t4yWYOd3TnvGVYimbb454unM2/mKN4+2yfI\nn6/9FXvG/L7KvVHWTZZt3hHaJsU0hgs6PYNFL9+BlvYtJA2NYoy27xtClqvzIeS0PANDzvtYg7yF\nZu5bshLTxfcjmXHn+v67nCtWZ4orRWbzCIcSlZGiUrzAQCHrmbfcCdUNsKn000LtwJRXCzenkSzV\nfv08U63j56t15ZST8CS47MM1t9nqa5N+g6biVxOIogdyJQfHslAyQj5vbQTyHtZySJK7QNsMCS8e\neRd8Yqg1BtlymKzcyF8Xe3P+RCkpJprmulWcyzb4wlqmXE3ewTEKEpebgSpE3gEBSDMsUDXkQKT2\nYDHnBWC5SMQVT4iT04JUKpPT7A1EsQ+X/D70JlcTm7iR4lh/nfJAKSx8uglz+lIr+dnqX4aEqoHC\nEHe+fA92zH+PJMW3mtiYge2Oz7aW5q0NBI4TRDE40lI/xaA3U/RiG1yhIISA1hcb5QsBVkEEBA47\nhNubKSI39DM0cTGmFCDvcjUhm5Lfn64B35ozXK6+n6DmXWt5m6aJ96w0AnnbisbZJ43lX981k7NO\nGlO1X2kJC2NlKUd8imOSdueCWJk9TuKaGcc0ITf1hPz6l54nhILebBHd9L8LN2ZgWM95Yyyp1fcg\nqToZR8gNCktSsQXbkpDrhjyXVqYQHs/GeEOkeUc4tIhVJNxPnvIUNy3/ukfq7oTvEnStmsG1siu9\nWnjBOCO8fqZDDF6QTsW1C3qRVervUNp3UyxXTxSGUjs46GAR/AC1V6F5uzm7K8k7XzIOSvMeChCv\npBj0Z/0J1jQtbNv2JlLN1MgXq33Kg+UwWelGONZhMKhNK4bXF89H3uATT7YGebtL2AAKZoA43Ykx\noKGEzeYWyROXED/Gj3LPFKsrMCVisqfpuYFKlQUt9hd88t7W7U/87uQaxEDRHw/TMulveB65foBy\n0xZW9a1DM/yJfqhUYwJWDIYdTasQ8C3bloRqJ8g774zhPV+bQS2gPTvrg0dapy8pOn2ZkhfbYBuB\n4C6XuFS/j5Ll77cKIrmKaxbvzRRDhKZaooJXvkLz3jS4FSvhH7c34z/zXMDEbmbbkGyZTDDOooal\nwRVkhgvV38yY5Hh0S/e+p2DSFu/8eFhrjU3Y4AluFB33k6qxs1tsax6TJTFNuLdkW4yHkhTj25sp\nUTQDz0lLOPeV9yPTlWrNWVI133IQ0MylvTPBjCHFyyROehqAgYL/DZzVeiGtyRaKRumIBeNG5P1m\nQ0CajKsVmrdyZDXvkczmXiCRM2lVLrfaNbybItmQ9haEJudqbg9ix9CuUDRxELkAMb0as7k7gcdU\nudpsHiDQkTSUIT3Qf8UMBVaZlk1eL2AENJ+hgJbktl+pLZtWOFguUwoICLJBWQ8njwlOpBkt7PuE\nsHBQNH1hyU0sEgzyqQxYq4zyHa6hESZiCrudSHsMYbKsDCQKmnF39PmWCKxqrTWoea8f2EivuoHE\nCc+BLO47GImdr0HekmJ4VoVcYLy1DXNRiFN0iNHVzmPHrmJDYZV/vqPlDeVqvE+2BIpBT7ZAr5sg\nJBCZHdNF/Elw3EzE3+ZQC9aQKBE6rPmad5D8k5YgviB59xT6+MHKn3r3D9CdC0SmOwKKOdSKtuUU\nVCsVGsOagVmOcDGU10LzjLZsY2voAAAgAElEQVRtFi0J0Qc3ULJWamC5gryDEfbGUKM3Bjv2i/dR\nSfnHj5VEPIIuFVBkib5MMWzCdsZzqJwj4zyDWj5vggKSs9/oHYdeSHrjLzlj1p0V42V0T6TdmEZK\nTWLaZkjjP5yIyPvvEAOlQTQzaEoNkETg5Yx55C32u2ZzzSpXSemHMsmHa3IdSZu3bLe/Yn/l8ifX\nZOx+XJU+tqLtE9NI0dI/X/Mrfvly7dz6w4Go3FdjNvfIW5FCVox8yXCehY0yagd7ctVLbwAKhk/e\nUkVErGnZVcQ8FCC/2uQttO6gmX446ANWzCrNO6g51zKbBzX3vOFf39e8S9imgm0qlMxqn3cQtZaa\nxWOyPz6KDtgVS+L00Du6dzAgyFnV01qwv7XKoxYD1oOc5vfX1WpRDE+wcTXvuvxUrFwrshX3rDWu\nEKS2+W4J28bT8mwIERtA0m5Ckm36snl6B4uOUO2/N0nTIe9gwJfzHWtbT8Z2hJu8JvrQmy2FiClh\nC7N60KKU06sF3IGCP0Yll7wHRoMZQzHTDGnDXpayWm4C1zIwVNA9rdUcGIXZN576mMidviWzXRxb\n49sXAl8wsMA/xsoJ8pcSBU/zjiXEOOp7jmVS8ngAslqWtqYkvZli2IK2XUT2DRQDgmhgjLRts5x7\nEGOcSqj+flMV30dIKLTodSL0bSNGb6ZIWhWBevkaY3M4EEWbH4XY1T3ML/68nk9eeiKdLeGi9nm9\nwJeWfoPx9WP5wunX8H+PbOIvL/oaZnACiFVq3g5Db2m+l889bdO89VLv2B/9YS3nzxnPFRccf1B9\nfOCZ7by8c5Dr3z/b+1DveXwz2ZxGLqWBAiDxg3tXsdkpSPCpfzyJyWMasQhr3pWlI7tdf6/zcXUP\nFkKBa3nTn4SeXtPFI8v3ceOHTvMi1otGkUw5S4MzET350h4ef3EPLQ0Jpk9soXNyteb98PKdrHi5\nhxs/dNorWiFcYUOpMJs//uJu9vUXkFI54pM2cHfXBo4ZfQ0dHdNC5xfMvC9WKz7hqopMMbGXb6y4\nN3R8vkLzfu7lbh7ZvAncVNWySV+2xH/+5iVQyySmr6Bo+WSlxk1PAHIzuAXJ+4k1W2nMdHHh3An8\n/qmtDA6VGTfL13SDxPenZ3aI8+MlbC2JpOrsGxxC003iMSUsSOL4CZ3+j++oZ3dvDqV1L08Ul1G2\nXQ1JRBkPFzRu+eXzDAyVuPyiseLWJBnLtugeGgScSHTJH3NbjyPFNJ7esI1ZiX5mTWmrGVQUFCCW\nb9hLYgZItoK2+VSSJz8VIkPN0kgACScVpmzF0S2DsqlVuUJsSwJLDWt5FRpfPpNAaRVBcXv6ZEa1\npukPHJM0WxlmK/Gpqyi91Iytpfz2jJj4B+zoHeCWX66gN1MiMcrC/WpiOLWoy4M8u24/v35sM23j\nstDqdlJEUvfkMlzzvb/S2ZRkz2CWeDNgim9GNtPY2GTKQ7SlWnjmZT8S37+voNbqru0W53em2wG4\nc/09nNwxq3a8i2yKNtzgMMdaUVpzJraewDZU1NE76Fk3FmjwrmFmOmlKNEFJBLt2NI9l3fYBVm7d\nBzEorT4Lu1SHjEJ/UVhrmuvj5J0xHNf/DrYMlGHKWi/4rqkuTlmvuIf++fR0OMl0VIP+vAZpMUZ9\nmRId44UrIK8XSODniT9ciDTvoxD//fs17O7Nc/+S7VX7smUhDe52tJYgcYP/UUE1eXuk5Ex++/rD\n2vdjL4i2Hti6iJuWfjNkuq3EH57ezoZdmdBktvi5Lp5d3+1V7zEsg1Vb+ymUDTI5jXXbhfZkOaTq\nkXeFGd9dJuVOYNv2DbFuh29CzZm+VnnnY2vZ11/w2ga8NchlS5DDLxdtpKsnx+qt/fz2iS0hs7mr\nzdz7xFZ27B9mYFhM/I/tepIvLf1GTbO6YYj+xlQ51Hd3PIPEuLXGWveiJTRZ24gJE51kkUooJOMK\nQx3LveMkXZBVIUBGmmFy2x/XMRQ0dct+pLrascf3IzqIxy3vOXmacUwTpk5bwlSKvLBR+JT/vHQn\nz6zd7yWPUWWVklXh/5QNpJiOrSWxTRXd0ti0W5hcS3p4vNpTbRhogM2oVnE/8amrKdhhbV+KldnX\nX2D7viGyeY2N+4VmO8FJbtJfEM9/zrQOkulAycfhZmxLwlIK/OB3Ivgp6Au1ikIjdCO1g+M1Sj8Z\nu5wS5tsa5OvmsVY1oRWu69/gCUFuxrPyunnYhhr2V1eQt2vilhQD07LpaEqScuSQs8fOo8mY6B3b\n0C76Kak6tiWDrXiad1e+i+37hsgVdU8rbUk0M8Y6EXO4ha58F89u3k6uqLN70DeB1xtCECrbBbbu\nzrJsXbfXx6ljBOm675rrLtm63/fne/0P3CMO8br7zh53BhMbxmNjM1QerhKgZEMoIak5j3uBeZKq\nYVsSdrEejDj67uORZBu5LksqoVK2xHc0a2In582ayrFNk9kwuJlp08XzH8yL91wkh5EYk5jAgN6L\nlMgzujXtPceGRJpPvWc2tiV5yk1bU9IXnB3yTlvtTFBO8PqWLfrfaaGkM7vzJE5sn0FnXTtHAhF5\nH4UYcgJC6pPVfiP7gCUvCJnN3UxIru9VqTRlSbVNzot2Pk5faaAqorkWatbudYSDsiHuo9NZu+ul\nAK2INh9Z8xb34i6BclG2AxOxc7/BW3Ozk2mmVm1Wl02yAZPycCk8ybhc/IctDzJQGqyZdcowLZAN\nCsmuqr7Hj3+exPTnvd+1AuLKtiBDu5T2+pSIKSiyhGwEAn1KQrovB8hINyykZA65MbBGOKC9h9Jo\n6qItJRYgb9MCbKRYmeZEI7aeQIqXqtJn7sntJ6HEmdQwAc0uhd6Vjsni2tZwC5gKyCb5ongPKrN8\n1cXS4n1QjFCZSxcJSbwbUkwjmw8kRTHFxDyrbTqqpNDPDuIxiX+7ZBbHT/K1HqtUJywA8ZJnBXH9\ntbGhiRyfOA2A/ny1sCNZCiCBWaE5O/uTagJFlogPC3J9dt/zvrAqmyT0Nuxio/C31iB/M9sqtErX\nv+1sb29OolllpjQdw+XT30NCrqO8+RQAzjhFVC5D1T2N2y6lMTPtKI0DyM3i25Ad8rzm1I+RUJKY\nvULI2Ws6Firn2zH6RzPJnOe0GXgXnb5ceubxpBMqaOJdcZMDlZx3Ttt6EqUX345VrAuR97hRTh4B\nh/iS8RjHNYtqeHkjXxWVPaV9lD+8DQNCaFUMMGOeddF2+iCpGsm44rXxrxefQjKhcsGkc0UDDb1M\n7KzHkp3+OO/8xLiwcClt+xjVmvb6m5QTzD6uQ5i9nW1j2tJV1gNVkZkxfpTTB92L1bDNGLppM731\nOD520j8TV0Yu9XsoEZH3UQg3pWFDuka6wVcIsAp+YO5yD9eXqCgyECAb+cDZygyrOjBDMzVRLEEO\ntx3KmuWQd8kh77FO3WOXICqXihkBn7duGV6gkrf8JlS9yaZsB5f4iD4G/es9gexfwSUvUqJA6rRH\nWbTjL962XLmCvKtyh9fI8mZaxI9byZ66p9haeDm0T2nuC/2u5VPXKGDbYDlZtSTFEOStSEiaX3fZ\ndLRGzfKfeX+5j8TMZUiq4S83CvrNAwFKdllMikrM94drugmqjiTbNMQbsEsppHiJTD6Q7U6y6Cn2\nMrZuNC1JQSZBa4LWuB3bkjB6JmBbigjGGhSknXeimG1TYczwOYK8Ee9lMmVWvXMtsrOkKFYmGxDS\nipYg77H1Yzix/QSM2BAtHWUkSfKIxb1HW0tCrOwJGG4msobisXTUif5nQwF8jvbs+DhtUw2Rr/ve\nJeQE8ZiCVaynM93OjuwuQd6ShSTbWIZ/vhCgrND5Vq4Fu9jgkYvSIN7rlqYYNjZJ1dHsFckjLtcq\nIyl6IChNwth/DBAonOFcI6UmUWQJa0jYyPP0e+MNYPaOp0FtAFsKPUM1bnrnx2Iylkve5QyWbVGS\nRTu2HgdkQdJObAKAGjP8sUMQn/usl+xZ7uVkd3FSu59tJnHcS6RmPYuk6khmjNaGROBaQEwnHlMo\nODEPKUX0rTMlNN5sOYuqytiyLtwWtqC5MbFjkWwFdew28vWbkRQD25KIqWIcG2PNSIkCclOvqG8e\n8Hm7z8H1a8tNfdhjnRUThloVVHskEJH3UYxaUeFBM26tJQth8hbHFsoOwcmSZ+6CEaIxAyjVIJ4l\ne5fzu81/Iu6UKHQTHlQm+we8oLr6VIzm+rgXqeznwq4OWOst9PmEqRiA7ZVelOoyJE56GjsogDj3\nIAX81K7mDWHylOurE9G4ZnMXlZp0Lf+pYfpJNoaMAye3qWV216WiiLB2J+eA5m1LgQxteYe8bf8e\n9pZ3ICkmetfxGN2TgLDmHXz+linGRFZNz/pSMjSSJz0FQL1aj1VOIUli+ZUXSZ7MY9kWY+pGe+lb\ng9HphprHLtWBkRBaqwQ9Q4Jsi6azpGr/MSQK47wJXU6UeEL/XxLTnwtZB1plx7edyntCK0DRsa40\nJxqZ2ijiMFIteeceAuRddDRvyRcw3ICidCzFqAZB3iUr8Jwd8rZ0550xYk6kcfC9EwlyknGFsm7S\nnmwjbxTIlYre+YYhuwMi/ne/rQpSsDVBCLGJGyFWoqlRXNclJUWWQBcEVrByoh+q7hEj+OlTvWVy\nAdO+KsvYWgoZGSvmm91BmHx1A5JyGjk9jNK2l8TMZ5AdQSKlJokpMmbJ1byz/HHrwxjNwuftWg2E\ni8dGaXfW5scCPnkH7rNetm8Fd738W4KYN2Yu7516iX8/ySFQdFRJCEiiLdcXrpGIyRSMIkkl4WXO\na3LqqWfKQyKHhaO5e5kiTJVYYTSSbLFOe1ospzNVYoo4/8KxFyFJoI7ewdi2tDf/ueMcU2XqnMC7\n2Litfl/N2EEVHDrUiMj7KEN/IUPylCeQW/bXXCccJCOj1gtVg7xdYrVtO+QTr6V5u/5qqC7WAHjL\nJJTGAZANr22fvG1PA3LN5om4Qkdziv6hEoZp+ZHyjoYeLIPZEyBeSRZtuZp3/NhVyE7mLjcgxtMw\nAm0EyTtoqQhOhi4KevgeNcMK+fprJWUIErxsB9s8sLAFIg7AUHJYpTS25ZyrGCTiCrIsYztadHnT\nqZ42plt+H12t08o3CpO1c76LEHk7yT0kxcS0bAzTIqsNeoFCti152rmUKLK/wmffnGyixZkw3XSu\nSBaWpHt+WDdCtzebc8bL0byNGLph0eFoS7HxIpuZXJ8N9bfDnoptg9wUWAoGDNvid0uiBVsT10qk\nxHlFowSWjLZtltBuXXOrI2C4qTjr4knGNAvhQx29E6VjlzceAGXNyXtQSiPJlne+uz+pCGIp6xYt\nSeH3HigP+uTtnO8SnEsGleZYs38Mck6YY+Vkgfp6cZ6vecvYegJsWJ9ZizJqJ5IEiuW7GWwthW0H\nnoOsE1fiKLLiLAGVSMuNSIkCx09o8oPLzBjDeY3ZTWcgqQbxY1cj1w1jJ7NOH5IidqMk+rJjaCeP\n7XrSfxCGK4CIMY5PWRsao+A35Uac10JKTXJK58zQNkm2ScpJLzmPa2lQO/YwNGoJBb1ISvXT5SbV\nBEklSaacJaZIQrM2VRJxcb5hWpT2jQ9dwzZjnvvw2JaJIj4hVqalIeG9h+51Fdm3HoTa0OO159rD\njIi8jzI8vnOZSBRw3Es10xAGyeBHf1hbtT84eWu2+LusmTy8fKfIchUMOlGq28+X/PaD5kkXwQpS\nUsqv4OOlHJRsQbrgVeJJxAR52zbc9sd1dGcdE6ZD8oWSwc/+tI7t+4ZYvlVIvHaAmAbcDGSBicI1\nobkfYFk3uOuRDdz57OPszPjpX3/1WKAkZg0ff7DYA8AdD7/MMxt2+PsDWt7zG3q4/+ltXoY1gBUv\nB7JG1bBkDBULWJbN/zz0Mt/9zUrW7u0CyRZBOs49SorQvFVZwpZ1UnIdVqbTm1SCfmQ3iMc2Yz75\nB4Uwl0D2noDpaeZim6abFAMa6JT6qb5Glyiw28ls5b5DxbzM48sdM6yreTt+U9fE6fZxy0AXS9fu\n8zVcI0ZXT47lS2JYxXqkej/gUJIgZTfz2TmfADOOlWsRVhGnbbmhnyFpPzNaj6cp0YBWEtey4jl+\nt/lPDJYzKHoDZt94QAqR992PbWbzfiG8NSTqGN/a4l03Ptl5FxzXgusxsYvChy4lnVgKR4BKqkkS\nMZmybnoWiKyWDWhsYdLxfMoBUhjVmgYkFIe8pXiJtMNHSVX0W5Yc06/zPONOXeykEqg1YMvCwuCQ\ntyXrYc0dyA6oSDGd3WN+i5zyg62GCjpnj52HbVbTQVJJEFcV8jlQibNreE9ovyuY6Hum+hslv5JY\ncL16LeKb0Xo8lxx7MZIk0ZRoqNqfjqX9zHqm6iXsKSf3M1jOkI6F6y00JxrZm9/PQMOLYi4zVS+W\n4p7Ht1AeaGFy9p3e8dZwi5dpsi6pYhtxJFVHVWTf8uhp3lLoHqxcE+WNp4KexDBtNnVl+PkD6w95\nIaeREJH3UQYvwtGSR9C8fXJdvbWvan9wOYcZINpHV3RhmFY4KrZm1R0/GKyW5h3URCXVr9Lk19AN\nrNX1yFtm1mThk3txU6+fwcohnadW7eXZ9d3c8svneXHnDsDRLBHLSTzLgeFrIp3pDq8PAC9s7OXJ\nrmdZXlgUShm5bmcgM1egb66/2fXLu9jTm+dXj/vJN4JLjH58/1r+9MyOUCYwSxJ/nzdnfDga10Hf\ncJ6+TJElq/exbscgK7YJ4cQq1vtai2z4ZnPZICY596m7+Zb9PnqWF1MNkH+15l3YO9Zr31svr1ue\nyVnfNY2J6SkhzXt3r/PsHRLq2qvRtdsJ7nK1UvcenWdh9ApNRx23lb+8sMeL5G9K1lPWTdZvz2L2\nj64al3pzFJObJqEbFtZwC5IEckoIdW5ynrdPOFvcS05MY7uNjSLeApDxScMn7yKPPt/FcKmIbcP0\n8e00pfzJ2DYVLjhtgvfeubGK9YrQqpV0nqb6uDdeKVVohZpmehaIgVLGF5Zcn7kTeCincsSOXYXa\nIVZttNbVMXl0g9NH8b6NHS15edBd8vVQIfx1NjaGfttaCjlRIjZlFSYaKZf8HfK2HdO7jS0sHDbU\nxVNccf5xjG2vJ2ZWL29SZMVZlSJR6vOjqMubT0HvOs57zu3pZibEnWWkqk5RFXPP5I4OLxVqXQ3N\n+x+mLOSCSW8T/ayxfGx0U6OnOYPvv3aRVivJ2zGdpzYiyRa2odJQkRBmzqQp3t9mpsPLQJlMqCTk\nJHJcFwKPa4Gq4bcHMHomYGU7aUzHMEyLp1ftZdm6/fRnj0x+84i8jzJ4ZGHEvIQQQYQCoCo0ye99\n8kzGjvJfZMM2mDymkUmjGiiUDQzDqliPWi0cZEv+MqNaPu9g6khJMTzy9uoDy7XIW+GMmaOZM80h\nXOe6biajYPUeOT0szLmOyTc4oU0d3eH9PaqCvLsHi6GsX36DZu2/HXPgLv1lnt+/MnxOILCndi7j\ngHnc6d+0Cc186rJpVUdqZjkkhA3oTgnIYr0n8UuqCNBRFAkUHQXXzxj0AYoJzrUE2IbqTToN9YHP\nXNHF0idLEe3bEpYsyLikGb7mbsQo66YnxMjJgle8xBUWTC0WIka3L+75AP/90X9gUuMElPpBhu0e\n9sbEWH78nbO5/v2zAbDyTVXjIluOS8AwPdLxVg441291TNXZrF1V79pSfWuEbz1wBQwDhRjzZo5B\nkiTU3aeKZUKKiT12LRPGiOuVyqKm9LX/cBYA8+c1M3/2uEAwWIJETMEGGmMueQ/6AW+u5u1o7kpr\nN2rbPuQ6IYTc8E9v83IPWI5PefrxKe+7cjVvL6eMHo7GnzVugvf3tz8+jwmdghzV9n0YUtkjb1fz\ndp+Vi3Qsxa1Xn8Ox45qIqTIzx4r2bEOl05jBW8ecDvhLSs0+EX9QrzZgDY5G6fdzPnz742+lNS2+\nydiY7QzYuzm+ZSpffN+5/MvFM4BqzbshVs/ExrAZ28qFBZJxLS2hnPZSxZyUrqHNB2FrqVBFwpnH\ntPD2U8czPjlZXG+oDdW5P1mSOG5MBzYW31/zQz/4L0DeDTFfwDH7x5KMKzTVJzBMS9R4l6Ct6dBU\nX3wlROR9lMElC3dyrUTIh1rxosdUORSJbdg6MVUmnVTRdKeggHJgzXsoQN7lGpp3KFtWgLx9zdvv\nk0fejmTtfaTudR3ydgPzpGQOuW4IK9vmE1egv2ogAdKYOmfpiUMm/dlSyKzuJVEIEHZQcLED2ZTu\nWH936B6DwVmVZnWlfTdKu798zG1TUYTJOwjbktCscMrUYVMEuNmlOo+0pFiZZFxBloVA4+ZxxlKw\nLRkppnnpJstWwIXg3INhB56pE8ErGEEiIdWhSeKZarqFZrmFMWLifdATyCjIyYDP2yFRvaSCGcM2\nFc9c6xKrazaPqTJtyRaQID9KVGhS+o9lcvNEjxRcK0pojB0TsW5YHmkpDYNI8aInILgTaX8mXPEL\nwFQC5K255F1EqsuKwCzbJ8J0cRJG9zEAPLN/GT3SJm98VUVmVLodWZJZ2buGPnmrFxOQiiW9dzet\nOMVB9Kz/jjvjbzlL+jwyQBBZc6LJS0lslcWzzpQy3mqKSh+xvP2tnNJxovd7Zsdx3t8xVWFWy6zQ\n8UmPvMU4G/smc0LsLG99eqXw3ZRwnoNkc4w1jytnvFcc5xatGWrj7JaFvHvM+wHoqCCphrgjPIze\nSVxOcOnUd4b2B8n7golv4wunX0MlyhtOp7R2nve7M91BIjYyTbkCigvTDs95drE+RN5u8NtFoy6l\n+OLbwYyhBoJZ61RxDz3FXuRkwUmyI85RFZn6eB0fPuFyJmbeAbZMQzqGqsjohk1vtkRrQ7KqENTh\nQkTebzDolsEL3atGXPJVMv3JtbbPO1A4voJ8Y6rirSEGQLZEBKVTYSlb0MKm3Rqad1+gwENNzTto\nNlcM8k4ke9Eh76CJ17CdJTfOB5WIK0h1Wc8n7loO3OVZSpvwVZt943yTslJtogY/o5N7P2U9LJgk\nqHP6GPQHB9ZDl/2JqXISDd5DZWrP+JS1xKesCbTpkLcsoVNhTrNUdFsLZR3TLFdzjnvFFKR4mURM\n8QOeLH+JkK3HQfXJW7c1Z3mM4pnNTcIJQkLEJTWiSQWQRIpUL3LdUB3BSyItNSKlh4jPWYSUzHkC\nUbkozKlWoQEplUNp24OccM+Pe/fdFHdIIZHHzHQwpnwasiT7BXMMv7b4jBZhnVBNJ5LesDxBQB29\nk+QpTwrLhy15/s7eTMl7Z+aOOhWAMaXT/HE2Y9iGitLc65fRDFilEjEFK+dr/xa+5qwqMnElzsJj\nzmNYy/F88REUZy11OuYHU6WoR5UUitKQJxDalkIqoYAR9zK9uVAlX5sDMHSFhBJnsJxlq5NCdErT\nJIKQyg28a8qF3u+JjX5lrrgqc+boed56cPDJ0hUQsFSmpWbzDqeNyY1+8hcARXIEViksCfnr6yVa\n9anETfE8O5rDxOmSN8C5o89hQsPY0H41UBP+H45d6AsLQVgqdqGJy6dcwTWzP8bcUbOrqskFMXfU\n7NDv9x1/ifftg1jn71aQA0g6wlZSjXvvnRog2/p4WJMXFj4xfm5g2+mjT0XVxfuSTsaIKRKGKQJn\nK8fkcCIi7zcYFu94nP9Z93/cv/Xhmvu9IDFbqql5B3OaV5KvLNuUrTC5xlWZdDIG2BhNO/xoVaiK\nNpcb+1jc/cfqvgQQIrMKn7c6ertXHxfAQiz18j5OtRSuUSyHydsNGDKHW3yTskNoqiJj4pN3a7KV\nhJIIpYN1NSZzqIVOyzFhBwQc19xp5ZrQd87wtlea+4Jthgs01Fiap7h542VPsDGHm/nQ8R/ANhVM\nWw/lHNesssiFbckhzTsekz1BwF0/DIARR1I16p01/yaaFyTkBqwFyRtVJyb5ZFkvi0lIbhhk7eAa\nj7xtM0beqaLVoDoR5bKN2tnlkVCx4JiF841IEsSPXYM6QQRTeZYRSQpN0ma2jQ4nKU88oFG17lnI\nl97yWT4y/QOUN84hVRJJRXTDCsUyiPHQUOwEsiRjWlaoZOqxzZP477d/i3GcGDonKIyBsxzPQTyu\nYA2OQt80h45Um3+Q5QsYFx9zPh+Y8T5/V66JZEz1a0obNu2pdqz4MI0Nzn2ZKumEeBZuJjcXHzxB\ntOUSgmHatCRb6C8OsGlwK82JJi8ILvhadaTamdQwgYXHnIcs++MXU2WSCVUkxnFwaufJ4hoBzTKm\nysyfcDafnfMJPjDjn0J9mtYqgs7M3rApOzPsv++9mZIXhNpWURmsMemblMc3d3IgjFQO2MVJ7TM4\nrmUKkiQRj/vvu7Z9JlY5yWzlHXz5LZ9leutxofPG1o/m+jmf8n7bxbqQ5u0+r2CKYzVQddHVvF2Y\nw63e30GN2nUDphNqiPzbm4+MyRwi8j6ieOz5Lrp6wqkphwoaDyz1g5y2O8kLdgyJZSuPrOjinsc3\ne8uhvCQNstCUlq3bz12PbOS3T2xhqKCFfd4V5Js3CuGkIgHNW27qJT55HWpnIA96KEDGJhYo4wjV\nAWuWbYfK5EmKEYo2V8eJ7E5mthUz60ySkuV9nHvl1aH2RE1rvw61p7kYcZ/YHD92XVL1lr5JG9/G\njq4yacXPmAR45KdvO9ExHVdq3k7U9daTwIxTeukcZDPBkBZ+ZkENasPgZm596o/c+9ctxBM1stsF\nNG+3kIax5zjmjj0RLAXN0vnLCr82s47uCCaSuE9bglhZWCVc4UMPrO/V40iK5UUoIxu+VcJdR+ya\n6yUTSbZIyP6k26AKv3HsmHX8pe8BhmNOX4yY9+wanWPASTiiashWjGLJoqUhUdNnbet+bEWQvO1S\n2iPvYKnaJI2MruskEVOxsh1YTlSxHtC8XcjJAoolnv/9T2/HtGxithCwOlLtSJJUpa25goxVrMPM\ntnGccoa3TxwrYWY7mBYkA0vxtFZJkpg35jRa4+K9NbonElMV7zovbupF1uqRFJN0Y9k737VqeX57\nQNtyMjNahb/YNWnbNqT0jnwAACAASURBVExrOZaSWaZgFJnaPLlm8Q5FVvjc3E/xrikLKrZLwrxs\nJLC1BAoqJ7YLAVRRwiQPMLlpkhfU6eLE9hOIbTsXfdf00PZgVbvebJGCM1e11CdCxzUHyLsj3Uot\n/MeZX+Rrb72h5r4g4oHnlwz8bfZOoLzqbYxPTmZUXW0BIRiBbpdTNc3mSlCgCZJ3haBuF/0I+CDJ\nu0pJXSoW2t4RkfffH/b05vj1Y5u56X+eC23/5cMb+MNT2/ijk6fc9dmokkJfpshv/rKZxc+JZTa6\nqfumV4e871y0kcdf3MOi5bt4fkNPyOddGdzh1om2yk6QkWx6Pu9ahelD/uBE0VtD7aLSbL5++wAl\no+RPtorOcN5J0lLWQRITsbb5VH8Nsmx6H+cwvdiWRGnVOZhZ5+OXA+StaiKBhy37EbxOn+rTMXRL\nx9bjFLJJfvC71aTUdCi5hr+EJ4ahy1X36Js7nWIMRh2y1iisCZIFWMSnP4fS0uMtWQHYYDzDw8/u\n8pa+ubBtKeTzdjNC2UYMWZbEGnDZ4PHnffK2EMk3Jo6qByTQ48LnHTCbG5r/2bpaaSJtihSwiu6T\ntvMcOjqcNe+uoBPQLprjzc44OlYBxe+jm9K0PuaTr6Tqjt88TqFk0NaYrE3eAW3ZM5sjfPluLedY\nYFJWHRLz/LPOulndsJDNMEkAyGaSkmbw4DIh7F7UcQVXTv8nprUI7TFoKhX37rgjSmm0jXOZnvTN\n6u77Z9swJu2n6cSWQxM7wPunXIm2YwZm/1hiqkzKuc79T29n5y4np3Z6nTjdUkgnVc49ZayXZAXC\na59PmSpMvP9w5jGcMdrv07njzwx0vur2PbQ42cckSfJIpLT2TK4c93FPuw1mF4yrI5ugAS47ay7Y\nMmec4I/DhXP9wLjeTNEjrvGdgqynTxTvUFOAvNuStcm7OdHkrYmvhfEd4t0MCl+1zOYH8oMDTJFP\nQ983GZBFeteKtoKat+dWIOxDNzPtmAP+OAQ17wWnC5fD204ZGyJvNxvckUBUVewIoViuvfZv/4CY\nLF3Tn0veiqzSE8gnPVzQ6Q/UL0a2KGtmyHReLBuUpaDmHSbkYUeDtMtpSJRANompCnXJcO7lifHj\n2aVtCpO/G6S07xhGG7Pon/DnqoC1fFlDUkyxbjemISkG/UMlLMumZJSRZJvpbZP5+GfO56tPbKef\nHpAt74PSEZnF7HLaXx8qW77ZPKb564fLKSRkSAhLREdTim5LCwWaqXZSRKzLplgj6yWmUCmXbVER\nyCHsKWMbSU2qZ1sOT7CoS6pCY0oBqoYkWSL5DAifbkX0uhtjEJfjDL94BvETlgc0b9kb/1s+IqKX\nZTuGpYhc4u4MLSkGthHnuHHNfPby2Vz/2FKkZE5MHE77Wjkwm7sJJGI6LQ0xioqF5Wb0slQScoJU\nyuCmj8zllj8+AAjTopsfqjXRChUp6t1o9JyjeadUn4ileAlUDb2QwrJt0kmVn3ziHfz48TgbSi+i\nNGRoUBspBsgqpHlrqZqatxfxK0tIkh+kqBsWsRqEI5kJT7iYPKaBK+efSl+fbyFprwim0ndNI3Hc\nS+h7BbknAqbYoJY3OqTNSSGTKMC4pk7MHuGLjqkyHQHTsV1hGscU39aHFkyjdetuFu0Sgkaw1vak\n0Q386NpzhGUFeNv4M2lPtYX93QcoV/Ctj83zAh49Td2Ih4g0SE6V91OJd501hVMmt4a01ffNn8q7\nz5rMt3+9kr39ec+d0tqQ4NZrziYVF8cGY0Nqrek+GHz5I3PRdCtErkGzubftAH5wgOPUuazrEgpR\nkLxdn/dImncwT4W2KRA3QVggPPeUsZw+YxTppMpTq/wA1WT8yFFqpHkfIVg1UpUCVaYxw3KLhMih\nYhCFkkFf0c/JLSt+SktX8ivr1oE1b6fghuf/U0zH5+1XParPT+WczvnO/mCqVD/pQn3MCc6p8Hkv\nyQo/va0lhd9WFVWSBoZLXtnIpkQ9qiLTXu8kvohp3sdZtovexGa7NZklV/O2QdWwveAmmTq5Ednx\ng7c3J0WQn+l/1JYernYkMi4JE2nZ4V3h47dJxhUM1zfsCACphIrlraUuh0zwthFnsiYKIXjJLZzx\nPnPs6SiWWKftEroiS2TKQ0hIjKpvce6gVhIVE0yVeFymPhUThUEUC1s2sJVgoJjTDyeoTZcLtLW4\n5nKfHBpiDWS1Idqakshp8fyntvqaVGstDckQZnt3kp7deiqzW+eIcUgNI8m2l9WsLqkSjym0SZPR\nd8yEgQl8aMpVBNXFUGCSLdf0eYeIXJFFwiBElbRa0buSmfDM+lPGNFV9R5WBQ9bgaN7ffg22YyUI\nam5BIvdWKQT6EkRdYAKPq3LIxxn0N4MTsJZUkSSJ9nTAOmGE1x2nEiqyJCFJEv90/Lt5+4Szqu53\nJKiKHCLaWvcUJKr4K5C3JElV7cnOto7mJLphsddZMphOxqhLxjyiDa7jrmXyPxioilxlNamteR+Y\nvINCyiuZzYPHnth+AnVqmium/2NVm3WBQlCSJHn9DL67ifiRo9SIvI8QauUZr7Xd07wlhd6MT475\nUrXm7aIuJV4iTTdDRSrCmrfN5sw28Zdjcg6bzZ3lN6UpXuIKKWg293Ihq6QSKkk1GdK8dctgW2GD\nc7AFZswj/L5Myat85Urnk5pEQJJclyURU9AtQ0RKuyZ3JxmDu9YbVRdm4YD/s0FpEfV3FZ2OppQg\n74DmrZXcQLhAZivHZFly5CK1bT9Kx26x3MPSPHJXFYlEXMEoOwJATAv5usHGGhjDhORkp9604Res\nUBNiQgsUtFBkiaw2RH28zsvFbLqme1dIkiyRWMJUfFOuQ85lCuiKIF+9GCAMxyeXNftobnL8pwGz\nbGO8kbxeYG+xy8vHPGOUr9U1JeqIyeIeZdstpOFkbnPMo+lEgg/NfC9WOemZ122nIlk66QpbNnax\nAXXvKbSkwlHESSXBuPix6LunoioyTfV+JLoLNaAdKrLkFXoQmncN8jYSnvm2crKH2r7HukCyjrBZ\n1m+/MR7O8hXsl/gd9h8Hr2NraYrPBXzRAZ93Q9zXhG0zTN6viFfBg0HNVKkIbHu1cO91pxO3U1dJ\nskqck9pnVvnjXytqEXWlUHWg/alEtQl+pIC1hng93z7nZs4c+5aqNmu9ZxAm/1cSKg4lIvI+QhiB\nu6vglsNUZYW+rK9590gb+O2m+/0DA8TqlgYtaWaIUIOat9zcy7J9ItLbKjnLpGJlYorsmM2dNddy\niua0Y/JSqoO9MEVO6sZ4AwOlQTQnA1le9zOvGb0TsE0VJSau35sp8v/Yu/P4qMqzf/yfs81MJpls\nkAAJ+yabICgo4i5Qt69WWxUXcKlaRVu1daFUpbUPuFT9Wbva1trqQ12hllddeLpp1YLWlcUVtAjI\nkkD2zHaW3x9nmXMmM5mQZCYZ5vP+h8xkZnLmJMx1rvu+7uuOWevL7eA9uXqMeVyl+/CrDx7Gqzv+\nbZ6npJ7Y/knrzKBmNUZxFy/ZhVRCoB0DyvxQDc0zbG533rKDrjkkbZ6rcLsrWJTvRWNwM3a173Iy\nd0kU4VckaBFX8PZUrsdR3xSBz96yUo45vxO/5IMkCOb6Z8mcKxdFc7cjuwMUAGiqfYFiPq+4OLGk\nx+nnbL3fiNaGmGiNnESCieYeVrOaFr0egZB1fK7gXR4wA+nKj58xHx8NYGBxYs4x4JfNoXMAJdoQ\nCLGg01TEzmz9PslsEqO5A5V5UWF/gNt75IiC0GGeWBAEnFJ9DtQvx6KqPODMwbqzMzkp8/YOm4uY\nV3YhYlumIbLpaMS3j4PSNNK5uEgOIgBQXtJx7tGdOaWbUxUEAQtGXYzohzM7HFcyRRZR2mFnP8HZ\nIcuI+Z2LG89FQYoe+r3NfUHiHjbvjeAdjWnmErqkQCUIAr459RKcMvLkbv+MVFLOb2e4oHFfdLmH\nsv0phs2TL9DSCaYY4QAS9RoAg/dByT1s/t6n9dANsxfuftd2llu/bEJMtTJcw0BdY9jJAPeXJ/aA\n1ttLrAIq8zXNDy8De/WtqI/sT/xQV/C1h5dlQYbeWG0uMSpuhqJ4h819QgClwSLo4SDEUKPzGu7M\ne2d9G0q1oYjpcTyx/jWs/2C3s7etumeY9fqJIri6pjBiVqFdsbWOcnRlrbmOdsBufNGyHau2/MU8\nUCt428PmghL3NOZwF0KVyFaLVCWC0lJ7eU7iP09Ts13oZm1VKCUqsdvaBGCH+SErVdShrug980nW\n/2NZMiuW7QsdqWq7N/OW4tjXHIEMa3jWmuMHzEzTzLytD3YljrgRQVxXUe4aQraXfCnDzKYgpSF7\nIwvZmUqwq5TbjVZExCYYutnD2mn5GPfDiPuwH19ik/oP8xQ0JqqI7S077SmX2JbEOmDAXPs/sMgM\n3pE2H9o3HO08xh42d9bhC4lhUfu47OBk/32bc9YdPwztAJuuGtcdJCVR8BSs+WQRgwODoe0fAqO9\nFOquMdBVn7MbXjDFvvbuzMo5Blfm7Q48/qQ51YmV46C3mFXlyRciboospXyvV0y5GPH35gGaL2Xm\nndziMxv8nmJAd/DufnAZ6JqKSHXBlC2ZsuxMz3FPz9gXAuky784Up/g7AwBZTrwWg/dByJ15P7Rq\nA155dyfuXvmOp9HK8sfeRn2zOTcc0+PY1xxFZWkAJQEFQsRd9GP9J7IztiIFUtUObCsyd/s5acDp\nAOBtB2oF4UvGLwIMEXpbGUR/GIZoNfiQzbaZPsmHoF+GVl8LQdQhVe72PB+agoaWKN57y/xD/vN7\nr+PXaz7Axm3m4+zgamgydMEMmvubo4gLVvC2Mm9REJ0Mz3OeUvTrdg9ZuzPvErnEeZ8lQSvwuTJv\nLe7q2Caaeyy7s57wl0M9VePun2suvZGgt1ZgQukkSKFGSFWJZXSxz6bCMIC4nZl7Mm+/uYey9f7E\nYDPaNHsLy0TWO7bYWspTuQcQVZSUJC5A7A+BgUHz8a/sfx5hcb815SGgusIOgmaTFA0xRIw2xHeM\nxfTBiTXqdvAGzPXtRpv5evaccNAvozpoBqq2Fsks7LOCS0u7+Tu3i3wG+BJroO3gbVfXjq01f85h\nYwc6owLuoFhZGoAAYGhVx9854B16lCUhkXlrZuadnDFqmp5YrpNuODPpQzlV4RLQ8QPXPUfaWYGX\nnbFVliay/AGl5haVMsy/02CK4J343XXN6CHm//3Dxg3M8MhERul+T6LYu5k3kH4IORvsn+WTRQyz\nKtyryjpvhuK+6PLMSSuJkbVU3+/KcSSTPXPeuQverDbPESczKWmAPPQTfLjTu7ymTdgLsbzOmeON\najFEYqq5jlY30KaJEACE6meiWbaWFllV1MGA7GzacP74ryKyx9zooXaIjMtPnAXdMPBKXQPW7/3M\nyXy11lKIZXUIi/sQ9I81s1PV3NtWFAV8e958/PKjTzF1qoh3/55ocFLiC6IZifWPdr/o3U1WW0+7\nGMfOOiUVMVWHjihEeCtSJ9YOxsdNiZaR5vPNDz17pACAOd/tt+daXQ1GrOB95IwifN76mfVzXWug\nnUYuGgTr/BiajOKAbA25CmZBmL9jsxnJNSw4o2w2Pmr+wNmJKbLhWHO/agDtLQJQYgV915y3JEWc\nJVRicTNaVfPnuzPvb596PJa/vBX75E8hKDFUlgWxwzpGe8574UmH4uebXI1rrO5XQwYU47yTxiIU\n9OGdLwdgXcM/UOoL4ZRDvo5h1SFcEDYb5OwT/us89fCRI3HWCWbryTsunYn9zebWh9VNZlCwh8JP\nnjEUf39nBzTdQHFAdoYd506agj98bG7KcsnJhyEkDMCU0WbWfszUIRhUUYTRNWaf7GWXzkSFK6hV\nlRfh9kuPwODK1FXInjlvSUQsrsEwDKfaPDljVDU9MSef5kP1vmuPxusbduGZl825fk+Rmiu4JQc0\nd5CXU2Tw9187B63huJN1L7t0JprazIs+ewcrO4ja2Zq7u9hti7xVzJnMnjIYA8sCGF2ToiNZkvsW\nH43G1ljSnHfXC9Y6M6DU3BfdMHIbvAM+GcsunYnykB+KJODLfe2oTXMRaHNfdCkpRlnSFay53X/t\nHLz18V488Tdzu9p0GXqqi4NcYPDOEbswTSzdB6m0Ac2tewAkrh63la6Fuyg3psUQi5vLqEQB2Cuo\nKJaLIDQMhVi1y1xcJOowAAQUGYJsZn2TB0zA3z7ZZ3bv8oedtZhavVWQ5jOvnu250jZhPwRBgCDH\nYaiJhgMTBtdC/FhE3JpntTPvimAJmvfHzbXWmuQUpe1vbzH/muxqcSdwxs0GNIpVze5aQlJRFAK8\nsdvJrGPbJiIweb35GnIMorVlpN6ayFxDivke3t3/Nt7d/7Z5pyvzdobQ5Rj8483vG9EihII+TB0z\nEOs273aCfak+BOMGV2P9f8zzJEuCk50FjUrobaUQi5s9xwgATc0wg7ccd4oI/ZIPoiA4PbvF4ia0\nqOZzy1xz3n6fhOpQGfaFzfqD8jLRXLalJ4bNq0PeavD4DrOJSHFAdrLYE8dNw4mY5nmcX5FQWQpU\nxkc79w0vH4RqK3sqtiqFAWDW4MPx7KsfQ9s/BJNHVngyQ3e2NW5AotBt+qihngsxURBwyPBEtfWI\nwR23dxw5OH3w6ThsbjhD58mZt98nQdONRJerNMOZpUGf50PeHdB8nmHljnP0smQeQ6oP9oqQ31lf\nDQChoA+hoLeRjP1+3BcCK+bcDkkUUaIcWMFa8rntTFmJH2VJ8/2pmrR0hyyJqAz5sa85mnYIOVvc\nf0/2KE9n5DRLwVIWrKW4QAPM3/PIFH/Hydw1BRw2Pwg5w+ZWT+WInmKHKxd7yZdfkcwPJ1GDIvoQ\njWuQkpYYybIASUlsU1jXGIERC6BdS6x7tVtzhvzmB669WUMM7TAMwwrePkSsHbxkUUaFvxx14X0Q\ny/dCHmAPi7u2WlQVZ/mUvduYMyft7GGsojUchVhsZuYhV+FOyrWg9rB7WzmiH1vLk5QoxFAj9EgR\nEHd1B/OluPp27fhlX0BIFXsgKHFIrYOh7jSDn7OUyPp9+EQ/Lp9yEcT95m5DdsEaYFZdq9aOSoYu\neLL7BmsBgFi6z5mXtzd+QDwAI+aHEGzGjlZzyL0maSlSib1LkRxDaYld7Z0YNncXOk0UToTeYI6q\ndDXzce+6lG7tbUD2I/blKECXMLC8KG27R3exXbHcvXW86aQqWItZ65cVyRu8Az4JqmZkHDYHktY4\np8mQUs2P2wVv7u1dD4Q9kuD+PZX5Qx365OdCb2XeQOJiLpeZd3d4Mu8U1eBdybzNx2U+X+6Lg1R/\nS9nC4J0jTsGatYGCs/uTLWlLQ7tHud9ndmkSJA2KYG4Dam9q4ARvSXSGtQNyAPWNYQhqAO1qO1Td\nvD+shiEKIoKKGbTsIdKI0WbuYiQYgKZ4CuiqigagOdYC//h3nPvawq41yZriFGm1WNXmRofMW0Wj\n8hnE4hZUxMd4AkiqYOLeYcoZQg81QJDjHdbRFrn28lWsYUnPPLp1DGKRtca8bZIzn+tklFa2LMIq\nHrP+I8qS4BS6tEfi0PbVmIFb9cFd6mq0l0JvLYNUXg+p2mxp65f8zsWa3h6C6I9g475NKJKLMCxU\n63kPIcVV+e/XneO2P2R8kmvNtpgYdTiQzOeiCeeiSC7C5AET0j4mZm0vW1bs82Qi7vXSgiDg4gnn\n4uyxp3d7HW86SoqlYnbzEZ8ieoJOQJGg6ZmHzYH0WZV7Pa6U4jF2Zt3Y0vlFdjr2h36uM9RUpG4U\nZ6VjX8wV+/v+fXXGezHoyox9nS8VS9aF2J2x8U22MHhnQTiqoqXduyuY0yXMyvTiRlJ3LtVbgOEE\nb8Xa9UtUAV1GLK5Bttbl2vPjih28NRkCBNQ1heEXzMBoN2Zpj4dRJAcgSaK5VCfuh2EIaFWb8ceP\nVgEAtP2DPB9WA4OuTRosEVenOMOqKPcrIiL2Bh3OnLcVOBUVMdnMumvh3bIwOXifNOxYs2DKZjVZ\nEcvMPa6T23C650EXTVqAG2dcA3XXqMTx6e75bxHlYiLrtT+c7WAfMMzXtocYJUl0/qO3RVRA9SH+\n3ymIb0/sya3IImCIiG01h6ztna38kh/2SgB7HXZYi2B8+egOGzKU+q3aASXmbCBiaHLK5TElUuLi\npegAMp+ja2bivuN+2GlbSltxkeL5MEquDp9dMxNzhx/f5Z/dZUnLxlQtfebt90nQNHPY3C4sTCdd\n5uS+v7Pg3dDazeCdIvPuK+5h855edOVL5q2kec+pMu/OCtbc2/Wm09MLou5i8M6C6x96Ddc/9Jrn\nPrt6Fk7w9gZ3Q9CgR4LmGlZRcTbZ8CsSSopkCJKOPfti5iYMgt061GroIgnOhhRtERXhqIZia3/h\npqg51xpWwwhamar5QWj2zd7eth0fNXyKQfIIaPW1GDwgEVA9OyxZhg8yX3dAqd8pShs62Oc0QnEy\nb6dtpwpVtIbsFe/8kXsI8VuHXdlh/99xQ8xWlfb/PfcmAYD3P+DQkhqMLR8FGCnmvGF1RLOqdodV\nlzgZROyzqYhvH4ehhrk8ys4AZVFwisbs4VmtvhbaPnP4fNSQUqdHtxEtcvrFA4Bf9jkdLY32xEhA\nqsy3LJAI3s7fhC55AtL/G30Kjhx8OAJS9pbq2PPcQyqDnkrsQTnaaMHdrEiWBOiG4azEUBTJO2yu\nmHPebRHVHJXqJCBJXVjDW2o1jXFn92NqzIu5IQO6N8ztVyQU+eU++2B3S3Vx0l2DrL+T0mJfhkf2\nrXS/d/v3ka63eTK7F7zYyd9YV9eJ97b+ffmUp+xCG8MwnA+WRPA2/9XQMXhDC0DdNQbV46LYGfkC\nsLbLnDV5IF54C04wcipXreCtSCIgxWFEfE5L1XJ/KRoANESbMArmnLddLFUe8mPP/nZz/tgXRZEc\nwHdnX463yxow3bUcZXz5mMTxqTK+MuJknHTUZLz7aT3iqo6nt5hrzysrJWyPxc1kU1NwzNQhEMsF\n/CeyCZKswfBFYBhCh0zbfXt8xRgIgoAbzp0Gnyxi9/52zJo4CDe/9ifnfert3jluWRKwaOL52NL4\neYcLjbISH5paDRiGGfyrS8px+lEjURr0YfaUwSgOKLj27EPx8z9thLprDIQae7g8kXnbRU32nuTj\nhpbhzGNGYV9TBDPGV+FnqzbA3GFcgN40EKK1I5tn2Nx1wTFj0FQkq7CaqMiDvsCGfebPOe+YyZ6i\no1NGmu1qX3rjC+e+AaW9u2/wLRdMx0dfNGDK6AGIxTV8/YQxB1Qo1VPupZR2sLH/litCfs8oix3I\nW9oT+5ink/yhe+uF0xGNe7OpMTVluPTUCc4GGwBw8hFD4fdJmDHeu/NWV10wd5wzrN/XejN4H35I\nFRbOH4+jJg/utdfMhuRs+vuLDkdTa+Iz1/130dn5GTE4hEtPnYBDhqcfteqrCzQG7yxStcSmCppm\nz3mbHxya4A3eEPREYxIkdtzy+yRnq0l7DbOdeTt7RUsCdCEOQwtiZ521UUdxJT4PA/sjDeZuZLrq\nZN5V5QEzeFsXEjXFQ1CsFOG4ad4sa3jpUAwOVmN3+15EP5yFY4+ag1DQh+Om1WD9B4lK7WDQgICw\ntbm9gLOPHY09cRn/ec8cNheUKBD3IVDi/aAt9lQrm+996hgzCE8YYfX/1v3QxXarUKxjRe+RVYfj\nyCGHdzj35SV+8z+rIQCCgepQGfyKhLlHJPp6H35IFYJ+Ge1R1cmU7Yst93CsnXkfOnoAJo9MVH+7\nq5zj28cDhoihlRVQRNmpcTDCJdAjRThhzHTPHL1znEWJC5KdrealwNETRnV4HODNEFJ1EOuJytIA\njp4yBIBZiX3aUSMyPKN3uTNve5jX3rSnqjzgyYrt77dFVFRXdF44l5xVpbsYOW5ajee2KAgd7jsQ\n44ZmnqLIld4M3pIo4sQZQzM/sI8lz0PbIympZJpKyPR30JWitmzo+zGdg5j7Cl/Tra+tYXNnj2Xz\nljlfavfztpc7WTtuOXt0W8HSZ+/HbDdOkTSn4GxHnVn1XVtqZtANkUa0W/PRRdY+t1X2jkuy+bo1\nJemvom8+4jpEP5wJI1zq3bQ+oCSGyJUwRH8EmpUZK7LobK0nyCoEXxRGzO/Zlxfo2s5DJa3mHLPe\n2DED6uxDKRS06wLMoFDiSz386QzJG4bntuyZ844797l55v00H+LbJmGYPsN6Qet+Q0R0w3E4b/xZ\nKX9+kc/nFA8CZuFdukpud/FVLqtac8Gdedvnede+xI5x7mFz9+890/RBbwaufCX1g6H7XMvlUHa6\nTaeyrfB+qzlkL7sCEsPmguBu2WmxN9+wMm87wxZE1QreiblQAAiIAc9rGFYWb2gydlrBe0SlOV+8\nP9Jo7kcNIGgFVLvNYeyzQzEiNAynjpyb9j0E5IDTKtL9HyIYkJ3g/Z+ItZtYuGPwhq/dXI8eD3To\nhdyV4F0ePgSRjXMQ+++UDt/r7EMped/idEt07Kvu5P9/dntUIJF5JweCVEU79lW49+VStwwFzGVP\n0Q+OcgJ4mb/jDlm2rhTP5Bv7nRquM2af50TmXeQ59+7fe6bCqT76XO1XCvECJpdD2U5ilmMcNu9F\numF45lI8mbfmLVhzb7cJwargtoKz0SHztrqLWXPefsneDMMM3ppgXQioMnZY2/UNG1AJn6hgQ/1m\nJxjYa4ZDRebws948ELfMPK/L7y8580bS7kh24xdFFlFkWM1gfFZns5i/Q1WwLMo4YegcDAqmn1eU\nZbFDoZqts0KT5Cvv9MHb/Dd52FwUEsHbnitLfs1Uy4DsD8p0u8h1PE4RRqzIXG5WuRdiJzsu2Mv4\netJoo78RBAGGYSRl3lbw3tcOvyIhFFSSNjFxZ96dz3l39fdwMGPwzi7nsz3HDp5PgT62e387rrjn\nn/j7267+1/HEsqrkgjVB6ph5G9awuW7vNuWLwO9zZ97mtZZTdWy9hu7KvJvaYigt9iHgk+GTzCD9\nft0mAMCospEA9OeBHQAAIABJREFUgPJQ9ypFk5tcOHtuW/RwCLIkmPv/Wpm37rOat8T9Kfv+njv+\nLBw39Oi0P7OzZRzJnbHcyoq9c8IBOXWBlz13XGQdWyITTPS/brcadSRn+ikzbyl1Jp+JfcEW19MX\nOdmvOXxQ560h84m9JMtd4S675rQHlgc6jES4f++ZMu+DbXqhOwrxHOTygqWvLqaZefeStz7aCwBY\n+ddPnPvcm444QyuiO/M2AAiJPautYFgSrwX8m6AM+wQ++WRnztvOvINyEIh3zLxrKspR7h/gVMi2\nurbpBIDRZWYR0qSRlTh99ghMH9e1Stprzz4UX9a3ej4EKkJ+nHroNLypbcWYkrF4+8P9MNpKofgT\nFfHmkjd7NzJft1oHdjZ3lep7t1wwHe9vrcfXjh8DUQT+ZT9WTP2zrz17Cl5Y/wVOn20VaLlesqqi\nCKOGhPD5LnP0IPkDIdV8qzNsbkVaWRJxwdxxad8DAFx51hSsa9iBrZFdnuHjZGcdMwqqpuOsY1IX\ntOWj75w/Df/3n+04+fBEEdSx02rQ0h6Hbhg4ekqiHuPCueMQ8MnY+mWip26m4D24MohTjhyOKaMq\nO33cwcyvSDhzzshO29MebIr8svmeh6R/zxfPH9/lTUk6M2N8FU6cXotjpw3p8WsdCAbvA7S18b+o\nC9fjqCHezQXswCYEWiGW1UPbMwLRlJm3GagF0XA2FnGG0q3g7YsOQoV/KBqKdwBSvMOcd5EcgBAX\nnNakLaq5DehXpo/F7JpEj+sLJ3wNL3z+NzRGm5znAeaQ8NeOTywDy+TwQ6pw+CEdA/3Xjp6Mq6uO\nwsdb67B+rbkft/sqNCgH0BSzh/SVbgZvMem22XMaSD1sPmFEhVOpfv5J4/Avc5dMKGLq4dXqiiAu\nPTWx/lpAYthbFARcd85UfPfnr6c8lmCKLlPJAf74aTU4cXpth8e5nXncGEzYfiZ+uWEfFhxydtrH\nBQMyFn7lkLTfz0dDBhTjklO869/H1pbh21/vuKzOXimwbXeLc1+mYXNBEHDeiWN74Ujz21ePHZ35\nQQeZTO/5pF6qmpclsU/+XzJ4H6AH3vkFAGDW4Bmebln2XHdgqtmcJdJa4Q3emrdgDYCZfetyIhu3\nhs1jcQ2KHgREQBOirszbqjZXJBTFi9BqBe9P2z+CKIiYPND7ITin5kjMqTkSb+95DwNTNFzpLe4P\nUPeSnqASRFMssZtXd7bLSw6YPlmCqplDy501TrAdWzsbr+5ch9HWlEEmiepz89+yksQUQ3Kmn7pg\nzTts3tWGVhWBciyddWPXHlzg3Bdt7o0/iAoJ//K7SdVVZ04ZAJKnWARRSxo2TypYgznsbcQDEKwm\nJPYcciSmQdB9ZvAWI4iq1lIxe523LCIoF6FNaoXgb8OeyJeYWDnes4mF2+GDDuvRe83Ep4hmP2rd\n8GTe7mpyo7uZd9J8kt8nOXPQXWn1eN74s/DVMachIHdvXbS3mYP3WFIOm9tz3vbwd+FNN2ad5ClY\n40cYFSYWrHWTmlRY1CGQGELSsLm9zjuRedubeiSGzc3gFo1rEKyGJHEjirC1TttemuWTRQSVIkCO\nQRpgNvaYOWh6z99UNwmC4HyIuueQPOuVXZttHIjkbPdAd0USBfGAAnfyum+3mKp5bqfaijIx523/\nfEbv3uYtWOvfG2QQZQuDdzfFda3zBwg6/vi3T7Hps30AXJm36FoTaFecJw2bb9vdgi92mvPcUSOM\ntri53tVu0qLIEkqUIATRgFS1A7IgY2rV5J6/qR6wP0QVxTtsbjNUxbOTU1clD5tne79cZ847xfda\n2uOe26kL1rpXbU5dx8ybiMG725Izb7ufucMKyA88/T6AdMPmKgZVFGFghbWHtWvplb0dZtQIO3tx\n25m3IotOYBT9EQwtHppoitJHjpo0CANKAzh8fLVzX1BJtAOVoXSrjaA7eI8cHMKF88b37EAzSZEo\nf+/iGZgwvByzJ3v34lZkEbMmVnsKopKHzZl4976JIyowqDKISSMrUFHau21iifIFL1u7SdW9WVgs\nufuVNY9tf3inK1i7Y9FMPP3uK3izHYAuosgvIRzVnK0129VE8LaboiiyiGI90XQk5Ov7db9nHjMK\nZyYtYXIPm/vl7q0td+/zfPslR2R9swdnnbfr1zRuaDluuXBGx8cKAq4+y+z89vQ/twBwVZsbicdQ\n7xo3tBx3XXVUXx8GUZ9i5t1NquEdNjdbV7rms63MuzJkZsTJvc0Bs1GLTxFRVGT9GgzRqdy2s+zW\nWBva4+3wiT5nWN0niyj3JdYvhtIUqvU1d8FadyrNgUTBmiSaLUaz3Xwh0XGte+Pe9pJBnfVqRJRF\nWc28V6xYgffffx+CIGDp0qWYOjWxdnPlypVYs2YNRFHElClT8P3vfz+bh9LrkofN46ruZNsAnK8H\nWMN6qea8BVmFJIoIBqzgrZutIOubIs6weVu8De1qGAEpALs1hSKLKJMSwbs0zaYbfc09593duWq7\nOMkO2tnecEBI7pd6gOSkJi3MvIkoG7KWeb/55pvYtm0bnnrqKSxfvhzLly93vtfa2opHHnkEK1eu\nxBNPPIGtW7fivffey9ahZEVyG8u4qnn7lVvBu9Rqv6m72qMaqnnNJPniePHzv6MdjQDMOe9ia39i\nSfdBgIDWeBva42FnO0/ALFgr8yeCd1mgf3ZOch9z8qYkXWVn3nZGm+3t9xLD5t2M3slLBhm7iSgL\nsvZJuG7dOsyda+5WNWbMGDQ1NaG11exzrSgKFEVBe3s7VFVFOBxGWVn6/Vb7Ql1jGI+t/djZDjJZ\nqszbvVOY0/LUCgLujUnsrFoYsAN/+XwtXtn5uvVY0Wk6IUkiipUgGqNNiGgRTxaryKI3ePeDOe9U\nZDExsNPtzFtK7K8N5K5Pc7eLxa0n9tU2gURUGLI2bF5fX4/JkxPLlyorK1FXV4eSkhL4/X5ce+21\nmDt3Lvx+P04//XSMGtV5v+aKiiBkuXeXCVVVpZ8rXrHyHWzZ3oiyUABXnNVxO8rikOJ5viCJiXXb\ngJN5y4qEqqoQJFkCYEAQAD3uAwLtHY+nrBhl1hy5IokYXl6DD+o+BQBUliSC9eDqEAJFiYA9fNAg\nVA3su3nvdOfRCNYC7wB6OIjSEn+n5zudygpzSkCWRef5AZ+E8cMruvV6mVx46kT86JE3cN68Q7r1\n+iWhAKqqQrj6nKn45aoNOHXO6C69TjbeS6HhOewdPI89l4tzmLNqc/cwZGtrKx5++GG89NJLKCkp\nwSWXXIKPPvoIEyZMSPv8hoaOwa4nqqpCqKtrSfv9fY1h69/2lI/b19CCOiVxf2tbzDNsPn54CB/s\nBMLhOOrqWtAeiSWK1TQZhi4msnPLVacfin+tM3+uKAoY5B+ED2AGb9lINKNoaQ5DiyZ+dVq72Ol7\nyabOzqMAH04oPh8vvl0HjDO6dYzhtqj1WnCe/7MbjoMgICvveVRVMX57y4kQRaFbr9/cHEZdXQtm\njhuIw7v4Opn+FikznsPewfPYc719DtNdCGRt2Ly6uhr19fXO7b1796KqytzcYuvWrRg2bBgqKyvh\n8/lwxBFHYNOmTdk6lG6xLzbSjdJ2GDbX9KRtPs3MW7NeJ2q0QvBHrBcXALXjdZMsys7wuiyJqA3V\nON9zL7tSJNFTCFWi9M9hcwCo8g0GNB/8Svf+1Ox13u4qc9GqPM+W3hqaL8StGIkoN7IWvOfMmYO1\na9cCADZv3ozq6mqUlJhBpra2Flu3bkUkYgazTZs2YeTIkdk6lG4xUqzTdY8edChYi2uA7B42N7Nq\nOxjvqFqDwNRXrRcSYcQ7NpdQRBmqtaRMEgUMK0kEb/dabrt/+IiQuctSd/t254IdfANK9wZ5ZDm3\nc91ERPkga8PmM2bMwOTJk7FgwQIIgoBly5Zh9erVCIVCmDdvHr7xjW9g0aJFkCQJ06dPxxFHHJH5\nRXPIvdRH1VX88aNVmO3aBlQ1OmbeYlFiqMQQzO87Veae1xZgtJVBLDYff+GEr+HThs9RVTQQmlYH\nwCxYqykZjONqZ0MWZcypmYUnsN45JgD47uGLu70eOVfsgjNfN1qjAole6WKWq8x7C+vUiCgXsjrn\nfdNNN3luu+e0FyxYgAULFmTzx/eIu8nGxvoP8cbut/HG7red76tJvc1jqg6xsinxfGgQBQGaYeCL\nvc3eFzdE6K3lQPUOAImtO4FEm1VZFCAKIs7vZH9nScxun+/eYGfe3a02l6zny3mSeff3iykiOjjk\nRzrTBxKZd+pGG8lz3jEtBiHYAr3NrArXoEIUBei6gR/8YZ33yboIvS310rhZE83+2ccfVpPy+/mm\nImQO6Q8o7V7v9UTm3b+D9xGHmPUcIwf3zzX3RHRwYW/zDARBgF/s2Jc7ntzbXG6EIBjQWiogBJuh\nG6qzx7Wn8xoAASLu/8ZXsOaLCMaVj/Z878hJgzC2tgyVKTZc+NkNxyY6teWJMbVluPvq2RhY1r3g\nbQ+79/fg/c2zJuO8ligGlhVlfjARUQ8xeKfhDJsLqbt6JQ+bq4K5lE2PBiHpkpN5R2Kad/03zD2m\ny0sCWDTp/JQ/e0CaQJevexdXl3c/oLl7m/dnkigycBNRznDYPI3EUjEBmqF3+H6HLUFFa9vOmB/Q\nRWiGBkkUsLehvUPmDZ2nvavsXuH9PfMmIsolRpE03FXDWlKWDXirzQ3DgC5Za7jjPhi6BNWIQxIF\nGAY6ZN727mCUmZ1550vBGhFRLjCKpJEp845riYC8e387oJidwIy4HzBEqIaayBalpODP4N1lSp7M\neRMR5RKjSBruOW/N6Dzz/vmfNkFQYgCAUn8I0BKZNwAIHQrWGIi6yqdIkCUBRT6WZxAR2Ri803A3\nadFTDZu75rwjMRWCEoUiKvjhJbNRO6AUcT0OwT67ycPmev9fn91fyJKI755/GM4/aWxfHwoRUb/B\ndCYDM/NOVbCWCOiabkDyx1DmC6G02I/yYDF2RXRIkpW+JxesaflZNd5XDhle0deHQETUrzDzTkN3\nNWlJOeftWuet6ToMKWoOmQPwS+YabdGa6xaS57w1XjMREVH3MXin47RHFVLPebuGzXUhBggGQtbu\nXgEreAv2RiVi0rC51rHpCxERUVcxeKdhrxRLV7AW01QnOzdEs9K8WDG37fRbu3wJaTNvDpsTEVH3\nMXhnIKYpWPt8dyN+9Zy5B7kmmpXmQTt4S1ZmbQftDpk3h82JiKj7GLy7INWcN0QNb31sbt9pWMG7\nWDaDtzNszsybiIiygME7A90wUg6bu7umGZJZvNZh2FxUARgQ/OGkF2XmTURE3cfgnYFupMm8reBt\nGAYMKXnY3NoRTFQhDdoGsbjZ05iFTVqIiKgnGLwzMAwj5Zy3ORRuQDcMCLKdeZu7StnD5pBUSKX7\nAQCXTlqQk+MlIqKDH4N3BuaweYrMGwAkFf/e+SaU2q0AgGDSnLchqBCKWmDEFVQFB+bkeImI6ODH\n4J2BYaRYKmavAZdUPPnpaufu5DlvTYpADIRhREKQRc5zExFR72DwzkDXOxasybDntL33FyctFYvI\n9eY3wqEO+38TERF1F4N3BobRcT9vUU/MaeuRIud+RTSXgNnD5mFxHwBAiIYwpHgQJMOH+M4xEFiv\nRkREPcDgnYFhGNCT5rwF3cysBUmFEU0Eb8GKyvawuV1ULmh++CQfZukLoe4cl/2DJiKigxqDdwYp\nC9bsJiuSCgjmBPjo8Hzn285SMYtgPV4wmHITEVHPMXin8NH+TyH4zMYqqQrWjLgVjK3gbRgCSvUa\n5/uKKENxFagJujeYExER9QSDd5KWWCt++t5v4J/2CgAz8/7vnibPY1S79kxSIQgGYAiQRG9WHfKF\nnK9FnbuIERFR72HwTtIWbwcAp6hM1XTsaWjzPMauX7MzbxgCxKQzGfKVOF+LOnuZExFR72HwThLT\nY57bcVV35rVtumadNsnsXW4Gb++pLHUFbwHmELoB7+sQERF1B4N3koga8dyOxXVAMAvWoh8dgaJw\nLbR6c37bnXlLSeu/Qkpi2Dz5e0RERD3B4J2kPSl4x1XNybz15gEI7T0aajQAABCUqBO8haQzWepP\nBG8hKXgn3yYiIjoQ7NmZJBz3bt8Zs4bNDQMABLRH44Dqg6EqEAJt1lruFAVrimvOW2SwJiKi3sPM\nO0lY6zhsLgg6YJinqj1ilprr4WIIgTAEUYNhCB0CtGepGDNtIiLqRQzeSbyZt4G4pjtD4wAQjpql\n5kakGIJgQPBFAUPskHlLouR8bX+L5WpERNQbGLyTeDJvwUAsrgGCDlmUMLqmFLo5fg4jXJx4nCFA\nTMquJw+YABgCYtsmdPgeERFRTzB4JwnHXcFbVJ05b1EQ4VcS2bQeDSYel2LYPOQrwYTGi6DtGclh\ncyIi6lUM3knCqmvYXNSdanMR3uAN3fV1ig5rAKwit8SwORERUW9g8E4Sdi0VEyTVWectQoLf5w7Y\n7lPXMfMG4AyxC4zeRETUixi8k3gzbw2abkBwhs1dp8u9Q1iKOW8gEbyd2M2KNSIi6gUM3kncTVoE\n0W5ibgZvn2vY3NATpy7VUjEAmDKyEgBw2NiBnvs5BU5ERD3BJi1JYpqrt7lkB28doiBBkdJn3qnm\nvOfOHIZDhldgWHVJh+8RERF1F4N3kqh7Y5KkzFv2BG9vIE+VeYuCgBGDQx3uJyIi6gkGbxfDMBDX\n4s7txLC5DkmQkoK3O1innvMmIiLKBs55u8R11bttp5TIvCVBhCy5h8q9WXiqYfNkrFcjIqLewODt\nYs93S4JZmCaIGiCqEATAJ/o9mbe7YC3dsHk6zNGJiKgnGLxdolbwDohW9zRRg+Azq89LlNABF6wl\nG1xpvu7omrLeOWAiIipInPN2iVvFakVSEG1aCyCp5sYjAEqVEGQxdcGakWadd7KTZtSiOCBj+riB\nGR9LRESUDoO3i5N5C2aGLEgaBMXMvEt9pZCTsm33110ZNpclEXMOHdJ7B0xERAWJw+YuMavS3O8M\nm6vOsHmZrzT9UrE07VGJiIiyIWPw3rp1ay6Oo1+IWcPmPqMIgJV5W8Pm5f4yyHLP5ryJiIh6Q8bg\n/e1vfxsXXHABVq1ahXA4nOnhec0eNldgBm9zztvMvCuKyrwFa8jc25yIiCgbMs55P//88/jkk0/w\n4osvYuHChZg4cSLOPfdcTJ06NRfHl1N2gxZBV2BoIgRJBXwRGLqIkFKMqBRJ/URD5LA5ERHlTJfm\nvMePH4/rr78eS5YswdatW7F48WJcdNFF+O9//5vlw8stO/OGIQG6DLG4GWKgHdq+wVCUpA5rbhw2\nJyKiHMqYee/cuRN/+tOf8Je//AVjx47F1VdfjWOPPRYbN27EzTffjGeeeSYXx5kT9py3oUkwNAmC\nYt6v7hwHRfL2Nvd0WwOYeRMRUc5kDN4LFy7E17/+dfzhD3/AoEGDnPunTp2aceh8xYoVeP/99yEI\nApYuXep5/K5du/Cd73wH8XgckyZNwp133tmDt9E77A5rhi4CmnlqDEOAEQtAlrztURXZvc5b5Jw3\nERHlTMZh8zVr1mDkyJFO4H7iiSfQ1tYGALj99tvTPu/NN9/Etm3b8NRTT2H58uVYvny55/t33303\nLr/8cjz77LOQJAlffvllT95Hr7CXihmqBEO39u6OKwAESJLgqTZP7rbGYXMiIsqVjMH7e9/7Hurr\n653bkUgEt9xyS8YXXrduHebOnQsAGDNmDJqamtDa2goA0HUdb7/9Nk466SQAwLJly1BTU9OtN9Cb\n7DlvXXNl3roMSTSryd0BO3nZGIfNiYgoVzIG78bGRixatMi5fdlll6G5uTnjC9fX16OiosK5XVlZ\nibq6OgDA/v37UVxcjLvuugsXXHAB7r///u4ce6+z57x1VYSzFEyTnEDtnvNOzrwZvImIKFcyznnH\n43Fs3boVY8aMAQBs2rQJ8Xg8w7M6MgzD8/WePXuwaNEi1NbW4qqrrsLLL7+ME044Ie3zKyqCkGXp\ngH9uZ6qqQp7bwqfmMUqiD7D28jZ0CQFFQlVVCEUlifddFFDgXMIYAgYOKO7weoWiUN93b+I57Dme\nw97B89hzuTiHGYP39773PSxevBgtLS3QNA2VlZW49957M75wdXW1Z7h97969qKqqAgBUVFSgpqYG\nw4cPBwDMnj0bn376aafBu6GhPePPPBBVVSHU1bV47mtpN39GuN2A4Lf28tYlSKKAuroWxOJa4sGu\nixEYApoa2+EvwOQ71XmkA8Nz2HM8h72D57HnevscprsQyDhsPm3aNKxduxbPP/881q5dixdffLFL\nmfecOXOwdu1aAMDmzZtRXV2NkpISAIAsyxg2bJizTnzz5s0YNWpUV99L1tjV5mocTuYNXXIqy93z\n3JJnqRg7rBERUe5kzLxbW1vx5z//GQ0NDQDMYfRVq1bhtdde6/R5M2bMwOTJk7FgwQIIgoBly5Zh\n9erVCIVCmDdvHpYuXYolS5bAMAyMHz/eKV7rS1E9BlmUoaqAPedtqDJ8VtB2B2jJ9bVhCDBARESU\nGxmD9w033ICamhq89tpr+MpXvoLXX38dP/jBD7r04jfddJPn9oQJE5yvR4wYgSeeeOLAjjbL4loc\nPlFBXNUhfjEdyvCPEd5+CJSqjgMU7gI1QTAYvImIKGcyDptHo1HceeedqK2txa233orHHnsML774\nYi6OLeeiWgw+yYeYqkNRy1C291hA9UNJUSgnJbdKNRi+iYgoNzIG73g8jvb2dui6joaGBpSXl2P7\n9u25OLaci2kx+CQz81ZkCbpuBmR3NzWbuymLJAuoCAVydpxERFTYMg6bn3XWWXj66adx7rnn4rTT\nTkNlZSVGjBiRi2PLuZgeQ7lYigZVQzCgQNV0AHDmvN3cwfvsY0alDPBERETZkDF42wVngLmka9++\nfZg4cWLWDyzXDMNATIvDJ/kQ13Qosoj2iAogc+bNGW8iIsqljOmiu7vaoEGDMGnSJCeYH0xUXYUB\nwwzeqg6fLELVzcxbSbEVqOgJ3kRERLmTMfOeOHEifvKTn2D69OlQFMW5f/bs2Vk9sFyLWq1RFVGB\nqhlQZBGaZs15KykK1kT3rmIM30RElDsZg/eHH34IAHjrrbec+wRBOOiCt92gRbY28VZkyZnzTpV5\ne4fN9RwcIRERkSlj8H788cdzcRx9zt4ONBG8RSd4y3IiUFeVB1DXGPEOmzPzJiKiHMoYvC+88MKU\nc9wrV67MygH1leTM2yeLUK1hc9k1RL78yqMQi2t4+p9bnftYsEZERLnUpQ5rtng8jvXr1yMYDGb1\noPqCvZe3CHN+293HXHb1MZcl0dkaVI8UQQyEUSQX5fBIiYio0GUM3rNmzfLcnjNnDq688sqsHVBf\nienmsLmExLC5rUM3Nfs5Hx+B4NCdOPb4g2v+n4iI+reMwTu5m9quXbvw+eefZ+2A+krMybzNU+Ju\nzCKLqZbGGTCixZD3TIFPUlJ8n4iIKDsyBu9LLrnE+VoQBJSUlOC6667L6kH1BSd4GzIA3ZN5y510\nTzv4VrwTEVF/lzF4/+Mf/4Cu6xCtoq14PO5Z732wiOnu4B3zbEYipxk2JyIi6gsZo9LatWuxePFi\n5/ZFF12El156KasH1RfsgjUY5ilxr+2WUgybc3UYERH1lYzB+9FHH8WPf/xj5/bvfvc7PProo1k9\nqL4Qt9Z5Q7fmvBV3wVr6wfGDsVUsERH1bxmDt2EYCIVCzu2SkpKDMmBFtCgAQLCCtzvzdq/ztjHx\nJiKivpJxznvKlCm44YYbMGvWLBiGgVdffRVTpkzJxbHllB287cxbUdzrvDnnTURE/UfG4H3bbbdh\nzZo12LBhAwRBwJlnnolTTjklF8eWU1HVCt6anXm7C9YOvpEGIiLKXxmDdzgchqIouP322wEATzzx\nBMLhMIqLi7N+cLlkZ96GZgZt91KxUNDX4fFVZQEAQG3VwXUeiIio/8s4Hnzrrbeivr7euR2JRHDL\nLbdk9aD6gp1566oZvH2yiOVXHolLTjkEIwaHOjz+lCOH44KTx+HKMybl9DiJiIgyBu/GxkYsWrTI\nuX3ZZZehubk5qwfVFyJaBIqoQNPM24osYsiAYhx/WG3KxyuyhHkzh6XMyomIiLIpY/COx+PYujWx\ng9bGjRsRj8ezelB9IaJFEZD8iKvWHt6ddFUjIiLqSxnnvL/3ve9h8eLFaGlpga7rqKiowL333puL\nY8upqBpFQPYjxuBNRET9XMYINW3aNKxduxarVq3CkiVLUF1djWuuuSYXx5ZTyZm3z9UelYiIqD/J\nmHm/9957WL16NV544QXouo4f/ehHmD9/fi6OLWd0Q0dUi8Ev+xHXmHkTEVH/ljZC/eY3v8Fpp52G\nG2+8EZWVlVi1ahWGDx+O008//aDbmMTuax6Q/IjHzYo1Bm8iIuqv0mbeDz74IMaOHYs77rgDRx11\nFICDt4931FrjHZADaGPmTURE/Vza4P3yyy/jT3/6E5YtWwZd13H22WcflFXmABCx1nj7JbNgTRBS\n7yRGRETUH6RNL6uqqnDVVVdh7dq1WLFiBb744gvs3LkTV199NV555ZVcHmPWOZm3VbDmk6WDdpSB\niIjyX5fGhmfOnIm7774br776Kk444QT8/Oc/z/Zx5VRYjQCAWbCm6hwyJyKifu2AolRJSQkWLFiA\np59+OlvH0ye8mbfG4E1ERP0aoxSA9ngYAFAkB9DcHkdx4OCqpiciooMLgzeAdtUM3qLuRzSmoao8\n0MdHRERElB6DN4D2eDsAIBoxT0dVeVFfHg4REVGnGLwBtFmZd6SNwZuIiPo/Bm8kMu+WVvM2gzcR\nEfVnDN5IzHk3NZnd1TjnTURE/RmDN4C2eDsUUUFLmxm8K0L+Pj4iIiKi9Bi8YQ6bFytBRGPmpiQ+\nhduBEhFR/8XgDXPYPCgXIRLX4FNEiGyNSkRE/VjBB2/d0BFWIwgqRYjFNfiZdRMRUT9X8ME7rEZg\nwECxHEQ6NoGoAAAYmElEQVSUwZuIiPIAg7ddad6so6E5Cr+PwZuIiPq3gg/eMc3co3zL9jYYADNv\nIiLq9wo+eMd1M3gbunkqGLyJiKi/Y/DWVfMLBm8iIsoTDN5W5g3dDNo+peBPCRER9XMFH6ni1pw3\nDPNUBFiwRkRE/RyDtzPnbWfeDN5ERNS/MXhzzpuIiPIMg7cz583gTURE+YHBW/MOmzN4ExFRf5fV\n4L1ixQqcf/75WLBgATZs2JDyMffffz8WLlyYzcPolDNsbhWsiSI3JSEiov4ta8H7zTffxLZt2/DU\nU09h+fLlWL58eYfHbNmyBf/5z3+ydQhdkrxUTNP0PjwaIiKizLIWvNetW4e5c+cCAMaMGYOmpia0\ntrZ6HnP33XfjxhtvzNYhdEksqcOapht9eThEREQZZS1419fXo6KiwrldWVmJuro65/bq1asxa9Ys\n1NbWZusQukR1qs3NzLu4SOnDoyEiIspMztUPMoxERtvY2IjVq1fj0UcfxZ49e7r0/IqKIGS5d4vJ\nqqpCkD63bugi5h85Al89aTwkznsfkKqqUF8fQt7jOew5nsPewfPYc7k4h1kL3tXV1aivr3du7927\nF1VVVQCA9evXY//+/bjooosQi8XwxRdfYMWKFVi6dGna12toaO/V46uqCqGurgXN7ebrGrqEuTNq\nsH9fa4Znkpt9Hqn7eA57juewd/A89lxvn8N0FwJZGzafM2cO1q5dCwDYvHkzqqurUVJSAgA45ZRT\n8MILL+Dpp5/Gz372M0yePLnTwJ1NqqvaXBILfuUcERHlgaxl3jNmzMDkyZOxYMECCIKAZcuWYfXq\n1QiFQpg3b162fuwBi7matMgSh8uJiKj/y+qc90033eS5PWHChA6PGTp0KB5//PFsHkannI1JdAmy\nxMybiIj6v4KPVqquWg1aBBaqERFRXij44B3T4xAMs4qdmTcREeWDgo9WcSt4CwJboxIRUX5g8NZU\nCKw0JyKiPFLwESuuxwFDYqU5ERHlDQZvPW4tEyv4U0FERHmioCOWYRiIaXFAl1lpTkREeaOgg7dq\naDBgwGCDFiIiyiMFHbxjWsz8QpcgcdiciIjyREFHLDt4G5rEYXMiIsobhR28rb7mhsaCNSIiyh8F\nHbHszFtn5k1ERHmkwIM3M28iIso/BR2xYnoi82a1ORER5YvCDt4sWCMiojxU4MHb3stb5FIxIiLK\nGwUdsRLrvGXOeRMRUd4o6IjlLBXTRQ6bExFR3ijs4O3qsMaCNSIiyhcM3gCgSdzPm4iI8kZBR6zE\nsLkERSnoU0FERHmkoCOWe9hcYcEaERHliYKOWFFnqZgEHzNvIiLKEwUdseJWhzWDmTcREeWRgo5Y\nMVfmrchS3x4MERFRFxV08I46c94iFLmgTwUREeWRgo5Yqq5CggRAgI/Bm4iI8kRBRyzVUCEKMgAw\n8yYiorxR0BErrschwpzrZvAmIqJ8UdARK66pruDNgjUiIsoPBR28VUOFYJingJk3ERHli4KOWKqu\nQrAybxasERFRvijoiBXXVQgG57yJiCi/FGzEMgzDzLw5bE5ERHmmYCOWqqvmF8y8iYgozxRsxIpr\nVvDW7cyb1eZERJQfCjd423t5W8PmLFgjIqJ8UbARy868DZ1z3kRElF8KNmLFrMwbmghBACRR6NsD\nIiIi6qKCDd6qlXnrugBFFiEIDN5ERJQfCjZ423t5G5oIRSrY00BERHmoYKOWXbCmaQJ8CivNiYgo\nfxRu8LaHzTWBmTcREeWVgo1acd0O3iIUpWBPAxER5aGCjVpxa85bUwXIYsGeBiIiykMFG7Xcw+ay\nzEpzIiLKH4UbvK2CNV0TITHzJiKiPFKwUcvpbW6IkCVm3kRElD8KN3jbvc11ETKrzYmIKI8UbNSy\nm7TAENkalYiI8krBBm9nP29dhMTMm4iI8kjBRq2Ys6uYBJmZNxER5ZGCDd5x97A5C9aIiCiPyNl8\n8RUrVuD999+HIAhYunQppk6d6nxv/fr1eOCBByCKIkaNGoXly5dDzOGSrYgaNb/QJBasERFRXsla\n1HrzzTexbds2PPXUU1i+fDmWL1/u+f4dd9yBhx56CE8++STa2trw6quvZutQUgqrEQCAocssWCMi\norySteC9bt06zJ07FwAwZswYNDU1obW11fn+6tWrMXjwYABAZWUlGhoasnUoKUXiZvCGJjPzJiKi\nvJK1qFVfX4+KigrndmVlJerq6pzbJSUlAIC9e/fi9ddfx/HHH5+tQ0kprEYhQLCqzZl5ExFR/sjq\nnLebYRgd7tu3bx+uvvpqLFu2zBPoU6moCEKWe2/f7XA8Ap/kRzsElJYEUFUV6rXXLjQ8dz3Hc9hz\nPIe9g+ex53JxDrMWvKurq1FfX+/c3rt3L6qqqpzbra2tuPLKK3HDDTfgmGOOyfh6DQ3tvXp8YTUC\nBQoAIBqNo66upVdfv1BUVYV47nqI57DneA57B89jz/X2OUx3IZC1YfM5c+Zg7dq1AIDNmzejurra\nGSoHgLvvvhuXXHIJjjvuuGwdQqci8QgU0QcALFgjIqK8krXMe8aMGZg8eTIWLFgAQRCwbNkyrF69\nGqFQCMcccwyee+45bNu2Dc8++ywA4IwzzsD555+frcPpIKxGUSGXAgAL1oiIKK9kdc77pptu8tye\nMGGC8/WmTZuy+aM7FddVqLoKQTffPoM3EVHfevnlv+OEE07u0mN/8pP7ce65C1BTU5vlo+q/CjJq\nRa0GLbv2xgBw2JyIqC/t2vUl/va3tV1+/PXXf7egAzeQw2rz/iSimcHb0MzqdS4VIyLqOw88cA8+\n/HAzHn30N9B1HV9+uRO7dn2JBx/8Be66607U1e1FOBzG5ZdfhTlzjsV1112F73znFvzzn39HW1sr\nvvhiG3bu3IFvf/u7mD17jvO6qqpi+fIfdHj+J598hPvvvweiKGDKlGm49trrU95n/5zRo8di1aqn\n0NjYiOnTD8eTT/4v2tvbcd11N+Ldd9/Gyy//HbquY/bsObj11u+ipaUFd955G9ra2lBSUoI77vgf\nXH75Rfj9759AMBjEhg3v4cknV2LFih93+5wVZPCOWsEb9rB5DtuyEhH1Z0//Ywv+89HeXn3NmROq\ncd5JY9N+/4ILFmL16qdx2WVX4pFHHoaqxvGLX/wWDQ37MWvWUTj11DOwc+cO3H77EsyZc6znuXv3\n7sF99z2E9ev/jT//eZUneLe0NKd8/oMP3oebb16KsWPH4Uc/ugO7d+9KeV86W7duwRNPrIbP58O7\n776NX/zitxBFEeeddxauvfabeOKJxzFr1myce+4CPPXUSrzzzls47rgT8dpr/8L8+afgtddewbx5\nX+nROS3I4G33NTc08+0z8yYi6j8mTpwMAAiFSvHhh5uxZs1qCIKI5uamDo+dOvUwAObyZHcXz86e\n/8UX2zB27DgAwO2335n2vnTGjh0Hn89crRQIBHDddVdBkiQ0NjaisbERn3zyEa644hoAwPnnXwQA\nqKmpxW9/+0vMn38K3n33bXzjG1cf+IlxKczgrSU2JQFYsEZEZDvvpLGdZsm5oChmD46//vUlNDc3\n4+c//y2am5txxRULOzxWkhLNu5KbgaV7fqpNsFLdJwiJxE5V1Q7Ht3v3Ljz11Er87ncrEQwGsXDh\nedZrSTAM3fNaY8eOw759+/Dhh5sxatQY+P3+zk9CBgUZtSL2piR25s2CNSKiPiOKIjRN63B/Y2Mj\nhgypgSiKeOWVfyAejx/Q66Z7/siRo7B5s7ni6a677sR///t5yvuKi4uxb5/ZbGzjxvdTvn5FRQWC\nwSA+/vgj7N69G/F4HBMnTsLbb/8HAPDcc6vw4ot/AQCcdNI8PPDAPZg375QDeh+pFGTwthlx88qH\nmTcRUd8ZMWIUPv74Izz00P2e+0844ST8+9+v4vrrr0FRURGqq6vx6KO/6fLrpnv+9dffhJ/97P/D\nNdd8A6FQKUaOHJXyvjPPPAf3338vbr75egwcWNXh9ceNG4+ioiCuueZy/P3v/4ezzjoHP/zhD3Hu\nuRdg06YNuO66q/Dvf7+G448/EQBw8snzsHfvXhx++MyenTAAgpGq6Xg/1Jvt5uJaHNf89hnojdWA\nIeKWC6ZjwojOe6tTamyn2HM8hz3Hc9g7eB57rrNz+Pzza7B79y584xvfPKDXS6Ug57wVSYHeMNi5\nzcybiIiy6Z57/gdffrkTd911X6+8XkEG72SsNiciomy69dbbevX1CjLl1HXvTAEL1oiIKJ8UZPCO\nxr1VjRw2JyKifFKQUSvWIXgz8yYiovxRkME7OfOW2B6ViIjySEFGrWjc2/mGmTcRUd96+eW/H/Bz\n3nvvHTQ07M/C0fR/hRm8Y0mZN+e8iYj6zIFuCWp7/vk1BRu8C3KpWMdhc2beRER9xb0l6PnnX4gV\nK36IlpYWaJqGG264GWPHjsP//u/v8cor/4Qoipgz51hMnDgJr776Mj7//DP8z//ci8GDzd4dfbEN\n6OWXX+VsAxqLReD3F2VlG1A3Bm+w2pyIyLZ6y1/w7t6Nvfqa06sPxTljz0j7ffeWoL///W9x5JFH\n4//9v6/i888/w09+ch8efPAXePLJ/8Vzz70ESZLw3HOrMHPmURg7djy+851bnMAN9M02oOeff6Gz\nDejixVfiZz/7VVa2AXVj8AabtBAR9RcbN25AY2MD1q59AQAQjZobSZ1wwsm44YbFmDfvFMyfn35j\nj77YBrS5uTkn24C6FWTwrgz54ZNF6IYBVTMgCgzeREQAcM7YMzrNkrNNUWTceOPNmDJlquf+m276\nHrZt+y/+8Y+/4lvf+iZ+/es/pHz+wbwNqOfYe+2V8sghwyvw1IrT8fBNJ+DXN5/Q14dDRFTQ3FuC\nTpo0Bf/618sAgM8//wxPPvm/aG1txaOP/gYjRozEZZddiVCoDO3tbSm3Ej2YtwH1nLNefbU8Iksi\nBEHgfDcRUR9zbwn69a+fj507t2Px4itwzz3/g8MOm4GSkhI0NjbgyisX4dvfvhqTJ09BaWkZDjts\nBm677VZ89tlW57X6YhvQ+++/x9kGdOHChVnbBtStILcEBbj1XW/heew5nsOe4znsHTyPPZd8Druz\nDWjy66VSkHPeRERE2dbb24C6MXgTERFlQW9vA+rGCV8iIqI8w+BNRESUZxi8iYiI8gyDNxERUZ5h\n8CYiIsozDN5ERER5hsGbiIgozzB4ExER5Zm8aY9KREREJmbeREREeYbBm4iIKM8weBMREeUZBm8i\nIqI8w+BNRESUZxi8iYiI8kxB7ue9YsUKvP/++xAEAUuXLsXUqVP7+pD6tU8++QSLFy/GpZdeiosv\nvhi7du3CLbfcAk3TUFVVhR//+Mfw+XxYs2YN/vCHP0AURZx33nk499xz+/rQ+417770Xb7/9NlRV\nxTe/+U0ceuihPIcHIBwOY8mSJdi3bx+i0SgWL16MCRMm8Bx2UyQSwRlnnIHFixdj9uzZPI8H4I03\n3sD111+PcePGAQDGjx+PK664Ivfn0Cgwb7zxhnHVVVcZhmEYW7ZsMc4777w+PqL+ra2tzbj44ouN\n2267zXj88ccNwzCMJUuWGC+88IJhGIZx//33GytXrjTa2tqM+fPnG83NzUY4HDZOP/10o6GhoS8P\nvd9Yt26dccUVVxiGYRj79+83jj/+eJ7DA/T8888bv/71rw3DMIwdO3YY8+fP5znsgQceeMA455xz\njFWrVvE8HqD169cb3/rWtzz39cU5LLhh83Xr1mHu3LkAgDFjxqCpqQmtra19fFT9l8/nw29+8xtU\nV1c7973xxhs4+eSTAQAnnngi1q1bh/fffx+HHnooQqEQAoEAZsyYgXfeeaevDrtfmTlzJn7yk58A\nAEpLSxEOh3kOD9Bpp52GK6+8EgCwa9cuDBo0iOewm7Zu3YotW7bghBNOAMD/z72hL85hwQXv+vp6\nVFRUOLcrKytRV1fXh0fUv8myjEAg4LkvHA7D5/MBAAYMGIC6ujrU19ejsrLSeQzPa4IkSQgGgwCA\nZ599FscddxzPYTctWLAAN910E5YuXcpz2E333HMPlixZ4tzmeTxwW7ZswdVXX40LLrgAr7/+ep+c\nw4Kc83Yz2B22R9KdP57Xjv72t7/h2Wefxe9+9zvMnz/fuZ/nsOuefPJJfPjhh7j55ps954fnsGue\ne+45HHbYYRg2bFjK7/M8ZjZy5Ehcd911OPXUU7F9+3YsWrQImqY538/VOSy44F1dXY36+nrn9t69\ne1FVVdWHR5R/gsEgIpEIAoEA9uzZg+rq6pTn9bDDDuvDo+xfXn31VfzqV7/Cb3/7W4RCIZ7DA7Rp\n0yYMGDAAQ4YMwcSJE6FpGoqLi3kOD9DLL7+M7du34+WXX8bu3bvh8/n4t3iABg0ahNNOOw0AMHz4\ncAwcOBAbN27M+TksuGHzOXPmYO3atQCAzZs3o7q6GiUlJX18VPnl6KOPds7h//3f/+HYY4/FtGnT\nsHHjRjQ3N6OtrQ3vvPMOjjjiiD4+0v6hpaUF9957Lx5++GGUl5cD4Dk8UG+99RZ+97vfATCnvtrb\n23kOu+HBBx/EqlWr8PTTT+Pcc8/F4sWLeR4P0Jo1a/DII48AAOrq6rBv3z6cc845OT+HBbmr2H33\n3Ye33noLgiBg2bJlmDBhQl8fUr+1adMm3HPPPdi5cydkWcagQYNw3333YcmSJYhGo6ipqcFdd90F\nRVHw0ksv4ZFHHoEgCLj44otx5pln9vXh9wtPPfUUfvrTn2LUqFHOfXfffTduu+02nsMuikQi+P73\nv49du3YhEonguuuuw5QpU3DrrbfyHHbTT3/6U9TW1uKYY47heTwAra2tuOmmm9Dc3Ix4PI7rrrsO\nEydOzPk5LMjgTURElM8KbticiIgo3zF4ExER5RkGbyIiojzD4E1ERJRnGLyJiIjyTME1aSHKN/fe\ney82btyIaDSKDz74ANOnTwcAfO1rX8NXv/rVLr3Gr3/9a4wfP97pZ53KwoUL8fvf/x6SJPXGYXvs\n2bMHn332GWbPnt3rr01UiLhUjChP7NixAxdeeCH+9a9/9fWhHLA1a9Zg69atuPHGG/v6UIgOCsy8\nifLYT3/6U+zYsQNffvklbr31VkQiEdx3333w+XyIRCJYtmwZJk+ejCVLluDwww/H7Nmzcc011+CY\nY47Bhg0b0NbWhocffhiDBg3CIYccgs2bN+OXv/wlGhsbsXv3bmzbtg1HHnkkbr/9dkSjUdx6663Y\nuXMnBg8eDEmSMGfOHM8exW1tbfjud7+L5uZmqKqKE088EWeccQYefPBBGIaB8vJyXHTRRbjzzjux\nbds2tLW14YwzzsDll1+O1atX469//SsEQcCePXswevRorFixAoqi9OEZJuqfOOdNlOd27NiBxx57\nDFOmTEFjYyN+8IMf4LHHHsOiRYvw8MMPd3j81q1bcc4552DlypWYOHEiXnzxxQ6P+eCDD/DQQw/h\n2WefxerVq9HU1IQ1a9ZAVVU888wzuOOOO/D66693eN6///1vqKqKP/7xj3jyyScRDAZRW1uLs88+\nG2eeeSYuu+wyPPbYY6iursbjjz+OZ555Bs8//zw++ugjAMDGjRv///bu2CW1MIzj+NcONQQRQi3W\nYnBsjDoSBFKNOVaEo0M4REO4HGyrKQin5ob+gDBaoiVyECEipakhWkKkQKFoiERPd5DOzYxLlysX\njvw+4+F5X97tx/PyHh7S6TSHh4eUy2VP3jKI/A/qvEU8bmJiAp/PB8DQ0BC7u7u8vb3x8vLC4OBg\nW73f78c0TQACgQBPT09tNZZlYRgGhmHg9/t5fn7m5uaG6elpAIaHh7Esq23d1NQUe3t7bGxsMDc3\nx8rKCj09rT3CxcUFDw8PXF5eAlCr1bi/v3fXf4xPnZyc5O7uzp2TLCK/KbxFPO7ztbJt22xvbzMz\nM8P5+bk7zOOzrw/Svnv28l2N4zgtQfw1lKE5y/j4+JhiscjZ2RnLy8scHR211PT19bG+vs7CwkLL\n90wmg+M4fzyXiDTp2lyki1QqFUzTpNFocHp6Sq1W69jeY2NjFItFAKrVKldXV201uVyObDaLZVnY\ntk1/fz/VahWfz0e9XgeaXf3HVb3jOOzs7Ljd//X1Na+vr7y/v1MoFBgfH+/Y+UW6iTpvkS6SSCSI\nx+MEAgFWV1exbZuDg4OO7L20tEQ2myUWizE6Oko4HG7r0IPBIKlUiv39fQzDIBKJMDIyQjgcJplM\n0tvby9raGre3t8RiMRqNBvPz8+6o1FAoxObmJqVSCdM0iUQiHTm7SLfRr2Ii8iOPj48UCgWi0SiO\n47C4uMjW1pb73/m/ymQy5PN50ul0R/YT6WbqvEXkRwYGBjg5OXHnE8/OznYsuEXk76jzFhER8Rg9\nWBMREfEYhbeIiIjHKLxFREQ8RuEtIiLiMQpvERERj1F4i4iIeMwvRph4T/csGFUAAAAASUVORK5C\nYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f867ef90>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "HNqUFL4deCsL",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 4. Case study: building an RNN\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "YkC1k4HEQ7rw",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "In this exercise we build and train a model similar to the RNNColorbot model that was used in the main Eager notebook. The model is adapted for converting and training in graph mode."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "7nkPDl5CTCNb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "To get started, we load the colorbot dataset. The code is identical to that used in the other exercise and its details are unimportant."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "A0uREmVXCQEw",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\n",
+        "  \n",
+        "  Args:\n",
+        "    line: A comma-separated string containing four items:\n",
+        "        color_name, red, green, and blue, representing the name and\n",
+        "        respectively the RGB value of the color, as an integer\n",
+        "        between 0 and 255.\n",
+        "\n",
+        "  Returns:\n",
+        "    A tuple of three tensors (rgb, chars, length), of shapes: (batch_size, 3),\n",
+        "    (batch_size, max_sequence_length, 256) and respectively (batch_size).\n",
+        "  \"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "\n",
+        "def maybe_download(filename, work_directory, source_url):\n",
+        "  \"\"\"Downloads the data from source url.\"\"\"\n",
+        "  if not tf.gfile.Exists(work_directory):\n",
+        "    tf.gfile.MakeDirs(work_directory)\n",
+        "  filepath = os.path.join(work_directory, filename)\n",
+        "  if not tf.gfile.Exists(filepath):\n",
+        "    temp_file_name, _ = six.moves.urllib.request.urlretrieve(source_url)\n",
+        "    tf.gfile.Copy(temp_file_name, filepath)\n",
+        "    with tf.gfile.GFile(filepath) as f:\n",
+        "      size = f.size()\n",
+        "    print('Successfully downloaded', filename, size, 'bytes.')\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = maybe_download(os.path.basename(url), data_dir, url)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None, None], []))\n",
+        "  return dataset\n",
+        "\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "waZ89t3DTUla",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Next, we set up the RNNColobot model, which is very similar to the one we used in the main exercise.\n",
+        "\n",
+        "Autograph doesn't fully support classes yet (but it will soon!), so we'll write the model using simple functions."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9v8AJouiC44V",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def model_components():\n",
+        "  lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "  lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "  upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "  upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "  relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "  relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "  return lower_cell, upper_cell, relu_layer\n",
+        "\n",
+        "\n",
+        "def rnn_layer(chars, cell, batch_size, training):\n",
+        "  \"\"\"A simple RNN layer.\n",
+        "  \n",
+        "  Args:\n",
+        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "  Returns:\n",
+        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "  \"\"\"\n",
+        "  hidden_outputs = []\n",
+        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "  n = tf.shape(chars)[0]\n",
+        "  i = 0\n",
+        "  while i < n:\n",
+        "    ch = chars[i]\n",
+        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "    hidden_outputs.append(cell_output)\n",
+        "    i += 1\n",
+        "  hidden_outputs = hidden_outputs.stack()\n",
+        "  if training:\n",
+        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "  return hidden_outputs\n",
+        "\n",
+        "\n",
+        "def model(inputs, lower_cell, upper_cell, relu_layer, batch_size, training):\n",
+        "  \"\"\"RNNColorbot model.\n",
+        "  \n",
+        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "  followed by a fully connected layer with ReLU activation.\n",
+        "  \n",
+        "  Args:\n",
+        "    inputs: A tuple (chars, length)\n",
+        "    lower_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    upper_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    relu_layer: An object of type tf.layers.Dense\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "    \n",
+        "  Returns:\n",
+        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "  \"\"\"\n",
+        "  (chars, length) = inputs\n",
+        "  chars_time_major = tf.transpose(chars, [1, 0, 2])\n",
+        "  chars_time_major.set_shape((None, batch_size, 256))\n",
+        "\n",
+        "  hidden_outputs = rnn_layer(chars_time_major, lower_cell, batch_size, training)\n",
+        "  final_outputs = rnn_layer(hidden_outputs, upper_cell, batch_size, training)\n",
+        "\n",
+        "  # Grab just the end-of-sequence from each output.\n",
+        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "  sequence_ends = tf.gather_nd(final_outputs, indices)\n",
+        "  return relu_layer(sequence_ends)\n",
+        "\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "JjK4gXFvFsf4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The train and test functions are also similar to the ones used in the Eager notebook. Since the network requires a fixed batch size, we'll train in a single shot, rather than by epoch."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "ZWQMExk0S6X6",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  iterator = train_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "    optimizer.minimize(loss)\n",
+        "    if step % (num_steps // 10) == 0:\n",
+        "      print('Step', step, 'train loss', loss)\n",
+        "    step += 1\n",
+        "  return step\n",
+        "\n",
+        "\n",
+        "def test(eval_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  total_loss = 0.0\n",
+        "  iterator = eval_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=False)\n",
+        "    total_loss += loss_fn(labels, predictions)\n",
+        "    step += 1\n",
+        "  print('Test loss', total_loss)\n",
+        "  return total_loss\n",
+        "\n",
+        "\n",
+        "def train_model(train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps):\n",
+        "  optimizer = tf.train.AdamOptimizer(learning_rate=0.01)\n",
+        "\n",
+        "  train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps=tf.constant(train_steps))\n",
+        "  test(eval_data, lower_cell, upper_cell, relu_layer, 50, num_steps=tf.constant(2))\n",
+        "\n",
+        "  print('Colorbot is ready to generate colors!\\n\\n')\n",
+        "  \n",
+        "  # In graph mode, every op needs to be a dependent of another op.\n",
+        "  # Here, we create a no_op that will drive the execution of all other code in\n",
+        "  # this function. Autograph will add the necessary control dependencies.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "iopcs5hXG2od",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we add code to run inference on a single input, which we'll read from the input.\n",
+        "\n",
+        "Note the `do_not_convert` annotation that lets us disable conversion for certain functions and run them as a `py_func` instead, so you can still call them from compiled code."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DyU0wnnAFEYj",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "@autograph.do_not_convert(run_as=autograph.RunMode.PY_FUNC)\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def inference(color_name, lower_cell, upper_cell, relu_layer):\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "  chars = tf.expand_dims(chars, 0)\n",
+        "  sequence_length = tf.expand_dims(sequence_length, 0)\n",
+        "  pred = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, 1, training=False)\n",
+        "  pred = tf.minimum(pred, 1.0)\n",
+        "  pred = tf.expand_dims(pred, 0)\n",
+        "  draw_prediction(color_name, pred)\n",
+        "  # Create an op that will drive the entire function.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Nt0Kv5OCHip0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we put everything together.\n",
+        "\n",
+        "Note that the entire training and testing code is all compiled into a single op (`tf_train_model`) that you only execute once! We also still use a `sess.run` loop for the inference part, because that requires keyboard input."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "-GmWa0GtYWdh",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 668
+        },
+        "outputId": "61f4af1d-c81e-44db-9079-1a7b8ed8ce58",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345877153,
+          "user_tz": 240,
+          "elapsed": 75500,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def run_input_loop(sess, inference_ops, color_name_placeholder):\n",
+        "  \"\"\"Helper function that reads from input and calls the inference ops in a loop.\"\"\"\n",
+        "\n",
+        "  tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "  while True:\n",
+        "    with tb.output_to(0):\n",
+        "      try:\n",
+        "        color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "      except (EOFError, KeyboardInterrupt):\n",
+        "        break\n",
+        "    if not color_name:\n",
+        "      break\n",
+        "    with tb.output_to(0):\n",
+        "      tb.clear_tab()\n",
+        "      sess.run(inference_ops, {color_name_placeholder: color_name})\n",
+        "      plt.show()\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  # Read the data.\n",
+        "  batch_size = 64\n",
+        "  train_data = load_dataset(data_dir, train_url, batch_size)\n",
+        "  eval_data = load_dataset(data_dir, test_url, 50, training=False)\n",
+        "  \n",
+        "  # Create the model components.\n",
+        "  lower_cell, upper_cell, relu_layer = model_components()\n",
+        "  # Create the helper placeholder for inference.\n",
+        "  color_name_placeholder = tf.placeholder(tf.string, shape=())\n",
+        "  \n",
+        "  # Compile the train / test code.\n",
+        "  tf_train_model = autograph.to_graph(train_model)\n",
+        "  train_model_ops = tf_train_model(\n",
+        "      train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps=100)\n",
+        "  \n",
+        "  # Compile the inference code.\n",
+        "  tf_inference = autograph.to_graph(inference)\n",
+        "  inference_ops = tf_inference(color_name_placeholder, lower_cell, upper_cell, relu_layer)\n",
+        "  \n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    \n",
+        "    # Run training and testing.\n",
+        "    sess.run(train_model_ops)\n",
+        "     \n",
+        "    # Run the inference loop.\n",
+        "    run_input_loop(sess, inference_ops, color_name_placeholder)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "('Successfully downloaded', 'train.csv', 28010L, 'bytes.')\n",
+            "('Successfully downloaded', 'test.csv', 2414L, 'bytes.')\n",
+            "Step 0 train loss 0.37890616\n",
+            "Step 10 train loss 0.18515904\n",
+            "Step 20 train loss 0.0892782\n",
+            "Step 30 train loss 0.07883155\n",
+            "Step 40 train loss 0.08585831\n",
+            "Step 50 train loss 0.09302989\n",
+            "Step 60 train loss 0.089012615\n",
+            "Step 70 train loss 0.07275697\n",
+            "Step 80 train loss 0.06644974\n",
+            "Step 90 train loss 0.0854013\n",
+            "Test loss 0.13216865Colorbot is ready to generate colors!\n",
+            "\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<link rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'></link>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<script src='/nbextensions/google.colab/tabbar_main.min.js'></script>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<div id=\"id1\"></div>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b102d936-3379-11e8-ac70-0242ac110002\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"borderColor\": [\"#a7a7a7\"], \"tabNames\": [\"RNN Colorbot\"], \"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"elementId\": \"id1\"});\n",
+              "//# sourceURL=js_e223a56194"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b103532a-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_b8c6a821fb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b105b28c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_44805e254b"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b106197a-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_a63d3c6c47"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1069f44-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b106197a-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7e203b8bce"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1070f38-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_d53293d4a7"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6d90d5c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b105b28c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_3000dc2c05"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6da872c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_4136f669a3"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dac868-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_2f70dd9aee"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6db07d8-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6dac868-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7226726048"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dcc6fe-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_72e7709865"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAFZCAYAAADHDNdrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAB9JJREFUeJzt3E1Lle0ax+HTF4jeEAyMBhE0DawI\nwsCH0AIlaGBWNJBo0CDoA0TQhmDXuKAGDioiCA2KlEAlnl05FD9Co8BeaGCQoBDa2jPZsXt4Bvu/\n0+o4Rmvd1zW4rsmP84bFamo0Go0C4H/WvNYHAPhVCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKDy\nUxgeHq5Dhw7V4OBgPXz4sHp7e+vWrVt15cqVOnnyZN2/f78ajUbdvn27+vr6qqenp65du1YrKytV\nVfXhw4e6cOFC9fX1VV9fX01PT1dV1dzcXHV3d9eDBw/q+PHj9ccff9TExMRaXpWfWOtaHwD+zuvX\nr+vOnTs1MTFRbW1tdf78+dW16enpGh8fr/b29hobG6upqal6/Phxbdy4sS5evFgjIyM1NDRUly5d\nqv3799fw8HC9efOmTp8+XVNTU1VV9enTp2pubq5nz57V5ORk3bhxo44dO7ZW1+UnZkJl3Zudna2D\nBw9WR0dHbdiwoQYHB1fX9u7dW+3t7VVV9fLlyxocHKytW7dWa2trnTp1qp4/f16Li4s1MzNT586d\nq6qqXbt21YEDB1an1OXl5Tpx4kRVVe3Zs6fevXv3Yy/IL8OEyrr3+fPnamtrW/2+ffv21c//+Xxh\nYaHu3r1bjx49qqqqlZWVam9vr4WFhWo0GnXmzJnVvYuLi9XV1VVVVS0tLbVp06aqqmpubq6vX7/+\nX+/Dr0tQWfe2bNlSi4uLq98/fvz43X0dHR3V29tbQ0ND3zxfXl6ulpaWevLkSW3evPmbtbm5ufyB\n+W155Wfd6+zsrJmZmZqfn68vX77U2NjYd/cdOXKkxsfHa2lpqaqqRkdH6+nTp9Xa2lqHDx+u0dHR\nqqpaWlqqy5cv1/v373/YHfg9CCrrXmdnZw0MDNTAwECdPXu2enp6vrvv6NGj1dPTUwMDA9Xf318v\nXryo7u7uqqq6evVqzc7OVn9/fw0MDNTOnTtrx44dP/Ia/Aaa/B8qP4NGo1FNTU1VVfXq1au6efPm\nX06qsFZMqKx78/Pz1dXVVW/fvq1Go1GTk5O1b9++tT4W/BcTKj+FkZGRunfvXjU1NdXu3bvr+vXr\ntW3btrU+FnxDUAFCvPIDhAgqQMi6+WH/kX8eXesjAPytf/3jz79cM6EChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkBI\nU6PRaKz1IQB+BSZUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBB\nBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAkH8D1Aj8lNhhe7QAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f402e850>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70592aa-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6da872c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_25c3aaf79a"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70842c0-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_984c56b816"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c708dec4-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_e0451a1217"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7092726-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c708dec4-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7aa23d7385"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7099044-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_5722756ddb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "Give me a color name (or press 'enter' to exit): \n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7baac12-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c70842c0-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_cdd622e58f"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AHJ2c47U-A5W",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Where do we go next?\n",
+        "\n",
+        "Autograph is available in tensorflow.contrib, but it's still in its early stages. We're excited about the possibilities it brings — write your machine learning code in the flexible Eager style, but still enjoy all the benefits that come with running in graph mode. A beta version will be available soon -- stay tuned!"
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 985177e897..d193a8459d 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,14 +44,14 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
+  r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`.
 
-  With `p(z) := exp{log_p(z)}`, this `Op` returns
+  With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns
 
   ```
-  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-  \approx E_q[ f(Z) p(Z) / q(Z) ]
-  =       E_p[f(Z)]
+  \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
+  \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
+  \\(=       E_p[f(Z)]\\)
   ```
 
   This integral is done in log-space with max-subtraction to better handle the
@@ -95,9 +95,9 @@ def expectation_importance_sampler(f,
       log_values = log_f_z + log_p_z - q_log_prob_z
       return _logspace_mean(log_values)
 
-    # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
-    # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
-    #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
+    # With \\(f_{plus}(z) = max(0, f(z)), f_{minus}(z) = max(0, -f(z))\\),
+    # \\(E_p[f(Z)] = E_p[f_{plus}(Z)] - E_p[f_{minus}(Z)]\\)
+    # \\(          = E_p[f_{plus}(Z) + 1] - E_p[f_{minus}(Z) + 1]\\)
     # Without incurring bias, 1 is added to each to prevent zeros in logspace.
     # The logarithm is approximately linear around 1 + epsilon, so this is good
     # for small values of 'z' as well.
@@ -121,13 +121,13 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-  returns
+  With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`,
+  this `Op` returns
 
   ```
-  Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-  \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-  =       Log[E_p[f(Z)]]
+  \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
+  \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
+  \\(=       Log[E_p[f(Z)]]\\)
   ```
 
   This integral is done in log-space with max-subtraction to better handle the
@@ -196,12 +196,12 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `E_p[f(X)]`.
+  """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`.
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
   ```none
-  E_p[f(X)] approx= m**-1 sum_i^m f(x_j),  x_j ~iid p(X)
+  \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
   ```
 
   where:
@@ -216,8 +216,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ s_i : i=1...n } ] = Avg{ grad[s_i] : i=1...n }` where
-  `S_n = Avg{s_i}` and `s_i = f(x_i), x_i ~ p`.
+  `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where
+  `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`.
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +296,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `E_p[f(X)]`.  A batch of samples should be indexed by `axis` dimensions.
+      `\\(E_p[f(X)]\\)`.  A batch of samples should be indexed by `axis`
+      dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
       required/used if `use_reparametrization=False`.
@@ -316,7 +317,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `E_p[f(X)]`.
+      of `\\(E_p[f(X)]\\)`.
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index a520a06bd7..5a2771229d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -75,7 +75,7 @@ class TPUClusterResolver(ClusterResolver):
                zone=None,
                project=None,
                job_name='worker',
-               coordinator_name='coordinator',
+               coordinator_name=None,
                coordinator_address=None,
                credentials='default',
                service=None):
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index cfddca1063..dff7a03b68 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -117,7 +117,8 @@ class TPUClusterResolverTest(test.TestCase):
         zone=None,
         tpu=['test-tpu-1'],
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
@@ -170,6 +171,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -196,6 +198,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -239,7 +242,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 340be61971..de84af866b 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -337,6 +337,7 @@ tensorflow/contrib/nccl/kernels
 tensorflow/contrib/nccl/ops
 tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
 tensorflow/contrib/nearest_neighbor/python
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 1dd490b386..c28c3a18e4 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -88,19 +88,23 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
 
   Cudnn compatible GRU (from Cudnn library user guide):
   ```python
-  r_t = sigma(x_t * W_r + h_t-1 * R_h + b_Wr + b_Rr)  # reset gate
-  u_t = sigma(x_t * W_u + h_t-1 * R_u + b_Wu + b_Ru)  # update gate
-  h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_Rh) + b_Wh)  # new memory gate
-  h_t = (1 - u_t) .* h'_t + u_t .* h_t-1
+  # reset gate
+  $$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$
+  # update gate
+  $$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$
+  # new memory gate
+  $$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$
+  $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
   ```
 
   Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
   ```python
-  h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_Wh)  # new memory gate
+  # new memory gate
+  \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
   ```
   which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh,
   ```python
-  r .* (h * R) != (r .* h) * R
+  \\(r .* (h * R) != (r .* h) * R\\)
   ```
   """
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 36ddf30042..b044ff1775 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -100,6 +100,12 @@ class SequenceDatasetSerializationTest(
     # Test repeat empty dataset
     self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
 
+  def testInvalidRepeat(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0),
+                          None, 0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index a182dddd38..b465397437 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -110,6 +110,7 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
         .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
     return filtered_ds.map(lambda class_value, _, data: (class_value, data))
 
+
   return _apply_fn
 
 
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index 68f202ea62..bbe5e877d5 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes for different algortihms of reduction and broadcasting."""
+"""Classes for different algorithms of reduction and broadcasting."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -155,7 +155,7 @@ class CrossTowerOps(object):
     Args:
       method_string: either 'sum' or 'mean' specifying the reduction method.
       value_destination_pairs: a list or a tuple of tuples of PerDevice objects
-        and destinations. If a destionation is None, then the destinations
+        and destinations. If a destination is None, then the destinations
         are set to match the devices of the input PerDevice object.
 
     Returns:
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
index 0dc6b8db6b..fc04e2195f 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -316,7 +316,7 @@ def unpack_small_tensors(tower_grads, packing):
       it made to tower_grads.
 
   Returns:
-    new_tower_grads: identical to tower_grads except that concatentations
+    new_tower_grads: identical to tower_grads except that concatenations
       of small tensors have been split apart and returned to their original
       positions, paired with their original variables.
   """
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/contrib/distribute/python/shared_variable_creator.py
index aca9c7af05..a7083e279f 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator.py
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator.py
@@ -46,7 +46,7 @@ def make_fn(shared_variable_store, device_id):
   error.
   Additionally, we de-uniquify variable names before checking for matches. This
   helps re-use variables which are intended to be the same but have different
-  names due to variable uniquificaton happening upstream. Since this might
+  names due to variable uniquification happening upstream. Since this might
   mean we may have multiple variables with the same canonical name, we store
   them in a list per canonical name and return them in the same order as well.
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
index ad11d9f248..074b5f275d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -69,7 +69,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       bijector = Kumaraswamy(
           concentration1=concentration1,
           concentration0=concentration0, validate_args=True)
-      # Omitting the endpoints 0 and 1, since idlj will be inifinity at these
+      # Omitting the endpoints 0 and 1, since idlj will be infinity at these
       # endpoints.
       y = np.linspace(.01, 0.99, num=10).astype(np.float32)
       x = 1 - (1 - y ** concentration1) ** concentration0
diff --git a/tensorflow/contrib/distributions/python/ops/estimator.py b/tensorflow/contrib/distributions/python/ops/estimator.py
index 6b53338c45..98edd337fe 100644
--- a/tensorflow/contrib/distributions/python/ops/estimator.py
+++ b/tensorflow/contrib/distributions/python/ops/estimator.py
@@ -75,7 +75,7 @@ def estimator_head_distribution_regression(make_distribution_fn,
 
 
 class _DistributionRegressionHead(_RegressionHead):
-  """Creates a _RegressionHead instance from an arbitray `Distribution`."""
+  """Creates a _RegressionHead instance from an arbitrary `Distribution`."""
 
   def __init__(self,
                make_distribution_fn,
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index 7dcb3e3ac4..b1bacb91b0 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -36,7 +36,7 @@ class Independent(distribution_lib.Distribution):
 
   This distribution is useful for regarding a collection of independent,
   non-identical distributions as a single random variable. For example, the
-  `Indpendent` distribution composed of a collection of `Bernoulli`
+  `Independent` distribution composed of a collection of `Bernoulli`
   distributions might define a distribution over an image (where each
   `Bernoulli` is a distribution over each pixel).
 
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 46c2cc8b7a..e3e40b2e9c 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -52,7 +52,7 @@ class OneHotCategorical(distribution.Distribution):
 
   #### Examples
 
-  Creates a 3-class distiribution, with the 2nd class, the most likely to be
+  Creates a 3-class distribution, with the 2nd class, the most likely to be
   drawn from.
 
   ```python
@@ -60,7 +60,7 @@ class OneHotCategorical(distribution.Distribution):
   dist = OneHotCategorical(probs=p)
   ```
 
-  Creates a 3-class distiribution, with the 2nd class the most likely to be
+  Creates a 3-class distribution, with the 2nd class the most likely to be
   drawn from, using logits.
 
   ```python
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b525809015..e454a53c62 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -35,10 +35,10 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
 
   The RelaxedBernoulli is a distribution over the unit interval (0,1), which
   continuously approximates a Bernoulli. The degree of approximation is
-  controlled by a temperature: as the temperaturegoes to 0 the RelaxedBernoulli
-  becomes discrete with a distribution described by the `logits` or `probs`
-  parameters, as the temperature goes to infinity the RelaxedBernoulli
-  becomes the constant distribution that is identically 0.5.
+  controlled by a temperature: as the temperature goes to 0 the
+  RelaxedBernoulli becomes discrete with a distribution described by the
+  `logits` or `probs` parameters, as the temperature goes to infinity the
+  RelaxedBernoulli becomes the constant distribution that is identically 0.5.
 
   The RelaxedBernoulli distribution is a reparameterized continuous
   distribution that is the binary special case of the RelaxedOneHotCategorical
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index ff33f327c7..f56ba07816 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -303,7 +303,7 @@ class RelaxedOneHotCategorical(
   The RelaxedOneHotCategorical is a distribution over random probability
   vectors, vectors of positive real values that sum to one, which continuously
   approximates a OneHotCategorical. The degree of approximation is controlled by
-  a temperature: as the temperaturegoes to 0 the RelaxedOneHotCategorical
+  a temperature: as the temperature goes to 0 the RelaxedOneHotCategorical
   becomes discrete with a distribution described by the `logits` or `probs`
   parameters, as the temperature goes to infinity the RelaxedOneHotCategorical
   becomes the constant distribution that is identically the constant vector of
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 8c67647a61..887981d64e 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -66,7 +66,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   This distribution is an Affine transformation of iid
   [Student's t-distributions](
   https://en.wikipedia.org/wiki/Student%27s_t-distribution)
-  and should not be confused with the [Multivate Student's t-distribution](
+  and should not be confused with the [Multivariate Student's t-distribution](
   https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
   traditional Multivariate Student's t-distribution is type of
   [elliptical distribution](
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 23137e0a97..84e80791f4 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -41,11 +41,12 @@ from tensorflow.python.platform import resource_loader
 _clustering_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile('_clustering_ops.so'))
 
-# Euclidean distance between vectors U and V is defined as ||U - V||_F which is
-# the square root of the sum of the absolute squares of the elements difference.
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
 SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
 # Cosine distance between vectors U and V is defined as
-# 1 - (U \dot V) / (||U||_F ||V||_F)
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
 COSINE_DISTANCE = 'cosine'
 
 RANDOM_INIT = 'random'
@@ -472,8 +473,8 @@ class KMeans(object):
         # Locally compute the sum of inputs mapped to each id.
         # For a cluster with old cluster value x, old count n, and with data
         # d_1,...d_k newly assigned to it, we recompute the new value as
-        # x += (sum_i(d_i) - k * x) / (n + k).
-        # Compute sum_i(d_i), see comment above.
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
         cluster_center_updates = math_ops.unsorted_segment_sum(
             inp, unique_idx, num_unique_cluster_idx)
         # Shape to enable broadcasting count_updates and learning_rate to inp.
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 8e0ed1d80e..811fa89bc3 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -51,9 +51,9 @@ class WALSModel(object):
   r"""A model for Weighted Alternating Least Squares matrix factorization.
 
   It minimizes the following loss function over U, V:
-   \\(
-   \|\sqrt W \odot (A - U V^T) \|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
-   )\\
+  $$
+   \|\sqrt W \odot (A - U V^T)\|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
+  $$
     where,
     A: input matrix,
     W: weight matrix. Note that the (element-wise) square root of the weights
@@ -61,12 +61,12 @@ class WALSModel(object):
     U, V: row_factors and column_factors matrices,
     \\(\lambda)\\: regularization.
   Also we assume that W is of the following special form:
-  \\( W_{ij} = W_0 + R_i * C_j )\\  if \\(A_{ij} \ne 0)\\,
-  \\(W_{ij} = W_0)\\ otherwise.
+  \\( W_{ij} = W_0 + R_i * C_j \\)  if \\(A_{ij} \ne 0\\),
+  \\(W_{ij} = W_0\\) otherwise.
   where,
-  \\(W_0)\\: unobserved_weight,
-  \\(R_i)\\: row_weights,
-  \\(C_j)\\: col_weights.
+  \\(W_0\\): unobserved_weight,
+  \\(R_i\\): row_weights,
+  \\(C_j\\): col_weights.
 
   Note that the current implementation supports two operation modes: The default
   mode is for the condition where row_factors and col_factors can individually
@@ -82,14 +82,15 @@ class WALSModel(object):
   normalized as follows:
     _, _, unregularized_loss, regularization, sum_weights =
         update_row_factors(sp_input)
-  if sp_input contains the rows {A_i, i \in I}, and the input matrix A has n
-  total rows, then the minibatch loss = unregularized_loss + regularization is
-   \\(
+  if sp_input contains the rows \\({A_i, i \in I}\\), and the input matrix A
+  has n total rows, then the minibatch loss = unregularized_loss +
+  regularization is
+   $$
    (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
    \lambda \|V\|_F^2
-   )\\
+   $$
   The sum_weights tensor contains the normalized sum of weights
-  sum(W_I) * n / |I|.
+  \\(sum(W_I) * n / |I|\\).
 
   A typical usage example (pseudocode):
 
@@ -223,7 +224,7 @@ class WALSModel(object):
         factor shard. In this case,  w_ij = unobserved_weight +
                                             row_weights[i] * col_weights[j].
         - If this is a single non-negative real number, this value is used for
-        all row weights and w_ij = unobserved_weight + row_weights *
+        all row weights and \\(w_ij\\) = unobserved_weight + row_weights *
                                    col_weights[j].
         Note that it is allowed to have row_weights as a list while col_weights
         a single number or vice versa.
@@ -665,18 +666,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the rows {A_{i, :}, i \in I}, and the input
-        matrix A has n total rows, then the unregularized loss is:
-        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|
+        term. If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the
+        input matrix A has n total rows, then the unregularized loss is:
+        \\(\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
-        A has n total rows, then the regularization term is:
-        \lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2.
+        If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the input
+        matrix A has n total rows, then the regularization term is:
+        \\(\lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2\\).
       sum_weights: The sum of the weights W_I corresponding to sp_input,
-        normalized by a factor of n / |I|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(n / |I|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         True, sp_input=sp_input, transpose_input=transpose_input)
@@ -698,18 +699,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the columns {A_{:, j}, j \in J}, and the
-        input matrix A has m total columns, then the unregularized loss is:
-        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|
+        term. If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and
+        the input matrix A has m total columns, then the unregularized loss is:
+        \\(\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the columns {A_{:, j}, j \in J}, and the input
-        matrix A has m total columns, then the regularization term is:
-        \lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2.
+        If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and the
+        input matrix A has m total columns, then the regularization term is:
+        \\(\lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2\\).
       sum_weights: The sum of the weights W_J corresponding to sp_input,
-        normalized by a factor of m / |J|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(m / |J|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         False, sp_input=sp_input, transpose_input=transpose_input)
@@ -720,8 +721,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the row factors.
 
-    This computes the row embedding u_i for an observed row a_i by solving
-    one iteration of the update equations.
+    This computes the row embedding \\(u_i\\) for an observed row \\(a_i\\) by
+    solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of rows. Please note that the
@@ -753,8 +754,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the column factors.
 
-    This computes the column embedding v_j for an observed column a_j by solving
-    one iteration of the update equations.
+    This computes the column embedding \\(v_j\\) for an observed column
+    \\(a_j\\) by solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of columns. Please note that
@@ -938,7 +939,7 @@ class WALSModel(object):
     loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
                      if transpose_input else new_sp_input)
     # sp_approx is the low rank estimate of the input matrix, formed by
-    # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
+    # computing the product <\\(u_i, v_j\\)> for (i, j) in loss_sp_input.indices.
     sp_approx_vals = gen_factorization_ops.masked_matmul(
         new_left_values,
         right,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 14d4c733e3..5d77bc77e1 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -357,8 +357,8 @@ class GmmAlgorithm(object):
     # Shape broadcasting.
     probs = array_ops.expand_dims(self._probs[shard_id], 0)
     # Membership weights are computed as:
-    # w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}
-    #               {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}
+    # $$w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}$$
+    # $$            {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}$$
     # where "i" is the i-th example, "k" is the k-th mixture, theta are
     # the model parameters and y_i the observations.
     # These are defined for each shard.
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 38faca119d..bfe338c9f9 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -374,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `||u - v||_2` which is
-             the square root of the sum of the absolute squares of the elements'
-             difference.
+             between vectors `u` and `v` is defined as `\\(||u - v||_2\\)`
+             which is the square root of the sum of the absolute squares of
+             the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `1 - (u . v) / (||u||_2 ||v||_2)`.
+             `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`.
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 62db3bb4c4..ca46c39baa 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -216,7 +216,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
         name=WALSMatrixFactorization.LOSS,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     # The root weighted squared error =
-    #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
+    #   \\(\sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )\\)
     rwse_var = variable_scope.variable(
         0.,
         trainable=False,
@@ -490,11 +490,11 @@ class WALSMatrixFactorization(estimator.Estimator):
           and the problem simplifies to ALS. Note that, in this case,
           col_weights must also be set to "None".
         - List of lists of non-negative scalars, of the form
-          [[w_0, w_1, ...], [w_k, ... ], [...]],
+          \\([[w_0, w_1, ...], [w_k, ... ], [...]]\\),
           where the number of inner lists equal to the number of row factor
           shards and the elements in each inner list are the weights for the
           rows of that shard. In this case,
-          w_ij = unonbserved_weight + row_weights[i] * col_weights[j].
+          \\(w_ij = unonbserved_weight + row_weights[i] * col_weights[j]\\).
         - A non-negative scalar: This value is used for all row weights.
           Note that it is allowed to have row_weights as a list and col_weights
           as a scalar, or vice-versa.
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 082c42eba1..e3fc6bf0f0 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -88,8 +88,8 @@ class GANEstimator(estimator.Estimator):
           discriminator_fn=discriminator_fn,
           generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
           discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-          generator_optimizer=tf.train.AdamOptimizier(0.1, 0.5),
-          discriminator_optimizer=tf.train.AdamOptimizier(0.1, 0.5))
+          generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5))
 
       # Train estimator.
       gan_estimator.train(train_input_fn, steps)
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 39588b7219..1ba3a64167 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -306,6 +306,7 @@ def wasserstein_gradient_penalty(
     discriminator_scope,
     epsilon=1e-10,
     target=1.0,
+    one_sided=False,
     weights=1.0,
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -327,6 +328,8 @@ def wasserstein_gradient_penalty(
       computing the gradient norm.
     target: Optional Python number or `Tensor` indicating the target value of
       gradient norm. Defaults to 1.0.
+    one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894
+      is used. Defaults to `False`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `real_data` and `generated_data`, and must be broadcastable to
       them (i.e., all dimensions must be either `1`, or the same as the
@@ -377,10 +380,13 @@ def wasserstein_gradient_penalty(
     # For numerical stability, add epsilon to the sum before taking the square
     # root. Note tf.norm does not add epsilon.
     slopes = math_ops.sqrt(gradient_squares + epsilon)
-    penalties = math_ops.square(slopes / target - 1.0)
+    penalties = slopes / target - 1.0
+    if one_sided:
+      penalties = math_ops.maximum(0., penalties)
+    penalties_squared = math_ops.square(penalties)
     penalty = losses.compute_weighted_loss(
-        penalties, weights, scope=scope, loss_collection=loss_collection,
-        reduction=reduction)
+        penalties_squared, weights, scope=scope,
+        loss_collection=loss_collection, reduction=reduction)
 
     if add_summaries:
       summary.scalar('gradient_penalty_loss', penalty)
@@ -665,7 +671,7 @@ def least_squares_discriminator_loss(
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
     add_summaries=False):
-  """Least squares generator loss.
+  """Least squares discriminator loss.
 
   This loss comes from `Least Squares Generative Adversarial Networks`
   (https://arxiv.org/abs/1611.04076).
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index dbaa624ae9..2889e93743 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -481,6 +481,28 @@ class GradientPenaltyTest(test.TestCase, _PenaltyTest):
                       })
       self.assertAlmostEqual(self._expected_loss, loss, 5)
 
+  def test_loss_using_one_sided_mode(self):
+    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+
+    loss = tfgan_losses.wasserstein_gradient_penalty(
+        generated_data,
+        real_data,
+        self._kwargs['generator_inputs'],
+        self._kwargs['discriminator_fn'],
+        self._kwargs['discriminator_scope'],
+        one_sided=True)
+    self.assertEqual(generated_data.dtype, loss.dtype)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      loss = sess.run(loss,
+                      feed_dict={
+                          generated_data: self._generated_data_np,
+                          real_data: self._real_data_np,
+                      })
+      self.assertAlmostEqual(self._expected_loss, loss, 5)
+
   def test_loss_with_gradient_norm_target(self):
     """Test loss value with non default gradient norm target."""
     generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 776eb11ecb..73acd05b60 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -461,6 +461,7 @@ def gan_loss(
     gradient_penalty_weight=None,
     gradient_penalty_epsilon=1e-10,
     gradient_penalty_target=1.0,
+    gradient_penalty_one_sided=False,
     mutual_information_penalty_weight=None,
     aux_cond_generator_weight=None,
     aux_cond_discriminator_weight=None,
@@ -485,6 +486,8 @@ def gan_loss(
     gradient_penalty_target: If `gradient_penalty_weight` is not None, a Python
       number or `Tensor` indicating the target value of gradient norm. See the
       CIFAR10 section of https://arxiv.org/abs/1710.10196. Defaults to 1.0.
+    gradient_penalty_one_sided: If `True`, penalty proposed in
+      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
     mutual_information_penalty_weight: If not `None`, must be a non-negative
       Python number or Tensor indicating how much to weight the mutual
       information penalty. See https://arxiv.org/abs/1606.03657 for more
@@ -546,6 +549,7 @@ def gan_loss(
         model,
         epsilon=gradient_penalty_epsilon,
         target=gradient_penalty_target,
+        one_sided=gradient_penalty_one_sided,
         add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index f9bdaa74c9..3ebbe55d05 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -359,10 +359,12 @@ class GANLossTest(test.TestCase):
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
   # Test gradient penalty option.
-  def _test_grad_penalty_helper(self, create_gan_model_fn):
+  def _test_grad_penalty_helper(self, create_gan_model_fn, one_sided=False):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
-    loss_gp = train.gan_loss(model, gradient_penalty_weight=1.0)
+    loss_gp = train.gan_loss(model,
+                             gradient_penalty_weight=1.0,
+                             gradient_penalty_one_sided=one_sided)
     self.assertTrue(isinstance(loss_gp, namedtuples.GANLoss))
 
     # Check values.
@@ -394,6 +396,25 @@ class GANLossTest(test.TestCase):
   def test_grad_penalty_callable_acgan(self):
     self._test_grad_penalty_helper(create_callable_acgan_model)
 
+  def test_grad_penalty_one_sided_gan(self):
+    self._test_grad_penalty_helper(create_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_gan(self):
+    self._test_grad_penalty_helper(create_callable_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_infogan(self):
+    self._test_grad_penalty_helper(create_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_infogan(self):
+    self._test_grad_penalty_helper(
+        create_callable_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_acgan(self):
+    self._test_grad_penalty_helper(create_acgan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_acgan(self):
+    self._test_grad_penalty_helper(create_callable_acgan_model, one_sided=True)
+
   # Test mutual information penalty option.
   def _test_mutual_info_penalty_helper(self, create_gan_model_fn):
     train.gan_loss(create_gan_model_fn(),
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 350bcb3bca..10d7f6d076 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -3045,16 +3045,16 @@ def legacy_fully_connected(x,
   `activation_fn` is `None`, the result of `y = w * x + b` is
   returned.
 
-  If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)]
-  with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix
+  If `x` has shape [\\(\text{dim}_0, \text{dim}_1, ..., \text{dim}_n\\)]
+  with more than 2 dimensions (\\(n > 1\\)), then we repeat the matrix
   multiply along the first dimensions. The result r is a tensor of shape
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`],
-  where \\\( r_{i_0, ..., i_{n-1}, k} =
-  \\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\).
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`],
+  where \\( r_{i_0, ..., i_{n-1}, k} =
+  \sum_{0 \leq j < \text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\).
   This is accomplished by reshaping `x` to 2-D
-  [\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)]
+  [\\(\text{dim}_0 \cdot ... \cdot \text{dim}_{n-1}, \text{dim}_n\\)]
   before the matrix multiply and afterwards reshaping it to
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`].
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`].
 
   This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
   `bias_init` to `None`.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
index 80649bd52d..9d3af66c92 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
@@ -138,8 +138,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
         This is true by default, and will raise a `ValueError` otherwise.
       name: A name for this `LinearOperator`.  Default is the individual
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 48c326651f..cbea39bcc0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -165,7 +165,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
   CHECK(mod_op && mod_op->type == OperatorType::kFloorMod)
       << "Unsupported partition strategy";
   CHECK_EQ(mod_op, GetOpWithOutput(*model, indices_partition_op->inputs[1]))
-      << "Indices and data parition ops require the same partition strategy "
+      << "Indices and data partition ops require the same partition strategy "
          "and inputs";
 
   // Glob together all of the gather data. This is not yet in the correct order.
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index 5a40451b3a..6c4f8e12cd 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -45,9 +45,6 @@ py_binary(
     name = "toco_wrapper",
     srcs = ["toco_wrapper.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
index e39b5f22c7..6d6b500d7e 100644
--- a/tensorflow/contrib/lite/toco/python/toco_wrapper.py
+++ b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
@@ -22,14 +22,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import sys
-import tensorflow as tf
 
 
 def main():
   # Pip installs the binary in aux-bin off of main site-package install.
   # Just find it and exec, passing all arguments in the process.
   # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
-  binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
-  os.execvp(binary, sys.argv)
+  print("""TOCO from pip install is currently not working on command line.
+Please use the python TOCO API or use
+bazel run tensorflow/contrib/lite:toco -- <args> from a TensorFlow source dir.
+""")
+  sys.exit(1)
+  # TODO(aselle): Replace this when we find a way to run toco without
+  # blowing up executable size.
+  # binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
+  # os.execvp(binary, sys.argv)
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index a03e731be3..4942d94176 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -298,7 +298,7 @@ class MutableHashTable(LookupInterface):
   table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.string,
                                              value_dtype=tf.int64,
                                              default_value=-1)
-  table.insert(keys, values)
+  sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
   ```
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 07b3ad71d4..d508cf3f9d 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -353,6 +353,42 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depth=9,
         name='testLuongNotNormalized')
 
+  def testLuongScaledDType(self):
+    # Test case for GitHub issue 18099
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.LuongAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          scale=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testLuongScaled(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongAttention, scale=True)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index be53779826..9e0d69593f 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -339,7 +339,8 @@ def _luong_score(query, keys, scale):
   if scale:
     # Scalar used in weight scaling
     g = variable_scope.get_variable(
-        "attention_g", dtype=dtype, initializer=1.)
+        "attention_g", dtype=dtype,
+        initializer=init_ops.ones_initializer, shape=())
     score = g * score
   return score
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 370911e4d9..e920a797fe 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -346,11 +346,10 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
+      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+               istrides, static_cast<Eigen::half*>(
+                             const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     }
     default:
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index af572d8124..d2746032a0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -246,6 +246,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_pip",  # b/64527635
         "no_pip_gpu",  # b/63391119
     ],
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index 4ef8f9eebd..639e708169 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -172,7 +172,7 @@ It is always recommended to port a small, simple model first to make sure that
 you are familiar with the basic concepts of `TPUEstimator` and test end-to-end
 behavior. Once your simple model runs, gradually add more functionality.
 In addition, there are several sample models, available at
-[github.com/tensorflow/tpu-demos](https://github.com/tensorflow/tpu-demos).
+[github.com/tensorflow/tpu](https://github.com/tensorflow/tpu).
 
 To convert your code from the vanilla `Estimator` class to use TPUs, change the
 following (note some of the details may change over time):
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 1a5fb45be0..4bb53e8678 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -36,9 +36,8 @@ out the metrics values to stdout:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-        predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
@@ -81,9 +80,8 @@ more summaries and call the evaluate_repeatedly method:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-          predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index b07039916c..c36d00e842 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -27,7 +27,6 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.losses.python.losses import loss_ops
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.training.python.training import evaluation
 from tensorflow.contrib.training.python.training import training
 from tensorflow.core.protobuf import config_pb2
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
@@ -196,7 +196,8 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     checkpoint_path = evaluation.wait_for_new_checkpoint(checkpoint_dir)
 
@@ -311,7 +312,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     final_values = evaluation.evaluate_repeatedly(
         checkpoint_dir=checkpoint_dir,
@@ -365,7 +367,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     timeout_fn_calls = [0]
     def timeout_fn():
@@ -417,9 +420,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     self.assertEqual(final_values['my_var'], expected_value)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(
-        predictions + 1, labels)
+    accuracy0, update_op0 = metrics.accuracy(labels, predictions)
+    accuracy1, update_op1 = metrics.accuracy(labels, predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 94203ee2b3..c9df6beb6b 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -262,7 +262,7 @@ class RdmaTensorRequest {
   // Receive tensor content (RDMA write was completed).
   //
   // Decode proto if required and/or move to GPU if the content was not
-  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // written to it directly (GPU direct is not available). Afterwards,
   // invoke Done().
   void RecvTensorContent();
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index e1f70404e3..be79cc4507 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -103,7 +103,7 @@ ScopedAllocatorContainer::~ScopedAllocatorContainer() {
   // In normal execution the table should be empty and all of its
   // contents deleted via Drop.  When when a step ends early
   // (e.g. through abnormal termination) we need to clean up
-  // explicitly.  So long as graph exection of the associated step has
+  // explicitly.  So long as graph execution of the associated step has
   // completey terminated this should be safe.
   for (auto& it : allocators_) {
     if (it.second.field_index == ScopedAllocator::kBackingIndex) {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index d91f7107c5..68d3e1c9ab 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -263,21 +263,18 @@ class MklInputConversionOp : public OpKernel {
 
  private:
   void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    const int kInputIndex_0 = 0, kInputIndex_1 = 1;
+    const Tensor& input_tensor_0 = MklGetInput(context, kInputIndex_0);
     MklDnnShape input_shape_0;
-    GetMklShape(context, 0, &input_shape_0);
+    GetMklShape(context, kInputIndex_0, &input_shape_0);
 
-    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    const Tensor& input_tensor_1 = MklGetInput(context, kInputIndex_1);
     MklDnnShape input_shape_1;
-    GetMklShape(context, 1, &input_shape_1);
-
-    bool tf_shapes_are_same =
-        context->input(0).shape() == context->input(1).shape();
+    GetMklShape(context, kInputIndex_1, &input_shape_1);
 
-    VLOG(1) << "MklInputConversionOp: Input shapes are "
-            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
-            << context->input(0).shape().DebugString() << " and "
-            << context->input(1).shape().DebugString();
+    VLOG(1) << "MklInputConversionOp: Input shapes are: "
+            << context->input(kInputIndex_0).shape().DebugString() << " and "
+            << context->input(kInputIndex_1).shape().DebugString();
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // if both inputs are in TF format, just copy input tensors to output.
@@ -285,15 +282,19 @@ class MklInputConversionOp : public OpKernel {
       VLOG(1) << "MklInputConversionOp: No conversion needed, "
               << "copying TF inputs to output";
 
-      ForwardTfTensorInToOut(context, 0, 0);
-      ForwardTfTensorInToOut(context, 1, 1);
+      ForwardTfTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+      ForwardTfTensorInToOut(context, kInputIndex_1, kInputIndex_1);
       return;
     }
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      if (tf_shapes_are_same) {
+      // It is safer to compare the original TensorFlow shapes than to compare
+      // Mkl shapes since element wise ops are forwarded to Eigen implementation.
+      TensorShape tf_shape0 = input_shape_0.GetTfShape();
+      TensorShape tf_shape1 = input_shape_1.GetTfShape();
+      if (tf_shape0 == tf_shape1) {
         auto input0_md = input_shape_0.GetMklLayout();
         auto input1_md = input_shape_1.GetMklLayout();
 
@@ -302,8 +303,8 @@ class MklInputConversionOp : public OpKernel {
           VLOG(1) << "MklInputConversionOp: No conversion needed, "
                   << "copying MKL inputs with identical shapes to output";
 
-          ForwardMklTensorInToOut(context, 0, 0);
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         } else {
           VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
@@ -324,7 +325,7 @@ class MklInputConversionOp : public OpKernel {
           mkl_output_mkl_shape.SetMklLayout(&input1_md);
 
           // Create output Mkl tensor for index 0
-          AllocateOutputSetMklShape(context, 0, &tensor_out,
+          AllocateOutputSetMklShape(context, kInputIndex_0, &tensor_out,
                                     input_tensor_0.shape(),
                                     mkl_output_mkl_shape);
 
@@ -342,7 +343,7 @@ class MklInputConversionOp : public OpKernel {
           stream(stream::kind::eager).submit(net).wait();
 
           // Input1 will be passed through
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         }
       }
@@ -361,11 +362,11 @@ class MklInputConversionOp : public OpKernel {
               << "converted MKL inputs to TF format";
 
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 0);
+                                           op_data_type, has_avx512f_, kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 1);
-      SetDummyMklShapeOutput(context, 0);
-      SetDummyMklShapeOutput(context, 1);
+                                           op_data_type, has_avx512f_, kInputIndex_1);
+      SetDummyMklShapeOutput(context, kInputIndex_0);
+      SetDummyMklShapeOutput(context, kInputIndex_1);
       return;
     }
 
@@ -377,7 +378,6 @@ class MklInputConversionOp : public OpKernel {
     const Tensor* mkl_tensor;
     const MklDnnShape* mkl_shape;
     const Tensor* tf_tensor;
-    MklDnnShape* tf_mkl_shape;
     uint mkl_tensor_index;
     uint tf_tensor_index;
     if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
@@ -385,14 +385,12 @@ class MklInputConversionOp : public OpKernel {
       mkl_shape = &input_shape_0;
       mkl_tensor_index = 0;
       tf_tensor = &input_tensor_1;
-      tf_mkl_shape = &input_shape_1;
       tf_tensor_index = 1;
     } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
       mkl_tensor = &input_tensor_1;
       mkl_shape = &input_shape_1;
       mkl_tensor_index = 1;
       tf_tensor = &input_tensor_0;
-      tf_mkl_shape = &input_shape_0;
       tf_tensor_index = 0;
     } else {
       CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
@@ -466,8 +464,8 @@ class MklInputConversionOp : public OpKernel {
     }
 
     VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(0)->shape().DebugString() << " and "
-            << context->mutable_output(1)->shape().DebugString();
+            << context->mutable_output(kInputIndex_0)->shape().DebugString() << " and "
+            << context->mutable_output(kInputIndex_1)->shape().DebugString();
 
     VLOG(1) << "MklInputConversion completed successfully.";
   }
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 170523b5b4..f79e18cff2 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -102,7 +102,7 @@ class MklSoftmaxOp : public OpKernel {
       // Softmax MklDnn output layout is same as input layout.
       auto dst_pd = src.GetUsrMemPrimDesc();
 
-      // if input is MKL shape, ouput is also MKL shape.
+      // if input is MKL shape, output is also MKL shape.
       // if input is TF shape, output is also TF shape
       if (src_mkl_shape.IsMklTensor()) {
         output_mkl_shape.SetMklTensor(true);
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 9237fa51d8..0de2ebb590 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -244,6 +244,33 @@ __global__ void RowReduceKernel(
   if (row < num_rows && lane == 0) out[row] = sum;
 }
 
+template <typename T1>
+struct storage_type {
+  T1 val;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator T1() { return val; }
+  __host__ __device__ storage_type<T1>& operator=(const T1& in) {
+    val = in;
+    return *this;
+  }
+};
+
+template <typename T2>
+struct storage_type<std::complex<T2>> {
+  T2 real;
+  T2 imag;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator std::complex<T2>() {
+    return std::complex<T2>(real, imag);
+  }
+  __host__ __device__ storage_type<std::complex<T2>>& operator=(
+      const std::complex<T2>& in) {
+    real = in.real();
+    imag = in.imag();
+    return *this;
+  }
+};
+
 // Works only if there are <= 16 columns
 // each warps sums over multiple rows at once
 template <typename T, typename outT, typename Op>
@@ -268,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -294,7 +321,8 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
     if (blockDim.y > 1) {
       for (int row = 1; row < blockDim.y; ++row) {
-        s = op(s, partial_sums[threadIdx.x * 33 + row]);
+        value_type t = partial_sums[threadIdx.x * 33 + row];
+        s = op(s, t);
       }
     }
 
@@ -316,7 +344,7 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += gridDim.y * blockDim.y;
 
@@ -347,7 +375,8 @@ __global__ void ColumnReduceKernel(
         min(blockDim.y, num_rows - blockIdx.y * blockDim.y);
 
     for (int row = 1; row < numRowsThisBlock; ++row) {
-      s = op(s, partial_sums[threadIdx.x * 33 + row]);
+      value_type t = partial_sums[threadIdx.x * 33 + row];
+      s = op(s, t);
     }
 
     out[col * gridDim.y + blockIdx.y] = s;
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 7badc00572..a5186bdacb 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index e2453b9712..2852c49e19 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -105,8 +105,11 @@ REGISTER_OP("RepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate the
-                                                // shape of `count`.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TakeDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 6c2fc60bab..12d6dc5eaf 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -472,7 +472,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -490,7 +490,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -589,7 +589,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("AvgPool3DGrad")
@@ -600,7 +600,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -618,7 +618,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {half, bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("MaxPool3DGrad")
@@ -630,8 +630,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float} = DT_FLOAT")
-    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -1170,9 +1170,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument(
-        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
-        c->Value(last_dim));
+    return errors::InvalidArgument("input must have last dimension >= k = ",
+                                   c->Value(k_dim), " but is ",
+                                   c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1226,9 +1226,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument(
-            "Input must have last dimension > n = ", c->Value(n_dim),
-            " but is ", c->Value(last_dim));
+        return errors::InvalidArgument("Input must have last dimension > n = ",
+                                       c->Value(n_dim), " but is ",
+                                       c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 40eebd1db0..706968d347 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index de4f126507..20fe88a799 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -61,21 +61,21 @@ A subgraph can be created in several ways:
 
 * using a list of ops:
 
-```python
-my_sgv = ge.sgv(ops)
-```
+  ```python
+  my_sgv = ge.sgv(ops)
+  ```
 
 * from a name scope:
 
-```python
-my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
+  ```
 
 * using regular expression:
 
-```python
-my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
+  ```
 
 Note that the Graph Editor is meant to manipulate several graphs at the same
 time, typically during transform or copy operation. For that reason,
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
index 94cf0de32a..86b4b39409 100644
--- a/tensorflow/docs_src/api_guides/python/io_ops.md
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -8,7 +8,7 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 ## Placeholders
 
 TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on @{$reading_data#feeding$Feeding data}.
+on execution.  For more info, see the section on @{$reading_data#Feeding$Feeding data}.
 
 *   @{tf.placeholder}
 *   @{tf.placeholder_with_default}
@@ -42,7 +42,7 @@ formats into tensors.
 
 ### Example protocol buffer
 
-TensorFlow's @{$reading_data#standard-tensorflow-format$recommended format for training examples}
+TensorFlow's @{$reading_data#standard_tensorflow_format$recommended format for training examples}
 is serialized `Example` protocol buffers, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 They contain `Features`, [described
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 8e6fd1cff9..8d8daaae19 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -89,7 +89,7 @@ bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 
-For the `'VALID`' scheme, the output height and width are computed as:
+For the `'VALID'` scheme, the output height and width are computed as:
 
     out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
     out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
@@ -98,10 +98,10 @@ and no padding is used.
 
 Given the output size and the padding, the output can be computed as
 
-    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
-                           strides[2] * j + dj - pad_left, ...] *
-                     filter[di, dj, ...]
+$$    output[b, i, j, :] =
+        sum_{d_i, d_j} input[b, strides[1] * i + d_i - pad_{top},\
+                           strides[2] * j + d_j - pad_{left}, ...] *
+                     filter[d_i, d_j,\ ...]$$
 
 where any value outside the original input image region are considered zero (
 i.e. we pad zero values around the border of the image).
@@ -161,12 +161,12 @@ Morphological operators are non-linear filters used in image processing.
 ](https://en.wikipedia.org/wiki/Dilation_(morphology))
 is the max-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         max_{dy, dx} input[b,
                            strides[1] * y + rates[1] * dy,
                            strides[2] * x + rates[2] * dx,
                            c] +
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 The `filter` is usually called structuring function. Max-pooling is a special
 case of greyscale morphological dilation when the filter assumes all-zero
@@ -176,12 +176,12 @@ values (a.k.a. flat structuring function).
 ](https://en.wikipedia.org/wiki/Erosion_(morphology))
 is the min-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         min_{dy, dx} input[b,
                            strides[1] * y - rates[1] * dy,
                            strides[2] * x - rates[2] * dx,
                            c] -
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 Dilation and erosion are dual to each other. The dilation of the input signal
 `f` by the structuring signal `g` is equal to the negation of the erosion of
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 9c58b3b900..b28cb9df75 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,15 +10,18 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-TensorFlow provides many APIs. This section focuses on the high-level APIs.
-If you are new to TensorFlow, begin by reading one of the following documents:
-
-  * @{$get_started/eager} is for machine learning beginners and uses
-    @{$programmers_guide/eager}.
-  * @{$get_started/get_started_for_beginners} is also for machine learning
-    beginners and uses @{$programmers_guide/graphs}.
-  * @{$get_started/premade_estimators} assumes some machine learning background
-    and uses an @{tf.estimator.Estimator$Estimator}.
+The easiest way to get started with tensorflow is using Eager Execution.
+
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models.
+To get started with Estimators begin by reading one of the following documents:
+
+  * @{$get_started/get_started_for_beginners}, which is aimed at readers
+    new to machine learning.
+  * @{$get_started/premade_estimators}, which is aimed at readers who have
+    experience in machine learning.
 
 Then, read the following documents, which demonstrate the key features
 in the high-level APIs:
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 17bc209e46..4c12f0d84b 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -5,7 +5,10 @@ eager.md
 get_started_for_beginners.md
 premade_estimators.md
 
-### Details
+### Estimators
+get_started_for_beginners.md: For Beginners
+premade_estimators.md: Premade Estimators
+>>>
 checkpoints.md
 feature_columns.md
 datasets_quickstart.md
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index aa4f85f6ce..4be7e508f9 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -1,4 +1,4 @@
-# Get Started with Estimators
+# Premade Estimators
 
 This document introduces the TensorFlow programming environment and shows you
 how to solve the Iris classification problem in TensorFlow.
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 9059b3f3b6..a3eca4bf37 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 2e47a6d212..1a0956634d 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index eff066d200..cdde45a6f4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.7.0-rc1</version>
+                 <version>1.7.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.7.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.7.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.7.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 27b696696d..04e4242b0f 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -46,6 +46,35 @@ must be installed on your system:
     a list of supported GPU cards.
   * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
     Toolkit.
+  * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
+    This library provides advanced profiling support. To install this library,
+    issue the following command for CUDA Toolkit >= 8.0:
+
+    <pre>
+    $ <b>sudo apt-get install cuda-command-line-tools</b>
+    </pre>
+
+    and add its path to your `LD_LIBRARY_PATH` environment variable:
+
+    <pre>
+    $ <b>export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</b>
+    </pre>
+
+    For CUDA Toolkit <= 7.5 do:
+
+    <pre>
+    $ <b>sudo apt-get install libcupti-dev</b>
+    </pre>
+  * **[OPTIONAL]**  For optimized inferencing performance, you can also install
+    NVIDIA TensorRT 3.0. For details, see
+    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
+    Only steps 1-4 in the TensorRT Tar File installation instructions are
+    required for compatibility with TensorFlow; the Python package installation
+    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+
+    **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
+    package, please use the Ubuntu **14.04** tar file package of TensorRT
+    even when installing onto an Ubuntu 16.04 system.   
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
@@ -165,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -270,7 +299,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -456,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -630,14 +659,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -649,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -668,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -687,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 7060ef43da..b3e9616a05 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 148f80efe2..7d7c2aa75a 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.7.0rc1 on Linux:
+for TensorFlow 1.7.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -450,8 +450,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.7.0rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
@@ -471,7 +471,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -486,8 +486,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.7.0rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index a9c2cb3e33..cb0d86fc4c 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -11,7 +11,7 @@ This doc is aimed at users who:
   using an existing model.
 * Have, perhaps, skimmed the code of an example TPU model
   [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
-  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+  [[2]](https://github.com/tensorflow/tpu/tree/master/models).
 * Are interested in porting an existing `Estimator` model to
   run on Cloud TPUs
 
@@ -288,7 +288,7 @@ If shape inference has failed, but the shape is known it is possible to
 impose the correct shape using `tf.set_shape()`. 
 
 In the example below the shape
-inference algorithm fails, but it is corrected using `set_shape`:
+inference algorithm fails, but it is correctly using `set_shape`:
 
 ```
 >>> x = tf.zeros(tf.constant([1,2,3])+1)
@@ -371,10 +371,10 @@ in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
 that data is available when needed.
 
 The TPU-demos repo includes
-[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+[a script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 for downloading the imagenet dataset and converting it to an appropriate format.
 This together with the imagenet
-[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+[models](https://github.com/tensorflow/tpu/tree/master/models)
 included in the repo demonstrate all of these best-practices.
 
 
@@ -387,7 +387,7 @@ For details on how to actually set up and run a Cloud TPU see:
 This document is by no means exhaustive. The best source of more detail on how
 to make a Cloud TPU compatible model are the example models published in:
 
- * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu)
 
 For more information about tuning TensorFlow code for performance see:
 
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index aeb746f29c..cadaec391d 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -198,17 +198,23 @@ Classifier"](#training_and_evaluating_the_cnn_mnist_classifier).
 ### Input Layer
 
 The methods in the `layers` module for creating convolutional and pooling layers
-for two-dimensional image data expect input tensors to have a `channels_last` shape of
-<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>, <em>channels</em>]</code>
-or a `channels_first` shape of <code>[<em>batch_size</em>, <em>channels</em>, <em>image_height</em>, <em>image_width</em>]</code>, defined as follows:
+for two-dimensional image data expect input tensors to have a shape of
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
+<em>channels</em>]</code> by default. This behavior can be changed using the <code><em>data_format</em></code> parameter; defined as follows:
+
 
 *   _`batch_size`_. Size of the subset of examples to use when performing
     gradient descent during training.
-*   _`image_width`_. Width of the example images.
 *   _`image_height`_. Height of the example images.
+*   _`image_width`_. Width of the example images.
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
+*   _`image_height`_. Height of the example images.
+*   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
 
 Here, our MNIST dataset is composed of monochrome 28x28 pixel images, so the
 desired shape for our input layer is <code>[<em>batch_size</em>, 28, 28,
@@ -247,28 +253,27 @@ conv1 = tf.layers.conv2d(
 ```
 
 The `inputs` argument specifies our input tensor, which must have the shape
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
 <em>channels</em>]</code>. Here, we're connecting our first convolutional layer
 to `input_layer`, which has the shape <code>[<em>batch_size</em>, 28, 28,
 1]</code>.
 
 > Note: <code>conv2d()</code> will instead accept a shape of
-> <code>[<em>channels</em>, <em>batch_size</em>, <em>image_width</em>,
-> <em>image_height</em>]</code> when passed the argument
+> <code>[<em>batch_size</em>, <em>channels</em>, <em>image_height</em>, <em>image_width</em>]</code> when passed the argument
 > <code>data_format=channels_first</code>.
 
 The `filters` argument specifies the number of filters to apply (here, 32), and
-`kernel_size` specifies the dimensions of the filters as <code>[<em>width</em>,
-<em>height</em>]</code> (here, <code>[5, 5]</code>).
+`kernel_size` specifies the dimensions of the filters as <code>[<em>height</em>,
+<em>width</em>]</code> (here, <code>[5, 5]</code>).
 
-<p class="tip"><b>TIP:</b> If filter width and height have the same value, you can instead specify a
+<p class="tip"><b>TIP:</b> If filter height and width have the same value, you can instead specify a
 single integer for <code>kernel_size</code>—e.g., <code>kernel_size=5</code>.</p>
 
 The `padding` argument specifies one of two enumerated values
 (case-insensitive): `valid` (default value) or `same`. To specify that the
-output tensor should have the same width and height values as the input tensor,
+output tensor should have the same height and width values as the input tensor,
 we set `padding=same` here, which instructs TensorFlow to add 0 values to the
-edges of the input tensor to preserve width and height of 28. (Without padding,
+edges of the input tensor to preserve height and width of 28. (Without padding,
 a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are
 24x24 locations to extract a 5x5 tile from a 28x28 grid.)
 
@@ -277,7 +282,7 @@ output of the convolution. Here, we specify ReLU activation with
 @{tf.nn.relu}.
 
 Our output tensor produced by `conv2d()` has a shape of
-<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same width and height
+<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same height and width
 dimensions as the input, but now with 32 channels holding the output from each
 of the filters.
 
@@ -292,31 +297,30 @@ pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
 ```
 
 Again, `inputs` specifies the input tensor, with a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<code>[<em>batch_size</em>, <em>image_height</em>, <em>image_width</em>,
 <em>channels</em>]</code>. Here, our input tensor is `conv1`, the output from
 the first convolutional layer, which has a shape of <code>[<em>batch_size</em>,
 28, 28, 32]</code>.
 
 > Note: As with <code>conv2d()</code>, <code>max_pooling2d()</code> will instead
-> accept a shape of <code>[<em>channels</em>, <em>batch_size</em>,
-> <em>image_width</em>, <em>image_height</em>]</code> when passed the argument
+> accept a shape of <code>[<em>batch_size</em>, <em>channels</em>, 
+> <em>image_height</em>, <em>image_width</em>]</code> when passed the argument
 > <code>data_format=channels_first</code>.
 
 The `pool_size` argument specifies the size of the max pooling filter as
-<code>[<em>width</em>, <em>height</em>]</code> (here, `[2, 2]`). If both
+<code>[<em>height</em>, <em>width</em>]</code> (here, `[2, 2]`). If both
 dimensions have the same value, you can instead specify a single integer (e.g.,
 `pool_size=2`).
 
 The `strides` argument specifies the size of the stride. Here, we set a stride
 of 2, which indicates that the subregions extracted by the filter should be
-separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
+separated by 2 pixels in both the height and width dimensions (for a 2x2 filter,
 this means that none of the regions extracted will overlap). If you want to set
-different stride values for width and height, you can instead specify a tuple or
+different stride values for height and width, you can instead specify a tuple or
 list (e.g., `stride=[3, 6]`).
 
 Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
-<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces width and
-height by 50% each.
+<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces height and width by 50% each.
 
 ### Convolutional Layer #2 and Pooling Layer #2
 
@@ -338,13 +342,11 @@ pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
 
 Note that convolutional layer #2 takes the output tensor of our first pooling
 layer (`pool1`) as input, and produces the tensor `conv2` as output. `conv2`
-has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same width
-and height as `pool1` (due to `padding="same"`), and 64 channels for the 64
+has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same height and width as `pool1` (due to `padding="same"`), and 64 channels for the 64
 filters applied.
 
 Pooling layer #2 takes `conv2` as input, producing `pool2` as output. `pool2`
-has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of width
-and height from `conv2`).
+has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of height and width from `conv2`).
 
 ### Dense Layer
 
@@ -360,7 +362,7 @@ pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
 
 In the `reshape()` operation above, the `-1` signifies that the *`batch_size`*
 dimension will be dynamically calculated based on the number of examples in our
-input data. Each example has 7 (`pool2` width) * 7 (`pool2` height) * 64
+input data. Each example has 7 (`pool2` height) * 7 (`pool2` width) * 64
 (`pool2` channels) features, so we want the `features` dimension to have a value
 of 7 * 7 * 64 (3136 in total). The output tensor, `pool2_flat`, has shape
 <code>[<em>batch_size</em>, 3136]</code>.
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index acaf1a44eb..565c1cb8e0 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -314,6 +314,9 @@ tf_cc_test(
     srcs = [
         "src/gen/cc/source_writer_test.cc",
     ],
+    data = [
+        "src/gen/resources/test.java.snippet",
+    ],
     deps = [
         ":java_op_gen_lib",
         "//tensorflow/core:lib",
diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h
index 615cdc165b..59f8beaee7 100644
--- a/tensorflow/java/src/gen/cc/java_defs.h
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@@ -17,10 +17,7 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
 
 #include <string>
-#include <vector>
-#include <deque>
-
-#include "tensorflow/core/platform/env.h"
+#include <list>
 
 namespace tensorflow {
 namespace java {
@@ -104,17 +101,17 @@ class Type {
     description_ = description;
     return *this;
   }
-  const std::vector<Type>& parameters() const { return parameters_; }
+  const std::list<Type>& parameters() const { return parameters_; }
   Type& add_parameter(const Type& parameter) {
     parameters_.push_back(parameter);
     return *this;
   }
-  const std::vector<Annotation>& annotations() const { return annotations_; }
+  const std::list<Annotation>& annotations() const { return annotations_; }
   Type& add_annotation(const Annotation& annotation) {
     annotations_.push_back(annotation);
     return *this;
   }
-  const std::deque<Type>& supertypes() const { return supertypes_; }
+  const std::list<Type>& supertypes() const { return supertypes_; }
   Type& add_supertype(const Type& type) {
     if (type.kind_ == CLASS) {
       supertypes_.push_front(type);  // keep superclass at the front of the list
@@ -141,9 +138,9 @@ class Type {
   string name_;
   string package_;
   string description_;
-  std::vector<Type> parameters_;
-  std::vector<Annotation> annotations_;
-  std::deque<Type> supertypes_;
+  std::list<Type> parameters_;
+  std::list<Annotation> annotations_;
+  std::list<Type> supertypes_;
 };
 
 // Definition of a Java annotation
@@ -223,16 +220,12 @@ class Method {
     return_description_ = description;
     return *this;
   }
-  const std::vector<Variable>& arguments() const { return arguments_; }
-  Method& add_arguments(const std::vector<Variable>& args) {
-    arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
-    return *this;
-  }
+  const std::list<Variable>& arguments() const { return arguments_; }
   Method& add_argument(const Variable& var) {
     arguments_.push_back(var);
     return *this;
   }
-  const std::vector<Annotation>& annotations() const { return annotations_; }
+  const std::list<Annotation>& annotations() const { return annotations_; }
   Method& add_annotation(const Annotation& annotation) {
     annotations_.push_back(annotation);
     return *this;
@@ -244,29 +237,13 @@ class Method {
   bool constructor_;
   string description_;
   string return_description_;
-  std::vector<Variable> arguments_;
-  std::vector<Annotation> annotations_;
+  std::list<Variable> arguments_;
+  std::list<Annotation> annotations_;
 
   Method(const string& name, const Type& return_type, bool constructor)
     : name_(name), return_type_(return_type), constructor_(constructor) {}
 };
 
-// A piece of code to read from a file.
-class Snippet {
- public:
-  static Snippet Create(const string& fname, Env* env = Env::Default()) {
-    return Snippet(fname, env);
-  }
-  const string& data() const { return data_; }
-
- private:
-  string data_;
-
-  Snippet(const string& fname, Env* env) {
-    TF_CHECK_OK(ReadFileToString(env, fname, &data_));
-  }
-};
-
 }  // namespace java
 }  // namespace tensorflow
 
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 2da81f2911..a02f75ad6e 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -14,49 +14,328 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
+#include <algorithm>
+#include <deque>
 
 #include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
+namespace java {
 
-SourceWriter& SourceWriter::Append(const StringPiece& str) {
-  if (!str.empty()) {
-    if (newline_) {
-      DoAppend(left_margin_ + line_prefix_);
-      newline_ = false;
-    }
-    DoAppend(str);
+SourceWriter::SourceWriter() {
+  // Push an empty generic namespace at start, for simplification.
+  generic_namespaces_.push(new GenericNamespace());
+}
+
+SourceWriter::~SourceWriter() {
+  // Remove empty generic namespace added at start as well as any other
+  // namespace objects that haven't been removed.
+  while (!generic_namespaces_.empty()) {
+    GenericNamespace* generic_namespace = generic_namespaces_.top();
+    generic_namespaces_.pop();
+    delete generic_namespace;
   }
+}
+
+SourceWriter& SourceWriter::Indent(int tab) {
+  left_margin_.resize(
+      std::max(static_cast<int>(left_margin_.size() + tab), 0), ' ');
+  return *this;
+}
+
+SourceWriter& SourceWriter::Prefix(const char* line_prefix) {
+  line_prefix_ = line_prefix;
   return *this;
 }
 
-SourceWriter& SourceWriter::Write(const string& str) {
+SourceWriter& SourceWriter::Write(const StringPiece& str) {
   size_t line_pos = 0;
   do {
     size_t start_pos = line_pos;
     line_pos = str.find('\n', start_pos);
     if (line_pos != string::npos) {
       ++line_pos;
-      Append(StringPiece(str.data() + start_pos, line_pos - start_pos));
+      Append(str.substr(start_pos, line_pos - start_pos));
       newline_ = true;
     } else {
-      Append(StringPiece(str.data() + start_pos, str.size() - start_pos));
+      Append(str.substr(start_pos, str.size() - start_pos));
     }
   } while (line_pos != string::npos && line_pos < str.size());
 
   return *this;
 }
 
+SourceWriter& SourceWriter::WriteFromFile(const string& fname, Env* env) {
+  string data_;
+  TF_CHECK_OK(ReadFileToString(env, fname, &data_));
+  return Write(data_);
+}
+
+SourceWriter& SourceWriter::Append(const StringPiece& str) {
+  if (!str.empty()) {
+    if (newline_) {
+      DoAppend(left_margin_ + line_prefix_);
+      newline_ = false;
+    }
+    DoAppend(str);
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::AppendType(const Type& type) {
+  if (type.kind() == Type::Kind::GENERIC && type.name().empty()) {
+    Append("?");
+  } else {
+    Append(type.name());
+  }
+  if (!type.parameters().empty()) {
+    Append("<");
+    for (const Type& t : type.parameters()) {
+      if (&t != &type.parameters().front()) {
+        Append(", ");
+      }
+      AppendType(t);
+    }
+    Append(">");
+  }
+  return *this;
+}
+
 SourceWriter& SourceWriter::EndLine() {
   Append("\n");
   newline_ = true;
   return *this;
 }
 
-SourceWriter& SourceWriter::Indent(int tab) {
-  left_margin_.resize(std::max(static_cast<int>(left_margin_.size() + tab), 0),
-                      ' ');
+SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers) {
+  GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
+  if (!method.constructor()) {
+    generic_namespace->Visit(method.return_type());
+  }
+  for (const Variable& v : method.arguments()) {
+    generic_namespace->Visit(v.type());
+  }
+  EndLine();
+  WriteDoc(method.description(), method.return_description(),
+      &method.arguments());
+  if (!method.annotations().empty()) {
+    WriteAnnotations(method.annotations());
+  }
+  WriteModifiers(modifiers);
+  if (!generic_namespace->declared_types().empty()) {
+    WriteGenerics(generic_namespace->declared_types());
+    Append(" ");
+  }
+  if (!method.constructor()) {
+    AppendType(method.return_type()).Append(" ");
+  }
+  Append(method.name()).Append("(");
+  for (const Variable& v : method.arguments()) {
+    if (&v != &method.arguments().front()) {
+      Append(", ");
+    }
+    AppendType(v.type()).Append(v.variadic() ? "... " : " ").Append(v.name());
+  }
+  return Append(")").BeginBlock();
+}
+
+SourceWriter& SourceWriter::EndMethod() {
+  EndBlock();
+  PopGenericNamespace();
+  return *this;
+}
+
+SourceWriter& SourceWriter::BeginType(const Type& type,
+    const std::list<Type>* dependencies, int modifiers) {
+  if (!type.package().empty()) {
+    Append("package ").Append(type.package()).Append(";").EndLine();
+  }
+  if (dependencies != nullptr && !dependencies->empty()) {
+    TypeImporter type_importer(type.package());
+    for (const Type& t : *dependencies) {
+      type_importer.Visit(t);
+    }
+    EndLine();
+    for (const string& s : type_importer.imports()) {
+      Append("import ").Append(s).Append(";").EndLine();
+    }
+  }
+  return BeginInnerType(type, modifiers);
+}
+
+SourceWriter& SourceWriter::BeginInnerType(const Type& type, int modifiers) {
+  GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
+  generic_namespace->Visit(type);
+  EndLine();
+  WriteDoc(type.description());
+  if (!type.annotations().empty()) {
+    WriteAnnotations(type.annotations());
+  }
+  WriteModifiers(modifiers);
+  CHECK_EQ(Type::Kind::CLASS, type.kind()) << ": Not supported yet";
+  Append("class ").Append(type.name());
+  if (!generic_namespace->declared_types().empty()) {
+    WriteGenerics(generic_namespace->declared_types());
+  }
+  if (!type.supertypes().empty()) {
+    bool first_interface = true;
+    for (const Type& t : type.supertypes()) {
+      if (t.kind() == Type::CLASS) {  // superclass is always first in list
+        Append(" extends ");
+      } else if (first_interface) {
+        Append(" implements ");
+        first_interface = false;
+      } else {
+        Append(", ");
+      }
+      AppendType(t);
+    }
+  }
+  return BeginBlock();
+}
+
+SourceWriter& SourceWriter::EndType() {
+  EndBlock();
+  PopGenericNamespace();
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteFields(const std::list<Variable>& fields,
+    int modifiers) {
+  EndLine();
+  for (const Variable& v : fields) {
+    WriteModifiers(modifiers);
+    AppendType(v.type()).Append(" ").Append(v.name()).Append(";");
+    EndLine();
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteModifiers(int modifiers) {
+  if (modifiers & PUBLIC) {
+    Append("public ");
+  } else if (modifiers & PROTECTED) {
+    Append("protected ");
+  } else if (modifiers & PRIVATE) {
+    Append("private ");
+  }
+  if (modifiers & STATIC) {
+    Append("static ");
+  }
+  if (modifiers & FINAL) {
+    Append("final ");
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::WriteDoc(const string& description,
+    const string& return_description, const std::list<Variable>* parameters) {
+  if (description.empty() && return_description.empty()
+      && (parameters == nullptr || parameters->empty())) {
+    return *this;  // no doc to write
+  }
+  bool do_line_break = false;
+  Append("/**").EndLine().Prefix(" * ");
+  if (!description.empty()) {
+    Write(description).EndLine();
+    do_line_break = true;
+  }
+  if (parameters != nullptr && !parameters->empty()) {
+    if (do_line_break) {
+      EndLine();
+      do_line_break = false;
+    }
+    for (const Variable& v : *parameters) {
+      Append("@param ").Append(v.name());
+      if (!v.description().empty()) {
+        Append(" ").Write(v.description());
+      }
+      EndLine();
+    }
+  }
+  if (!return_description.empty()) {
+    if (do_line_break) {
+      EndLine();
+      do_line_break = false;
+    }
+    Append("@return ").Write(return_description).EndLine();
+  }
+  return Prefix("").Append(" **/").EndLine();
+}
+
+SourceWriter& SourceWriter::WriteAnnotations(
+    const std::list<Annotation>& annotations) {
+  for (const Annotation& a : annotations) {
+    Append("@" + a.name());
+    if (!a.attributes().empty()) {
+      Append("(").Append(a.attributes()).Append(")");
+    }
+    EndLine();
+  }
   return *this;
 }
 
+SourceWriter& SourceWriter::WriteGenerics(
+    const std::list<const Type*>& generics) {
+  Append("<");
+  for (const Type* pt : generics) {
+    if (pt != generics.front()) {
+      Append(", ");
+    }
+    Append(pt->name());
+    if (!pt->supertypes().empty()) {
+      Append(" extends ").AppendType(pt->supertypes().front());
+    }
+  }
+  return Append(">");
+}
+
+SourceWriter::GenericNamespace* SourceWriter::PushGenericNamespace(
+    int modifiers) {
+  GenericNamespace* generic_namespace;
+  if (modifiers & STATIC) {
+    generic_namespace = new GenericNamespace();
+  } else {
+    generic_namespace = new GenericNamespace(generic_namespaces_.top());
+  }
+  generic_namespaces_.push(generic_namespace);
+  return generic_namespace;
+}
+
+void SourceWriter::PopGenericNamespace() {
+  GenericNamespace* generic_namespace = generic_namespaces_.top();
+  generic_namespaces_.pop();
+  delete generic_namespace;
+}
+
+void SourceWriter::TypeVisitor::Visit(const Type& type) {
+  DoVisit(type);
+  for (const Type& t : type.parameters()) {
+    DoVisit(t);
+  }
+  for (const Annotation& t : type.annotations()) {
+    DoVisit(t);
+  }
+  for (const Type& t : type.supertypes()) {
+    DoVisit(t);
+  }
+}
+
+void SourceWriter::GenericNamespace::DoVisit(const Type& type) {
+  // ignore non-generic parameters, wildcards and generics already declared
+  if (type.kind() == Type::GENERIC
+      && !type.IsWildcard()
+      && generic_names_.find(type.name()) == generic_names_.end()) {
+    declared_types_.push_back(&type);
+    generic_names_.insert(type.name());
+  }
+}
+
+void SourceWriter::TypeImporter::DoVisit(const Type& type) {
+  if (!type.package().empty() && type.package() != current_package_) {
+    imports_.insert(type.package() + '.' + type.name());
+  }
+}
+
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index bff26eb185..637072c0df 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -17,44 +17,23 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
 
 #include <string>
+#include <stack>
+#include <list>
+#include <set>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
 
 namespace tensorflow {
+namespace java {
 
-// A utility class for writing source code, normally generated at
-// compile-time.
-//
-// Source writers are language-agnostic and therefore only expose generic
-// methods common to most languages. Extend or wrap this class to implement
-// language-specific features.
-//
-// Note: if you are looking to reuse this class for generating code in another
-// language than Java, please do by moving it at the '//tensorflow/core/lib/io'
-// level.
+// A class for writing Java source code.
 class SourceWriter {
  public:
-  virtual ~SourceWriter() = default;
-
-  // Returns true if the writer is at the beginnig of a new line
-  bool newline() const { return newline_; }
-
-  // Appends a piece of code or text.
-  //
-  // It is expected that no newline character is present in the data provided,
-  // otherwise Write() must be used.
-  SourceWriter& Append(const StringPiece& str);
+  SourceWriter();
 
-  // Writes a block of code or text.
-  //
-  // The data might potentially contain newline characters, therefore it will
-  // be scanned to ensure that each line is indented and prefixed properly,
-  // making it a bit slower than Append().
-  SourceWriter& Write(const string& text);
-
-  // Appends a newline character and start writing on a new line.
-  SourceWriter& EndLine();
+  virtual ~SourceWriter();
 
   // Indents following lines with white spaces.
   //
@@ -75,18 +54,166 @@ class SourceWriter {
   // Indent(2)->Prefix("//") will result in prefixing lines with "  //".
   //
   // An empty value ("") will remove any line prefix that was previously set.
-  SourceWriter& Prefix(const char* line_prefix) {
-    line_prefix_ = line_prefix;
-    return *this;
+  SourceWriter& Prefix(const char* line_prefix);
+
+  // Writes a source code snippet.
+  //
+  // The data might potentially contain newline characters, therefore it will
+  // be scanned to ensure that each line is indented and prefixed properly,
+  // making it a bit slower than Append().
+  SourceWriter& Write(const StringPiece& text);
+
+  // Writes a source code snippet read from a file.
+  //
+  // All lines of the file at the provided path will be read and written back
+  // to the output of this writer in regard of its current attributes (e.g.
+  // the indentation, prefix, etc.)
+  SourceWriter& WriteFromFile(const string& fname, Env* env = Env::Default());
+
+  // Appends a piece of source code.
+  //
+  // It is expected that no newline character is present in the data provided,
+  // otherwise Write() must be used.
+  SourceWriter& Append(const StringPiece& str);
+
+  // Appends a type to the current line.
+  //
+  // The type is written in its simple form (i.e. not prefixed by its package)
+  // and followed by any parameter types it has enclosed in brackets (<>).
+  SourceWriter& AppendType(const Type& type);
+
+  // Appends a newline character.
+  //
+  // Data written after calling this method will start on a new line, in respect
+  // of the current indentation.
+  SourceWriter& EndLine();
+
+  // Begins a block of source code.
+  //
+  // This method appends a new opening brace to the current data and indent the
+  // next lines according to Google Java Style Guide. The block can optionally
+  // be preceded by an expression (e.g. Append("if(true)").BeginBlock();)
+  SourceWriter& BeginBlock() {
+    return Append(newline_ ? "{" : " {").EndLine().Indent(2);
+  }
+
+  // Ends the current block of source code.
+  //
+  // This method appends a new closing brace to the current data and outdent the
+  // next lines back to the margin used before BeginBlock() was invoked.
+  SourceWriter& EndBlock() {
+    return Indent(-2).Append("}").EndLine();
   }
 
+  // Begins to write a method.
+  //
+  // This method outputs the signature of the Java method from the data passed
+  // in the 'method' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // method.
+  SourceWriter& BeginMethod(const Method& method, int modifiers = 0);
+
+  // Ends the current method.
+  //
+  // This method ends the block of code that has begun when invoking
+  // BeginMethod() prior to this.
+  SourceWriter& EndMethod();
+
+  // Begins to write the main type of a source file.
+  //
+  // This method outputs the declaration of the Java type from the data passed
+  // in the 'type' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // type.
+  //
+  // If not null, all types found in the 'dependencies' list will be imported
+  // before declaring the new type.
+  SourceWriter& BeginType(const Type& clazz,
+      const std::list<Type>* dependencies, int modifiers = 0);
+
+  // Begins to write a new inner type.
+  //
+  // This method outputs the declaration of the Java type from the data passed
+  // in the 'type' parameter and starts a new block. Additionnal modifiers can
+  // also be passed in parameter to define the accesses and the scope of this
+  // type.
+  SourceWriter& BeginInnerType(const Type& type, int modifiers = 0);
+
+  // Ends the current type.
+  //
+  // This method ends the block of code that has begun when invoking
+  // BeginType() or BeginInnerType() prior to this.
+  SourceWriter& EndType();
+
+  // Writes a list of variables as fields of a type.
+  //
+  // This method must be called within the definition of a type (see BeginType()
+  // or BeginInnerType()). Additional modifiers can also be passed in parameter
+  // to define the accesses and the scope of those fields.
+  SourceWriter& WriteFields(const std::list<Variable>& fields,
+      int modifiers = 0);
+
  protected:
   virtual void DoAppend(const StringPiece& str) = 0;
 
  private:
+  // A utility base class for visiting elements of a type.
+  class TypeVisitor {
+   public:
+    virtual ~TypeVisitor() = default;
+    void Visit(const Type& type);
+
+   protected:
+    virtual void DoVisit(const Type& type) = 0;
+  };
+
+  // A utility class for keeping track of declared generics in a given scope.
+  class GenericNamespace : public TypeVisitor {
+   public:
+    GenericNamespace() = default;
+    explicit GenericNamespace(const GenericNamespace* parent)
+      : generic_names_(parent->generic_names_) {}
+    std::list<const Type*> declared_types() {
+      return declared_types_;
+    }
+   protected:
+    virtual void DoVisit(const Type& type);
+
+   private:
+    std::list<const Type*> declared_types_;
+    std::set<string> generic_names_;
+  };
+
+  // A utility class for collecting a list of import statements to declare.
+  class TypeImporter : public TypeVisitor {
+   public:
+    explicit TypeImporter(const string& current_package)
+      : current_package_(current_package) {}
+    virtual ~TypeImporter() = default;
+    const std::set<string> imports() {
+      return imports_;
+    }
+   protected:
+    virtual void DoVisit(const Type& type);
+
+   private:
+    string current_package_;
+    std::set<string> imports_;
+  };
+
   string left_margin_;
   string line_prefix_;
   bool newline_ = true;
+  std::stack<GenericNamespace*> generic_namespaces_;
+
+  SourceWriter& WriteModifiers(int modifiers);
+  SourceWriter& WriteDoc(const string& description,
+    const string& return_description = "",
+    const std::list<Variable>* parameters = nullptr);
+  SourceWriter& WriteAnnotations(const std::list<Annotation>& annotations);
+  SourceWriter& WriteGenerics(const std::list<const Type*>& generics);
+  GenericNamespace* PushGenericNamespace(int modifiers);
+  void PopGenericNamespace();
 };
 
 // A writer that outputs source code into a file.
@@ -128,6 +255,7 @@ class SourceBufferWriter : public SourceWriter {
   string* buffer_;
 };
 
+}  // namespace java
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index e973895754..4bce2fea70 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/java/src/gen/cc/source_writer.h"
+#include <list>
+
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
+#include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
+namespace java {
 namespace {
 
 TEST(AppendTest, SingleLineText) {
@@ -211,5 +215,368 @@ TEST(MarginTest, EmptyPrefix) {
   ASSERT_STREQ(expected, writer.str().data());
 }
 
+TEST(StreamTest, BlocksAndLines) {
+  SourceBufferWriter writer;
+
+  writer.Append("int i = 0;").EndLine()
+        .Append("int j = 10;").EndLine()
+        .Append("if (true)")
+        .BeginBlock()
+          .Append("int aLongWayToTen = 0;").EndLine()
+          .Append("while (++i <= j)")
+          .BeginBlock()
+            .Append("++aLongWayToTen;").EndLine()
+          .EndBlock()
+        .EndBlock();
+
+  const char* expected =
+      "int i = 0;\n"
+      "int j = 10;\n"
+      "if (true) {\n"
+      "  int aLongWayToTen = 0;\n"
+      "  while (++i <= j) {\n"
+      "    ++aLongWayToTen;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(StreamTest, Types) {
+  SourceBufferWriter writer;
+  Type generic = Type::Generic("T").add_supertype(Type::Class("Number"));
+
+  writer.AppendType(Type::Int()).Append(", ")
+        .AppendType(Type::Class("String")).Append(", ")
+        .AppendType(generic).Append(", ")
+        .AppendType(Type::ListOf(generic)).Append(", ")
+        .AppendType(Type::ListOf(Type::IterableOf(generic))).Append(", ")
+        .AppendType(Type::ListOf(Type::Generic()));
+
+  const char* expected =
+      "int, String, T, List<T>, List<Iterable<T>>, List<?>";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(StreamTest, FileSnippet) {
+  SourceBufferWriter writer;
+  const string fname = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(),
+      "java/src/gen/resources/test.java.snippet");
+
+  writer.WriteFromFile(fname)
+        .BeginBlock()
+        .WriteFromFile(fname)
+        .EndBlock();
+
+  const char* expected =
+      "// Here is a little snippet\n"
+      "System.out.println(\"Hello!\");\n"
+      "{\n"
+      "  // Here is a little snippet\n"
+      "  System.out.println(\"Hello!\");\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleClassWithDependencies) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  std::list<Type> deps;
+  deps.push_back(Type::Class("TypeA", "org.test.sub"));
+  deps.push_back(Type::Class("TypeA", "org.test.sub"));  // a second time
+  deps.push_back(Type::Class("TypeB", "org.other"));
+  deps.push_back(Type::Class("SamePackageType", "org.tensorflow"));
+  deps.push_back(Type::Class("NoPackageType"));
+
+  writer.BeginType(clazz, &deps, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "import org.other.TypeB;\n"
+      "import org.test.sub.TypeA;\n\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, AnnotatedAndDocumentedClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  clazz.description("This class has a\n<p>\nmultiline description.");
+  clazz.add_annotation(Annotation::Create("Bean"));
+  clazz.add_annotation(Annotation::Create("SuppressWarnings")
+      .attributes("\"rawtypes\""));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "/**\n"
+      " * This class has a\n"
+      " * <p>\n"
+      " * multiline description.\n"
+      " **/\n"
+      "@Bean\n"
+      "@SuppressWarnings(\"rawtypes\")\n"
+      "public class Test {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  clazz.add_parameter(Type::Generic("T"));
+  clazz.add_parameter(Type::Generic("U").add_supertype(Type::Class("Number")));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T, U extends Number> {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClassAndSupertypes) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T");
+  clazz.add_parameter(type_t);
+  Type type_u = Type::Generic("U").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_u);
+  clazz.add_supertype(Type::Interface("Parametrizable").add_parameter(type_u));
+  clazz.add_supertype(Type::Interface("Runnable"));
+  clazz.add_supertype(Type::Class("SuperTest").add_parameter(type_t));
+
+  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T, U extends Number>"
+      " extends SuperTest<T> implements Parametrizable<U>, Runnable {\n}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, ParameterizedClassFields) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  std::list<Variable> static_fields;
+  static_fields.push_back(Variable::Create("field1", Type::Class("String")));
+  std::list<Variable> member_fields;
+  member_fields.push_back(Variable::Create("field2", Type::Class("String")));
+  member_fields.push_back(Variable::Create("field3", type_t));
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .WriteFields(static_fields, STATIC | PUBLIC | FINAL)
+          .WriteFields(member_fields, PRIVATE)
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static final String field1;\n"
+      "  \n"
+      "  private String field2;\n"
+      "  private T field3;\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, SimpleInnerClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type inner_class = Type::Class("InnerTest");
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginInnerType(inner_class, PUBLIC)
+          .EndType()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  public class InnerTest {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteType, StaticParameterizedInnerClass) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Type inner_class = Type::Class("InnerTest");
+  inner_class.add_parameter(type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginInnerType(inner_class, PUBLIC | STATIC)
+          .EndType()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static class InnerTest<T extends Number> {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, SimpleMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("doNothing", Type::Void());
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC).EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  public void doNothing() {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, AnnotatedAndDocumentedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("doNothing", Type::Void());
+  method.description("This method has a\n<p>\nmultiline description.");
+  method.add_annotation(Annotation::Create("Override"));
+  method.add_annotation(Annotation::Create("SuppressWarnings")
+      .attributes("\"rawtypes\""));
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC).EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  /**\n"
+      "   * This method has a\n"
+      "   * <p>\n"
+      "   * multiline description.\n"
+      "   **/\n"
+      "  @Override\n"
+      "  @SuppressWarnings(\"rawtypes\")\n"
+      "  public void doNothing() {\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, DocumentedMethodWithArguments) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Method method = Method::Create("boolToInt", Type::Int());
+  method.description("Converts a boolean to an int");
+  method.return_description("int value for this boolean");
+  method.add_argument(Variable::Create("b", Type::Boolean()));
+  Variable reverse = Variable::Create("reverse", Type::Boolean());
+  reverse.description("if true, value is reversed");
+  method.add_argument(reverse);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC)
+            .Append("if (b && !reverse)")
+            .BeginBlock()
+              .Append("return 1;").EndLine()
+            .EndBlock()
+          .Append("return 0;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test {\n"
+      "  \n"
+      "  /**\n"
+      "   * Converts a boolean to an int\n"
+      "   * \n"
+      "   * @param b\n"
+      "   * @param reverse if true, value is reversed\n"
+      "   * @return int value for this boolean\n"
+      "   **/\n"
+      "  public int boolToInt(boolean b, boolean reverse) {\n"
+      "    if (b && !reverse) {\n"
+      "      return 1;\n"
+      "    }\n"
+      "    return 0;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, ParameterizedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Method method = Method::Create("doNothing", type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC)
+            .Append("return null;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public T doNothing() {\n"
+      "    return null;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteMethod, StaticParameterizedMethod) {
+  SourceBufferWriter writer;
+  Type clazz = Type::Class("Test", "org.tensorflow");
+  Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
+  clazz.add_parameter(type_t);
+  Method method = Method::Create("doNothing", type_t);
+
+  writer.BeginType(clazz, nullptr, PUBLIC)
+          .BeginMethod(method, PUBLIC | STATIC)
+            .Append("return null;").EndLine()
+          .EndMethod()
+        .EndType();
+
+  const char* expected =
+      "package org.tensorflow;\n\n"
+      "public class Test<T extends Number> {\n"
+      "  \n"
+      "  public static <T extends Number> T doNothing() {\n"
+      "    return null;\n"
+      "  }\n"
+      "}\n";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
 }  // namespace
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/resources/test.java.snippet b/tensorflow/java/src/gen/resources/test.java.snippet
new file mode 100644
index 0000000000..5e412a9aef
--- /dev/null
+++ b/tensorflow/java/src/gen/resources/test.java.snippet
@@ -0,0 +1,2 @@
+// Here is a little snippet
+System.out.println("Hello!");
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index 5e6b5acdb0..c046e9cfd4 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.client import timeline
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -155,9 +156,7 @@ class TimelineTest(test.TestCase):
     ctf = step_analysis.chrome_trace.format_to_string()
     self._validateTrace(ctf)
     maximums = step_analysis.allocator_maximums
-    cpuname = 'cpu'
-    if 'mklcpu' in maximums:
-      cpuname = 'mkl' + cpuname
+    cpuname = 'mklcpu' if test_util.IsMklEnabled() else 'cpu'
     self.assertTrue(cpuname in maximums)
     cpu_max = maximums[
         'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums[cpuname]
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 535361498a..9a08259653 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -253,7 +253,7 @@ def add_execution_callback(callback):
       `f(op_type, op_name, attrs, inputs, outputs)`.
       `op_type` is the type of the operation that was just executed (e.g.,
         `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
+      `op_name` is the name of the operation that was just executed. This
         name is set by the client who created the operation and can be `None` if
         it is unset.
       `attrs` contains the attributes of the operation as a `tuple` of
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 36a86a25cc..1e5c118cbc 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -618,7 +618,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     for dtype in [dtypes.float32]:
       for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]:
         tol = 1e-2
-        # Check orthogonality by computing the 2-norms of the inputs and ouputs.
+        # Check orthogonality by computing the 2-norms of the inputs and outputs.
         if len(kernel_size) == 1:
           shape = [4, 32, 64]
           convolution = convolutional.conv1d
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 7be8628073..fb53d9ffea 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -833,6 +833,9 @@ class GradLoopState(object):
     if outer_grad_state:
       outer_forward_ctxt = outer_grad_state.forward_context
     else:
+      if not hasattr(forward_ctxt, 'outer_context'):
+        raise ValueError("Failed to call gradients on a while loop without"
+                         "properly serializing graph via MetaGraphDef")
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 4b57e2de79..908e793902 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -218,7 +218,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
         The rows store: `[batch, time]`.
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
-      `decoded.shape`: Shape vector, size `(2)`.
+      `decoded.dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
     neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
         sequence found, the negative of the sum of the greatest logit at each
@@ -265,7 +265,7 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
         The rows store: [batch, time].
       `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
         The vector stores the decoded classes for beam j.
-      `decoded[j].shape`: Shape vector, size `(2)`.
+      `decoded[j].dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length[j]]`.
     log_probability: A `float` matrix `(batch_size x top_paths)` containing
         sequence log-probabilities.
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 9eacac1b37..dfa07abfc6 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -95,7 +95,7 @@ def custom_gradient(f):
     if not context.executing_eagerly():
       if kwargs:
         raise ValueError(
-            "The custom_gradient decorator currently suports keywords "
+            "The custom_gradient decorator currently supports keywords "
             "arguments only when eager execution is enabled.")
       name = "CustomGradient-%s" % ops.uid()
       args = [ops.convert_to_tensor(x) for x in args]
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index d2cc87555f..cb725199a8 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1769,7 +1769,9 @@ class StagingArea(BaseStagingArea):
     its capacity.
 
     Args:
-      values: Tensor (or a tuple of Tensors) to place into the staging area.
+      values: A single tensor, a list or tuple of tensors, or a dictionary with
+        tensor values. The number of elements must match the length of the
+        list provided to the dtypes argument when creating the StagingArea.
       name: A name for the operation (optional).
 
     Returns:
@@ -1780,11 +1782,12 @@ class StagingArea(BaseStagingArea):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
+      
+      if not isinstance(values, (list, tuple, dict)):
+        values = [values]
 
       # Hard-code indices for this staging area
-      indices = (
-          list(six.moves.range(len(values)))
-          if isinstance(values, (list, tuple)) else None)
+      indices = list(six.moves.range(len(values)))
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index c7513d5b40..193c787baa 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -166,8 +166,7 @@ class LinearOperator(object):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index ecd30e4d7e..0292bc51dc 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -134,8 +134,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index e180e83026..5beaea65a5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -132,8 +132,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index f979fb37d6..5ba3b090ae 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -125,8 +125,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 50f3d407e8..45929eb4e2 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -236,8 +236,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
@@ -576,8 +575,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index a5130188b6..c4d386ccb4 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -133,8 +133,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 78bc024c0d..c6b2dcdf98 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -538,7 +538,7 @@ class DistributionStrategy(object):
   in the distributed vs. single tower cases.
   """
 
-  # TODO(josh11b): Raise an exception if variable paritioning requested before
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
   #   we add support.
   # TODO(josh11b): Also `parameter_device_index` property?
   # TODO(josh11b): `map()`
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 360e02fb44..a00ceb9021 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -229,10 +229,14 @@ class SessionManager(object):
     up to `max_wait_secs`, for recovery to succeed.
 
     If the model cannot be recovered successfully then it is initialized by
-    either running the provided `init_op`, or calling the provided `init_fn`.
-    The local_init_op is also run after init_op and init_fn, regardless of
+    running the `init_op` and calling `init_fn` if they are provided.
+    The `local_init_op` is also run after init_op and init_fn, regardless of
     whether the model was recovered successfully, but only if
-    ready_for_local_init_op passes.
+    `ready_for_local_init_op` passes.
+
+    If the model is recovered from a checkpoint it is assumed that all
+    global variables have been initialized, in particular neither `init_op`
+    nor `init_fn` will be executed.
 
     It is an error if the model cannot be recovered and no `init_op`
     or `init_fn` or `local_init_op` are passed.
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index e1edd62cc5..124ad82e91 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.9.2.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 7b2d7e1a56..d654b433e7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -120,7 +120,9 @@ function run_configure_for_gpu_build {
   export TF_CUDA_VERSION=9.0
   export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
   export TF_CUDNN_VERSION=7.0
-  export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  if [ -z "$CUDNN_INSTALL_PATH" ]; then
+    export CUDNN_INSTALL_PATH="C:/tools/cuda"
+  fi
   export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index e2d212a0db..8f0cf8c3d1 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -139,7 +139,9 @@ function main() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    # TODO(aselle): Re-enable this when we find a way to do it without doubling
+    # the whl size (over the limit).
+    # cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index cfad0f70c9..6511a50b3b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.7.0-rc1'
+_VERSION = '1.7.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From f9c5e71104cb30583127fdc918591cc7604f17ca Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Wed, 11 Apr 2018 09:51:10 +0800
Subject: [PATCH 0569/1262] Add missing TF_ATTRIBUTE_WEAK for MSVC (#18303)

---
 tensorflow/core/platform/macros.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 1b1faed703..3723968175 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -31,13 +31,14 @@ limitations under the License.
   __attribute__((__format__(__printf__, string_index, first_to_check)))
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
   __attribute__((__format__(__scanf__, string_index, first_to_check)))
-#elif defined(COMPILER_MSVC)
+#elif defined(_MSC_VER)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
-#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
 #define TF_MUST_USE_RESULT
 #define TF_PACKED
 #define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
@@ -57,7 +58,7 @@ limitations under the License.
 #endif
 
 // Control visiblity outside .so
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_EXPORT __declspec(dllexport)
 #else
@@ -65,7 +66,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 
 #ifdef __has_builtin
 #define TF_HAS_BUILTIN(x) __has_builtin(x)
-- 
GitLab


From 963ad0ff75d880861df20266652b263a9e32f0c7 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 10 Apr 2018 18:50:58 -0700
Subject: [PATCH 0570/1262] Remove BN workaround for resource variable
 gradients bug that was recently fixed.

PiperOrigin-RevId: 192388867
---
 .../keras/_impl/keras/layers/normalization.py | 33 +------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index b60d864ae5..b73025a5a8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import distribute as distribute_lib
@@ -171,7 +170,6 @@ class BatchNormalization(Layer):
 
     self.fused = fused
     self._bessels_correction_test_only = True
-    self._use_resource_variables = None
 
     if renorm:
       renorm_clipping = renorm_clipping or {}
@@ -277,27 +275,6 @@ class BatchNormalization(Layer):
         for idx, x in enumerate(self.axis):
           self.axis[idx] = x + 1      # Account for added dimension
 
-    # BUG: when using fused BN with Resource Variables with a dynamic
-    # `training` argument in call, the cond
-    # `smart_cond(
-    #     training,
-    #     _fused_batch_norm_training,
-    #     _fused_batch_norm_inference)` triggers None gradients for the
-    # variables gamma and beta.
-    # In this case we choose to force normal variables when possible.
-    # The bug will not occur of `training` is static, or when
-    # not using fused BN, or when in eager execution.
-    # TODO(fchollet): remove code below when bug is fixed.
-    use_resource = False
-    if context.executing_eagerly():
-      use_resource = True  # Eager execution requires resource variables.
-    elif not self.fused:
-      use_resource = True  # Issue only exists with fused BN.
-    elif self._use_resource_variables is True:
-      use_resource = True  # Case of a subclassed model, always use RVs.
-    if hasattr(self, '_scope'):
-      use_resource = None  # Legacy layers, leave it to `add_weight`.
-
     if self.scale:
       self.gamma = self.add_variable(
           name='gamma',
@@ -306,7 +283,6 @@ class BatchNormalization(Layer):
           initializer=self.gamma_initializer,
           regularizer=self.gamma_regularizer,
           constraint=self.gamma_constraint,
-          use_resource=use_resource,
           trainable=True)
     else:
       self.gamma = None
@@ -322,7 +298,6 @@ class BatchNormalization(Layer):
           initializer=self.beta_initializer,
           regularizer=self.beta_regularizer,
           constraint=self.beta_constraint,
-          use_resource=use_resource,
           trainable=True)
     else:
       self.beta = None
@@ -531,13 +506,7 @@ class BatchNormalization(Layer):
         outputs = array_ops.reshape(outputs, original_shape)
         return outputs
 
-    # Gradient bug when using fused BN with dynamic `training` and resource
-    # variables. TODO(fchollet): remove workaround when bug fixed.
-    use_fused_bn = (
-        self.fused and
-        (tf_utils.constant_value(training) is not None or
-         not isinstance(self.gamma, resource_variable_ops.ResourceVariable)))
-    if use_fused_bn:
+    if self.fused:
       outputs = self._fused_batch_norm(inputs, training=training)
       if self.virtual_batch_size is not None:
         # Currently never reaches here since fused_batch_norm does not support
-- 
GitLab


From b675450000753ff77e7a39a9ea84a59210781ea7 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 10 Apr 2018 19:01:33 -0700
Subject: [PATCH 0571/1262] Checkpointable: remove colocation constraints from
 restore ops

Mystery solved thanks to log_device_placement.

PiperOrigin-RevId: 192389574
---
 .../eager/python/checkpointable_utils_test.py      | 10 ++++------
 tensorflow/python/training/optimizer.py            | 14 +++++++-------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 36670aa210..b344d50e7f 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -764,9 +764,8 @@ class CheckpointingTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
 
     root = checkpointable.Checkpointable()
-    with ops.device("/cpu:0"):
-      root.var = checkpointable_utils.add_variable(
-          root, name="var", initializer=0.)
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
     if context.executing_eagerly():
       optimizer.minimize(root.var.read_value)
@@ -796,9 +795,8 @@ class CheckpointingTests(test.TestCase):
         new_root).restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    with ops.device("/cpu:0"):
-      new_root.var = checkpointable_utils.add_variable(
-          new_root, name="var", shape=[])
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(new_root.var))
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 46a58a9adf..f126d3847b 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -818,13 +818,13 @@ class Optimizer(
           if restored_initial_value is not None:
             initial_value = restored_initial_value
         v = variable_scope.variable(initial_value, name=name, trainable=False)
-        # Restore this variable by name if necessary, but don't add a
-        # Checkpointable dependency. Optimizers return the current graph's
-        # non-slot variables from _checkpoint_dependencies explicitly rather
-        # than unconditionally adding dependencies (since there may be multiple
-        # non-slot variables with the same name in different graphs, trying to
-        # save all of them would result in errors).
-        self._handle_deferred_dependencies(name=name, checkpointable=v)
+      # Restore this variable by name if necessary, but don't add a
+      # Checkpointable dependency. Optimizers return the current graph's
+      # non-slot variables from _checkpoint_dependencies explicitly rather
+      # than unconditionally adding dependencies (since there may be multiple
+      # non-slot variables with the same name in different graphs, trying to
+      # save all of them would result in errors).
+      self._handle_deferred_dependencies(name=name, checkpointable=v)
       self._non_slot_dict[key] = v
 
     return v
-- 
GitLab


From 531e71b799bb8803d7357a501f38bed5c7141921 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 10 Apr 2018 19:20:58 -0700
Subject: [PATCH 0572/1262] experimental C API: Fix compilation failure in
 Windows.

The functions added in
https://github.com/tensorflow/tensorflow/commit/be917027e37c5e8f21f6ba07f24bdbf072cf6dfd
are temporary, and their existence breaks compilation in MSVC because of
https://docs.microsoft.com/en-us/cpp/c-language/maximum-string-length
and
https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026

So just disabling it in Windows for now.

PiperOrigin-RevId: 192391164
---
 tensorflow/c/BUILD                 |  1 +
 tensorflow/c/c_api_experimental.cc | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 2367014cd0..8a9301d584 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -122,6 +122,7 @@ tf_cuda_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e82a546092..9678ee926f 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 using tensorflow::FunctionDef;
@@ -189,6 +190,12 @@ library {
 //  be deleted by calling TF_DeleteFunction.
 static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
     const char* file_path, std::string* dataset_name, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return std::vector<UniqueFuncPtr>();
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -7067,6 +7074,7 @@ library {
         DCHECK(found);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
 
 //  On success, returns a set of TF_Function instances encoding a dataset
@@ -7076,6 +7084,12 @@ library {
 static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
     const char* file_path, int batch_size, std::string* dataset_name,
     TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -8205,6 +8219,7 @@ library {
         DCHECK(found_batch_size);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
 
 // Adds the input functions to `graph`.  On success, returns the created
-- 
GitLab


From 44adf97426c6e1f218010a4a16190b5ec0a9f4df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 19:31:05 -0700
Subject: [PATCH 0573/1262] [XLA] Redesign: implement and test BatchNormXXX.

PiperOrigin-RevId: 192391748
---
 .../xla/client/xla_client/xla_builder.cc      | 58 +++++++++++-
 tensorflow/compiler/xla/tests/BUILD           |  4 +-
 .../xla/tests/batch_normalization_test.cc     | 93 +++++++++----------
 3 files changed, 102 insertions(+), 53 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index ba76001c78..40bafdb5c1 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1489,21 +1489,73 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
 XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                                     const XlaOp& offset, float epsilon,
                                     int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferBatchNormTrainingShape(
+            operand_shape, scale_shape, offset_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormTraining,
+                          {operand, scale, offset});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                                      const XlaOp& offset, const XlaOp& mean,
                                      const XlaOp& variance, float epsilon,
                                      int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
+    TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormInferenceShape(
+                            operand_shape, scale_shape, offset_shape,
+                            mean_shape, variance_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormInference,
+                          {operand, scale, offset, mean, variance});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                                 const XlaOp& batch_mean, const XlaOp& batch_var,
                                 const XlaOp& grad_output, float epsilon,
                                 int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
+    TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormGradShape(
+                            operand_shape, scale_shape, batch_mean_shape,
+                            batch_var_shape, grad_output_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormGrad,
+                          {operand, scale, batch_mean, batch_var, grad_output});
+  });
 }
 
 XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 2a2ef229ed..74ea1a0f39 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -860,11 +860,11 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index af8af99c79..f3dac75a44 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -69,14 +69,12 @@ class BatchNormalizationTest
     CHECK_EQ(kY, input_array_.width());
   }
 
-  ComputationDataHandle CheckShape(ComputationBuilder* b,
-                                   const ComputationDataHandle& operand,
-                                   const Shape& expected_shape) const {
-    std::unique_ptr<Shape> actual_shape =
-        b->GetShape(operand).ConsumeValueOrDie();
-    CHECK(ShapeUtil::Equal(expected_shape, *actual_shape))
+  XlaOp CheckShape(XlaBuilder* b, const XlaOp& operand,
+                   const Shape& expected_shape) const {
+    Shape actual_shape = b->GetShape(operand).ConsumeValueOrDie();
+    CHECK(ShapeUtil::Equal(expected_shape, actual_shape))
         << "want " << ShapeUtil::HumanString(expected_shape) << " got "
-        << ShapeUtil::HumanString(*actual_shape);
+        << ShapeUtil::HumanString(actual_shape);
     return operand;
   }
 
@@ -102,7 +100,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest,
 #endif
 
 XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
-  ComputationBuilder builder(client_, "subtract_in_z_one_sample");
+  XlaBuilder builder("subtract_in_z_one_sample");
   auto x = builder.ConstantLiteral(input_literal_);
   auto y = builder.ConstantR1<float>({3.14, 4.25});
   builder.Sub(x, y, /*broadcast_dimensions=*/{1});
@@ -118,7 +116,7 @@ XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
-  ComputationBuilder builder(client_, "square_tesseract_elementwise");
+  XlaBuilder builder("square_tesseract_elementwise");
   auto x = builder.ConstantLiteral(input_literal_);
   builder.SquareF32(x);
 
@@ -135,9 +133,9 @@ XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SumToZ) {
-  ComputationBuilder builder(client_, "sum_to_z");
+  XlaBuilder builder("sum_to_z");
   auto input_activations = builder.ConstantLiteral(input_literal_);
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all but the Z dimension.
   builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                  {0, 2, 3});
@@ -147,24 +145,23 @@ XLA_TEST_P(BatchNormalizationTest, SumToZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) {
-  ComputationBuilder builder(client_, "square_and_reduce");
+  XlaBuilder builder("square_and_reduce");
   auto input_activations = builder.ConstantLiteral(input_literal_);
   auto set_means = builder.ConstantR1<float>({2.f, 4.2f});
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.Reduce(
-      dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
+  builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
 
   std::vector<float> expected = {18, 0.06};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
-  ComputationBuilder builder(client_, "variance_to_stddev");
+  XlaBuilder builder("variance_to_stddev");
   auto variance = builder.ConstantR1<float>({6.f, .02f});
-  auto sqrt = builder.SqrtF32(variance);
+  builder.SqrtF32(variance);
 
   std::vector<float> expected = {2.44948974f, 0.14142136f};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
@@ -173,13 +170,13 @@ XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
 // Compare against a forward batch normalization example in the NN spec
 // reference.
 XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
-  ComputationBuilder builder(client_, "batch_normalize_per_spec");
+  XlaBuilder builder("batch_normalize_per_spec");
   auto input_activations =
       CheckShape(&builder, builder.ConstantLiteral(input_literal_),
                  ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
   auto gamma = builder.ConstantR1<float>({1.0, 1.0});
   auto beta = builder.ConstantR1<float>({0.0, 0.0});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all dimensions except dimension 1.
   Shape TwoElementVectorF32 = ShapeUtil::MakeShape(F32, {2});
   auto sum = CheckShape(
@@ -189,8 +186,8 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
       TwoElementVectorF32);
   auto input_shape = builder.GetShape(input_activations).ConsumeValueOrDie();
   auto sum_shape = builder.GetShape(sum).ConsumeValueOrDie();
-  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(*input_shape) /
-                                         ShapeUtil::ElementsIn(*sum_shape));
+  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(input_shape) /
+                                         ShapeUtil::ElementsIn(sum_shape));
   auto set_means = builder.Div(sum, count);
 
   const float kEpsilon = 1e-9f;
@@ -233,7 +230,7 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
   const int kFeatureIndex = 3;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
@@ -242,8 +239,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
@@ -257,7 +254,7 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
@@ -266,8 +263,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
@@ -282,23 +279,23 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
   // Use 0 dimension as feature, tests layout analyzer.
   const int kFeatureIndex = 0;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/1, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/1, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
@@ -314,24 +311,24 @@ XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
 XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
   // Test the correctness of choosing a large epsilon value.
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
   // var = 125, mean = 15, epsilon = -100
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/-100, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/-100, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
@@ -346,7 +343,7 @@ XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
 
 XLA_TEST_P(BatchNormalizationTest, BatchNormGradBasic) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand =
       builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
@@ -453,7 +450,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormTest_Instantiation, BatchNormTestManySizes,
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -553,7 +550,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -661,7 +658,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -828,9 +825,9 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   std::unique_ptr<GlobalData> grad_output_data =
       client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie();
 
-  auto t = builder.BatchNormGrad(input_parameter, scale_parameter,
-                                 mean_parameter, var_parameter,
-                                 grad_output_parameter, epsilon, feature_index);
+  builder.BatchNormGrad(input_parameter, scale_parameter, mean_parameter,
+                        var_parameter, grad_output_parameter, epsilon,
+                        feature_index);
 
   auto expected =
       Literal::MakeTuple({expected_grad_activation.get(),
-- 
GitLab


From e7b1ab049d22119c7b649046be853ea88120f27a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 10 Apr 2018 19:34:54 -0700
Subject: [PATCH 0574/1262] [StreamExecutor] Merge StreamExecutor's and XLA's
 StatusOr classes.

StatusOr is a...complicated class to write.  It's really not good to
have two copies of it.  They've diverged (the XLA one is more
sophisticated), and this may be causing upstream build problems with
gcc6.

PiperOrigin-RevId: 192392111
---
 tensorflow/stream_executor/BUILD          |   2 +
 tensorflow/stream_executor/lib/statusor.h | 225 +---------------------
 2 files changed, 5 insertions(+), 222 deletions(-)

diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1913fc20ee..80fc9ff292 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -33,6 +33,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -45,6 +46,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/compiler/xla:statusor",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 138738ecab..3b97929b37 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -14,238 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 // IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
-//
-// StatusOr<T> is the union of a Status object and a T
-// object. StatusOr models the concept of an object that is either a
-// usable value, or an error Status explaining why such a value is
-// not present. To this end, StatusOr<T> does not allow its Status
-// value to be Status::OK. Further, StatusOr<T*> does not allow the
-// contained pointer to be NULL.
-//
-// The primary use-case for StatusOr<T> is as the return value of a
-// function which may fail.
-//
-// Example client usage for a StatusOr<T>, where T is not a pointer:
-//
-//  StatusOr<float> result = DoBigCalculationThatCouldFail();
-//  if (result.ok()) {
-//    float answer = result.ValueOrDie();
-//    printf("Big calculation yielded: %f", answer);
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<T*>:
-//
-//  StatusOr<Foo*> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<std::unique_ptr<T>>:
-//
-//  StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo = std::move(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example factory implementation returning StatusOr<T*>:
-//
-//  StatusOr<Foo*> FooFactory::MakeNewFoo(int arg) {
-//    if (arg <= 0) {
-//      return Status(port::error::INVALID_ARGUMENT,
-//                            "Arg must be positive");
-//    } else {
-//      return new Foo(arg);
-//    }
-//  }
-//
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 
-#include <new>
-#include "tensorflow/stream_executor/platform/port.h"
-#include <type_traits>
-#include <utility>
-
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace perftools {
 namespace gputools {
 namespace port {
 
-template<typename T>
-class StatusOr {
-  template<typename U> friend class StatusOr;
-
- public:
-  // Construct a new StatusOr with Status::UNKNOWN status
-  StatusOr() : status_(error::UNKNOWN, "") {}
-
-  // Construct a new StatusOr with the given non-ok status. After calling
-  // this constructor, calls to ValueOrDie() is invalid.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return
-  // value, so it is convenient and sensible to be able to do 'return
-  // Status()' when the return type is StatusOr<T>.
-  //
-  // REQUIRES: status != Status::OK.
-  // In optimized builds, passing Status::OK here will have the effect
-  // of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const Status& status);  // NOLINT
-
-  // Construct a new StatusOr with the given value. If T is a plain pointer,
-  // value must not be NULL. After calling this constructor, calls to
-  // ValueOrDie() will succeed, and calls to status() will return OK.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
-  // so it is convenient and sensible to be able to do 'return T()'
-  // when the return type is StatusOr<T>.
-  //
-  // REQUIRES: if T is a plain pointer, value != NULL.
-  // In optimized builds, passing a NULL pointer here will have
-  // the effect of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const T& value);  // NOLINT
-
-  // Conversion copy constructor, T must be copy constructible from U
-  template <typename U>
-  StatusOr(const StatusOr<U>& other)  // NOLINT
-      : status_(other.status_),
-        value_(other.value_) {}
-
-  // Conversion assignment operator, T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(const StatusOr<U>& other) {
-    status_ = other.status_;
-    value_ = other.value_;
-    return *this;
-  }
-
-  // Rvalue-reference overloads of the other constructors and assignment
-  // operators, to support move-only types and avoid unnecessary copying.
-  StatusOr(T&& value);  // NOLINT
-
-  // Move conversion operator to avoid unnecessary copy.
-  // T must be assignable from U.
-  // Not marked with explicit so the implicit conversion can happen.
-  template <typename U>
-  StatusOr(StatusOr<U>&& other)  // NOLINT
-      : status_(std::move(other.status_)),
-        value_(std::move(other.value_)) {}
-
-  // Move assignment operator to avoid unnecessary copy.
-  // T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(StatusOr<U>&& other) {
-    status_ = std::move(other.status_);
-    value_ = std::move(other.value_);
-    return *this;
-  }
-
-  // Returns a reference to our status. If this contains a T, then
-  // returns Status::OK.
-  const Status& status() const { return status_; }
-
-  // Returns this->status().ok()
-  bool ok() const { return status_.ok(); }
-
-  // Returns a reference to our current value, requires that this->ok().
-  // If you need to initialize a T object from the stored value,
-  // ConsumeValueOrDie() may be more efficient.
-  const T& ValueOrDie() const;
-  T& ValueOrDie();
-
-  // Returns our current value, requires this->ok(). Use this if
-  // you would otherwise want to say std::move(s.ValueOrDie()), for example
-  // if you need to initialize a T object from the stored value and you don't
-  // need subsequent access to the stored value. It uses T's move constructor,
-  // if it has one, so it will work with move-only types, and will often be
-  // more efficient than ValueOrDie, but may leave the stored value
-  // in an arbitrary valid state.
-  T ConsumeValueOrDie();
-
- private:
-  Status status_;
-  T value_;
-
-  void CheckValueNotNull(const T& value);
-
-  template <typename U>
-  struct IsNull {
-    // For non-pointer U, a reference can never be NULL.
-    static inline bool IsValueNull(const U& t) { return false; }
-  };
-
-  template <typename U>
-  struct IsNull<U*> {
-    static inline bool IsValueNull(const U* t) { return t == NULL; }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementation details for StatusOr<T>
-
-template <typename T>
-StatusOr<T>::StatusOr(const T& value)
-    : status_(), value_(value) {
-  CheckValueNotNull(value);
-}
-
-template <typename T>
-const T& StatusOr<T>::ValueOrDie() const {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T& StatusOr<T>::ValueOrDie() {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T StatusOr<T>::ConsumeValueOrDie() {
-  TF_CHECK_OK(status_);
-  return std::move(value_);
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(const Status& status)
-    : status_(status) {
-  assert(!status.ok());
-  if (status.ok()) {
-    status_ =
-        Status(error::INTERNAL,
-               "Status::OK is not a valid constructor argument to StatusOr<T>");
-  }
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(T&& value)
-    : status_() {
-  CheckValueNotNull(value);
-  value_ = std::move(value);
-}
-
+// Use XLA's StatusOr so we don't duplicate code.
 template <typename T>
-void StatusOr<T>::CheckValueNotNull(const T& value) {
-  assert(!IsNull<T>::IsValueNull(value));
-  if (IsNull<T>::IsValueNull(value)) {
-    status_ =
-        Status(error::INTERNAL,
-               "NULL is not a valid constructor argument to StatusOr<T*>");
-  }
-}
+using StatusOr = ::xla::StatusOr<T>;
 
 }  // namespace port
 }  // namespace gputools
-- 
GitLab


From f3180f3827ef1340f51408385f139143da55f07f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 19:44:00 -0700
Subject: [PATCH 0575/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192392702
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 399 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |   7 +
 2 files changed, 406 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index fe4b7a7be0..12df60a2ae 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -7610,6 +7610,111 @@ op {
     }
   }
 }
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "AvgPool3DGrad"
   input_arg {
@@ -7646,6 +7751,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -7711,6 +7829,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7771,6 +7890,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -17318,6 +17438,76 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
@@ -17486,6 +17676,76 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "Dequantize"
   input_arg {
@@ -28687,6 +28947,63 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "MaxPool3DGrad"
   input_arg {
@@ -28958,6 +29275,88 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "MaxPool3DGradGrad"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9950388357..6af77be148 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2449,6 +2449,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -2510,6 +2511,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7892,6 +7894,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7961,6 +7964,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -14232,6 +14236,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -14299,6 +14304,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -14312,6 +14318,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
-- 
GitLab


From 0c0428e41289392be095bb07f5daa1a0c4557c8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 20:48:57 -0700
Subject: [PATCH 0576/1262] [XLA] Redesign: implment and test CrossReplicaSum.

PiperOrigin-RevId: 192397189
---
 .../compiler/xla/client/xla_client/xla_builder.cc    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 40bafdb5c1..3b96bc72be 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1559,7 +1559,17 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
 }
 
 XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+
+    return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
+                          {operand});
+  });
 }
 
 XlaOp XlaBuilder::SelectAndScatter(
-- 
GitLab


From f22655d09820f83881b8a2170eb51407956864d6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 21:42:14 -0700
Subject: [PATCH 0577/1262] [XLA] Redesgin: implement and test Gather,
 Conditional.

Also support convert from/to proto for Gather.

PiperOrigin-RevId: 192400659
---
 .../xla/client/xla_client/xla_builder.cc      |  47 ++++-
 .../compiler/xla/service/hlo_instruction.cc   |  15 ++
 tensorflow/compiler/xla/tests/BUILD           |   3 +-
 .../compiler/xla/tests/conditional_test.cc    | 192 +++++++++---------
 .../xla/tests/gather_operation_test.cc        |   8 +-
 5 files changed, 160 insertions(+), 105 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 3b96bc72be..c3c824a231 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1390,14 +1390,57 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
 XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
                          const GatherDimensionNumbers& dimension_numbers,
                          tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
+    TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape,
+                        GetShape(gather_indices));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferGatherShape(input_shape, gather_indices_shape,
+                                         dimension_numbers, window_bounds));
+
+    *instr.mutable_gather_dimension_numbers() = dimension_numbers;
+    for (int64 bound : window_bounds) {
+      instr.add_gather_window_bounds(bound);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kGather,
+                          {input, gather_indices});
+  });
 }
 
 XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                               const XlaComputation& true_computation,
                               const XlaOp& false_operand,
                               const XlaComputation& false_computation) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate));
+    TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape,
+                        GetShape(true_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape,
+                        true_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape,
+                        GetShape(false_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
+                        false_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConditionalShape(
+            predicate_shape, true_operand_shape, false_operand_shape,
+            true_computation_shape, false_computation_shape));
+
+    // The index of true_computation must be 0 and that of false computation
+    // must be 1.
+    AddCalledComputation(true_computation, &instr);
+    AddCalledComputation(false_computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConditional,
+                          {predicate, true_operand, false_operand});
+  });
 }
 
 XlaOp XlaBuilder::Reduce(
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 8149e47cb5..3629106a25 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -159,6 +159,14 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->fft_length_.push_back(fft_len);
   }
 
+  if (proto.has_gather_dimension_numbers()) {
+    instruction->gather_dimension_numbers_ =
+        MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
+  }
+  for (int64 bound : proto.gather_window_bounds()) {
+    instruction->gather_window_bounds_.push_back(bound);
+  }
+
   return std::move(instruction);
 }
 
@@ -2416,6 +2424,13 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
+  if (gather_dimension_numbers_ != nullptr) {
+    *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_;
+  }
+  for (int64 bound : gather_window_bounds_) {
+    proto.add_gather_window_bounds(bound);
+  }
+
   return proto;
 }
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 74ea1a0f39..1f90a44d8b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -492,9 +492,10 @@ xla_test(
     tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index b917dee77b..7ff6706935 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -23,8 +24,8 @@ namespace {
 
 class ConditionalOpTest : public ClientLibraryTestBase {
  protected:
-  Computation CreateR0ConstantComputation(float value) {
-    ComputationBuilder builder(client_, "Constant");
+  XlaComputation CreateR0ConstantComputation(float value) {
+    XlaBuilder builder("Constant");
     builder.Parameter(0, empty_tuple_, "tuple");
     builder.ConstantR0<float>(value);
     auto build_status = builder.Build();
@@ -32,16 +33,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0IdentityComputation() {
-    ComputationBuilder builder(client_, "Identity");
+  XlaComputation CreateR0IdentityComputation() {
+    XlaBuilder builder("Identity");
     builder.Parameter(0, r0f32_, "x");
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateCeilComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Ceil");
+  XlaComputation CreateCeilComputation(const Shape& shape) {
+    XlaBuilder builder("Ceil");
     auto param = builder.Parameter(0, shape, "param");
     builder.Ceil(param);
     auto build_status = builder.Build();
@@ -49,16 +50,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0CeilComputation() {
+  XlaComputation CreateR0CeilComputation() {
     return CreateCeilComputation(r0f32_);
   }
 
-  Computation CreateR1CeilComputation() {
+  XlaComputation CreateR1CeilComputation() {
     return CreateCeilComputation(r1s2f32_);
   }
 
-  Computation CreateFloorComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Floor");
+  XlaComputation CreateFloorComputation(const Shape& shape) {
+    XlaBuilder builder("Floor");
     auto param = builder.Parameter(0, shape, "param");
     builder.Floor(param);
     auto build_status = builder.Build();
@@ -66,17 +67,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0FloorComputation() {
+  XlaComputation CreateR0FloorComputation() {
     return CreateFloorComputation(r0f32_);
   }
 
-  Computation CreateR1FloorComputation() {
+  XlaComputation CreateR1FloorComputation() {
     return CreateFloorComputation(r1s2f32_);
   }
 
-  Computation CreateTupleCeilComputation(const string& computation_name,
-                                         const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleCeilComputation(const string& computation_name,
+                                            const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -88,17 +89,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleCeilComputation() {
+  XlaComputation CreateR0TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleCeilComputation() {
+  XlaComputation CreateR1TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleFloorComputation(const string& computation_name,
-                                          const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleFloorComputation(const string& computation_name,
+                                             const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -110,17 +111,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleFloorComputation() {
+  XlaComputation CreateR0TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleFloorComputation() {
+  XlaComputation CreateR1TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleAddComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleAddComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -130,17 +131,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleAddComputation() {
+  XlaComputation CreateR0TupleAddComputation() {
     return CreateTupleAddComputation("AddR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleAddComputation() {
+  XlaComputation CreateR1TupleAddComputation() {
     return CreateTupleAddComputation("AddR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleSubComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleSubComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -150,11 +151,11 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleSubComputation() {
+  XlaComputation CreateR0TupleSubComputation() {
     return CreateTupleSubComputation("SubR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleSubComputation() {
+  XlaComputation CreateR1TupleSubComputation() {
     return CreateTupleSubComputation("SubR1", tuple_2_r1s2f32_);
   }
 
@@ -170,26 +171,25 @@ class ConditionalOpTest : public ClientLibraryTestBase {
 
 // Test true and false computations that do not take any parameters.
 XLA_TEST_F(ConditionalOpTest, Parameters0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   auto true_computation = CreateR0ConstantComputation(56.0f);
   auto false_computation = CreateR0ConstantComputation(12.0f);
-  auto result = builder.Conditional(pred, operands, true_computation, operands,
-                                    false_computation);
+  builder.Conditional(pred, operands, true_computation, operands,
+                      false_computation);
 
   ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
 }
 
 // Test true and false computations that take in 1 parameter.
 XLA_TEST_F(ConditionalOpTest, Parameters1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto identity = CreateR0IdentityComputation();
-  auto result =
-      builder.Conditional(pred, operand1, identity, operand2, identity);
+  builder.Conditional(pred, operand1, identity, operand2, identity);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -197,12 +197,12 @@ XLA_TEST_F(ConditionalOpTest, Parameters1) {
 // Test conditional with two different computations in the true and false cases
 // that take in different arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -210,11 +210,11 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
 // Test conditional with two different computations in the true and false cases
 // that take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand, CreateR0CeilComputation(),
-                                    operand, CreateR0FloorComputation());
+  builder.Conditional(pred, operand, CreateR0CeilComputation(), operand,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -222,12 +222,12 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
 // Test conditional with the same computation in the true and false cases but
 // take in different arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand1, floor, operand2, floor);
+  builder.Conditional(pred, operand1, floor, operand2, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -235,11 +235,11 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
 // Test conditional with the same computation in the true and false cases that
 // take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand, floor, operand, floor);
+  builder.Conditional(pred, operand, floor, operand, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -247,12 +247,12 @@ XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
 // Test conditional with different instances of the same computation in the true
 // and false cases.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0FloorComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0FloorComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -260,7 +260,7 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
 // Test the case when a call invokes a computation that contains a conditional.
 XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   auto pred_cond = inner_builder.Parameter(0, r0bool, "param0");
   auto true_operand = inner_builder.Parameter(1, r0f32_, "param1");
   auto false_operand = inner_builder.Parameter(2, r0f32_, "param2");
@@ -268,7 +268,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
                             false_operand, CreateR0FloorComputation());
   auto inner_builder_result = inner_builder.Build();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
@@ -281,14 +281,13 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
 // Test true and false computations that take in 2 parameters and predicate is
 // true.
 XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
 }
@@ -296,14 +295,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
 // Test true and false computations that take in 2 parameters and predicate is
 // false.
 XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
 }
@@ -311,14 +309,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is true.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
 }
@@ -326,21 +323,20 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is false.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
 }
 
 // Test true and false computations that return a tuple of scalars.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple(
       {builder.ConstantR0<float>(12.2f), builder.ConstantR0<float>(25.6f)});
@@ -356,7 +352,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
 
 // Test true and false computations that return a tuple of arrays.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
                                  builder.ConstantR1<float>({25.6f, 29.2f})});
@@ -373,7 +369,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_pred = true_builder.ConstantR0<bool>(true);
@@ -384,7 +380,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_pred = false_builder.ConstantR0<bool>(false);
@@ -395,7 +391,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -411,7 +407,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
 
 // Test true and false computations that return a nested tuple.
 XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_constant1 = true_builder.ConstantR0<float>(12.2f);
@@ -424,7 +420,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_constant1 = false_builder.ConstantR0<float>(46.6f);
@@ -438,7 +434,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -460,16 +456,16 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
 // params.
 XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
   auto operand1_param =
       CreateR0Parameter<float>(56.3f, 1, "operand1", &builder, &operand1);
   auto operand2_param =
       CreateR0Parameter<float>(12.7f, 2, "operand2", &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(
       &builder, 57.0f,
@@ -480,16 +476,16 @@ XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
 // Test conditional that takes in array operands in the form of external params.
 XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
   auto operand1_param = CreateR1Parameter<float>({24.3f, 56.7f}, 1, "operand1",
                                                  &builder, &operand1);
   auto operand2_param = CreateR1Parameter<float>({10.2f, 11.6f}, 2, "operand2",
                                                  &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR1CeilComputation(),
-                                    operand2, CreateR1FloorComputation());
+  builder.Conditional(pred, operand1, CreateR1CeilComputation(), operand2,
+                      CreateR1FloorComputation());
 
   ComputeAndCompareR1<float>(
       &builder, {10.0f, 11.0f},
@@ -499,7 +495,7 @@ XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
 
 // Test the case where one conditional is nested within another.
 XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -514,7 +510,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred1 = builder.ConstantR0<bool>(true);
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
@@ -529,7 +525,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
 }
 
 XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -544,7 +540,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
   auto operand2 = builder.ConstantR0<float>(12.2f);
@@ -556,7 +552,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
 
 // Test a mismatch in the shape of the true operand and true computation.
 XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
@@ -573,27 +569,27 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
 
 XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   Shape tuple_shape = ShapeUtil::MakeTupleShape({r0f32_, r0f32_});
-  Computation swapper;
+  XlaComputation swapper;
   {
-    ComputationBuilder builder(client_, TestName() + ".swapper");
+    XlaBuilder builder(TestName() + ".swapper");
     auto param0 = builder.Parameter(0, tuple_shape, "sp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
     builder.Tuple({y, x});
     swapper = builder.Build().ConsumeValueOrDie();
   }
-  Computation forwarder;
+  XlaComputation forwarder;
   {
-    ComputationBuilder builder(client_, TestName() + ".forwarder");
+    XlaBuilder builder(TestName() + ".forwarder");
     auto param0 = builder.Parameter(0, tuple_shape, "fp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
     builder.Tuple({x, y});
     forwarder = builder.Build().ConsumeValueOrDie();
   }
-  Computation main;
+  XlaComputation main;
   {
-    ComputationBuilder builder(client_, TestName() + ".main");
+    XlaBuilder builder(TestName() + ".main");
     auto param0 = builder.Parameter(0, tuple_shape, "mp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
@@ -605,7 +601,7 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   }
 
   auto test_swap = [&](float a, float b) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto x = builder.ConstantR0<float>(a);
     auto y = builder.ConstantR0<float>(b);
     auto tuple_operand = builder.Tuple({x, y});
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 9db68ff7a6..90496d55e6 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -405,7 +405,7 @@ class GatherClientLibraryTest : public ClientLibraryTestBase {};
 // GPU and CPU_PARALLEL.
 XLA_TEST_F(GatherClientLibraryTest,
            DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) {
-  // We create this HLO, but using the ComputationBuilder API.
+  // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
   //   operand = s32[3,3] parameter(0)
@@ -418,7 +418,7 @@ XLA_TEST_F(GatherClientLibraryTest,
   //       window_bounds={1, 3}
   // }
 
-  ComputationBuilder builder(client_, "gather_basic");
+  XlaBuilder builder("gather_basic");
 
   Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3});
   Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
@@ -443,8 +443,8 @@ XLA_TEST_F(GatherClientLibraryTest,
                           client_->GetDeviceHandles(1));
   xla::ExecutionOptions execution_options = CreateDefaultExecutionOptions();
   *execution_options.add_device_handles() = devices[0];
-  TF_ASSERT_OK_AND_ASSIGN(Computation computation, builder.Build());
-  std::vector<xla::Client::ComputationInstance> computation_instances = {
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  std::vector<xla::Client::XlaComputationInstance> computation_instances = {
       {computation,
        {operand_arg.get(), indices_arg.get()},
        execution_options,
-- 
GitLab


From 785c484288913ed7989881483aefa3bee0cec015 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 22:29:13 -0700
Subject: [PATCH 0578/1262] [XLA] Redesign: implement ComputeHost.

Also support convert from/to proto for ComputeHost.

PiperOrigin-RevId: 192403660
---
 tensorflow/compiler/xla/client/xla_client/xla_builder.cc | 8 +++++++-
 tensorflow/compiler/xla/service/hlo.proto                | 4 ++++
 tensorflow/compiler/xla/service/hlo_instruction.cc       | 5 +++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index c3c824a231..7ccdc2ded2 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1074,7 +1074,13 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name,
 XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
                               const string& channel_name,
                               int64 cost_estimate_ns, const Shape& shape) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape;
+    instr.set_channel_name(channel_name);
+    instr.set_cost_estimate_ns(cost_estimate_ns);
+    return AddInstruction(std::move(instr), HloOpcode::kHostCompute, operands);
+  });
 }
 
 XlaOp XlaBuilder::Complex(
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 0b446c6547..8fd7f8945c 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -135,6 +135,10 @@ message HloInstructionProto {
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
   repeated int64 gather_window_bounds = 34;
 
+  // Compute Host.
+  string channel_name = 41;
+  int64 cost_estimate_ns = 42;
+
   // The id of this instruction.
   int64 id = 35;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 3629106a25..a986bbd511 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -167,6 +167,9 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->gather_window_bounds_.push_back(bound);
   }
 
+  instruction->channel_name_ = proto.channel_name();
+  instruction->cost_estimate_ns_ = proto.cost_estimate_ns();
+
   return std::move(instruction);
 }
 
@@ -2430,6 +2433,8 @@ HloInstructionProto HloInstruction::ToProto() const {
   for (int64 bound : gather_window_bounds_) {
     proto.add_gather_window_bounds(bound);
   }
+  proto.set_channel_name(channel_name_);
+  proto.set_cost_estimate_ns(cost_estimate_ns_);
 
   return proto;
 }
-- 
GitLab


From 231146433a45ca8135e132ee0b48469798ca0b1f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 10 Apr 2018 22:44:36 -0700
Subject: [PATCH 0579/1262] [XLA] Fix the size of data buffer for sparse
 literals.

PiperOrigin-RevId: 192404543
---
 tensorflow/compiler/xla/literal_util.cc | 13 ++++++++++---
 tensorflow/compiler/xla/literal_util.h  |  5 +++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index c2950c1faa..c315b4ff30 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -97,11 +97,18 @@ Literal::Literal(const Shape& shape, bool allocate_arrays)
     const Shape& subshape = piece.subshape();
     if (ShapeUtil::IsArray(subshape)) {
       if (allocate_arrays) {
-        piece.set_buffer(new char[piece.size_bytes()]);
         if (LayoutUtil::IsSparseArray(subshape)) {
+          // For sparse arrays, the buffer must be of the size of the maximum
+          // number of sparse elements possible.
+          const int64 max_sparse_elements =
+              LayoutUtil::MaxSparseElements(subshape.layout());
+          piece.set_buffer(
+              new char[max_sparse_elements * ShapeUtil::ByteSizeOfPrimitiveType(
+                                                 subshape.element_type())]);
           piece.set_sparse_indices(new SparseIndexArray(
-              LayoutUtil::MaxSparseElements(subshape.layout()),
-              ShapeUtil::Rank(subshape)));
+              max_sparse_elements, ShapeUtil::Rank(subshape)));
+        } else {
+          piece.set_buffer(new char[piece.size_bytes()]);
         }
       } else {
         piece.set_buffer(nullptr);
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index a6a3dffeb7..8aa19222dc 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -1287,12 +1287,13 @@ void Literal::PopulateSparse(SparseIndexArray indices,
   CHECK_LE(num_elements, max_elements);
   CHECK_EQ(num_elements, indices.index_count());
   auto root_data = root_piece().data<NativeT>();
-  root_data.remove_suffix(max_elements - values.size());
+  // Piece::data() returns an ArraySlice of size equal to the number of indices
+  // in the SparseIndexArray. So there is no need to adjust the size of the data
+  // here. It is enough to just copy the incoming values into the data buffer.
   std::copy(values.begin(), values.end(), root_data.begin());
   *this->root_piece().sparse_indices() = std::move(indices);
   if (sort) {
     auto root_data = this->root_piece().data<NativeT>();
-    root_data.remove_suffix(root_data.size() - num_elements);
     this->root_piece().sparse_indices()->SortWithValues(root_data);
   }
   DCHECK(this->root_piece().sparse_indices()->Validate(shape()));
-- 
GitLab


From 6accb84d8437cb915e23d83673c233f5084aad68 Mon Sep 17 00:00:00 2001
From: Nick Felt <nickfelt@google.com>
Date: Tue, 10 Apr 2018 23:44:12 -0700
Subject: [PATCH 0580/1262] Create FileWriter <-> tf.contrib.summary
 compatibility layer

This provides an implementation of FileWriter, activated by passing in a `session` parameter to the constructor, that is backed by session.run'ing graph ops that manipulate a tf.contrib.summary.create_file_writer() instance. Because tf.contrib.summary.SummaryWriters are backed by shared resources in the graph, this makes it possible to have a FileWriter and a tf.contrib.summary.SummaryWriter that both write to the same events file.

This change includes some related smaller changes:
- Factors out training_utils.py into a separate target to avoid a cyclic dep
- Moves contrib/summary/summary_ops.py to python/ops/summary_ops_v2.py
- Adds SummaryWriter.init(), .flush(), and .close() op-returning methods
- Changes create_file_writer() `name` arg to default to logdir prefixed by `logdir:` so shared resources are scoped by logdir by default
- Fixes a bug with tf.contrib.summary.flush() `writer` arg
- Makes create_file_writer()'s max_queue arg behave as documented
- Adds more testing for existing tf.contrib.summary API

PiperOrigin-RevId: 192408079
---
 tensorflow/contrib/eager/python/BUILD         |   6 +-
 tensorflow/contrib/eager/python/evaluator.py  |   2 +-
 .../contrib/eager/python/metrics_impl.py      |   2 +-
 .../contrib/eager/python/metrics_test.py      |   2 +-
 tensorflow/contrib/summary/BUILD              |  33 +--
 tensorflow/contrib/summary/summary.py         |  40 +--
 .../contrib/summary/summary_ops_graph_test.py | 197 ++++++++++++++-
 .../contrib/summary/summary_ops_test.py       | 113 ++++++++-
 .../contrib/summary/summary_test_internal.py  |  60 -----
 .../contrib/summary/summary_test_util.py      |   2 +-
 .../tensorboard/db/summary_file_writer.cc     |   2 +-
 tensorflow/contrib/tpu/BUILD                  |   2 +-
 .../contrib/tpu/python/tpu/tpu_estimator.py   |   2 +-
 tensorflow/python/BUILD                       |  54 +++-
 .../ops/summary_ops_v2.py}                    |  68 +++--
 .../summary/writer/event_file_writer_v2.py    | 140 +++++++++++
 tensorflow/python/summary/writer/writer.py    |  40 ++-
 .../python/summary/writer/writer_test.py      | 233 ++++++++++++++----
 .../tensorflow.summary.-file-writer.pbtxt     |   2 +-
 19 files changed, 797 insertions(+), 203 deletions(-)
 delete mode 100644 tensorflow/contrib/summary/summary_test_internal.py
 rename tensorflow/{contrib/summary/summary_ops.py => python/ops/summary_ops_v2.py} (90%)
 create mode 100644 tensorflow/python/summary/writer/event_file_writer_v2.py

diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 4e088503bf..d97048405d 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -120,13 +120,13 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/eager/python:checkpointable_utils",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
@@ -140,11 +140,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/summary:summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -161,10 +161,10 @@ py_library(
     deps = [
         ":datasets",
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 37c8f0d47a..7949a3f6da 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,12 +22,12 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 
 
 class Evaluator(object):
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 2f2347736a..907f9204c2 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 15ac889191..28f5f286eb 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
@@ -31,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.training import training_util
 
 
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index fda1367b15..f88b03ec4c 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -15,7 +15,6 @@ py_test(
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -23,6 +22,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -35,7 +35,6 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -44,31 +43,9 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "summary_ops",
-    srcs = ["summary_ops.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:summary_ops_gen",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python:variables",
         "@six_archive//:six",
     ],
 )
@@ -79,7 +56,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":summary_ops",
+        "//tensorflow/python:summary_ops_v2",
     ],
 )
 
@@ -92,8 +69,10 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
         "@org_sqlite//:python",
     ],
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 2d6d7ea6a3..99ced53e11 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -61,23 +61,23 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.contrib.summary.summary_ops import all_summary_ops
-from tensorflow.contrib.summary.summary_ops import always_record_summaries
-from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_db_writer
-from tensorflow.contrib.summary.summary_ops import create_file_writer
-from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
-from tensorflow.contrib.summary.summary_ops import eval_dir
-from tensorflow.contrib.summary.summary_ops import flush
-from tensorflow.contrib.summary.summary_ops import generic
-from tensorflow.contrib.summary.summary_ops import graph
-from tensorflow.contrib.summary.summary_ops import histogram
-from tensorflow.contrib.summary.summary_ops import image
-from tensorflow.contrib.summary.summary_ops import import_event
-from tensorflow.contrib.summary.summary_ops import initialize
-from tensorflow.contrib.summary.summary_ops import never_record_summaries
-from tensorflow.contrib.summary.summary_ops import record_summaries_every_n_global_steps
-from tensorflow.contrib.summary.summary_ops import scalar
-from tensorflow.contrib.summary.summary_ops import should_record_summaries
-from tensorflow.contrib.summary.summary_ops import summary_writer_initializer_op
-from tensorflow.contrib.summary.summary_ops import SummaryWriter
+from tensorflow.python.ops.summary_ops_v2 import all_summary_ops
+from tensorflow.python.ops.summary_ops_v2 import always_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import audio
+from tensorflow.python.ops.summary_ops_v2 import create_db_writer
+from tensorflow.python.ops.summary_ops_v2 import create_file_writer
+from tensorflow.python.ops.summary_ops_v2 import create_summary_file_writer
+from tensorflow.python.ops.summary_ops_v2 import eval_dir
+from tensorflow.python.ops.summary_ops_v2 import flush
+from tensorflow.python.ops.summary_ops_v2 import generic
+from tensorflow.python.ops.summary_ops_v2 import graph
+from tensorflow.python.ops.summary_ops_v2 import histogram
+from tensorflow.python.ops.summary_ops_v2 import image
+from tensorflow.python.ops.summary_ops_v2 import import_event
+from tensorflow.python.ops.summary_ops_v2 import initialize
+from tensorflow.python.ops.summary_ops_v2 import never_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import record_summaries_every_n_global_steps
+from tensorflow.python.ops.summary_ops_v2 import scalar
+from tensorflow.python.ops.summary_ops_v2 import should_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import summary_writer_initializer_op
+from tensorflow.python.ops.summary_ops_v2 import SummaryWriter
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 3aba04540e..ae8336daaf 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -16,27 +16,220 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
 
 get_all = summary_test_util.get_all
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class GraphFileTest(test_util.TensorFlowTestCase):
+
+  def testSummaryOps(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.generic('tensor', 1, step=1)
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.histogram('histogram', [1.0], step=1)
+      summary_ops.image('image', [[[[1.0]]]], step=1)
+      summary_ops.audio('audio', [[1.0]], 1.0, 1, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    # The working condition of the ops is tested in the C++ test so we just
+    # test here that we're calling them correctly.
+    self.assertTrue(gfile.Exists(logdir))
+
+  def testSummaryName(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scalar', events[1].summary.value[0].tag)
+
+  def testSummaryNameScope(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      with ops.name_scope('scope'):
+        summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scope/scalar', events[1].summary.value[0].tag)
+
+  def testSummaryGlobalStep(self):
+    training_util.get_or_create_global_step()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(summary_ops.summary_writer_initializer_op())
+      step, _ = sess.run(
+          [training_util.get_global_step(), summary_ops.all_summary_ops()])
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(step, events[1].step)
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=1, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=999999, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+      flush_op = summary_ops.flush()
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(flush_op)
+      self.assertEqual(2, get_total())
+      # Test "writer" parameter
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer))
+      self.assertEqual(3, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer._resource))  # pylint:disable=protected-access
+      self.assertEqual(4, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+      # Create with different shared name (should be separate resource/file)
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+
+    with self.test_session() as sess:
+      # Run init ops across writers sequentially to avoid race condition.
+      # TODO(nickfelt): fix race condition in resource manager lookup or create
+      sess.run(writer1.init())
+      sess.run(writer2.init())
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer3.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run([writer1.flush(), writer2.flush(), writer3.flush()])
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['one', 'two'], tags)
+
+    # Second file has tag "three"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['three'], tags)
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      # Running init() again while writer is open has no effect
+      sess.run(writer.init())
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Running close() should do an implicit flush
+      sess.run(writer.close())
+      self.assertEqual(2, get_total())
+      # Running init() on a closed writer should start a new file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(writer.close())
+      files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+      self.assertEqual(2, len(files))
+      self.assertEqual(2, len(summary_test_util.events_from_file(files[1])))
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(writer.flush())
+      self.assertEqual(2, get_total())
+
+
+class GraphDbTest(summary_test_util.SummaryDbTest):
 
   def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
     with self.assertRaises(TypeError):
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index c756f8b270..f1ef218e74 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -16,12 +16,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import numpy as np
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -33,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
@@ -57,7 +59,7 @@ _NUMPY_NUMERIC_TYPES = {
 }
 
 
-class TargetTest(test_util.TensorFlowTestCase):
+class EagerFileTest(test_util.TensorFlowTestCase):
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -138,21 +140,22 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
-        logs, max_queue=2, flush_millis=999999,
+        logs, max_queue=1, flush_millis=999999,
         name='lol').as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
       summary_ops.scalar('scalar', 2.0, step=1)
       self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
       summary_ops.scalar('scalar', 2.0, step=2)
       self.assertEqual(3, get_total())
 
-  def testFlush(self):
+  def testFlushFunction(self):
     logs = tempfile.mkdtemp()
-    with summary_ops.create_file_writer(
-        logs, max_queue=999999, flush_millis=999999,
-        name='lol').as_default(), summary_ops.always_record_summaries():
+    writer = summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999, name='lol')
+    with writer.as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
@@ -161,9 +164,103 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(1, get_total())
       summary_ops.flush()
       self.assertEqual(3, get_total())
+      # Test "writer" parameter
+      summary_ops.scalar('scalar', 2.0, step=3)
+      summary_ops.flush(writer=writer)
+      self.assertEqual(4, get_total())
+      summary_ops.scalar('scalar', 2.0, step=4)
+      summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+      self.assertEqual(5, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        summary_ops.flush()
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+        summary_ops.flush()
+      # Create with different shared name (should be separate resource/file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+        summary_ops.flush()
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('one', next(events).summary.value[0].tag)
+    self.assertEqual('two', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file has tag "three"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('three', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, get_total())
+      try:
+        # Not using .as_default() to avoid implicit flush when exiting
+        writer.set_as_default()
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, get_total())
+        # Calling init() on a closed writer should start a new file
+        time.sleep(1.1)  # Ensure filename has a different timestamp
+        writer.init()
+        files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+        self.assertEqual(2, len(files))
+        get_total = lambda: len(summary_test_util.events_from_file(files[1]))
+        self.assertEqual(1, get_total())  # file_version Event
+        summary_ops.scalar('two', 2.0, step=2)
+        writer.close()
+        self.assertEqual(2, get_total())
+      finally:
+        # Clean up by resetting default writer
+        summary_ops.create_file_writer(None).set_as_default()
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.scalar('two', 2.0, step=2)
+      # Exiting the "as_default()" should do an implicit flush of the "two" tag
+      self.assertEqual(3, get_total())
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
deleted file mode 100644
index d0d3384735..0000000000
--- a/tensorflow/contrib/summary/summary_test_internal.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal helpers for tests in this directory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-import sqlite3
-
-from tensorflow.contrib.summary import summary_ops
-from tensorflow.python.framework import test_util
-
-
-class SummaryDbTest(test_util.TensorFlowTestCase):
-  """Helper for summary database testing."""
-
-  def setUp(self):
-    super(SummaryDbTest, self).setUp()
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_db_writer = functools.partial(
-        summary_ops.create_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-    super(SummaryDbTest, self).tearDown()
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 8506c4be9c..b4ae43302c 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -24,10 +24,10 @@ import os
 
 import sqlite3
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 85b3e7231b..3f24f58f03 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -132,7 +132,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
   Status WriteEvent(std::unique_ptr<Event> event) override {
     mutex_lock ml(mu_);
     queue_.emplace_back(std::move(event));
-    if (queue_.size() >= max_queue_ ||
+    if (queue_.size() > max_queue_ ||
         env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
       return InternalFlush();
     }
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 2f4a76720d..3e489d38b6 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -46,7 +46,6 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -57,6 +56,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 1332108d04..7fab19afee 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -30,7 +30,6 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -57,6 +56,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 7b548d2c70..9707b370c0 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2549,6 +2549,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "summary_ops_v2",
+    srcs = ["ops/summary_ops_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":smart_cond",
+        ":summary_op_util",
+        ":summary_ops_gen",
+        ":training_util",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "template",
     srcs = ["ops/template.py"],
@@ -2911,7 +2935,10 @@ py_library(
     name = "training",
     srcs = glob(
         ["training/**/*.py"],
-        exclude = ["**/*test*"],
+        exclude = [
+            "**/*test*",
+            "training/training_util.py",  # See :training_util
+        ],
     ),
     srcs_version = "PY2AND3",
     deps = [
@@ -2945,6 +2972,7 @@ py_library(
         ":string_ops",
         ":summary",
         ":training_ops_gen",
+        ":training_util",
         ":util",
         ":variable_scope",
         ":variables",
@@ -4194,6 +4222,25 @@ py_test(
     ],
 )
 
+py_library(
+    name = "training_util",
+    srcs = ["training/training_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework",
+        ":framework_ops",
+        ":init_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_test(
     name = "training_util_test",
     size = "small",
@@ -4204,6 +4251,7 @@ py_test(
         ":framework",
         ":platform",
         ":training",
+        ":training_util",
         ":variables",
     ],
 )
@@ -4248,6 +4296,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":client",
         ":constant_op",
         ":errors",
         ":framework",
@@ -4260,6 +4309,7 @@ py_library(
         ":summary_op_util",
         ":summary_ops",
         ":summary_ops_gen",
+        ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -4286,7 +4336,7 @@ py_tests(
         ":platform",
         ":platform_test",
         ":summary",
-        ":training",
+        ":summary_ops_v2",
         "//tensorflow/core:protos_all_py",
     ],
 )
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/python/ops/summary_ops_v2.py
similarity index 90%
rename from tensorflow/contrib/summary/summary_ops.py
rename to tensorflow/python/ops/summary_ops_v2.py
index bc763fe655..12f361c513 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -31,7 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.layers import utils
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_summary_ops
@@ -108,8 +108,10 @@ class SummaryWriter(object):
   - @{tf.contrib.summary.create_db_writer}
   """
 
-  def  __init__(self, resource):
+  def  __init__(self, resource, init_op_fn):
     self._resource = resource
+    # TODO(nickfelt): cache constructed ops in graph mode
+    self._init_op_fn = init_op_fn
     if context.executing_eagerly() and self._resource is not None:
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="cpu:0")
@@ -129,10 +131,32 @@ class SummaryWriter(object):
       yield self
       # Flushes the summary writer in eager mode or in graph functions, but not
       # in legacy graph mode (you're on your own there).
-      with ops.device("cpu:0"):
-        gen_summary_ops.flush_summary_writer(self._resource)
+      self.flush()
       context.context().summary_writer_resource = old
 
+  def init(self):
+    """Operation to initialize the summary writer resource."""
+    if self._resource is not None:
+      return self._init_op_fn()
+
+  def _flush(self):
+    return _flush_fn(writer=self)
+
+  def flush(self):
+    """Operation to force the summary writer to flush any buffered data."""
+    if self._resource is not None:
+      return self._flush()
+
+  def _close(self):
+    with ops.control_dependencies([self.flush()]):
+      with ops.device("cpu:0"):
+        return gen_summary_ops.close_summary_writer(self._resource)
+
+  def close(self):
+    """Operation to flush and close the summary writer resource."""
+    if self._resource is not None:
+      return self._close()
+
 
 def initialize(
     graph=None,  # pylint: disable=redefined-outer-name
@@ -178,7 +202,7 @@ def create_file_writer(logdir,
                        flush_millis=None,
                        filename_suffix=None,
                        name=None):
-  """Creates a summary file writer in the current context.
+  """Creates a summary file writer in the current context under the given name.
 
   Args:
     logdir: a string, or None. If a string, creates a summary file writer
@@ -186,18 +210,20 @@ def create_file_writer(logdir,
      a mock object which acts like a summary writer but does nothing,
      useful to use as a context manager.
     max_queue: the largest number of summaries to keep in a queue; will
-     flush once the queue gets bigger than this.
-    flush_millis: the largest interval between flushes.
-    filename_suffix: optional suffix for the event file name.
+     flush once the queue gets bigger than this. Defaults to 10.
+    flush_millis: the largest interval between flushes. Defaults to 120,000.
+    filename_suffix: optional suffix for the event file name. Defaults to `.v2`.
     name: Shared name for this SummaryWriter resource stored to default
-      Graph.
+      Graph. Defaults to the provided logdir prefixed with `logdir:`. Note: if a
+      summary writer resource with this shared name already exists, the returned
+      SummaryWriter wraps that resource and the other arguments have no effect.
 
   Returns:
     Either a summary writer or an empty object which can be used as a
     summary writer.
   """
   if logdir is None:
-    return SummaryWriter(None)
+    return SummaryWriter(None, None)
   with ops.device("cpu:0"):
     if max_queue is None:
       max_queue = constant_op.constant(10)
@@ -205,6 +231,8 @@ def create_file_writer(logdir,
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
       filename_suffix = constant_op.constant(".v2")
+    if name is None:
+      name = "logdir:" + logdir
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
@@ -267,13 +295,12 @@ def create_db_writer(db_uri,
 
 def _make_summary_writer(name, factory, **kwargs):
   resource = gen_summary_ops.summary_writer(shared_name=name)
+  init_op_fn = lambda: factory(resource, **kwargs)
   # TODO(apassos): Consider doing this instead.
-  # node = factory(resource, **kwargs)
   # if not context.executing_eagerly():
-  #   ops.get_default_session().run(node)
-  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME,
-                        factory(resource, **kwargs))
-  return SummaryWriter(resource)
+  #   ops.get_default_session().run(init_op)
+  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME, init_op_fn())
+  return SummaryWriter(resource, init_op_fn)
 
 
 def _cleanse_string(name, pattern, value):
@@ -341,7 +368,7 @@ def summary_writer_function(name, tensor, function, family=None):
   if context.context().summary_writer_resource is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
-    op = utils.smart_cond(
+    op = smart_cond.smart_cond(
         should_record_summaries(), record, _nothing, name="")
     ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
   return op
@@ -538,7 +565,14 @@ def flush(writer=None, name=None):
     writer = context.context().summary_writer_resource
     if writer is None:
       return control_flow_ops.no_op()
-  return gen_summary_ops.flush_summary_writer(writer, name=name)
+  else:
+    if isinstance(writer, SummaryWriter):
+      writer = writer._resource  # pylint: disable=protected-access
+  with ops.device("cpu:0"):
+    return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
+_flush_fn = flush  # for within SummaryWriter.flush()
 
 
 def eval_dir(model_dir, name=None):
diff --git a/tensorflow/python/summary/writer/event_file_writer_v2.py b/tensorflow/python/summary/writer/event_file_writer_v2.py
new file mode 100644
index 0000000000..5c66c0f7a8
--- /dev/null
+++ b/tensorflow/python/summary/writer/event_file_writer_v2.py
@@ -0,0 +1,140 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Writes events to disk in a logdir."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.platform import gfile
+
+
+class EventFileWriterV2(object):
+  """Writes `Event` protocol buffers to an event file via the graph.
+
+  The `EventFileWriterV2` class is backed by the summary file writer in the v2
+  summary API (currently in tf.contrib.summary), so it uses a shared summary
+  writer resource and graph ops to write events.
+
+  As with the original EventFileWriter, this class will asynchronously write
+  Event protocol buffers to the backing file. The Event file is encoded using
+  the tfrecord format, which is similar to RecordIO.
+  """
+
+  def __init__(self, session, logdir, max_queue=10, flush_secs=120,
+               filename_suffix=''):
+    """Creates an `EventFileWriterV2` and an event file to write to.
+
+    On construction, this calls `tf.contrib.summary.create_file_writer` within
+    the graph from `session.graph` to look up a shared summary writer resource
+    for `logdir` if one exists, and create one if not. Creating the summary
+    writer resource in turn creates a new event file in `logdir` to be filled
+    with `Event` protocol buffers passed to `add_event`. Graph ops to control
+    this writer resource are added to `session.graph` during this init call;
+    stateful methods on this class will call `session.run()` on these ops.
+
+    Note that because the underlying resource is shared, it is possible that
+    other parts of the code using the same session may interact independently
+    with the resource, e.g. by flushing or even closing it. It is the caller's
+    responsibility to avoid any undesirable sharing in this regard.
+
+    The remaining arguments to the constructor (`flush_secs`, `max_queue`, and
+    `filename_suffix`) control the construction of the shared writer resource
+    if one is created. If an existing resource is reused, these arguments have
+    no effect.  See `tf.contrib.summary.create_file_writer` for details.
+
+    Args:
+      session: A `tf.Session`. Session that will hold shared writer resource.
+        The writer ops will be added to session.graph during this init call.
+      logdir: A string. Directory where event file will be written.
+      max_queue: Integer. Size of the queue for pending events and summaries.
+      flush_secs: Number. How often, in seconds, to flush the
+        pending events and summaries to disk.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `filename_suffix`.
+    """
+    self._session = session
+    self._logdir = logdir
+    self._closed = False
+    if not gfile.IsDirectory(self._logdir):
+      gfile.MakeDirs(self._logdir)
+
+    with self._session.graph.as_default():
+      with ops.name_scope('filewriter'):
+        file_writer = summary_ops_v2.create_file_writer(
+            logdir=self._logdir,
+            max_queue=max_queue,
+            flush_millis=flush_secs * 1000,
+            filename_suffix=filename_suffix)
+        with summary_ops_v2.always_record_summaries(), file_writer.as_default():
+          self._event_placeholder = array_ops.placeholder_with_default(
+              constant_op.constant('unused', dtypes.string),
+              shape=[])
+          self._add_event_op = summary_ops_v2.import_event(
+              self._event_placeholder)
+        self._init_op = file_writer.init()
+        self._flush_op = file_writer.flush()
+        self._close_op = file_writer.close()
+      self._session.run(self._init_op)
+
+  def get_logdir(self):
+    """Returns the directory where event file will be written."""
+    return self._logdir
+
+  def reopen(self):
+    """Reopens the EventFileWriter.
+
+    Can be called after `close()` to add more events in the same directory.
+    The events will go into a new events file.
+
+    Does nothing if the EventFileWriter was not closed.
+    """
+    if self._closed:
+      self._closed = False
+      self._session.run(self._init_op)
+
+  def add_event(self, event):
+    """Adds an event to the event file.
+
+    Args:
+      event: An `Event` protocol buffer.
+    """
+    if not self._closed:
+      event_pb = event.SerializeToString()
+      self._session.run(
+          self._add_event_op, feed_dict={self._event_placeholder: event_pb})
+
+  def flush(self):
+    """Flushes the event file to disk.
+
+    Call this method to make sure that all pending events have been written to
+    disk.
+    """
+    self._session.run(self._flush_op)
+
+  def close(self):
+    """Flushes the event file to disk and close the file.
+
+    Call this method when you do not need the summary writer anymore.
+    """
+    if not self._closed:
+      self.flush()
+      self._session.run(self._close_op)
+      self._closed = True
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 57f78c156b..aca084fc91 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.summary.writer.event_file_writer_v2 import EventFileWriterV2
 from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
@@ -286,6 +287,11 @@ class FileWriter(SummaryToEventTransformer):
   file contents asynchronously. This allows a training program to call methods
   to add data to the file directly from the training loop, without slowing down
   training.
+
+  When constructed with a `tf.Session` parameter, a `FileWriter` instead forms
+  a compatibility layer over new graph-based summaries (`tf.contrib.summary`)
+  to facilitate the use of new summary writing with pre-existing code that
+  expects a `FileWriter` instance.
   """
 
   def __init__(self,
@@ -294,10 +300,11 @@ class FileWriter(SummaryToEventTransformer):
                max_queue=10,
                flush_secs=120,
                graph_def=None,
-               filename_suffix=None):
-    """Creates a `FileWriter` and an event file.
+               filename_suffix=None,
+               session=None):
+    """Creates a `FileWriter`, optionally shared within the given session.
 
-    On construction the summary writer creates a new event file in `logdir`.
+    Typically, constructing a file writer creates a new event file in `logdir`.
     This event file will contain `Event` protocol buffers constructed when you
     call one of the following functions: `add_summary()`, `add_session_log()`,
     `add_event()`, or `add_graph()`.
@@ -317,13 +324,16 @@ class FileWriter(SummaryToEventTransformer):
     writer = tf.summary.FileWriter(<some-directory>, sess.graph)
     ```
 
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
+    The `session` argument to the constructor makes the returned `FileWriter` a
+    a compatibility layer over new graph-based summaries (`tf.contrib.summary`).
+    Crucially, this means the underlying writer resource and events file will
+    be shared with any other `FileWriter` using the same `session` and `logdir`,
+    and with any `tf.contrib.summary.SummaryWriter` in this session using the
+    the same shared resource name (which by default scoped to the logdir). If
+    no such resource exists, one will be created using the remaining arguments
+    to this constructor, but if one already exists those arguments are ignored.
+    In either case, ops will be added to `session.graph` to control the
+    underlying file writer resource. See `tf.contrib.summary` for more details.
 
     Args:
       logdir: A string. Directory where event file will be written.
@@ -334,6 +344,7 @@ class FileWriter(SummaryToEventTransformer):
       graph_def: DEPRECATED: Use the `graph` argument instead.
       filename_suffix: A string. Every event file's name is suffixed with
         `suffix`.
+      session: A `tf.Session` object. See details above.
 
     Raises:
       RuntimeError: If called with eager execution enabled.
@@ -347,9 +358,12 @@ class FileWriter(SummaryToEventTransformer):
       raise RuntimeError(
           "tf.summary.FileWriter is not compatible with eager execution. "
           "Use tf.contrib.summary instead.")
-
-    event_writer = EventFileWriter(logdir, max_queue, flush_secs,
-                                   filename_suffix)
+    if session is not None:
+      event_writer = EventFileWriterV2(
+          session, logdir, max_queue, flush_secs, filename_suffix)
+    else:
+      event_writer = EventFileWriter(logdir, max_queue, flush_secs,
+                                     filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def __enter__(self):
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 88ade0aac3..dc990c2602 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -29,10 +29,12 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import plugin_asset
@@ -42,7 +44,10 @@ from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.util import compat
 
 
-class SummaryWriterTestCase(test.TestCase):
+class FileWriterTestCase(test.TestCase):
+
+  def _FileWriter(self, *args, **kwargs):
+    return writer.FileWriter(*args, **kwargs)
 
   def _TestDir(self, test_name):
     test_dir = os.path.join(self.get_temp_dir(), test_name)
@@ -96,7 +101,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.add_summary(
@@ -171,7 +176,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, graph=g)
+    sw = self._FileWriter(test_dir, graph=g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -179,7 +184,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, g)
+    sw = self._FileWriter(test_dir, g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -188,7 +193,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, graph_def=gd)
+    sw = self._FileWriter(test_dir, graph_def=gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -197,7 +202,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, gd)
+    sw = self._FileWriter(test_dir, gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -207,18 +212,18 @@ class SummaryWriterTestCase(test.TestCase):
       with ops.Graph().as_default() as g:
         constant_op.constant([12], name="douze")
       gd = g.as_graph_def()
-      sw = writer.FileWriter(test_dir, graph=g, graph_def=gd)
+      sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
-      sw = writer.FileWriter(test_dir, "string instead of graph object")
+      sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.close()
     # Sleep at least one second to make sure we get a new event file name.
@@ -261,7 +266,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     # Sleep 1.2 seconds to make sure event queue is empty.
     time.sleep(1.2)
     time_before_close = time.time()
@@ -270,7 +275,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
-    with writer.FileWriter(test_dir) as sw:
+    with self._FileWriter(test_dir) as sw:
       sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
     self.assertEquals(1, len(event_paths))
@@ -280,7 +285,7 @@ class SummaryWriterTestCase(test.TestCase):
   # protocol buffers correctly.
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     with self.test_session():
       i = constant_op.constant(1, dtype=dtypes.int32, shape=[])
       l = constant_op.constant(2, dtype=dtypes.int64, shape=[])
@@ -327,7 +332,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
 
@@ -386,7 +391,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
-    sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
+    sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
     for _ in range(10):
       sw.add_summary(
           summary_pb2.Summary(value=[
@@ -400,9 +405,178 @@ class SummaryWriterTestCase(test.TestCase):
     for filename in event_filenames:
       self.assertTrue(filename.endswith("_test_suffix"))
 
+  def testPluginAssetSerialized(self):
+    class ExamplePluginAsset(plugin_asset.PluginAsset):
+      plugin_name = "example"
+
+      def assets(self):
+        return {"foo.txt": "foo!", "bar.txt": "bar!"}
+
+    with ops.Graph().as_default() as g:
+      plugin_asset.get_plugin_asset(ExamplePluginAsset)
+
+      logdir = self.get_temp_dir()
+      fw = self._FileWriter(logdir)
+      fw.add_graph(g)
+    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
+
+    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "foo!")
+
+    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "bar!")
 
-class SummaryWriterCacheTest(test.TestCase):
-  """SummaryWriterCache tests."""
+
+class SessionBasedFileWriterTestCase(FileWriterTestCase):
+  """Tests for FileWriter behavior when passed a Session argument."""
+
+  def _FileWriter(self, *args, **kwargs):
+    if "session" not in kwargs:
+      # Pass in test_session() as the session. It will be cached during this
+      # test method invocation so that any other use of test_session() with no
+      # graph should result in re-using the same underlying Session.
+      with self.test_session() as sess:
+        kwargs["session"] = sess
+        return writer.FileWriter(*args, **kwargs)
+    return writer.FileWriter(*args, **kwargs)
+
+  def _createTaggedSummary(self, tag):
+    summary = summary_pb2.Summary()
+    summary.value.add(tag=tag)
+    return summary
+
+  def testSharing_withOtherSessionBasedFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # File writer, should share file with writer1
+      writer2 = writer.FileWriter(session=sess, logdir=logdir)
+      writer2.add_summary(self._createTaggedSummary("two"), 2)
+      writer2.flush()
+
+      # File writer with different logdir (shouldn't be in this logdir at all)
+      writer3 = writer.FileWriter(session=sess, logdir=logdir + "-other")
+      writer3.add_summary(self._createTaggedSummary("three"), 3)
+      writer3.flush()
+
+      # File writer in a different session (should be in separate file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer4.add_summary(self._createTaggedSummary("four"), 4)
+        writer4.flush()
+
+      # One more file writer, should share file with writer1
+      writer5 = writer.FileWriter(session=sess, logdir=logdir)
+      writer5.add_summary(self._createTaggedSummary("five"), 5)
+      writer5.flush()
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "four"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+    # Just check that the other logdir file exists to be sure we wrote it
+    self.assertTrue(glob.glob(os.path.join(logdir + "-other", "event*")))
+
+  def testSharing_withExplicitSummaryFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer via FileWriter(session=?)
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # Next one via create_file_writer(), should use same file
+      writer2 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer2.as_default():
+        summary2 = summary_ops_v2.scalar("two", 2.0, step=2)
+      sess.run(writer2.init())
+      sess.run(summary2)
+      sess.run(writer2.flush())
+
+      # Next has different shared name, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops_v2.create_file_writer(logdir=logdir, name="other")
+      with summary_ops_v2.always_record_summaries(), writer3.as_default():
+        summary3 = summary_ops_v2.scalar("three", 3.0, step=3)
+      sess.run(writer3.init())
+      sess.run(summary3)
+      sess.run(writer3.flush())
+
+      # Next uses a second session, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = summary_ops_v2.create_file_writer(logdir=logdir)
+        with summary_ops_v2.always_record_summaries(), writer4.as_default():
+          summary4 = summary_ops_v2.scalar("four", 4.0, step=4)
+        other_sess.run(writer4.init())
+        other_sess.run(summary4)
+        other_sess.run(writer4.flush())
+
+        # Next via FileWriter(session=?) uses same second session, should be in
+        # same separate file. (This checks sharing in the other direction)
+        writer5 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer5.add_summary(self._createTaggedSummary("five"), 5)
+        writer5.flush()
+
+      # One more via create_file_writer(), should use same file
+      writer6 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer6.as_default():
+        summary6 = summary_ops_v2.scalar("six", 6.0, step=6)
+      sess.run(writer6.init())
+      sess.run(summary6)
+      sess.run(writer6.flush())
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "six"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("six", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "three"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("three", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Third file should have "four" and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+
+class FileWriterCacheTest(test.TestCase):
+  """FileWriterCache tests."""
 
   def _test_dir(self, test_name):
     """Create an empty dir to use for tests.
@@ -448,32 +622,5 @@ class SummaryWriterCacheTest(test.TestCase):
       self.assertFalse(sw1 == sw2)
 
 
-class ExamplePluginAsset(plugin_asset.PluginAsset):
-  plugin_name = "example"
-
-  def assets(self):
-    return {"foo.txt": "foo!", "bar.txt": "bar!"}
-
-
-class PluginAssetsTest(test.TestCase):
-
-  def testPluginAssetSerialized(self):
-    with ops.Graph().as_default() as g:
-      plugin_asset.get_plugin_asset(ExamplePluginAsset)
-
-      logdir = self.get_temp_dir()
-      fw = writer.FileWriter(logdir)
-      fw.add_graph(g)
-    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
-
-    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "foo!")
-
-    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "bar!")
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
index dcf747971b..6b65b0ace3 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_event"
-- 
GitLab


From 2fc718c21cb82b2905cfc0ade2c801ce56af62d1 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 11 Apr 2018 02:16:25 -0700
Subject: [PATCH 0581/1262] [TF:XLA] Mark oom_test as optonly, it's really slow
 when compiled without optimization.

PiperOrigin-RevId: 192420481
---
 tensorflow/compiler/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index a7a8d2d1ff..47c6ab58c0 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -203,6 +203,7 @@ tf_xla_py_test(
     tags = [
         # Allocates very large amounts of memory and does not work under TSAN.
         "notsan",
+        "optonly",  # Times out frequently in fastbuild.
     ],
     deps = [
         ":xla_test",
-- 
GitLab


From ef6637771b2582245bb15507a6796b3c3f1db6b5 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 11 Apr 2018 20:48:32 +0900
Subject: [PATCH 0582/1262] fix typo

---
 tensorflow/core/framework/collective.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 5810c7fa54..a82fb50d88 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -178,7 +178,7 @@ class StepSequenceInterface {
   virtual void RefreshStepIdSequenceAsync(int64 graph_key,
                                           const StatusCallback& done) = 0;
 
-  // Returns the the step_id that should be used for initiating a new execution
+  // Returns the step_id that should be used for initiating a new execution
   // on the specified graph. May return the same step_id multiple times if
   // RetireStepId or RefreshStepIdReservation is not called.
   virtual int64 NextStepId(int64 graph_key) = 0;
-- 
GitLab


From acd9725e72af749c60153cd4d7efdd679c935426 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 11 Apr 2018 20:49:46 +0900
Subject: [PATCH 0583/1262] fix typo

---
 tensorflow/contrib/lite/toco/model.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 56ef9fe2a8..8a936842d9 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -151,7 +151,7 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that that does not by itself tell whether the values in the array are
+// Note that does not by itself tell whether the values in the array are
 // real (are literally interpreted as real numbers) or quantized (only acquire
 // a meaning as real numbers in conjunction with QuantizationParams).
 //
-- 
GitLab


From bbfff939e45013a7b5f8f6412981e7b50a4273d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 07:47:26 -0700
Subject: [PATCH 0584/1262] Fixing propagation of minmax info on constant
 gather ops.

PiperOrigin-RevId: 192448922
---
 .../resolve_constant_concatenation.cc            | 16 ++++++++++++----
 .../resolve_constant_gather.cc                   | 10 ++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 064810b53e..d916ae0ddf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -105,7 +106,8 @@ void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
 // already set (e.g. because of previous pass in TOCO), it doesn't change it and
 // returns. Otherwise it uses the input arrays min and max values to compute the
 // concatenated array min and max.
-void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
+void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
+                                 const std::vector<Array*>& input_arrays,
                                  Array* concatenated_array) {
   CHECK(concatenated_array->data_type == ArrayDataType::kFloat);
   // If the minmax is already set, use it
@@ -125,6 +127,9 @@ void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
   MinMax& minmax = concatenated_array->GetOrCreateMinMax();
   minmax.min = concat_min;
   minmax.max = concat_max;
+
+  transformation->AddMessageF("Setting concatenated array min/max to %g,%g",
+                              concat_min, concat_max);
 }
 
 }  // namespace
@@ -161,11 +166,14 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
+  AddMessageF("Performing constant concat of %s into %s",
+              absl::StrJoin(concat_op->inputs, ", "), concatenated_array_name);
+
   switch (concatenated_array.data_type) {
     case ArrayDataType::kFloat:
       ConcatenateTensorBuffers<ArrayDataType::kFloat>(
           input_arrays, concatenation_axis, &concatenated_array);
-      SetMinMaxForConcatenedArray(input_arrays, &concatenated_array);
+      SetMinMaxForConcatenedArray(this, input_arrays, &concatenated_array);
       break;
     case ArrayDataType::kUint8:
       ConcatenateTensorBuffers<ArrayDataType::kUint8>(
@@ -189,13 +197,13 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    // Check to prevent removal of shared tensors
+    // Check to prevent removal of shared tensors.
     if (CountOpsWithInput(*model, input_name) == 1) {
       model->EraseArray(input_name);
     }
   }
 
-  // Remove concatenate operator
+  // Remove concatenate operator.
   model->operators.erase(concat_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
index d999c2df94..debe298a5a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -98,6 +98,16 @@ bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
   CHECK(coords_array.data_type == ArrayDataType::kInt32)
       << "Only int32 indices are supported";
 
+  // Copy min/max info if present. The ranges of the selected values may be
+  // a subset of the original range but we want to ensure the quantization
+  // params stay the same.
+  if (input_array.minmax) {
+    const auto& input_minmax = input_array.GetMinMax();
+    auto& output_minmax = output_array.GetOrCreateMinMax();
+    output_minmax.min = input_minmax.min;
+    output_minmax.max = input_minmax.max;
+  }
+
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
-- 
GitLab


From 77548a7877028614e4c5e0b4c2a8d25660785c6f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 08:11:50 -0700
Subject: [PATCH 0585/1262] Remove unused former source of
 tensorflow.org/tutorials/image_retraining.

The source of https://tensorflow.org/tutorials/image_retraining has moved from
https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src/tutorials
to https://github.com/tensorflow/hub/tree/master/docs/tutorials
because of its use of TensorFlow Hub.

This change replaces the now-defunct version with a pointer to the new
location, in order to avoid dead code.

PiperOrigin-RevId: 192451570
---
 .../docs_src/tutorials/image_retraining.md    | 404 +-----------------
 1 file changed, 2 insertions(+), 402 deletions(-)

diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index 93d7c86e42..27784eef9c 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -1,404 +1,4 @@
 # How to Retrain Inception's Final Layer for New Categories
 
-Modern object recognition models have millions of parameters and can take weeks
-to fully train. Transfer learning is a technique that shortcuts a lot of this
-work by taking a fully-trained model for a set of categories like ImageNet, and
-retrains from the existing weights for new classes. In this example we'll be
-retraining the final layer from scratch, while leaving all the others untouched.
-For more information on the approach you can see
-[this paper on Decaf](https://arxiv.org/pdf/1310.1531v1.pdf).
-
-Though it's not as good as a full training run, this is surprisingly effective
-for many applications, and can be run in as little as thirty minutes on a
-laptop, without requiring a GPU. This tutorial will show you how to run the
-example script on your own images, and will explain some of the options you have
-to help control the training process.
-
-Note: A version of this tutorial is also available
-[as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
-
-Before you start, you must @{$install$install tensorflow}.
-
-[TOC]
-
-## Training on Flowers
-
-![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
-
-[Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
-
-Before you start any training, you'll need a set of images to teach the network
-about the new classes you want to recognize. There's a later section that
-explains how to prepare your own images, but to make it easy we've created an
-archive of creative-commons licensed flower photos to use initially. To get the
-set of flower photos, run these commands:
-
-```sh
-cd ~
-curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
-tar xzf flower_photos.tgz
-```
-
-Once you have the images, you can clone the tensorflow repository using the
-following command (these examples are not included in the installation):
-
-```sh
-git clone https://github.com/tensorflow/tensorflow
-```
-
-Then checkout the version of the tensorflow repository matching your
-installation and this tutorial as follows:
-
-``` sh
-cd tensorflow
-git checkout {version}
-```
-
-In the simplest cases the retrainer can then be run like this:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py --image_dir ~/flower_photos
-```
-
-The script has many other options. You can get a full listing with:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py -h
-```
-
-This script loads the pre-trained Inception v3 model, removes the old top layer,
-and trains a new one on the flower photos you've downloaded. None of the flower
-species were in the original ImageNet classes the full network was trained on.
-The magic of transfer learning is that lower layers that have been trained to
-distinguish between some objects can be reused for many recognition tasks
-without any alteration.
-
-## Bottlenecks
-
-The script can take thirty minutes or more to complete, depending on the speed
-of your machine. The first phase analyzes all the images on disk and calculates
-the bottleneck values for each of them. 'Bottleneck' is an informal term we
-often use for the layer just before the final output layer that actually does
-the classification. This penultimate layer has been trained to output a set of
-values that's good enough for the classifier to use to distinguish between all
-the classes it's been asked to recognize. That means it has to be a meaningful
-and compact summary of the images, since it has to contain enough information
-for the classifier to make a good choice in a very small set of values. The
-reason our final layer retraining can work on new classes is that it turns out
-the kind of information needed to distinguish between all the 1,000 classes in
-ImageNet is often also useful to distinguish between new kinds of objects.
-
-Because every image is reused multiple times during training and calculating
-each bottleneck takes a significant amount of time, it speeds things up to
-cache these bottleneck values on disk so they don't have to be repeatedly
-recalculated. By default they're stored in the `/tmp/bottleneck` directory, and
-if you rerun the script they'll be reused so you don't have to wait for this
-part again.
-
-## Training
-
-Once the bottlenecks are complete, the actual training of the top layer of the
-network begins. You'll see a series of step outputs, each one showing training
-accuracy, validation accuracy, and the cross entropy. The training accuracy
-shows what percent of the images used in the current training batch were
-labeled with the correct class. The validation accuracy is the precision on a
-randomly-selected group of images from a different set. The key difference is
-that the training accuracy is based on images that the network has been able
-to learn from so the network can overfit to the noise in the training data. A
-true measure of the performance of the network is to measure its performance on
-a data set not contained in the training data -- this is measured by the
-validation accuracy. If the train accuracy is high but the validation accuracy
-remains low, that means the network is overfitting and memorizing particular
-features in the training images that aren't helpful more generally. Cross
-entropy is a loss function which gives a glimpse into how well the learning
-process is progressing. The training's objective is to make the loss as small as
-possible, so you can tell if the learning is working by keeping an eye on
-whether the loss keeps trending downwards, ignoring the short-term noise.
-
-By default this script will run 4,000 training steps. Each step chooses 100
-images at random from the training set, finds their bottlenecks from the cache,
-and feeds them into the final layer to get predictions. Those predictions are
-then compared against the actual labels to update the final layer's weights
-through the back-propagation process. As the process continues you should see
-the reported accuracy improve, and after all the steps are done, a final test
-accuracy evaluation is run on a set of images kept separate from the training
-and validation pictures. This test evaluation is the best estimate of how the
-trained model will perform on the classification task. You should see an
-accuracy value of between 90% and 95%, though the exact value will vary from run
-to run since there's randomness in the training process. This number is based on
-the percent of the images in the test set that are given the correct label
-after the model is fully trained.
-
-## Visualizing the Retraining with TensorBoard
-
-The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
-
-To launch TensorBoard, run this command during or after retraining:
-
-```sh
-tensorboard --logdir /tmp/retrain_logs
-```
-
-Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
-
-The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
-
-The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
-
-## Using the Retrained Model
-
-The script will write out a version of the Inception v3 network with a final
-layer retrained to your categories to /tmp/output_graph.pb, and a text file
-containing the labels to /tmp/output_labels.txt. These are both in a format that
-the @{$image_recognition$C++ and Python image classification examples}
-can read in, so you can start using your new model immediately. Since you've
-replaced the top layer, you will need to specify the new name in the script, for
-example with the flag `--output_layer=final_result` if you're using label_image.
-
-Here's an example of how to run the label_image example with your
-retrained graphs:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=Mul \
---output_layer=final_result \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-You should see a list of flower labels, in most cases with daisy on top
-(though each retrained model may be slightly different). You can replace the
-`--image` parameter with your own images to try those out.
-
-If you'd like to use the retrained model in your own Python program, then the
-above
-[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/label_image/label_image.py)
-is a reasonable starting point. The `label_image`
-directory also contains C++ code which you can use as a template to integrate
-tensorflow with your own applications.
-
-If you find the default Inception v3 model is too large or slow for your
-application, take a look at the [Other Model Architectures section](/tutorials/image_retraining#other_model_architectures)
-below for options to speed up and slim down your network.
-
-## Training on Your Own Categories
-
-If you've managed to get the script working on the flower example images, you
-can start looking at teaching it to recognize categories you care about instead.
-In theory all you'll need to do is point it at a set of sub-folders, each named
-after one of your categories and containing only images from that category. If
-you do that and pass the root folder of the subdirectories as the argument to
-`--image_dir`, the script should train just like it did for the flowers.
-
-Here's what the folder structure of the flowers archive looks like, to give you
-and example of the kind of layout the script is looking for:
-
-![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
-
-In practice it may take some work to get the accuracy you want. I'll try to
-guide you through some of the common problems you might encounter below.
-
-## Creating a Set of Training Images
-
-The first place to start is by looking at the images you've gathered, since the
-most common issues we see with training come from the data that's being fed in.
-
-For training to work well, you should gather at least a hundred photos of each
-kind of object you want to recognize. The more you can gather, the better the
-accuracy of your trained model is likely to be. You also need to make sure that
-the photos are a good representation of what your application will actually
-encounter. For example, if you take all your photos indoors against a blank wall
-and your users are trying to recognize objects outdoors, you probably won't see
-good results when you deploy.
-
-Another pitfall to avoid is that the learning process will pick up on anything
-that the labeled images have in common with each other, and if you're not
-careful that might be something that's not useful. For example if you photograph
-one kind of object in a blue room, and another in a green one, then the model
-will end up basing its prediction on the background color, not the features of
-the object you actually care about. To avoid this, try to take pictures in as
-wide a variety of situations as you can, at different times, and with different
-devices. If you want to know more about this problem, you can read about the
-classic (and possibly apocryphal)
-[tank recognition problem](https://www.jefftk.com/p/detecting-tanks).
-
-You may also want to think about the categories you use. It might be worth
-splitting big categories that cover a lot of different physical forms into
-smaller ones that are more visually distinct. For example instead of 'vehicle'
-you might use 'car', 'motorbike', and 'truck'. It's also worth thinking about
-whether you have a 'closed world' or an 'open world' problem. In a closed world,
-the only things you'll ever be asked to categorize are the classes of object you
-know about. This might apply to a plant recognition app where you know the user
-is likely to be taking a picture of a flower, so all you have to do is decide
-which species. By contrast a roaming robot might see all sorts of different
-things through its camera as it wanders around the world. In that case you'd
-want the classifier to report if it wasn't sure what it was seeing. This can be
-hard to do well, but often if you collect a large number of typical 'background'
-photos with no relevant objects in them, you can add them to an extra 'unknown'
-class in your image folders.
-
-It's also worth checking to make sure that all of your images are labeled
-correctly. Often user-generated tags are unreliable for our purposes, for
-example using #daisy for pictures of a person named Daisy. If you go through
-your images and weed out any mistakes it can do wonders for your overall
-accuracy.
-
-## Training Steps
-
-If you're happy with your images, you can take a look at improving your results
-by altering the details of the learning process. The simplest one to try is
-`--how_many_training_steps`. This defaults to 4,000, but if you increase it to
-8,000 it will train for twice as long. The rate of improvement in the accuracy
-slows the longer you train for, and at some point will stop altogether, but you
-can experiment to see when you hit that limit for your model.
-
-## Distortions
-
-A common way of improving the results of image training is by deforming,
-cropping, or brightening the training inputs in random ways. This has the
-advantage of expanding the effective size of the training data thanks to all the
-possible variations of the same images, and tends to help the network learn to
-cope with all the distortions that will occur in real-life uses of the
-classifier. The biggest disadvantage of enabling these distortions in our script
-is that the bottleneck caching is no longer useful, since input images are never
-reused exactly. This means the training process takes a lot longer, so I
-recommend trying this as a way of fine-tuning your model once you've got one
-that you're reasonably happy with.
-
-You enable these distortions by passing `--random_crop`, `--random_scale` and
-`--random_brightness` to the script. These are all percentage values that
-control how much of each of the distortions is applied to each image. It's
-reasonable to start with values of 5 or 10 for each of them and then experiment
-to see which of them help with your application. `--flip_left_right` will
-randomly mirror half of the images horizontally, which makes sense as long as
-those inversions are likely to happen in your application. For example it
-wouldn't be a good idea if you were trying to recognize letters, since flipping
-them destroys their meaning.
-
-## Hyper-parameters
-
-There are several other parameters you can try adjusting to see if they help
-your results. The `--learning_rate` controls the magnitude of the updates to the
-final layer during training. Intuitively if this is smaller then the learning
-will take longer, but it can end up helping the overall precision. That's not
-always the case though, so you need to experiment carefully to see what works
-for your case. The `--train_batch_size` controls how many images are examined
-during one training step, and because the learning rate is applied per batch
-you'll need to reduce it if you have larger batches to get the same overall
-effect.
-
-## Training, Validation, and Testing Sets
-
-One of the things the script does under the hood when you point it at a folder
-of images is divide them up into three different sets. The largest is usually
-the training set, which are all the images fed into the network during training,
-with the results used to update the model's weights. You might wonder why we
-don't use all the images for training? A big potential problem when we're doing
-machine learning is that our model may just be memorizing irrelevant details of
-the training images to come up with the right answers. For example, you could
-imagine a network remembering a pattern in the background of each photo it was
-shown, and using that to match labels with objects. It could produce good
-results on all the images it's seen before during training, but then fail on new
-images because it's not learned general characteristics of the objects, just
-memorized unimportant details of the training images.
-
-This problem is known as overfitting, and to avoid it we keep some of our data
-out of the training process, so that the model can't memorize them. We then use
-those images as a check to make sure that overfitting isn't occurring, since if
-we see good accuracy on them it's a good sign the network isn't overfitting. The
-usual split is to put 80% of the images into the main training set, keep 10%
-aside to run as validation frequently during training, and then have a final 10%
-that are used less often as a testing set to predict the real-world performance
-of the classifier. These ratios can be controlled using the
-`--testing_percentage` and `--validation_percentage` flags. In general
-you should be able to leave these values at their defaults, since you won't
-usually find any advantage to training to adjusting them.
-
-Note that the script uses the image filenames (rather than a completely random
-function) to divide the images among the training, validation, and test sets.
-This is done to ensure that images don't get moved between training and testing
-sets on different runs, since that could be a problem if images that had been
-used for training a model were subsequently used in a validation set.
-
-You might notice that the validation accuracy fluctuates among iterations. Much
-of this fluctuation arises from the fact that a random subset of the validation
-set is chosen for each validation accuracy measurement. The fluctuations can be
-greatly reduced, at the cost of some increase in training time, by choosing
-`--validation_batch_size=-1`, which uses the entire validation set for each
-accuracy computation.
-
-Once training is complete, you may find it insightful to examine misclassified
-images in the test set. This can be done by adding the flag
-`--print_misclassified_test_images`. This may help you get a feeling for which
-types of images were most confusing for the model, and which categories were
-most difficult to distinguish. For instance, you might discover that some
-subtype of a particular category, or some unusual photo angle, is particularly
-difficult to identify, which may encourage you to add more training images of
-that subtype. Oftentimes, examining misclassified images can also point to
-errors in the input data set, such as mislabeled, low-quality, or ambiguous
-images. However, one should generally avoid point-fixing individual errors in
-the test set, since they are likely to merely reflect more general problems in
-the (much larger) training set.
-
-## Other Model Architectures
-
-By default the script uses a pretrained version of the Inception v3 model
-architecture. This is a good place to start because it provides high accuracy
-results, but if you intend to deploy your model on mobile devices or other
-resource-constrained environments you may want to trade off a little accuracy
-for much smaller file sizes or faster speeds. To help with that, the
-[retrain.py script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py)
-supports different variations on the [Mobilenet architecture](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html).
-
-These are a little less precise than Inception v3, but can result in far
-smaller file sizes (a few megabytes) and can be many times faster
-to run. To train with one of these models, pass in the `--architecture` flag,
-for example:
-
-```
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_0.25_128
-```
-
-This will create a 1.9MB model file in `/tmp/output_graph.pb`, with only 25% of
-the number of neurons of the full Mobilenet, and trained to take 128x128 sized
-input images.
-
-You can choose '1.0', '0.75', '0.50', or '0.25' to control the number of
-neurons (activations of hidden layers); the number of weights (and hence to
-some extent the file size and speed) shrinks like the square of that fraction.
-You can choose '224', '192', '160', or '128' for the input image size,
-with smaller sizes giving faster speeds.
-
-The speed and size advantages come at a loss to accuracy of course, but for many
-purposes this isn't critical. They can also be somewhat offset with improved
-training data. For example, training with distortions allows me to get above 80%
-accuracy on the flower data set even with the 0.25/128 graph above.
-
-If you're going to be using the Mobilenet models in label_image or your own
-programs, you'll need to feed in an image of the specified size converted to a
-float range into the 'input' tensor. Typically 24-bit images are in the range
-[0,255], and you must convert them to the [-1,1] float range expected by the
-model with the formula  `(image - 128.)/128.`.
-
-The default arguments for the `label_image` script are set for Inception V3.
-To use it with a MobileNet, specify the above normalization parameters as
-`input_mean` and `input_std` on the command line. You also must specify the
-image size that your model expects, as follows:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=input \
---output_layer=final_result \
---input_height=224 --input_width=224 \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-For more information on deploying the retrained model to a mobile device, see
-the [codelab version](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0)
-of this tutorial, especially [part 2](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/#0), which describes
-[TensorFlow Lite](/mobile/tflite/) and the additional optimizations it offers
-(including quantization of model weights).
+**NOTE: This tutorial has moved to**
+https://github.com/tensorflow/hub/tree/master/docs/tutorials/image_retraining.md
-- 
GitLab


From 8e1b323be4b5d56d531b2d5ee7a1fc573a2a0b5f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 08:30:18 -0700
Subject: [PATCH 0586/1262] Temporarily remove prelu from
 generated_examples_zip_test

PiperOrigin-RevId: 192453411
---
 tensorflow/contrib/lite/testing/BUILD         |  1 -
 .../contrib/lite/testing/generate_examples.py | 48 -------------------
 2 files changed, 49 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 198984e7e7..1ce89a25fd 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -42,7 +42,6 @@ gen_zipped_test_files(
         "minimum.zip",
         "mul.zip",
         "pad.zip",
-        "prelu.zip",
         "relu.zip",
         "relu1.zip",
         "relu6.zip",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 672158aa2f..0e6aceeb86 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -630,54 +630,6 @@ def make_relu6_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_prelu_tests(zip_path):
-  """Make a set of tests to do PReLU."""
-
-  test_parameters = [{
-      # The canonical case for image processing is having a 4D `input` (NHWC)
-      # and `shared_axes`=[1, 2], so the alpha parameter is per channel.
-      "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
-      "shared_axes": [[1, 2], [1]],
-  }]
-
-  def build_graph(parameters):
-    """Build the graph for the test case."""
-
-    input_tensor = tf.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    prelu = tf.keras.layers.PReLU(shared_axes=parameters["shared_axes"])
-    out = prelu(input_tensor)
-    return [input_tensor], [out]
-
-  def build_inputs(parameters, sess, inputs, outputs):
-    """Build the inputs for the test case."""
-
-    input_shape = parameters["input_shape"]
-    input_values = create_tensor_data(
-        np.float32, input_shape, min_value=-10, max_value=10)
-    shared_axes = parameters["shared_axes"]
-
-    alpha_shape = []
-    for dim in range(1, len(input_shape)):
-      alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
-
-    alpha_values = create_tensor_data(np.float32, alpha_shape)
-
-    with tf.variable_scope("", reuse=True):
-      alpha = tf.get_variable("p_re_lu/alpha")
-      sess.run(alpha.assign(alpha_values))
-
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
-
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      use_frozen_graph=True)
-
-
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
-- 
GitLab


From 0073d1375add58b0493449c356af76aa33455f7d Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 11 Apr 2018 09:34:44 -0700
Subject: [PATCH 0587/1262] Fix Windows GPU TensorFlow Bazel builds.

The configure.py script will error out on Windows GPU builds due
to NCCL attempted to be configured (and is currently Linux only).

PiperOrigin-RevId: 192461362
---
 configure.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 81d5ad77ee..8fb8979111 100644
--- a/configure.py
+++ b/configure.py
@@ -1516,7 +1516,8 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
-    set_tf_nccl_install_path(environ_cp)
+      set_tf_nccl_install_path(environ_cp)
+
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
-- 
GitLab


From adfbc272ded60a221444423b1fee58551c6445c7 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Wed, 11 Apr 2018 09:34:51 -0700
Subject: [PATCH 0588/1262] Fixing dependencies.

PiperOrigin-RevId: 192461382
---
 tensorflow/contrib/lite/python/BUILD          |  5 ++++
 .../lite/python/convert_saved_model_test.py   | 12 +++++-----
 tensorflow/contrib/saved_model/BUILD          | 23 ++++++++++++++-----
 tensorflow/python/tools/BUILD                 | 14 ++++-------
 4 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index e735062a7f..6fafaf0727 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -106,8 +106,13 @@ py_test(
     deps = [
         ":convert_saved_model",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model",
     ],
 )
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index d87fbeb91c..734e42d619 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -25,21 +25,21 @@ from __future__ import print_function
 
 import os
 from tensorflow.contrib.lite.python import convert_saved_model
-from tensorflow.python import estimator
 from tensorflow.python import keras
-from tensorflow.python import layers
-from tensorflow.python import losses
-from tensorflow.python import nn
-from tensorflow.python import saved_model
-from tensorflow.python import train
 from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib as estimator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training import training as train
 
 
 class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index e431c464ef..26fd4e2023 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -48,16 +48,14 @@ py_library(
     ],
 )
 
-py_test(
-    name = "reader_test",
-    size = "small",
-    srcs = ["python/saved_model/reader_test.py"],
+py_library(
+    name = "reader",
+    srcs = ["python/saved_model/reader.py"],
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
-    visibility = ["//visibility:private"],
+    visibility = ["//visibility:public"],
     deps = [
         ":saved_model_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:variables",
@@ -66,6 +64,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "reader_test",
+    size = "small",
+    srcs = ["python/saved_model/reader_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":reader",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "signature_def_utils_test",
     size = "small",
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 6e39ce8c80..cc2884a4f6 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -28,7 +28,7 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = ["//tensorflow/contrib/saved_model:reader"],
 )
 
 py_library(
@@ -38,11 +38,12 @@ py_library(
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
+        "//tensorflow/python/saved_model:loader",
         "@six_archive//:six",
     ],
 )
@@ -52,14 +53,7 @@ py_binary(
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":saved_model_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-        "@six_archive//:six",
+        ":freeze_graph_lib",
     ],
 )
 
-- 
GitLab


From a9a3b98a76f1d4a8fb7a02e451fb71147a842f31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 09:43:32 -0700
Subject: [PATCH 0589/1262] Import FunctionDef as GrapplerFunctionItem

Explicitly track function input arg expansion into Placeholders,
and keep metadata to map between FunctionDef and GraphDef connectivity
formats.

PiperOrigin-RevId: 192462592
---
 tensorflow/core/grappler/grappler_item.h      |   3 +-
 .../grappler/optimizers/function_optimizer.cc |  29 +-
 .../optimizers/function_optimizer_test.cc     |  16 +-
 tensorflow/core/grappler/utils/BUILD          |   2 +
 tensorflow/core/grappler/utils/functions.cc   | 385 +++++++++++++-----
 tensorflow/core/grappler/utils/functions.h    | 116 +++++-
 .../core/grappler/utils/functions_test.cc     | 277 +++++++++----
 7 files changed, 627 insertions(+), 201 deletions(-)

diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 06bba544c3..45eed47b50 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -35,8 +35,9 @@ namespace grappler {
 // nodes, and potentially a set of nodes to feed.
 // TODO(volunteer_needed): turn this struct into a class.
 struct GrapplerItem {
-  GrapplerItem() {}
+  GrapplerItem() = default;
   GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef);
+  virtual ~GrapplerItem() = default;
 
   string id;  // A unique id for this item
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 343c89a9da..6d67ead355 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -38,11 +38,14 @@ class FunctionInliningContext {
  public:
   explicit FunctionInliningContext(const GrapplerItem& item,
                                    RewriterConfig::Toggle opt_level)
-      : library_(&item.graph.library()),
-        opt_level_(opt_level),
-        functions_(InliningCandidates(item)) {}
+      : opt_level_(opt_level),
+        functions_(InliningCandidates(item)),
+        function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
+                                                    item.graph.library())) {}
 
-  const FunctionDefLibrary& Library() const { return *library_; }
+  const FunctionLibraryDefinition& FunctionLibrary() const {
+    return function_library_;
+  }
 
   bool HasInlinedFunctions() const { return !functions_.empty(); }
 
@@ -78,9 +81,9 @@ class FunctionInliningContext {
     return functions;
   }
 
-  const FunctionDefLibrary* library_;
   RewriterConfig::Toggle opt_level_;
   std::unordered_map<string, const FunctionDef*> functions_;
+  FunctionLibraryDefinition function_library_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
 };
@@ -150,11 +153,14 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
-  if (!item) {
+  GrapplerFunctionItem item;
+  Status item_status =
+      MakeGrapplerFunctionItem(func, func_attr, ctx.FunctionLibrary(), &item);
+
+  if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
-                                   " instantiated by ", func_node.name());
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
   }
 
   std::unordered_map<string, int> input_nodes;
@@ -168,7 +174,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
   TF_RETURN_IF_ERROR(
       HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
 
-  for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+  for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
     if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
       CHECK_EQ(0, func_body_node.input_size());
       // Turn input placeholders into identity nodes
@@ -217,8 +223,9 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
 
   // Hook inlined function outputs to IdentityN node
   NodeDef* func_outputs = optimized_graph->add_node();
+  std::vector<string> fetch = OutputTensors(item);
   TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
-                                                item->fetch, func_outputs));
+                                                fetch, func_outputs));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index fe26a56fc2..099fe7caf2 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -92,13 +92,13 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
-      EXPECT_EQ("y/scale:0", node.input(1));
+      EXPECT_EQ("y/scale", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/y:0", node.input(0));
+      EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
@@ -180,13 +180,13 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
-      EXPECT_EQ("y/two:0", node.input(1));
+      EXPECT_EQ("y/two", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/y:0", node.input(0));
+      EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
@@ -264,13 +264,13 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/Linear_func:0", node.input(0));
+      EXPECT_EQ("y/Linear_func", node.input(0));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/Exp:0", node.input(0));
+      EXPECT_EQ("y/Exp", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
@@ -453,12 +453,12 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output/output:0", node.input(0));
+      EXPECT_EQ("square/output/output", node.input(0));
     } else if (node.name() == "square" && count++) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output:0", node.input(0));
+      EXPECT_EQ("square/output", node.input(0));
     } else if (node.name() == "outputs" && count++) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(kDevice, node.device());
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 7419c26dff..05d9cbaa2b 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -161,6 +161,8 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 4f286ce1c8..dd0d918e72 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -24,50 +24,285 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
 
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library) {
-  if (func.signature().name().empty()) {
-    LOG(ERROR) << "function name must be specified.";
-    return nullptr;
+void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
+    const InputArgExpansion& input_arg_expansion) {
+  input_arg_expansions_.insert(
+      {input_arg_expansion.input_name, input_arg_expansion});
+}
+
+void GrapplerFunctionConnectivity::RegisterFunctionBodyOutputs(
+    const string& node_name, const tensorflow::NameRangeMap& outputs) {
+  function_body_outputs_.insert({node_name, outputs});
+}
+
+Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
+    const string& func_def_input, std::vector<string>* graph_def_inputs) const {
+  using ::tensorflow::strings::Scanner;
+
+  // Parse input format: "node_name[:node_output][:position]"
+  string node_name;
+  string node_output;
+  int position = -1;
+
+  StringPiece capture;
+  StringPiece remaining;
+
+  // Parse "node_name"
+  if (Scanner(func_def_input)
+          .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
+          .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
+          .GetResult(&remaining, &capture)) {
+    node_name = string(capture.data(), capture.size());
   }
-  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-  new_item->id = func.signature().name();
-
-  std::unordered_map<string, string> port_map;
-
-  // Add the function inputs as placeholder
-  for (const auto& inp : func.signature().input_arg()) {
-    NodeDef* ph = new_item->graph.add_node();
-    ph->set_name(inp.name());
-    ph->set_op("Placeholder");
-    if (inp.type() != DT_INVALID) {
-      (*ph->mutable_attr())["T"].set_type(inp.type());
-    } else {
-      auto it = func_attr.find(inp.type_attr());
-      if (it == func_attr.end()) {
-        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-                   << " for function input " << inp.name();
-        return nullptr;
+
+  // Parse "node_output" if it exists
+  if (Scanner(remaining)
+          .OneLiteral(":")
+          .RestartCapture()
+          .One(strings::Scanner::LOWERLETTER)
+          .Any(strings::Scanner::LETTER_DIGIT_UNDERSCORE)
+          .GetResult(&remaining, &capture)) {
+    node_output = string(capture.data(), capture.size());
+  }
+
+  // Parse "position" if it exists
+  if (Scanner(remaining)
+          .OneLiteral(":")
+          .RestartCapture()
+          .Many(strings::Scanner::DIGIT)
+          .GetResult(nullptr, &capture)) {
+    CHECK(strings::safe_strto32(capture, &position));
+  }
+
+  // If "node_output" is not empty, it must be an output of a function body node
+  bool is_function_body_output = !node_output.empty();
+
+  // Function input argument: "node_name[:position]"
+  if (!is_function_body_output) {
+    auto input_arg = input_arg_expansions_.find(node_name);
+    if (input_arg != input_arg_expansions_.end()) {
+      const InputArgExpansion& input_arg_expansion = input_arg->second;
+      const auto& placeholders = input_arg_expansion.placeholders;
+
+      if (position == -1) {
+        // If position is not defined use all placeholders
+        graph_def_inputs->reserve(placeholders.size());
+        for (const string& placeholder : placeholders) {
+          graph_def_inputs->push_back(placeholder);
+        }
       } else {
-        (*ph->mutable_attr())["T"] = it->second;
+        if (position > input_arg_expansion.placeholders.size() - 1) {
+          return errors::InvalidArgument("Invalid input ", node_name,
+                                         "position: ", position,
+                                         " (out of range)");
+        }
+        graph_def_inputs->push_back(input_arg_expansion.placeholders[position]);
+      }
+
+      return Status::OK();
+    }
+  }
+
+  // Function body output: "node_name:node_output[:position]"
+  if (is_function_body_output) {
+    auto function_body_outputs = function_body_outputs_.find(node_name);
+    if (function_body_outputs != function_body_outputs_.end()) {
+      const tensorflow::NameRangeMap& outputs = function_body_outputs->second;
+      auto output = outputs.find(node_output);
+      if (output != outputs.end()) {
+        const auto& output_range = output->second;
+
+        if (position == -1) {
+          // If position is not defined expand node output range
+          for (int i = output_range.first; i < output_range.second; ++i) {
+            i == 0 ? graph_def_inputs->push_back(node_name)
+                   : graph_def_inputs->push_back(
+                         strings::StrCat(node_name, ":", i));
+          }
+        } else {
+          if (position > (output_range.second - output_range.first)) {
+            return errors::InvalidArgument(
+                "Invalid node ", node_name, " output ", node_output,
+                " position: ", position, " (out of range)");
+          }
+          int pos = output_range.first + position;
+          pos == 0 ? graph_def_inputs->push_back(node_name)
+                   : graph_def_inputs->push_back(
+                         strings::StrCat(node_name, ":", pos));
+        }
+
+        return Status::OK();
       }
     }
-    port_map[inp.name()] = inp.name();
   }
 
-  // Add the function body to the graph.
-  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+  return errors::InvalidArgument("Failed to expand a function def input: ",
+                                 func_def_input);
+}
+
+Status GrapplerFunctionConnectivity::ExpandNodeInputs(
+    NodeDef* function_body_node) const {
+  std::vector<string> expanded_inputs;
+
+  for (const string& function_def_input : function_body_node->input()) {
+    if (!IsControlInput(function_def_input))
+      TF_RETURN_IF_ERROR(
+          ExpandFunctionDefInput(function_def_input, &expanded_inputs));
+    else
+      expanded_inputs.push_back(function_def_input);
+  }
+
+  function_body_node->clear_input();
+  for (const string& expanded_input : expanded_inputs)
+    function_body_node->add_input(expanded_input);
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemBuilder::GetTypeAttr(const string& type_attr_name,
+                                                DataType* data_type) const {
+  auto it = func_attr_->find(type_attr_name);
+  if (it == func_attr_->end()) {
+    return errors::InvalidArgument("Type attribute ", type_attr_name,
+                                   " is not defined");
+  } else if (it->second.type() == DT_INVALID) {
+    return errors::InvalidArgument("Type attribute ", type_attr_name,
+                                   " is not defined with a valid type");
+  } else {
+    *data_type = it->second.type();
+  }
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemBuilder::GetArgType(const OpDef::ArgDef& arg,
+                                               DataType* data_type) const {
+  if (arg.type() != DT_INVALID) {
+    *data_type = arg.type();
+  } else {
+    TF_RETURN_IF_ERROR(GetTypeAttr(arg.type_attr(), data_type));
+  }
+  return Status::OK();
+}
+
+GrapplerFunctionItem::GrapplerFunctionItem(
+    const string& function_name,
+    const std::vector<InputArgExpansion>& input_arg_expansions,
+    const std::vector<OutputArgExpansion>& output_arg_expansions,
+    GraphDef&& function_body)
+    : function_name_(function_name),
+      input_arg_expansions_(input_arg_expansions),
+      output_arg_expansions_(output_arg_expansions) {
+  graph.Swap(&function_body);
+}
+
+const string& GrapplerFunctionItem::function_name() const {
+  return function_name_;
+}
+
+const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
+  return input_arg_expansions_;
+}
+
+const InputArgExpansion& GrapplerFunctionItem::input(int i) const {
+  return input_arg_expansions_[i];
+}
+
+const std::size_t GrapplerFunctionItem::input_size() const {
+  return input_arg_expansions_.size();
+}
+
+const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
+  return output_arg_expansions_;
+}
+
+const OutputArgExpansion& GrapplerFunctionItem::output(int i) const {
+  return output_arg_expansions_[i];
+}
+
+const std::size_t GrapplerFunctionItem::output_size() const {
+  return output_arg_expansions_.size();
+}
+
+const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
+
+GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
+
+std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
+  std::vector<string> output_tensors;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& tensor : output.output_tensors) {
+      output_tensors.push_back(tensor);
+    }
+  }
+  return output_tensors;
+}
+
+Status MakeGrapplerFunctionItem(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionLibraryDefinition& func_library, GrapplerFunctionItem* item) {
+  const OpDef& signature = func.signature();
+
+  if (signature.name().empty()) {
+    return errors::InvalidArgument("Function name must be specified");
+  }
+
+  // Helper methods to lookup function attributes
+  GrapplerFunctionItemBuilder builder(&func_attr);
+
+  // Mapping from FunctionDef input format (name[:output][:position]) to
+  // GraphDef input format (name[:position])
+  GrapplerFunctionConnectivity connectivity;
+
+  std::vector<InputArgExpansion> inputs;
+  std::vector<OutputArgExpansion> outputs;
+  GraphDef function_body;
+
+  // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
+
+  // Make sure that there is no tensor sequences in outputs
+  for (const OpDef::ArgDef& output : signature.output_arg()) {
+    if (!output.type_list_attr().empty() || !output.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Outputs with sequence of tensors are not supported. Unsupported "
+          "output: ",
+          output.name());
+    }
+  }
+
+  // For each input argument create a placeholder in function body.
+  for (const OpDef::ArgDef& input : signature.input_arg()) {
+    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Inputs with sequence of tensors are not supported. Unsupported "
+          "input: ",
+          input.name());
+    }
+
+    DataType input_data_type;
+    TF_RETURN_IF_ERROR(builder.GetArgType(input, &input_data_type));
+
+    NodeDef* placeholder = function_body.add_node();
+    placeholder->set_name(input.name());
+    placeholder->set_op("Placeholder");
+    (*placeholder->mutable_attr())["T"].set_type(input_data_type);
+
+    InputArgExpansion input_expansion{/*input_name=*/input.name(),
+                                      /*placeholders=*/{input.name()}};
+    connectivity.RegisterInputArgExpansion(input_expansion);
+    inputs.push_back(input_expansion);
+  }
+
+  // Add all function nodes to the function body
+  for (const NodeDef& func_def_node : func.node_def()) {
+    NodeDef* new_node = function_body.add_node();
+    *new_node = func_def_node;
 
-  for (const NodeDef& node : func.node_def()) {
-    NodeDef* new_node = new_item->graph.add_node();
-    *new_node = node;
-    // Replace the placeholder attribute values with the specified value.
+    // Replace the placeholder attribute values with the specified value
     for (auto& attr : *new_node->mutable_attr()) {
       const string& ph_name = attr.second.placeholder();
       auto it = func_attr.find(ph_name);
@@ -78,75 +313,39 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
 
     // Functions use a custom format to encode connectivity. Map these custom
     // strings to regular ones.
+    tensorflow::NameRangeMap outputs_range_map;
     const OpRegistrationData* registration;
-    Status status = func_def.LookUp(node.op(), &registration);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-      return nullptr;
-    }
-
-    tensorflow::NameRangeMap inputs;
-    tensorflow::NameRangeMap outputs;
-    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-                                           &outputs);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-      return nullptr;
-    }
-    for (const auto& name_range : outputs) {
-      string port_prefix =
-          strings::StrCat(node.name(), ":", name_range.first, ":");
-      int index_start = name_range.second.first;
-      int index_end = name_range.second.second;
-      for (int i = index_start; i < index_end; ++i) {
-        string port_id = strings::StrCat(port_prefix, i - index_start);
-        string port_name = strings::StrCat(node.name(), ":", i);
-        port_map[port_id] = port_name;
-      }
-    }
+    TF_RETURN_IF_ERROR(func_library.LookUp(func_def_node.op(), &registration));
+    TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(
+        func_def_node, registration->op_def, nullptr, &outputs_range_map));
+    connectivity.RegisterFunctionBodyOutputs(func_def_node.name(),
+                                             outputs_range_map);
   }
 
-  for (auto& node : *new_item->graph.mutable_node()) {
-    // Rewrite the inputs to use the normal naming convention.
-    for (int i = 0; i < node.input_size(); ++i) {
-      const string& input = node.input(i);
-      if (IsControlInput(input)) {
-        // No need to remap control dependencies.
-        continue;
-      } else {
-        auto it = port_map.find(input);
-        if (it == port_map.end()) {
-          LOG(ERROR) << "Unknown input: " << input;
-          return nullptr;
-        }
-        node.set_input(i, it->second);
-      }
-    }
+  // Rewrite inputs to use GraphDef format
+  for (NodeDef& node : *function_body.mutable_node()) {
+    TF_RETURN_IF_ERROR(connectivity.ExpandNodeInputs(&node));
   }
 
-  // Add the function outputs to the list of fetch nodes, taking into account
-  // the output mapping if any.
-  for (const auto& out : func.signature().output_arg()) {
-    auto it = func.ret().find(out.name());
-    if (it != func.ret().end()) {
-      auto it2 = port_map.find(it->second);
-      if (it2 == port_map.end()) {
-        LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
-                   << it->second;
-        return nullptr;
-      } else {
-        new_item->fetch.emplace_back(it2->second);
-      }
+  // Add function outputs
+  for (const OpDef::ArgDef& out : signature.output_arg()) {
+    std::vector<string> output_tensors;
+    auto ret = func.ret().find(out.name());
+    if (ret != func.ret().end()) {
+      // Expand outputs using provided output mapping
+      TF_RETURN_IF_ERROR(
+          connectivity.ExpandFunctionDefInput(ret->second, &output_tensors));
     } else {
-      new_item->fetch.emplace_back(out.name());
+      // Otherwise output must be one of the function inputs
+      TF_RETURN_IF_ERROR(
+          connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
     }
-  }
-  // Add the function inputs to the list of feeds.
-  for (const auto& inp : func.signature().input_arg()) {
-    new_item->feed.emplace_back(inp.name(), Tensor());
+    outputs.push_back({out.name(), output_tensors});
   }
 
-  return new_item;
+  *item = GrapplerFunctionItem(signature.name(), inputs, outputs,
+                               std::move(function_body));
+  return Status::OK();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 8f9b7d848a..60ea8857c0 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -19,19 +19,125 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
-
 namespace grappler {
 
-// Factory method for creating a GrapplerItem from a FunctionDef.
-// Returns nullptr if the given function def cannot be converted.
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+// Depending on the function instantiation attributes, input argument to the
+// function might be a single tensor, list of tensors of the same type, or a
+// list of tensors of different types.
+//
+// InputArgExpansion keeps track of the placeholders that were added to the
+// function body in place of function inputs.
+struct InputArgExpansion {
+  string input_name;                 // name of the function input argument
+  std::vector<string> placeholders;  // names of placeholder nodes in the
+                                     // function body
+};
+
+// Depending on the function instantiation attributes, output argument is mapped
+// to one or more outputs of one of the function body nodes.
+//
+// OutputArgExpansion keeps mapping from a function output arg to the output
+// tensors of a function body nodes, that compute function outputs.
+struct OutputArgExpansion {
+  string output_name;                  // name of the function output argument
+  std::vector<string> output_tensors;  // names of output tensors from the
+                                       // function body graph nodes
+};
+
+// FunctionDef uses different connectivity encoding for the function body nodes,
+// then a GraphDef (see function.proto for details). Input name in FunctionDef
+// can potentially represent a sequence of tensors (instead just one tensor in
+// GraphDef), we need to expand it when converting from FunctionDef to GraphDef,
+// and fold it back when doing backward conversion.
+class GrapplerFunctionConnectivity {
+ public:
+  void RegisterInputArgExpansion(const InputArgExpansion& input_arg_expansion);
+  void RegisterFunctionBodyOutputs(const string& node_name,
+                                   const tensorflow::NameRangeMap& outputs);
+
+  // Expand input encoded in FunctionDef format (name[:output][:position]) into
+  // multiple inputs in GraphDef format (name[:position]).
+  Status ExpandFunctionDefInput(const string& func_def_input,
+                                std::vector<string>* graph_def_inputs) const;
+
+  // Update Node inputs from FunctionDef to GraphDef format
+  Status ExpandNodeInputs(NodeDef* function_body_node) const;
+
+  // TODO(ezhulenev): fold GraphDef inputs back to FunctionDef format
+  // Status FoldGraphDefInputs(const std::vector<sting> graph_def_inputs,
+  //                          std::vector<string>* function_def_inputs) const;
+
+ private:
+  std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+};
+
+// Helper methods to build GrapplerFunctionItem from a function def and function
+// attributes.
+class GrapplerFunctionItemBuilder {
+ public:
+  using FunctionAttr = std::unordered_map<string, AttrValue>;
+
+  explicit GrapplerFunctionItemBuilder(const FunctionAttr* func_attr)
+      : func_attr_(func_attr) {}
+
+  // Get DataType from attributes by name. Return error if attribute is missing,
+  // or it doesn't define a valid data type.
+  Status GetTypeAttr(const string& type_attr_name, DataType* data_type) const;
+
+  // Get argument data type. If data type is not explicitly defined, uses
+  // provided attribute name to look it up in function attributes.
+  Status GetArgType(const OpDef::ArgDef& arg, DataType* data_type) const;
+
+ private:
+  const FunctionAttr* func_attr_;  // do not own
+};
+
+// A special case of GrapplerItem, constructed from a TensorFlow Function.
+class GrapplerFunctionItem : public GrapplerItem {
+ public:
+  GrapplerFunctionItem() {}
+  GrapplerFunctionItem(
+      const string& function_name,
+      const std::vector<InputArgExpansion>& input_arg_expansions,
+      const std::vector<OutputArgExpansion>& output_arg_expansions,
+      GraphDef&& function_body);
+
+  const string& function_name() const;
+
+  const std::vector<InputArgExpansion>& inputs() const;
+  const InputArgExpansion& input(int i) const;
+  const std::size_t input_size() const;
+
+  const std::vector<OutputArgExpansion>& outputs() const;
+  const OutputArgExpansion& output(int i) const;
+  const std::size_t output_size() const;
+
+  const GraphDef& function_body() const;
+  GraphDef& mutable_function_body();
+
+ private:
+  string function_name_;
+  std::vector<InputArgExpansion> input_arg_expansions_;
+  std::vector<OutputArgExpansion> output_arg_expansions_;
+};
+
+// Return all output tensors referenced by item output args.
+std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
+
+// Make a GrapplerFunctionItem from the function definition and attributes.
+// Return error if the given function def cannot be converted.
+Status MakeGrapplerFunctionItem(
     const FunctionDef& func,
     const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library);
+    const FunctionLibraryDefinition& func_library, GrapplerFunctionItem* item);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 6a7d766b1c..1eb3298e89 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
@@ -28,6 +29,88 @@ namespace {
 
 class FunctionsTest : public ::testing::Test {};
 
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion({"inputA", {"inputA"}});
+  connectivity.RegisterInputArgExpansion({"inputB", {"inputB_0", "inputB_1"}});
+
+  connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
+  connectivity.RegisterFunctionBodyOutputs("Func",
+                                           {{"o1", {0, 2}}, {"o2", {2, 4}}});
+
+  std::vector<string> inputs;
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputA", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("inputA", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputB", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("inputB_0", inputs[0]);
+  EXPECT_EQ("inputB_1", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputB:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("inputB_1", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Add:z", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Add", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("Func", inputs[0]);
+  EXPECT_EQ("Func:1", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("Func:2", inputs[0]);
+  EXPECT_EQ("Func:3", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1:0", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:1", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2:0", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:2", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:3", inputs[0]);
+}
+
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandNodeInputs) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion({"inputA", {"inputA"}});
+  connectivity.RegisterInputArgExpansion({"inputB", {"inputB_0", "inputB_1"}});
+
+  NodeDef node;
+  node.add_input("inputA:0");
+  node.add_input("inputB");
+
+  TF_EXPECT_OK(connectivity.ExpandNodeInputs(&node));
+
+  EXPECT_EQ(3, node.input_size());
+  EXPECT_EQ("inputA", node.input(0));
+  EXPECT_EQ("inputB_0", node.input(1));
+  EXPECT_EQ("inputB_1", node.input(2));
+}
+
 TEST_F(FunctionsTest, FromSimpleFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -48,37 +131,45 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
-  CHECK(item);
-  EXPECT_EQ("XTimesTwo", item->id);
-  EXPECT_EQ(4, item->graph.node_size());
-  EXPECT_EQ(std::vector<string>({"y:0"}), item->fetch);
-  EXPECT_EQ(1, item->feed.size());
-  EXPECT_EQ("x", item->feed[0].first);
-
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "x") {
+  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+
+  EXPECT_EQ("XTimesTwo", item.function_name());
+  EXPECT_EQ(4, item.function_body().node_size());
+
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ("x", item.input(0).input_name);
+  EXPECT_EQ(std::vector<string>{"x"}, item.input(0).placeholders);
+
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("y", item.output(0).output_name);
+  EXPECT_EQ("y", item.output(0).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "x" && count++) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "two") {
+    } else if (node.name() == "two" && count++) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "scale") {
+    } else if (node.name() == "scale" && count++) {
       EXPECT_EQ("Cast", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("two:0", node.input(0));
-    } else if (node.name() == "y") {
+      EXPECT_EQ("two", node.input(0));
+    } else if (node.name() == "y" && count++) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("scale:0", node.input(1));
+      EXPECT_EQ("scale", node.input(1));
     }
   }
+  EXPECT_EQ(4, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
@@ -115,45 +206,53 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
-  CHECK(item);
-  EXPECT_EQ("SubGrad", item->id);
-  EXPECT_EQ(12, item->graph.node_size());
-  EXPECT_EQ(std::vector<string>({"dx:0", "dy:0"}), item->fetch);
-  EXPECT_EQ(3, item->feed.size());
-  EXPECT_EQ("x", item->feed[0].first);
-  EXPECT_EQ("y", item->feed[1].first);
-  EXPECT_EQ("dz", item->feed[2].first);
-
-  for (const NodeDef &node : item->graph.node()) {
+  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+
+  EXPECT_EQ("SubGrad", item.function_name());
+  EXPECT_EQ(12, item.function_body().node_size());
+
+  ASSERT_EQ(3, item.input_size());
+  EXPECT_EQ("x", item.input(0).input_name);
+  EXPECT_EQ("y", item.input(1).input_name);
+  EXPECT_EQ("dz", item.input(2).input_name);
+
+  ASSERT_EQ(2, item.output_size());
+  EXPECT_EQ("dx", item.output(0).output_tensors[0]);
+  EXPECT_EQ("dy", item.output(1).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
     if (node.name() == "x" || node.name() == "y" || node.name() == "dz") {
+      count++;
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "rx") {
+    } else if (node.name() == "rx" && count++) {
       EXPECT_EQ("BroadcastGradientArgs", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("sx:0", node.input(0));
-      EXPECT_EQ("sy:0", node.input(1));
-    } else if (node.name() == "sum_gx") {
+      EXPECT_EQ("sx", node.input(0));
+      EXPECT_EQ("sy", node.input(1));
+    } else if (node.name() == "sum_gx" && count++) {
       EXPECT_EQ("Sum", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("gx:0", node.input(0));
-      EXPECT_EQ("rx:0", node.input(1));
-    } else if (node.name() == "sum_gy") {
+      EXPECT_EQ("gx", node.input(0));
+      EXPECT_EQ("rx", node.input(1));
+    } else if (node.name() == "sum_gy" && count++) {
       EXPECT_EQ("Sum", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("gy:0", node.input(0));
+      EXPECT_EQ("gy", node.input(0));
       EXPECT_EQ("rx:1", node.input(1));
     }
   }
+  EXPECT_EQ(6, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
-  FunctionDefLibrary library;
-  *library.add_function() = FunctionDefHelper::Define(
+  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+  TF_ASSERT_OK(library.AddFunctionDef(FunctionDefHelper::Define(
       // Name
       "Swap",
       // Args
@@ -164,7 +263,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
       {"T: {float, double}"},
       // Nodes
       {{{"o0"}, "Identity", {"i1"}, {{"T", "$T"}}},
-       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
+       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}})));
 
   FunctionDef func = FunctionDefHelper::Create(
       // Name
@@ -189,43 +288,47 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
 
-  for (const NodeDef &node : item->graph.node()) {
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
     if (node.name() == "x" || node.name() == "y") {
+      count++;
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "a0") {
+    } else if (node.name() == "a0" && count++) {
       EXPECT_EQ("Swap", node.op());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("y", node.input(1));
       EXPECT_EQ("^x2", node.input(2));
-    } else if (node.name() == "a1") {
+    } else if (node.name() == "a1" && count++) {
       EXPECT_EQ("Swap", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("a0:0", node.input(0));
+      EXPECT_EQ("a0", node.input(0));
       EXPECT_EQ("a0:1", node.input(1));
-    } else if (node.name() == "x2") {
+    } else if (node.name() == "x2" && count++) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("x", node.input(1));
-    } else if (node.name() == "y2") {
+    } else if (node.name() == "y2" && count++) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("y", node.input(0));
       EXPECT_EQ("y", node.input(1));
       EXPECT_EQ("^a1", node.input(2));
-    } else if (node.name() == "o") {
+    } else if (node.name() == "o" && count++) {
       EXPECT_EQ("Add", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x2:0", node.input(0));
-      EXPECT_EQ("y2:0", node.input(1));
+      EXPECT_EQ("x2", node.input(0));
+      EXPECT_EQ("y2", node.input(1));
     }
   }
+  EXPECT_EQ(7, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
@@ -245,28 +348,31 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       {{"out", "Exp:y:0"}});
 
   std::unordered_map<string, AttrValue> func_attr;
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
 
-  EXPECT_EQ(1, item->fetch.size());
-  EXPECT_EQ("Exp:0", item->fetch[0]);
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
 
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "in") {
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "in" && count++) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "Linear_func") {
+    } else if (node.name() == "Linear_func" && count++) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("in", node.input(0));
-    } else if (node.name() == "Exp") {
+    } else if (node.name() == "Exp" && count++) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("Linear_func:0", node.input(0));
+      EXPECT_EQ("Linear_func", node.input(0));
     }
   }
+  EXPECT_EQ(3, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
@@ -285,20 +391,25 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
       {{"out0", "in0"}});
 
   std::unordered_map<string, AttrValue> func_attr;
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
 
-  EXPECT_EQ(3, item->fetch.size());
-  EXPECT_EQ("in0", item->fetch[0]);
-  EXPECT_EQ("arg2", item->fetch[1]);
-  EXPECT_EQ("arg3", item->fetch[2]);
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
 
-  EXPECT_EQ(5, item->graph.node_size());
-  for (const NodeDef &node : item->graph.node()) {
+  EXPECT_EQ("ForwardInputs", item.function_name());
+  EXPECT_EQ(5, item.function_body().node_size());
+
+  EXPECT_EQ(3, item.output_size());
+  EXPECT_EQ("in0", item.output(0).output_tensors[0]);
+  EXPECT_EQ("arg2", item.output(1).output_tensors[0]);
+  EXPECT_EQ("arg3", item.output(2).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
     EXPECT_TRUE(node.name() == "in0" || node.name() == "in1" ||
                 node.name() == "arg2" || node.name() == "arg3" ||
                 node.name() == "arg4");
+    count++;
     EXPECT_EQ("Placeholder", node.op());
     if (node.name() == "arg3") {
       EXPECT_EQ(DT_INT32, node.attr().at("T").type());
@@ -306,6 +417,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
     }
   }
+  EXPECT_EQ(5, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
@@ -325,24 +437,23 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
 
-  EXPECT_EQ(0, item->feed.size());
-  EXPECT_EQ(1, item->fetch.size());
-  EXPECT_EQ("o:0", item->fetch[0]);
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
 
-  EXPECT_EQ(2, item->graph.node_size());
-  const NodeDef &two = item->graph.node(0);
+  EXPECT_EQ(0, item.input_size());
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("o", item.output(0).output_tensors[0]);
+
+  EXPECT_EQ(2, item.function_body().node_size());
+  const NodeDef &two = item.function_body().node(0);
   EXPECT_EQ("two", two.name());
   EXPECT_EQ(0, two.input_size());
-  const NodeDef &cast = item->graph.node(1);
+  const NodeDef &cast = item.function_body().node(1);
   EXPECT_EQ("o", cast.name());
   EXPECT_EQ(1, cast.input_size());
-  EXPECT_EQ("two:0", cast.input(0));
-
-  std::cout << item->graph.DebugString() << std::endl;
+  EXPECT_EQ("two", cast.input(0));
 }
 
 }  // namespace
-- 
GitLab


From 08a12ca6016c34d9476d2e93bd0f2dc9ae60abc5 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 11 Apr 2018 09:50:40 -0700
Subject: [PATCH 0590/1262] Add a clear error message for when a doc does not
 have a title.

PiperOrigin-RevId: 192463583
---
 tensorflow/tools/docs/generate_lib.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 34dd419f15..9cc261d7dd 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -308,6 +308,10 @@ def build_doc_index(src_dir):
         continue
       title_parser = _GetMarkdownTitle()
       title_parser.process(os.path.join(dirpath, base_name))
+      if title_parser.title is None:
+        msg = ('`{}` has no markdown title (# title)'.format(
+            os.path.join(dirpath, base_name)))
+        raise ValueError(msg)
       key_parts = os.path.join(suffix, base_name[:-3]).split('/')
       if key_parts[-1] == 'index':
         key_parts = key_parts[:-1]
-- 
GitLab


From 8f753859dd50a4c8d25b99a7b57c61e0e5c20578 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 11 Apr 2018 09:53:21 -0700
Subject: [PATCH 0591/1262] Add gradient in cond test to match CallGradInLoop.

PiperOrigin-RevId: 192463997
---
 .../kernel_tests/control_flow_ops_py_test.py    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 75f8644f69..e27eb00818 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -664,6 +664,23 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(42.0, grad.eval(feed_dict={c: 1}))
       self.assertAllEqual(3.0, grad.eval(feed_dict={c: 3}))
 
+  def testCondGrad_3(self):
+    with self.test_session():
+      c = array_ops.placeholder(dtypes.int32, shape=[])
+      ox = constant_op.constant(10.0)
+      pred = math_ops.less(c, 2)
+
+      def fn1(x):
+        m = x * x
+        return gradients_impl.gradients(m, [ox])[0]
+
+      fn2 = lambda: math_ops.multiply(ox, 3.0)
+      y = math_ops.multiply(7.0, ox)
+      r = control_flow_ops.cond(pred, lambda: fn1(y), fn2)
+
+      self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
+      self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
+
   def testNestedCond_Simple(self):
     with self.test_session():
       x = constant_op.constant(0., name="X")
-- 
GitLab


From ae9542a8582d2e95229265d324f1b83a6e1d4a37 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <b@lamberta.org>
Date: Wed, 11 Apr 2018 10:57:30 -0700
Subject: [PATCH 0592/1262] Docs: Clarify using_tpu.md

---
 tensorflow/docs_src/programmers_guide/using_tpu.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index cb0d86fc4c..5e3e49d434 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -280,8 +280,8 @@ Where `params['batch-size']` will contain the batch size.
 ### Static shapes and batch size
 
 The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
-free strict static shape requirements imposed by the XLA/TPU environment. The
-one requirement is that the batches of data fed from your input pipeline to
+free from the strict static shape requirements imposed by the XLA/TPU environment.
+The one requirement is that the batches of data fed from your input pipeline to
 the TPU have a static shape, as determined by the standard TensorFlow shape
 inference algorithm. Intermediate tensors are free to have a dynamic shapes.
 If shape inference has failed, but the shape is known it is possible to
-- 
GitLab


From 5757d091a5c915b5ca99da7bc44feebdb374c569 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 11:02:48 -0700
Subject: [PATCH 0593/1262] Use tf.train.get_or_create_global_step() instead of
 deprecated variables.get_or_create_global_step().

PiperOrigin-RevId: 192476077
---
 tensorflow/contrib/training/python/training/evaluation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 4bb53e8678..f7fd66d33f 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -138,7 +138,6 @@ from __future__ import print_function
 
 import time
 
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
@@ -298,7 +297,7 @@ class SummaryAtEndHook(session_run_hook.SessionRunHook):
   def begin(self):
     if self._replace_summary_op:
       self._summary_op = summary.merge_all()
-    self._global_step = variables.get_or_create_global_step()
+    self._global_step = training_util.get_or_create_global_step()
 
   def after_create_session(self, session, coord):
     if self._summary_writer is None and self._log_dir:
-- 
GitLab


From 48b2bdc72541139bff7bf9a044eafee8234fe41f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 11:21:48 -0700
Subject: [PATCH 0594/1262] Fix uninitialized value.

PiperOrigin-RevId: 192479630
---
 tensorflow/compiler/xla/service/hlo_instruction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a6cb19f331..9a9de07883 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1446,7 +1446,7 @@ class HloInstruction {
   string channel_name_;
 
   // Estimate of the duration of a host computation in nanoseconds.
-  int64 cost_estimate_ns_;
+  int64 cost_estimate_ns_ = 0;
 
   // Computations called by this instruction.
   std::vector<HloComputation*> called_computations_;
-- 
GitLab


From 2ea5c1e867f029c3cda9ac099542858cd737d8e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 11:26:25 -0700
Subject: [PATCH 0595/1262] Disable prelu tests for real now.

PiperOrigin-RevId: 192480452
---
 tensorflow/contrib/lite/testing/generated_examples_zip_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 7426ab56af..84ae1d58fe 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -267,7 +267,7 @@ INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
 INSTANTIATE_TESTS(relu1)
-INSTANTIATE_TESTS(prelu)
+// INSTANTIATE_TESTS(prelu)
 INSTANTIATE_TESTS(relu6)
 INSTANTIATE_TESTS(reshape)
 INSTANTIATE_TESTS(resize_bilinear)
-- 
GitLab


From 8b17a17ed5d92fb52922c1c4726180db0c220f8e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 11:33:38 -0700
Subject: [PATCH 0596/1262] Script to create custom_ops inside a TensorFlow
 graphdef.

PiperOrigin-RevId: 192481690
---
 tensorflow/contrib/lite/python/BUILD          |  13 ++
 .../contrib/lite/python/create_custom_op.py   | 111 ++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 tensorflow/contrib/lite/python/create_custom_op.py

diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 6fafaf0727..926896d609 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -97,6 +97,19 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "create_custom_op",
+    srcs = ["create_custom_op.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "@absl_py//absl/flags",
+    ],
+)
+
 py_test(
     name = "convert_saved_model_test",
     srcs = ["convert_saved_model_test.py"],
diff --git a/tensorflow/contrib/lite/python/create_custom_op.py b/tensorflow/contrib/lite/python/create_custom_op.py
new file mode 100644
index 0000000000..830f95358c
--- /dev/null
+++ b/tensorflow/contrib/lite/python/create_custom_op.py
@@ -0,0 +1,111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Replaces a subgraph of a TensorFlow GraphDef with a single node.
+
+In conjunction with TOCO's --allow_custom_op this script allows selected
+portions of a TensorFlow GraphDef to be executed by custom code.
+
+Example:
+
+bazel run tensorflow/contrib/lite/python:create_custom_op  -- \
+  --input_graph=/tmp/input.pb \
+  --output_graph=/tmp/output.pb \
+  --inputs=concat,concat_1 \
+  --outputs=detection_classes \
+  --op_definition='op:"PostProcessing" attr{key:"num" value:{i:10}}'
+
+The above will identify a subgraph starting at nodes 'concat' and 'concat_1',
+and ending at 'detection_classes'. All nodes in between will be removed and
+replaced by a new op called 'PostProcessing'.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import uuid as _uuid
+from absl import app
+from absl import flags
+from google.protobuf import text_format
+from tensorflow.contrib.framework.python.framework.graph_util import fuse_op
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.platform import gfile
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_graph", "", "Binary graphdef to load.")
+flags.DEFINE_string("output_graph", "", "Resulting binary graphdef.")
+
+flags.DEFINE_string("inputs", "",
+                    "Comma-separated list of inputs to the subgraph.")
+flags.DEFINE_string("outputs", "",
+                    "Comma-separated list of outputs of the subgraph.")
+flags.DEFINE_string("op_definition", "",
+                    "A text NodeDef defining the contents of the custom op.")
+
+
+def _read_graph_def(filename):
+  if not gfile.Exists(filename):
+    raise ValueError("Input graph file '" + filename + "' does not exist!")
+
+  graph_def = graph_pb2.GraphDef()
+  with gfile.FastGFile(filename, "rb") as f:
+    graph_def.ParseFromString(f.read())
+  return graph_def
+
+
+def _write_graph_def(graph_def, filename):
+  if not filename:
+    raise ValueError("Output graph file not specified")
+
+  with gfile.Open(filename, "wb") as f:
+    f.write(graph_def.SerializeToString())
+
+
+def _collapse_subgraph(graph_def, inputs, outputs, op_definition):
+  """Substitute a custom op for the subgraph delimited by inputs and outputs."""
+  name = _uuid.uuid1().hex
+  # We need a default type, but it can be changed using 'op_definition'.
+  default_type = types_pb2.DT_FLOAT
+  new_graph = fuse_op(
+      graph_def=graph_def,
+      input_nodes=inputs,
+      output_nodes=outputs,
+      output_dtypes=[default_type for _ in outputs],
+      output_quantized=False,
+      op_name=name,
+      op_type="CustomTfLiteOp")
+  node_def = node_def_pb2.NodeDef()
+  text_format.Parse(op_definition, node_def)
+  for node in new_graph.node:
+    if node.name == name:
+      node.MergeFrom(node_def)
+  return new_graph
+
+
+def main(argv):
+  del argv  # unused
+  graph = _read_graph_def(filename=flags.FLAGS.input_graph)
+  graph = _collapse_subgraph(
+      graph_def=graph,
+      inputs=flags.FLAGS.inputs.split(","),
+      outputs=flags.FLAGS.outputs.split(","),
+      op_definition=flags.FLAGS.op_definition)
+  _write_graph_def(graph_def=graph, filename=flags.FLAGS.output_graph)
+
+
+if __name__ == "__main__":
+  app.run(main)
-- 
GitLab


From abc26c182ce2e1f010c53ca4f384759587740578 Mon Sep 17 00:00:00 2001
From: Adria Puigdomenech <adriap@google.com>
Date: Wed, 11 Apr 2018 11:36:56 -0700
Subject: [PATCH 0597/1262] Update docs for softmax_cross_entropy_with_logits.

PiperOrigin-RevId: 192482242
---
 tensorflow/python/ops/nn_ops.py | 38 ++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 07ca32953f..bb454b3c3a 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1803,8 +1803,11 @@ def softmax_cross_entropy_with_logits_v2(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
@@ -1816,14 +1819,17 @@ def softmax_cross_entropy_with_logits_v2(
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1926,9 +1932,9 @@ def softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
-  or `float64`).
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
 
   Backpropagation will happen only into `logits`.  To calculate a cross entropy
   loss that allows backpropagation into both `logits` and `labels`, see
@@ -1939,14 +1945,17 @@ def softmax_cross_entropy_with_logits(
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1983,8 +1992,11 @@ def sparse_softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  A common use case is to have logits of shape `[batch_size, num_classes]` and
-  labels of shape `[batch_size]`. But higher dimensions are supported.
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, in which
+  case the `dim`-th dimension is assumed to be of size `num_classes`.
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
-- 
GitLab


From 5eccb5afe6f8ecda6a0aa9ecdd2d4a6636996509 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 11 Apr 2018 11:52:52 -0700
Subject: [PATCH 0598/1262] Increase size of
 tensorflow/contrib/data/python/kernel_tests:batch_dataset_op_test to
 "medium".

PiperOrigin-RevId: 192484895
---
 tensorflow/contrib/data/python/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c8699e0d5a..5d6dbdcbdf 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -8,7 +8,7 @@ load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-- 
GitLab


From 5a2129e863d7983a34a86865c6fb3f1d382ef4a5 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <b@lamberta.org>
Date: Wed, 11 Apr 2018 12:05:39 -0700
Subject: [PATCH 0599/1262] Tidy up doc for rebuild project

---
 tensorflow/docs_src/mobile/android_build.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 0cd0a98be4..c35530061d 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -51,8 +51,8 @@ If you haven't already, do the following two things:
         // set to 'bazel', 'cmake', 'makefile', 'none'
         def nativeBuildSystem = 'none'
 
-4. Running "Build -> Rebuild Project" from Android Studio menu and click the 
-    Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
+4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the
+    top menu. You may need to rebuild the project using *Build > Rebuild Project*.
 
     If it asks you to use Instant Run, click **Proceed Without Instant Run**.
 
-- 
GitLab


From 44fc1feaa989ea4e1fbfe49dc9ca4db3ce661659 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 12:27:55 -0700
Subject: [PATCH 0600/1262] Relaxing float comparison and removing unneeded
 include

---
 tensorflow/contrib/layers/python/layers/rev_block_lib_test.py | 4 ++--
 tensorflow/stream_executor/cuda/cudnn_version_test.cc         | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 392a490be1..8c118402a4 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -60,8 +60,8 @@ class RevBlockTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
 
-      self.assertAllClose(x1, x1_inv)
-      self.assertAllClose(x2, x2_inv)
+      self.assertAllClose(x1, x1_inv, atol=1e-5)
+      self.assertAllClose(x2, x2_inv, atol=1e-5)
 
   def testBackwardForward(self):
 
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 230adafeb1..42b3dc8cc6 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
-#include "testing/base/public/gunit.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace perftools {
-- 
GitLab


From cc1525125c497772f25ee4851c7b832048cd5bd8 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 11 Apr 2018 12:32:08 -0700
Subject: [PATCH 0601/1262] Internal TF Lite test changes

PiperOrigin-RevId: 192491201
---
 tensorflow/contrib/lite/kernels/BUILD | 205 ++++++--------------------
 1 file changed, 41 insertions(+), 164 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index f07eca0ba9..914893cd90 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -12,10 +12,7 @@ tf_cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -108,10 +105,7 @@ tf_cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":kernel_util",
         "//tensorflow/contrib/lite/testing:util",
@@ -243,10 +237,7 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -259,10 +250,7 @@ tf_cc_test(
     name = "add_test",
     size = "small",
     srcs = ["add_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -291,10 +279,7 @@ tf_cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -307,10 +292,7 @@ tf_cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -323,10 +305,7 @@ tf_cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -341,10 +320,7 @@ tf_cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -357,10 +333,7 @@ tf_cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -385,10 +358,7 @@ tf_cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -401,10 +371,7 @@ tf_cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -418,10 +385,7 @@ tf_cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -447,10 +411,7 @@ tf_cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -463,10 +424,7 @@ tf_cc_test(
     name = "bidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -479,10 +437,7 @@ tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -510,10 +465,7 @@ tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -526,10 +478,7 @@ tf_cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -542,10 +491,7 @@ tf_cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -570,10 +516,7 @@ tf_cc_test(
     name = "mean_test",
     size = "small",
     srcs = ["mean_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -586,10 +529,7 @@ tf_cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -602,10 +542,7 @@ tf_cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -618,10 +555,7 @@ tf_cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -634,10 +568,7 @@ tf_cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -651,10 +582,7 @@ tf_cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -668,10 +596,7 @@ tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -684,10 +609,7 @@ tf_cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -700,10 +622,7 @@ tf_cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -716,10 +635,7 @@ tf_cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -732,10 +648,7 @@ tf_cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -749,10 +662,7 @@ tf_cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -765,10 +675,7 @@ tf_cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -781,10 +688,7 @@ tf_cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -798,10 +702,7 @@ tf_cc_test(
     name = "log_softmax_test",
     size = "small",
     srcs = ["log_softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -815,10 +716,7 @@ tf_cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -831,10 +729,7 @@ tf_cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -848,10 +743,7 @@ tf_cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -864,10 +756,7 @@ tf_cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -881,10 +770,7 @@ tf_cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -897,10 +783,7 @@ tf_cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -913,10 +796,7 @@ tf_cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -929,10 +809,7 @@ tf_cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
-- 
GitLab


From d983832d8fe01ab85b761fa1effd2d3b7a8ee794 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Wed, 11 Apr 2018 12:33:04 -0700
Subject: [PATCH 0602/1262] Adding hp5y back.

PiperOrigin-RevId: 192491335
---
 .../python/learn/learn_io/data_feeder_test.py | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index 82848be7df..1f439965da 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os.path
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -26,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.learn.python.learn.learn_io import *
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 # pylint: enable=wildcard-import
@@ -35,6 +37,13 @@ class DataFeederTest(test.TestCase):
   # pylint: disable=undefined-variable
   """Tests for `DataFeeder`."""
 
+  def setUp(self):
+    self._base_dir = os.path.join(self.get_temp_dir(), 'base_dir')
+    file_io.create_dir(self._base_dir)
+
+  def tearDown(self):
+    file_io.delete_recursively(self._base_dir)
+
   def _wrap_dict(self, data, prepend=''):
     return {prepend + '1': data, prepend + '2': data}
 
@@ -45,14 +54,14 @@ class DataFeederTest(test.TestCase):
   def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
     feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
     if isinstance(input_data, dict):
-      for k, v in list(feeder.input_dtype.items()):
+      for v in list(feeder.input_dtype.values()):
         self.assertEqual(expected_np_dtype, v)
     else:
       self.assertEqual(expected_np_dtype, feeder.input_dtype)
     with ops.Graph().as_default() as g, self.test_session(g):
       inp, _ = feeder.input_builder()
       if isinstance(inp, dict):
-        for k, v in list(inp.items()):
+        for v in list(inp.values()):
           self.assertEqual(expected_tf_dtype, v.dtype)
       else:
         self.assertEqual(expected_tf_dtype, inp.dtype)
@@ -301,7 +310,10 @@ class DataFeederTest(test.TestCase):
                                                 [0.60000002, 0.2]])
       self.assertAllClose(feed_dict[out.name], [[0., 0., 1.], [0., 1., 0.]])
 
-  def test_hdf5_data_feeder(self):
+  # TODO(rohanj): Fix this test by fixing data_feeder. Currently, h5py doesn't
+  # support permutation based indexing lookups (More documentation at
+  # http://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing)
+  def DISABLED_test_hdf5_data_feeder(self):
 
     def func(df):
       inp, out = df.input_builder()
@@ -314,11 +326,12 @@ class DataFeederTest(test.TestCase):
       import h5py  # pylint: disable=g-import-not-at-top
       x = np.matrix([[1, 2], [3, 4]])
       y = np.array([1, 2])
-      h5f = h5py.File('test_hdf5.h5', 'w')
+      file_path = os.path.join(self._base_dir, 'test_hdf5.h5')
+      h5f = h5py.File(file_path, 'w')
       h5f.create_dataset('x', data=x)
       h5f.create_dataset('y', data=y)
       h5f.close()
-      h5f = h5py.File('test_hdf5.h5', 'r')
+      h5f = h5py.File(file_path, 'r')
       x = h5f['x']
       y = h5f['y']
       func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
-- 
GitLab


From c9df9896422a5509b55f92f66c1310bb48249afb Mon Sep 17 00:00:00 2001
From: Rajendra arora <rajendraarora16@yahoo.com>
Date: Thu, 12 Apr 2018 01:19:31 +0530
Subject: [PATCH 0603/1262] Updating tensorboard link in Readme.md (#18161)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 29418dc2e9..e1a50c87e2 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
-- 
GitLab


From 242788aa28a838fe0e611780023d74be04606e1d Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 10 Apr 2018 19:20:58 -0700
Subject: [PATCH 0604/1262] experimental C API: Fix compilation failure in
 Windows.

The functions added in
https://github.com/tensorflow/tensorflow/commit/be917027e37c5e8f21f6ba07f24bdbf072cf6dfd
are temporary, and their existence breaks compilation in MSVC because of
https://docs.microsoft.com/en-us/cpp/c-language/maximum-string-length
and
https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026

So just disabling it in Windows for now.

PiperOrigin-RevId: 192391164
---
 tensorflow/c/BUILD                 |  1 +
 tensorflow/c/c_api_experimental.cc | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 2367014cd0..8a9301d584 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -122,6 +122,7 @@ tf_cuda_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e82a546092..9678ee926f 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 using tensorflow::FunctionDef;
@@ -189,6 +190,12 @@ library {
 //  be deleted by calling TF_DeleteFunction.
 static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
     const char* file_path, std::string* dataset_name, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return std::vector<UniqueFuncPtr>();
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -7067,6 +7074,7 @@ library {
         DCHECK(found);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
 
 //  On success, returns a set of TF_Function instances encoding a dataset
@@ -7076,6 +7084,12 @@ library {
 static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
     const char* file_path, int batch_size, std::string* dataset_name,
     TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -8205,6 +8219,7 @@ library {
         DCHECK(found_batch_size);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
 
 // Adds the input functions to `graph`.  On success, returns the created
-- 
GitLab


From c5d59c6a3cd8c15ee2f93608e412a1e9335d3465 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 13:22:53 -0700
Subject: [PATCH 0605/1262] Internal change.

PiperOrigin-RevId: 192498471
---
 tensorflow/stream_executor/cuda/cudnn_version_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 230adafeb1..42b3dc8cc6 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
-#include "testing/base/public/gunit.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace perftools {
-- 
GitLab


From 371d5132a5558ef06a0951f3197bde63565a1805 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 13:29:12 -0700
Subject: [PATCH 0606/1262] DepthwiseConv Optimizations

PiperOrigin-RevId: 192499401
---
 .../internal/optimized/depthwiseconv_uint8.h  |   18 +-
 .../depthwiseconv_uint8_3x3_filter.h          | 5015 +++++++++++++++--
 2 files changed, 4434 insertions(+), 599 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 0f78e0f728..dd6932ffe7 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1696,15 +1696,15 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
 #ifdef __aarch64__
   // Call kernel optimized for depthwise convolutions using 3x3 filters if
   // parameters are supported.
-  if (Fast3by3FilterKernelSupported(input_dims, filter_dims, stride_width,
-                                    stride_height, pad_width, pad_height,
-                                    depth_multiplier, output_dims)) {
-    DepthwiseConv3by3FilterDepth16(
-        input_data, input_dims, input_offset, filter_data, filter_dims,
-        filter_offset, bias_data, bias_dims, stride_width, stride_height,
-        pad_width, pad_height, depth_multiplier, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_data, output_dims);
+  if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width,
+                                   stride_height, pad_width, pad_height,
+                                   depth_multiplier, output_dims)) {
+    DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data,
+                           filter_dims, filter_offset, bias_data, bias_dims,
+                           stride_width, stride_height, pad_width, pad_height,
+                           depth_multiplier, output_offset, output_multiplier,
+                           output_shift, output_activation_min,
+                           output_activation_max, output_data, output_dims);
     return;
   }
 #endif
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index a349892076..cdcb166b2f 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,412 +40,4380 @@ inline void preload_l1_keep(const uint8* ptr) {
 // NEON intrinsics vector data types.
 // See: https://bugs.llvm.org/show_bug.cgi?id=34945
 
-struct Int32x16 {
-  int32x4_t v0, v1, v2, v3;
+struct Int32x8 {
+  int32x4_t low, high;
 };
 
-struct Int16x16 {
-  int16x8_t low, high;
+struct Filter3x3x8 {
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8;
 };
 
-struct Int16x16x3 {
-  Int16x16 v0, v1, v2;
+// Loads 3x3 filter of depth 8 and adds filter offsets.
+inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset,
+                                 int output_depth) {
+  Filter3x3x8 filter;
+
+  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
+      temp_u8_6, temp_u8_7, temp_u8_8;
+  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+
+  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
+  temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth);
+  temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth);
+  temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth);
+  temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth);
+  temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth);
+  temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth);
+  temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth);
+  temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth);
+
+  filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
+  filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
+  filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
+  filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
+  filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
+  filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
+  filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
+  filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
+  filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
+
+  filter.f0 = vaddq_s16(filter.f0, filter_offset_vec);
+  filter.f1 = vaddq_s16(filter.f1, filter_offset_vec);
+  filter.f2 = vaddq_s16(filter.f2, filter_offset_vec);
+  filter.f3 = vaddq_s16(filter.f3, filter_offset_vec);
+  filter.f4 = vaddq_s16(filter.f4, filter_offset_vec);
+  filter.f5 = vaddq_s16(filter.f5, filter_offset_vec);
+  filter.f6 = vaddq_s16(filter.f6, filter_offset_vec);
+  filter.f7 = vaddq_s16(filter.f7, filter_offset_vec);
+  filter.f8 = vaddq_s16(filter.f8, filter_offset_vec);
+
+  return filter;
+}
+
+// Applies activation, offset and downquantize on a set of accumulator
+// registers that correspond to a 2x2 output of depth 8.
+// Stores results to output.
+inline void DownquantizeAndStore2x2Output(
+    Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3,
+    int32 output_offset, int32 output_multiplier, int output_shift,
+    int32 output_activation_min, int32 output_activation_max, uint8* output_ptr,
+    int output_depth, int output_width) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  // Fixed-point multiplication.
+  acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+  acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+  acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+  acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+  acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier);
+  acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier);
+  acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier);
+  acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier);
+
+  acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+  acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+  acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+  acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+  acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift);
+  acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift);
+  acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift);
+  acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift);
+
+  // Add the output offset.
+  acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+  acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+  acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+  acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+  acc_2.low = vaddq_s32(acc_2.low, output_offset_vec);
+  acc_2.high = vaddq_s32(acc_2.high, output_offset_vec);
+  acc_3.low = vaddq_s32(acc_3.low, output_offset_vec);
+  acc_3.high = vaddq_s32(acc_3.high, output_offset_vec);
+
+  // Apply the activation function.
+  acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+  acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+  acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+  acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+  acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec);
+  acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec);
+  acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec);
+  acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec);
+
+  acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+  acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+  acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+  acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec);
+  acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec);
+  acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec);
+  acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec);
+
+  // Saturating cast to uint8 and store to destination.
+  int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+  int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+  int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+  int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+  int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low);
+  int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high);
+  int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low);
+  int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high);
+
+  int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16);
+  int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16);
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16);
+  uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16);
+
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_depth, res_1_u8);
+  vst1_u8(output_ptr + output_depth * output_width, res_2_u8);
+  vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8);
+}
+
+inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 uint8* output_ptr) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier);
+  acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier);
+
+  acc.low = RoundingDivideByPOT(acc.low, output_shift);
+  acc.high = RoundingDivideByPOT(acc.high, output_shift);
+
+  acc.low = vaddq_s32(acc.low, output_offset_vec);
+  acc.high = vaddq_s32(acc.high, output_offset_vec);
+
+  acc.low = vmaxq_s32(acc.low, output_activation_min_vec);
+  acc.high = vmaxq_s32(acc.high, output_activation_min_vec);
+
+  acc.low = vminq_s32(acc.low, output_activation_max_vec);
+  acc.high = vminq_s32(acc.high, output_activation_max_vec);
+
+  int16x4_t acc_low_s16 = vqmovn_s32(acc.low);
+  int16x4_t acc_high_s16 = vqmovn_s32(acc.high);
+
+  int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16);
+  uint8x8_t res_u8 = vqmovun_s16(res_s16);
+  vst1_u8(output_ptr, res_u8);
+}
+
+inline void DownquantizeAndStore2Output(
+    Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  {
+    using gemmlowp::RoundingDivideByPOT;
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    const int32x4_t output_activation_min_vec =
+        vdupq_n_s32(output_activation_min);
+    const int32x4_t output_activation_max_vec =
+        vdupq_n_s32(output_activation_max);
+
+    // Fixed-point multiplication.
+    acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+    acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+    acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+    acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+
+    acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+    acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+    acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+    acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+
+    // Add the output offset.
+    acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+    acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+    acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+    acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+
+    // Apply the activation function.
+    acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+    acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+    acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+    acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+
+    acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+    acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+    acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+    acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  }
+
+  // Saturating cast to uint8 and store to destination.
+  int16x8_t res_0_s16;
+  {
+    int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+    int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+    res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  }
+
+  int16x8_t res_1_s16;
+  {
+    int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+    int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+    res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  }
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_ptr_offset, res_1_u8);
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1,
+                                     int16x8_t f2, int16x8_t i0, int16x8_t i1,
+                                     int16x8_t i2) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2));
+  return accum;
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0,
+                                           int16x8_t i1, int16x8_t i2,
+                                           int16x8_t i3, int16x8_t i4,
+                                           int16x8_t i5, int16x8_t i6,
+                                           int16x8_t i7, int16x8_t i8,
+                                           Int32x8 accum) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8));
+  return accum;
+}
+
+inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0,
+                               int16x8_t i1, int16x8_t i2, int16x8_t i3,
+                               int16x8_t i4, int16x8_t i5, int16x8_t i6,
+                               int16x8_t i7, int16x8_t i8,
+                               const int32* bias_ptr, int32 output_offset,
+                               int32 output_multiplier, int output_shift,
+                               int32 output_activation_min,
+                               int32 output_activation_max, uint8* output_ptr) {
+  Int32x8 acc;
+  acc.low = vld1q_s32(bias_ptr);
+  acc.high = vld1q_s32(bias_ptr + 4);
+
+  acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8,
+                                    acc);
+
+  DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_ptr);
+}
+
+// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs.
+inline void DotProductAndStore2xStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9,
+                                      i10, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs.
+inline void DotProductAndStore2yStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7,
+                                      i8, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// A kernel that is optimized on the number of output cells in the x and y
+// direction, and the stride. Assumes 3x3 filters of 16 depth.
+template <int kFixedOutputY, int kFixedOutputX, int kFixedStride = 1>
+struct ConvKernel3x3FilterDepth8 {};
+
+template <>
+struct ConvKernel3x3FilterDepth8<8, 8, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    //
+    //        INPUT                          OUTPUT
+    //   |\----------------\               |\------------\
+    //   | \                \              | \            \
+    //   |  \----------------\             |  \------------\
+    //   |  | 0    ...     9 |             |  | 0  ...   7 |
+    //   |  | 10   ...    19 |     --->    |  | 8  ...  15 |
+    //   |  | 20   ...    29 |              \ | .. ...  .. |
+    //    \ | ..   ...    .. |               \| 56 ...  63 |
+    //     \| 90   ...   109 |                |------------|
+    //      |----------------|
+    //
+    // The first set of loads corresponds to:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-----------------                |\-----------
+    //   | \                                | \
+    //   |  \-----------------              |  \----------
+    //   |  | 0  1   2  3 ...               |  | 0  1 ...
+    //   |  | 10 11 12 13 ...     --->      |  | ..   ...
+    //   |  | 20 21 22 23 ...                  | ..   ...
+    //   |  | ..   ...    ...
+    //
+    // The next set of loads correspond to a sliding window to the right.
+    // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-------------------                |\-------------
+    //   | \                                  | \
+    //   |  \-------------------              |  \------------
+    //   |  | .. 2  3   4  5 ...              |  | .. 2  3 ...
+    //   |  | .. 12 13 14 15 ...     --->     |  | ..      ...
+    //   |  | .. 21 22 23 24 ...                 | ..      ...
+    //   |  | ..    ...      ...
+    //
+    // And so on...
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (0) and (1).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Slide to the right for outputs x = [2, 3], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (2) and (3).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (4) and (5).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth, output_depth);
+
+    // Slide to the right one last time for outputs x = [6, 7], y = 0.
+    // Referring to the indexes in the diagram above, this corresponds to
+    // outputs (6) and (7).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth, output_depth);
+
+    // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (14) and (15).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (12) and (13).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes
+    // in the diagram above, this corresponds to outputs (10) and (11).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (8) and (9).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (16) and (17).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (18) and (19).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (20) and (21).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (22) and (23).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (30) and (31).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (28) and (29).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (26) and (27).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (24) and (25).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (32) and (33).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (34) and (35).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (36) and (37).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (38) and (39).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (46) and (47).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (44) and (45).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (42) and (43).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (40) and (41).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 5 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (48) and (49).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 8 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (50) and (51).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (52) and (53).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (54) and (55).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (62) and (63).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (60) and (61).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (58) and (59).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (56) and (57).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 7 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next inputs one row down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load last row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 2x1 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_row_size);
+
+    // Load inputs for bottom 2 rows.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0,
+        input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size,
+        output_row_size);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs.
+      // Load inputs for the top two filters first.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9, input_10, input_11;
+
+      const uint8* ptr = input_ptr;
+
+      // Load top 3 rows.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+        input_10 = vaddq_s16(input_10, input_offset_vec);
+        input_11 = vaddq_s16(input_11, input_offset_vec);
+      }
+
+      // Multiply-accum for top-left output.
+      acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2,
+                                          input_4, input_5, input_6, input_8,
+                                          input_9, input_10, acc_0);
+
+      // Multiply-accum for top-right output.
+      acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3,
+                                          input_5, input_6, input_7, input_9,
+                                          input_10, input_11, acc_1);
+
+      // Now load the bottom row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+      }
+
+      // Multiply-accum for bottom-left output.
+      acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6,
+                                          input_8, input_9, input_10, input_0,
+                                          input_1, input_2, acc_2);
+
+      // Multiply-accum for bottom-right output.
+      acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7,
+                                          input_9, input_10, input_11, input_1,
+                                          input_2, input_3, acc_3);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_depth * 4;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs.
+    // Load all inputs at the beginning.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth * output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9;
+
+    const uint8* ptr = input_ptr;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+    // Load first 2 rows.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load last row.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 4x2 kernel twice.
+    ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0,
+        input_1, input_2, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter at stride 2, we require
+      // 5x5 inputs. We load the first 5x2 inputs at a time.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9;
+
+      const uint8* ptr = input_ptr;
+
+      // Load inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load next inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+
+      // Moving onto the two bottom outputs.
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load last input row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+      }
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
 };
 
-struct Filter3x3x16 {
-  Int16x16x3 r0, r1, r2;
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 2x2 kernel twice.
+    ConvKernel3x3FilterDepth8<2, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<2, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
 };
 
-// Loads 3x3 filter of depth 16 and adds filter offsets.
-inline Filter3x3x16 LoadFilterDepth16(const uint8* filter_ptr,
-                                      int32 filter_offset, int output_depth) {
-  Filter3x3x16 filter;
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
 
-  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
-      temp_u8_6, temp_u8_7, temp_u8_8, temp_u8_9, temp_u8_10, temp_u8_11,
-      temp_u8_12, temp_u8_13, temp_u8_14, temp_u8_15, temp_u8_16, temp_u8_17;
-  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
 
-  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
-  temp_u8_1 = vld1_u8(filter_ptr + 0 * output_depth + 8);
-  temp_u8_2 = vld1_u8(filter_ptr + 1 * output_depth);
-  temp_u8_3 = vld1_u8(filter_ptr + 1 * output_depth + 8);
-  temp_u8_4 = vld1_u8(filter_ptr + 2 * output_depth);
-  temp_u8_5 = vld1_u8(filter_ptr + 2 * output_depth + 8);
-
-  temp_u8_6 = vld1_u8(filter_ptr + 3 * output_depth);
-  temp_u8_7 = vld1_u8(filter_ptr + 3 * output_depth + 8);
-  temp_u8_8 = vld1_u8(filter_ptr + 4 * output_depth);
-  temp_u8_9 = vld1_u8(filter_ptr + 4 * output_depth + 8);
-  temp_u8_10 = vld1_u8(filter_ptr + 5 * output_depth);
-  temp_u8_11 = vld1_u8(filter_ptr + 5 * output_depth + 8);
-
-  temp_u8_12 = vld1_u8(filter_ptr + 6 * output_depth);
-  temp_u8_13 = vld1_u8(filter_ptr + 6 * output_depth + 8);
-  temp_u8_14 = vld1_u8(filter_ptr + 7 * output_depth);
-  temp_u8_15 = vld1_u8(filter_ptr + 7 * output_depth + 8);
-  temp_u8_16 = vld1_u8(filter_ptr + 8 * output_depth);
-  temp_u8_17 = vld1_u8(filter_ptr + 8 * output_depth + 8);
-
-  filter.r0.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
-  filter.r0.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
-  filter.r0.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
-  filter.r0.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
-  filter.r0.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
-  filter.r0.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
-
-  filter.r1.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
-  filter.r1.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
-  filter.r1.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
-  filter.r1.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_9));
-  filter.r1.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_10));
-  filter.r1.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_11));
-
-  filter.r2.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_12));
-  filter.r2.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_13));
-  filter.r2.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_14));
-  filter.r2.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_15));
-  filter.r2.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_16));
-  filter.r2.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_17));
-
-  filter.r0.v0.low = vaddq_s16(filter.r0.v0.low, filter_offset_vec);
-  filter.r0.v0.high = vaddq_s16(filter.r0.v0.high, filter_offset_vec);
-  filter.r0.v1.low = vaddq_s16(filter.r0.v1.low, filter_offset_vec);
-  filter.r0.v1.high = vaddq_s16(filter.r0.v1.high, filter_offset_vec);
-  filter.r0.v2.low = vaddq_s16(filter.r0.v2.low, filter_offset_vec);
-  filter.r0.v2.high = vaddq_s16(filter.r0.v2.high, filter_offset_vec);
-
-  filter.r1.v0.low = vaddq_s16(filter.r1.v0.low, filter_offset_vec);
-  filter.r1.v0.high = vaddq_s16(filter.r1.v0.high, filter_offset_vec);
-  filter.r1.v1.low = vaddq_s16(filter.r1.v1.low, filter_offset_vec);
-  filter.r1.v1.high = vaddq_s16(filter.r1.v1.high, filter_offset_vec);
-  filter.r1.v2.low = vaddq_s16(filter.r1.v2.low, filter_offset_vec);
-  filter.r1.v2.high = vaddq_s16(filter.r1.v2.high, filter_offset_vec);
-
-  filter.r2.v0.low = vaddq_s16(filter.r2.v0.low, filter_offset_vec);
-  filter.r2.v0.high = vaddq_s16(filter.r2.v0.high, filter_offset_vec);
-  filter.r2.v1.low = vaddq_s16(filter.r2.v1.low, filter_offset_vec);
-  filter.r2.v1.high = vaddq_s16(filter.r2.v1.high, filter_offset_vec);
-  filter.r2.v2.low = vaddq_s16(filter.r2.v2.low, filter_offset_vec);
-  filter.r2.v2.high = vaddq_s16(filter.r2.v2.high, filter_offset_vec);
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
 
-  return filter;
-}
+    const uint8* ptr = input_ptr;
 
-// Loads 3 input cells of depth 16 and adds input offsets.
-inline Int16x16x3 LoadInputRowDepth16(const uint8* ptr, int input_depth,
-                                      int32 input_offset,
-                                      Int16x16x3 input_row) {
-  uint8x8_t temp_0, temp_1;
-  int16x8_t offset_vec = vdupq_n_s16(input_offset);
-
-  temp_0 = vld1_u8(ptr + 0 * input_depth);
-  temp_1 = vld1_u8(ptr + 0 * input_depth + 8);
-  input_row.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v0.low = vaddq_s16(input_row.v0.low, offset_vec);
-  input_row.v0.high = vaddq_s16(input_row.v0.high, offset_vec);
-
-  temp_0 = vld1_u8(ptr + 1 * input_depth);
-  temp_1 = vld1_u8(ptr + 1 * input_depth + 8);
-  input_row.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v1.low = vaddq_s16(input_row.v1.low, offset_vec);
-  input_row.v1.high = vaddq_s16(input_row.v1.high, offset_vec);
-
-  temp_0 = vld1_u8(ptr + 2 * input_depth);
-  temp_1 = vld1_u8(ptr + 2 * input_depth + 8);
-  input_row.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v2.low = vaddq_s16(input_row.v2.low, offset_vec);
-  input_row.v2.high = vaddq_s16(input_row.v2.high, offset_vec);
-
-  return input_row;
-}
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
 
-// Performs multiply accumulate on 3 inputs of depth 16.
-inline Int32x16 MultiplyAccumulateRowDepth16(Int32x16 output,
-                                             const Int16x16x3& filter_row,
-                                             const Int16x16x3& input_row) {
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v0.low),
-                        vget_low_s16(input_row.v0.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v0.low),
-                        vget_high_s16(input_row.v0.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v0.high),
-                        vget_low_s16(input_row.v0.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v0.high),
-                        vget_high_s16(input_row.v0.high));
-
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v1.low),
-                        vget_low_s16(input_row.v1.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v1.low),
-                        vget_high_s16(input_row.v1.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v1.high),
-                        vget_low_s16(input_row.v1.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v1.high),
-                        vget_high_s16(input_row.v1.high));
-
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v2.low),
-                        vget_low_s16(input_row.v2.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v2.low),
-                        vget_high_s16(input_row.v2.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v2.high),
-                        vget_low_s16(input_row.v2.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v2.high),
-                        vget_high_s16(input_row.v2.high));
-
-  return output;
-}
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
 
-// Applies activation, offset and downquantize on a set of accumulator
-// registers of depth 16. Stores results to output.
-inline void DownquantizeAndStoreDepth16(Int32x16 acc, int32 output_multiplier,
-                                        int output_shift,
-                                        int32x4_t output_offset_vec,
-                                        int32x4_t output_activation_min_vec,
-                                        int32x4_t output_activation_max_vec,
-                                        uint8* output_ptr) {
-  // Fixed-point multiplication.
-  acc.v0 = vqrdmulhq_n_s32(acc.v0, output_multiplier);
-  acc.v1 = vqrdmulhq_n_s32(acc.v1, output_multiplier);
-  acc.v2 = vqrdmulhq_n_s32(acc.v2, output_multiplier);
-  acc.v3 = vqrdmulhq_n_s32(acc.v3, output_multiplier);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
 
-  using gemmlowp::RoundingDivideByPOT;
-  acc.v0 = RoundingDivideByPOT(acc.v0, output_shift);
-  acc.v1 = RoundingDivideByPOT(acc.v1, output_shift);
-  acc.v2 = RoundingDivideByPOT(acc.v2, output_shift);
-  acc.v3 = RoundingDivideByPOT(acc.v3, output_shift);
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
 
-  // Add the output offset.
-  acc.v0 = vaddq_s32(acc.v0, output_offset_vec);
-  acc.v1 = vaddq_s32(acc.v1, output_offset_vec);
-  acc.v2 = vaddq_s32(acc.v2, output_offset_vec);
-  acc.v3 = vaddq_s32(acc.v3, output_offset_vec);
+    // Second output.
+    output_ptr += output_row_size;
 
-  // Apply the activation function.
-  acc.v0 = vmaxq_s32(acc.v0, output_activation_min_vec);
-  acc.v1 = vmaxq_s32(acc.v1, output_activation_min_vec);
-  acc.v2 = vmaxq_s32(acc.v2, output_activation_min_vec);
-  acc.v3 = vmaxq_s32(acc.v3, output_activation_min_vec);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
 
-  acc.v0 = vminq_s32(acc.v0, output_activation_max_vec);
-  acc.v1 = vminq_s32(acc.v1, output_activation_max_vec);
-  acc.v2 = vminq_s32(acc.v2, output_activation_max_vec);
-  acc.v3 = vminq_s32(acc.v3, output_activation_max_vec);
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
 
-  // Saturating cast to uint8 and store to destination.
-  int16x4_t acc_tlla_s16 = vqmovn_s32(acc.v0);
-  int16x4_t acc_tllb_s16 = vqmovn_s32(acc.v1);
-  int16x4_t acc_tlha_s16 = vqmovn_s32(acc.v2);
-  int16x4_t acc_tlhb_s16 = vqmovn_s32(acc.v3);
-
-  int16x8_t res_s16_0 = vcombine_s16(acc_tlla_s16, acc_tllb_s16);
-  int16x8_t res_s16_1 = vcombine_s16(acc_tlha_s16, acc_tlhb_s16);
-  uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
-  uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
-  vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 5 * input_depth;
+    temp_2 = vld1_u8(ptr);
+    temp_0 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_5 = vld1_u8(ptr);
+    temp_3 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_8 = vld1_u8(ptr);
+    temp_6 = vld1_u8(ptr + input_depth);
+
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7,
+        input_8, input_6, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 7 * input_depth;
+    temp_1 = vld1_u8(ptr);
+    temp_2 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_4 = vld1_u8(ptr);
+    temp_5 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_7 = vld1_u8(ptr);
+    temp_8 = vld1_u8(ptr + input_depth);
+
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+
+    uint8x8_t temp_0 = vld1_u8(input_ptr);
+    uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_3 = vld1_u8(input_ptr);
+    uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_6 = vld1_u8(input_ptr);
+    uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+inline void ShuffleInput(const uint8* input_ptr, int input_depth,
+                         int input_width, int input_height, int output_depth,
+                         int output_width, int output_height,
+                         uint8* output_ptr) {
+  const int input_row_size = input_depth * input_width;
+
+  for (int y = 0; y < output_height; y++) {
+    const uint8* ptr = input_ptr;
+    for (int x = 0; x < output_width; x++) {
+      memcpy(output_ptr, ptr, output_depth);
+      output_ptr += output_depth;
+      ptr += input_depth;
+    }
+    input_ptr += input_row_size;
+  }
 }
 
-// A kernel that is optimized on the number of output cells in the x and y
-// direction, and the stride. Assumes 3x3 filters of 16 depth.
-template <int kFixedOutputX, int kFixedOutputY, int kFixedStride = 1>
-struct ConvKernel3x3FilterDepth16 {};
+template <int kFixedHeight, int kFixedStrideWidth,
+          int kFixedStrideHeight = kFixedStrideWidth>
+struct ConvRow3x3FilterDepth8 {};
+
+template <int kFixedStrideWidth>
+struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 1x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 1x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <int kFixedStrideWidth>
+struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 2x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 2x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * kFixedStrideWidth * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 2x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<4, 1> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += input_depth;
+      output_data += output_depth;
+    }
+  }
+};
 
 template <>
-struct ConvKernel3x3FilterDepth16<1, 2, 1> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    // 16 depth accumulators for the 2 outputs.
-    Int32x16 acc0, acc1;
-
-    // Accumulators for top filter.
-    acc0.v0 = vld1q_s32(bias_ptr);
-    acc0.v1 = vld1q_s32(bias_ptr + 4);
-    acc0.v2 = vld1q_s32(bias_ptr + 8);
-    acc0.v3 = vld1q_s32(bias_ptr + 12);
-    // Accumulators for bottom filter.
-    acc1.v0 = vld1q_s32(bias_ptr);
-    acc1.v1 = vld1q_s32(bias_ptr + 4);
-    acc1.v2 = vld1q_s32(bias_ptr + 8);
-    acc1.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row of top filter.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r0, input);
-
-      // Do second row of top filter.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r1, input);
-
-      // The inputs to second row of the top filter are also the inputs to the
-      // first row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r0, input);
-
-      // Do third row of top filter.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r2, input);
-
-      // The inputs to third row of the top filter are also the inputs to the
-      // second row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r1, input);
-
-      // Do third row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 3 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc0, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
-
-    DownquantizeAndStoreDepth16(acc1, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec,
-                                output_ptr + output_depth * output_width);
+struct ConvRow3x3FilterDepth8<4, 2> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Branch and cache misses increase substantially with stride 2 kernels.
+    // Adding prefetching reduces latency by as much as 2x.
+    const int i0 = 0;
+    const int i1 = input_depth;
+    const int i2 = 2 * input_depth;
+    const int i3 = 3 * input_depth;
+    const int i4 = 4 * input_depth;
+    const int i5 = 5 * input_depth;
+    const int i6 = 6 * input_depth;
+    const int i7 = 7 * input_depth;
+    const int i8 = 8 * input_depth;
+
+#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i)         \
+  preload_l1_keep(input_ptr + i * input_row_size + i0); \
+  preload_l1_keep(input_ptr + i * input_row_size + i1); \
+  preload_l1_keep(input_ptr + i * input_row_size + i2); \
+  preload_l1_keep(input_ptr + i * input_row_size + i3); \
+  preload_l1_keep(input_ptr + i * input_row_size + i4); \
+  preload_l1_keep(input_ptr + i * input_row_size + i5); \
+  preload_l1_keep(input_ptr + i * input_row_size + i6); \
+  preload_l1_keep(input_ptr + i * input_row_size + i7); \
+  preload_l1_keep(input_ptr + i * input_row_size + i8);
+
+    int out_x = start_x;
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // Preload 9x9 input.
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+        // For a large input window (64x9x9) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If this size is ever changed,
+        // update the ShuffleWorkspaceSize() function to return the new size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9,
+                     9, shuffle_workspace);
+        const uint8* shuffled_ptr = &shuffle_workspace[0];
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<4, 4, 2>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
+              bias_ptr, output_offset, output_multiplier, output_shift,
+              output_activation_min, output_activation_max, output_ptr,
+              output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      // Preload 9x9 input one more time for the rest of the depth.
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * 2 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+#undef DEPTHWISECONV_PRELOAD_ROW
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += output_depth;
+    }
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth16<1, 2, 2> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    // 16 depth accumulators for the 2 outputs.
-    Int32x16 acc0, acc1;
-
-    // Accumulators for top filter.
-    acc0.v0 = vld1q_s32(bias_ptr);
-    acc0.v1 = vld1q_s32(bias_ptr + 4);
-    acc0.v2 = vld1q_s32(bias_ptr + 8);
-    acc0.v3 = vld1q_s32(bias_ptr + 12);
-    // Accumulators for bottom filter.
-    acc1.v0 = vld1q_s32(bias_ptr);
-    acc1.v1 = vld1q_s32(bias_ptr + 4);
-    acc1.v2 = vld1q_s32(bias_ptr + 8);
-    acc1.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row of top filter.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r0, input);
-
-      // Do second row of top filter.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r1, input);
-
-      // Do third row of top filter.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r2, input);
-
-      // The inputs to third row of the top filter are also the inputs
-      // to first row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r0, input);
-
-      // Do second row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 3 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r1, input);
-
-      // Do third row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 4 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc0, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
-
-    DownquantizeAndStoreDepth16(acc1, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec,
-                                output_ptr + output_depth * output_width);
+struct ConvRow3x3FilterDepth8<8, 2> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Reuse 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 2>::Run(
+        input_data, start_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 2>::Run(
+        input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth16<1, 1> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    Int32x16 acc;
-    acc.v0 = vld1q_s32(bias_ptr);
-    acc.v1 = vld1q_s32(bias_ptr + 4);
-    acc.v2 = vld1q_s32(bias_ptr + 8);
-    acc.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r0, input);
-
-      // Do second row.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r1, input);
-
-      // Do third row.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
+struct ConvRow3x3FilterDepth8<8, 1> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+    // 8x8 at a time.
+    for (; out_x <= output_width - 8; out_x += 8) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // For a large input window (64x10x10) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If the size of the input window
+        // changes, update the function ShuffleWorkspaceSize() with the new
+        // size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10,
+                     10, shuffle_workspace);
+        const uint8* shuffled_ptr = shuffle_workspace;
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<8, 8, 1>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
+              filter_offset, bias_ptr, output_offset, output_multiplier,
+              output_shift, output_activation_min, output_activation_max,
+              output_ptr, output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<8, 8, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 8 * input_depth;
+      output_data += 8 * output_depth;
+    }
+
+    // Handle the rest of the right side by re-using 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 1>::Run(
+        input_data, out_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 1>::Run(
+        input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
   }
 };
 
-inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
-                                          const Dims<4>& filter_dims,
-                                          int stride_width, int stride_height,
-                                          int pad_width, int pad_height,
-                                          int depth_multiplier,
-                                          const Dims<4>& output_dims) {
+inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
+                                         const Dims<4>& filter_dims,
+                                         int stride_width, int stride_height,
+                                         int pad_width, int pad_height,
+                                         int depth_multiplier,
+                                         const Dims<4>& output_dims) {
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -458,14 +4426,14 @@ inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
                    depth_multiplier == 1 &&
                    (stride_width == 1 || stride_width == 2) &&
                    (stride_height == 1 || stride_height == 2) &&
-                   pad_width == 0 && pad_height == 0 && (input_depth % 16) == 0;
+                   pad_width == 0 && pad_height == 0 && (input_depth % 8) == 0;
 
   if (!supported) {
     return false;
   }
 
-  // Handle case where padding is zero but type is not kValid. This would
-  // require special boundary case handling that is not supported yet.
+  // Handle case where padding is zero but padding type is not kValid.
+  // This would require special boundary case handling that is not supported.
 
   const int out_x = output_width - 1;
   const int out_y = output_height - 1;
@@ -481,7 +4449,7 @@ inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
   return in_x_end <= input_width && in_y_end <= input_height;
 }
 
-inline void DepthwiseConv3by3FilterDepth16(
+inline void DepthwiseConv3x3Filter(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
     const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
     const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
@@ -500,241 +4468,108 @@ inline void DepthwiseConv3by3FilterDepth16(
   const int output_width = ArraySize(output_dims, 1);
 
   // Algorithm assumes below constraints. It is optimized for depth multiplier
-  // of 1, 3x3 filter, no padding, strides 1 and 2.
+  // of 1, 3x3 filter, no padding and strides 1 and 2.
   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
   TFLITE_DCHECK(depth_multiplier == 1);
   TFLITE_DCHECK(filter_height == 3);
   TFLITE_DCHECK(filter_width == 3);
   TFLITE_DCHECK(pad_height == 0);
   TFLITE_DCHECK(pad_width == 0);
-  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
   TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
 
-  // The number of outputs to process in the main loop.
-  const int num_x_outputs = 1;
-  const int num_y_outputs = 2;
-
-  const int input_row_width = output_depth * (input_width + 2 * pad_width);
-  const int input_batch_size =
-      input_row_width * (input_height + 2 * pad_height);
+  const int input_row_size = input_depth * (input_width + 2 * pad_width);
+  const int output_row_size = output_depth * output_width;
+  const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
   const int output_batch_size = output_depth * output_width * output_height;
-  const int input_ptr_x_increment = input_depth * stride_width;
 
-  // Calculate extents of non-boundary loop.
-  int out_x_start = 0;
-  for (; out_x_start < input_width; out_x_start++) {
-    int in_x = (out_x_start * stride_width) - pad_width;
-    if (in_x >= 0) {
-      break;
-    }
-  }
-  int out_x_end = output_width - 1;
-  for (; out_x_end >= 0; out_x_end--) {
-    int in_x = (out_x_end * stride_width) - pad_width;
-    int in_x_end = in_x + filter_width + (num_x_outputs - 1) * stride_width;
-    if (in_x_end <= input_width) {
-      out_x_end++;
-      break;
-    }
-  }
-  int out_y_start = 0;
-  for (; out_y_start < input_height; out_y_start++) {
-    int in_y = (out_y_start * stride_height) - pad_height;
-    if (in_y >= 0) {
-      break;
-    }
-  }
-  int out_y_end = output_height - 1;
-  for (; out_y_end >= 0; out_y_end--) {
-    int in_y = (out_y_end * stride_height) - pad_height;
-    int in_y_end = in_y + filter_height + (num_y_outputs - 1) * stride_height;
-    if (in_y_end <= input_height) {
-      out_y_end++;
-      break;
-    }
+  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1>::Run);
+  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1>::Run;
+  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1>::Run;
+  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1>::Run;
+  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1>::Run;
+
+  if (stride_width == 2) {
+    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2>::Run;
+    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2>::Run;
+    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2>::Run;
+    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2>::Run;
   }
 
-  using dot_product_func_t =
-      decltype(&ConvKernel3x3FilterDepth16<1, 2, 1>::Run);
-  dot_product_func_t dot_product_func = nullptr;
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array used
+  // in gemmlowp.
+#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
+  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
 
-  if (stride_width == 1 && stride_height == 1) {
-    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 1>::Run;
-  } else {
-    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 2>::Run;
-  }
+  // Make sure the kernels using this buffer will not run out of bounds.
+  static_assert(ConvRow3x3FilterDepth8<8, 1>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
+  static_assert(ConvRow3x3FilterDepth8<4, 2>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
 
-  // Offsets for preloading inputs.
-  const int i0 = 0;
-  const int i1 = input_depth;
-  const int i2 = 2 * input_depth;
-  const int i3 = input_row_width;
-  const int i4 = input_row_width + input_depth;
-  const int i5 = input_row_width + 2 * input_depth;
-  const int i6 = 2 * input_row_width;
-  const int i7 = 2 * input_row_width + input_depth;
-  const int i8 = 2 * input_row_width + 2 * input_depth;
-  const int i9 = 3 * input_row_width;
-  const int i10 = 3 * input_row_width + input_depth;
-  const int i11 = 3 * input_row_width + 2 * input_depth;
-  const int i12 = 4 * input_row_width;
-  const int i13 = 4 * input_row_width + input_depth;
-  const int i14 = 4 * input_row_width + 2 * input_depth;
+#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE
 
   for (int b = 0; b < batches; ++b) {
-    const int32* bias_ptr = bias_data;
-    const uint8* filter_ptr = filter_data;
-
-    const int in_batch_offset = b * input_batch_size;
-    const int out_batch_offset = b * output_batch_size;
-
-    int depth = 0;
-    for (; depth <= output_depth - 16; depth += 16) {
-      Filter3x3x16 filter =
-          LoadFilterDepth16(filter_ptr, filter_offset, output_depth);
-
-      // Handle 1x2 outputs.
-      int out_y = out_y_start;
-      for (; out_y < out_y_end; out_y += num_y_outputs) {
-        int out_x = out_x_start;
-
-        int in_y_offset =
-            stride_height * input_row_width * (out_y + pad_height);
-        int in_x_offset = stride_width * input_depth * (out_x + pad_width);
-
-        const uint8* input_ptr =
-            input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
-
-        // Preload inputs. If input depth is large, preload every value of the
-        // input for this depth range. Otherwise, preload only the first values
-        // of each row.
-        if (input_depth >= 32) {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i1);
-          preload_l1_keep(input_ptr + i2);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i4);
-          preload_l1_keep(input_ptr + i5);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i7);
-          preload_l1_keep(input_ptr + i8);
-          preload_l1_keep(input_ptr + i9);
-          preload_l1_keep(input_ptr + i10);
-          preload_l1_keep(input_ptr + i11);
-
-          if (stride_height == 2) {
-            preload_l1_keep(input_ptr + i12);
-            preload_l1_keep(input_ptr + i13);
-            preload_l1_keep(input_ptr + i14);
-          }
-        } else {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i9);
-
-          if (stride_height == 2) {
-            preload_l1_keep(input_ptr + i12);
-          }
-        }
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
 
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
-
-        for (; out_x < out_x_end; out_x += num_x_outputs) {
-          dot_product_func(filter, input_ptr, input_depth, input_offset,
-                           input_row_width, bias_ptr, output_offset,
-                           output_multiplier, output_shift,
-                           output_activation_min, output_activation_max,
-                           output_ptr, output_depth, output_width);
-
-          input_ptr += input_ptr_x_increment * num_x_outputs;
-          output_ptr += output_depth * num_x_outputs;
-
-          // Preload the next inputs depending on stride.
-          if (stride_width == 1) {
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i8);
-            preload_l1_keep(input_ptr + i11);
-          } else if (stride_width == 2) {
-            preload_l1_keep(input_ptr + i1);
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i4);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i7);
-            preload_l1_keep(input_ptr + i8);
-            preload_l1_keep(input_ptr + i10);
-            preload_l1_keep(input_ptr + i11);
-            preload_l1_keep(input_ptr + i13);
-            preload_l1_keep(input_ptr + i14);
-          }
-        }
+    int out_y = 0;
 
-        // Handle the rest of the right side.
-        for (; out_x < output_width; out_x++) {
-          // This code path can only be reached if we're handling >1 x outputs
-          // at a time or support kSame padding.
-        }
-      }
+    // Handle 8 rows at a time.
+    for (; out_y <= output_height - 8; out_y += 8) {
+      conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
 
-      // Handle the rest of the bottom side.
-      for (; out_y < output_height; out_y++) {
-        int out_x = out_x_start;
-
-        int in_y_offset =
-            stride_height * input_row_width * (out_y + pad_height);
-        int in_x_offset = stride_width * input_depth * (out_x + pad_width);
-
-        const uint8* input_ptr =
-            input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
-
-        if (input_depth >= 32) {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i1);
-          preload_l1_keep(input_ptr + i2);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i4);
-          preload_l1_keep(input_ptr + i5);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i7);
-        } else {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i6);
-        }
+      input_ptr += 8 * stride_height * input_row_size;
+      output_ptr += 8 * output_row_size;
+    }
 
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
+    // Handle 4 rows at a time.
+    for (; out_y <= output_height - 4; out_y += 4) {
+      conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
 
-        for (; out_x < output_width; out_x++) {
-          ConvKernel3x3FilterDepth16<1, 1>::Run(
-              filter, input_ptr, input_depth, input_offset, input_row_width,
-              bias_ptr, output_offset, output_multiplier, output_shift,
-              output_activation_min, output_activation_max, output_ptr,
-              output_depth, output_width);
+      input_ptr += 4 * stride_height * input_row_size;
+      output_ptr += 4 * output_row_size;
+    }
 
-          input_ptr += input_ptr_x_increment;
-          output_ptr += output_depth;
-
-          if (stride_width == 1) {
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i8);
-          } else if (stride_width == 2) {
-            preload_l1_keep(input_ptr + i1);
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i4);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i7);
-            preload_l1_keep(input_ptr + i8);
-          }
-        }
-      }
-      filter_ptr += 16;
-      bias_ptr += 16;
+    // Handle 2 rows at a time.
+    for (; out_y <= output_height - 2; out_y += 2) {
+      conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
+
+      input_ptr += 2 * stride_height * input_row_size;
+      output_ptr += 2 * output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < output_height; out_y++) {
+      conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width,
+                        input_height, input_row_size, input_offset, filter_data,
+                        filter_offset, bias_data, output_offset,
+                        output_multiplier, output_shift, output_activation_min,
+                        output_activation_max, output_ptr, output_depth,
+                        output_width, shuffle_workspace);
+
+      input_ptr += stride_height * input_row_size;
+      output_ptr += output_row_size;
     }
   }
 }
-- 
GitLab


From 317cc081d620c27df464e19aea624a1e89e30fd8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 13:35:51 -0700
Subject: [PATCH 0607/1262] Update tf.contrib.metrics with deprecations
 (#18335)

* Update tf.contrib.metrics with deprecations

This fix updates tf.contrib.metrics.streaming_mean_absolution(relative/squared)_error
with deprecation notices as they have been replaces with tf.metrics.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update streaming_mean_relative_error

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update deprecation notice for streaming_root_mean_squared_error

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix the deprecation message.

* Fix pylint `Line too long (81/80)` issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/metrics/python/ops/metric_ops.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 088319a557..2bf281b791 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2711,7 +2711,9 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       name=name)
 
 
-@deprecated(None, 'Please switch to tf.metrics.mean.')
+@deprecated(None,
+            'Please switch to tf.metrics.mean_absolute_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2830,7 +2832,9 @@ def streaming_mean_relative_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None,
+            'Please switch to tf.metrics.mean_squared_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_squared_error(predictions,
                                  labels,
                                  weights=None,
@@ -2888,7 +2892,10 @@ def streaming_mean_squared_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(
+    None,
+    'Please switch to tf.metrics.root_mean_squared_error. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_root_mean_squared_error(predictions,
                                       labels,
                                       weights=None,
-- 
GitLab


From 744a5cc092401f3725f06498058e6ba262fd697d Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 11 Apr 2018 13:46:03 -0700
Subject: [PATCH 0608/1262] When not necessary, avoid the creation of a
 `placeholder_with_default` in BN (not yet supported by TPU compilation).

PiperOrigin-RevId: 192502020
---
 tensorflow/python/keras/_impl/keras/layers/normalization.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index b73025a5a8..5462a95d7d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -489,6 +489,7 @@ class BatchNormalization(Layer):
     return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
+    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -512,7 +513,7 @@ class BatchNormalization(Layer):
         # Currently never reaches here since fused_batch_norm does not support
         # virtual batching
         outputs = undo_virtual_batching(outputs)
-      if not context.executing_eagerly() and training is K.learning_phase():
+      if not context.executing_eagerly() and original_training_value is None:
         outputs._uses_learning_phase = True  # pylint: disable=protected-access
       return outputs
 
@@ -628,7 +629,7 @@ class BatchNormalization(Layer):
 
     if self.virtual_batch_size is not None:
       outputs = undo_virtual_batching(outputs)
-    if not context.executing_eagerly() and training is K.learning_phase():
+    if not context.executing_eagerly() and original_training_value is None:
       outputs._uses_learning_phase = True  # pylint: disable=protected-access
     return outputs
 
-- 
GitLab


From 3fa224a453bb9d7f7f8340231adb53ba74b79b42 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 13:47:46 -0700
Subject: [PATCH 0609/1262] Factor out the syntactic function scope tracking
 into the transformer. Choosing not to do this at static analysis because it
 exposes the scope to any node, making it easier to use by any specialization
 of a transformer.

PiperOrigin-RevId: 192502309
---
 tensorflow/contrib/autograph/pyct/BUILD       | 11 +++
 .../contrib/autograph/pyct/transformer.py     | 15 +++
 .../autograph/pyct/transformer_test.py        | 97 +++++++++++++++++++
 3 files changed, 123 insertions(+)
 create mode 100644 tensorflow/contrib/autograph/pyct/transformer_test.py

diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index c483ff68c4..796ab445c7 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -125,3 +125,14 @@ py_test(
         "@gast_archive//:gast",
     ],
 )
+
+py_test(
+    name = "transformer_test",
+    srcs = ["transformer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 35f114b6e1..b38d52c5b2 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -51,6 +51,11 @@ class Base(gast.NodeTransformer):
     self._lineno = 0
     self._col_offset = 0
     self.context = context
+    self._enclosing_entities = []
+
+  @property
+  def enclosing_entities(self):
+    return tuple(self._enclosing_entities)
 
   def debug_print(self, node):
     """Helper method useful for debugging."""
@@ -61,13 +66,20 @@ class Base(gast.NodeTransformer):
   def visit(self, node):
     source_code = self.context.source_code
     source_file = self.context.source_file
+    did_enter_function = False
+
     try:
+      if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
+        self._enclosing_entities.append(node)
+        did_enter_function = True
+
       if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
       if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
         return node
       return super(Base, self).visit(node)
+
     except (ValueError, AttributeError, KeyError, NotImplementedError,
             AssertionError) as e:
       msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
@@ -82,3 +94,6 @@ class Base(gast.NodeTransformer):
                       msg,
                       (source_file, self._lineno, self._col_offset + 1, line)),
                   sys.exc_info()[2])
+    finally:
+      if did_enter_function:
+        self._enclosing_entities.pop()
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
new file mode 100644
index 0000000000..57f1c31ef6
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -0,0 +1,97 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for templates module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.python.platform import test
+
+
+class TransformerTest(test.TestCase):
+
+  def test_entity_scope_tracking(self):
+
+    class TestTransformer(transformer.Base):
+
+      # The choice of note to assign to is arbitrary. Using Assign because it's
+      # easy to find in the tree.
+      def visit_Assign(self, node):
+        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
+        return self.generic_visit(node)
+
+      # This will show up in the lambda function.
+      def visit_BinOp(self, node):
+        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
+        return self.generic_visit(node)
+
+    tr = TestTransformer(
+        context.EntityContext(
+            namer=None,
+            source_code=None,
+            source_file=None,
+            namespace=None,
+            arg_values=None,
+            arg_types=None,
+            owner_type=None,
+            recursive=False))
+
+    def test_function():
+      a = 0
+
+      class TestClass(object):
+
+        def test_method(self):
+          b = 0
+          def inner_function(x):
+            c = 0
+            d = lambda y: (x + y)
+            return c, d
+          return b, inner_function
+      return a, TestClass
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+
+    test_function_node = node.body[0]
+    test_class = test_function_node.body[1]
+    test_method = test_class.body[0]
+    inner_function = test_method.body[1]
+    lambda_node = inner_function.body[1].value
+
+    a = test_function_node.body[0]
+    b = test_method.body[0]
+    c = inner_function.body[0]
+    lambda_expr = lambda_node.body
+
+    self.assertEqual(
+        (test_function_node,), anno.getanno(a, 'enclosing_entities'))
+    self.assertEqual((test_function_node, test_class, test_method),
+                     anno.getanno(b, 'enclosing_entities'))
+    self.assertEqual(
+        (test_function_node, test_class, test_method, inner_function),
+        anno.getanno(c, 'enclosing_entities'))
+    self.assertEqual((test_function_node, test_class, test_method,
+                      inner_function, lambda_node),
+                     anno.getanno(lambda_expr, 'enclosing_entities'))
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 1a36eb1550639b22fa884ccf7511bf8cd65cca95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 13:48:43 -0700
Subject: [PATCH 0610/1262] Replace examples/image_retraining by a pointer to
 TensorFlow Hub.

https://github.com/tensorflow/hub/tree/master/examples/image_retraining
has the same tool, upgraded to use TensorFlow Hub instead of raw graph defs.

PiperOrigin-RevId: 192502469
---
 tensorflow/examples/image_retraining/BUILD    |   51 -
 .../examples/image_retraining/README.md       |   21 +-
 .../examples/image_retraining/__init__.py     |    0
 .../examples/image_retraining/data/labels.txt |    3 -
 .../examples/image_retraining/retrain.py      | 1487 -----------------
 .../examples/image_retraining/retrain_test.py |  148 --
 6 files changed, 12 insertions(+), 1698 deletions(-)
 delete mode 100644 tensorflow/examples/image_retraining/BUILD
 delete mode 100644 tensorflow/examples/image_retraining/__init__.py
 delete mode 100644 tensorflow/examples/image_retraining/data/labels.txt
 delete mode 100644 tensorflow/examples/image_retraining/retrain.py
 delete mode 100644 tensorflow/examples/image_retraining/retrain_test.py

diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
deleted file mode 100644
index ecd79a3b00..0000000000
--- a/tensorflow/examples/image_retraining/BUILD
+++ /dev/null
@@ -1,51 +0,0 @@
-# Description:
-# Transfer learning example for TensorFlow.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-py_binary(
-    name = "retrain",
-    srcs = [
-        "retrain.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "retrain_test",
-    size = "small",
-    srcs = [
-        "retrain.py",
-        "retrain_test.py",
-    ],
-    data = [
-        ":data/labels.txt",
-        "//tensorflow/examples/label_image:data/grace_hopper.jpg",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":retrain",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/image_retraining/README.md b/tensorflow/examples/image_retraining/README.md
index 8a49525c6e..3f0b3d1268 100644
--- a/tensorflow/examples/image_retraining/README.md
+++ b/tensorflow/examples/image_retraining/README.md
@@ -1,12 +1,15 @@
-retrain.py is an example script that shows how one can adapt a pretrained
-network for other classification problems. A detailed overview of this script
-can be found at:
-https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0
+**NOTE: This code has moved to**
+https://github.com/tensorflow/hub/tree/master/examples/image_retraining
 
-The script also shows how one can train layers
-with quantized weights and activations instead of taking a pre-trained floating
-point model and then quantizing weights and activations.
-The output graphdef produced by this script is compatible with the TensorFlow
-Lite Optimizing Converter and can be converted to TFLite format.
+retrain.py is an example script that shows how one can adapt a pretrained
+network for other classification problems (including use with TFLite and
+quantization).
 
+As of TensorFlow 1.7, it is recommended to use a pretrained network from
+TensorFlow Hub, using the new version of this example found in the location
+above, as explained in TensorFlow's revised [image retraining
+tutorial](https://www.tensorflow.org/tutorials/image_retraining).
 
+Older versions of this example (using frozen GraphDefs instead of
+TensorFlow Hub modules) are available in the release branches of
+TensorFlow versions up to and including 1.7.
diff --git a/tensorflow/examples/image_retraining/__init__.py b/tensorflow/examples/image_retraining/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tensorflow/examples/image_retraining/data/labels.txt b/tensorflow/examples/image_retraining/data/labels.txt
deleted file mode 100644
index bc1131ac45..0000000000
--- a/tensorflow/examples/image_retraining/data/labels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Runner-up
-Winner
-Loser
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
deleted file mode 100644
index fcc191250f..0000000000
--- a/tensorflow/examples/image_retraining/retrain.py
+++ /dev/null
@@ -1,1487 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Simple transfer learning with Inception v3 or Mobilenet models.
-
-With support for TensorBoard.
-
-This example shows how to take a Inception v3 or Mobilenet model trained on
-ImageNet images, and train a new top layer that can recognize other classes of
-images.
-
-The top layer receives as input a 2048-dimensional vector (1001-dimensional for
-Mobilenet) for each image. We train a softmax layer on top of this
-representation. Assuming the softmax layer contains N labels, this corresponds
-to learning N + 2048*N (or 1001*N)  model parameters corresponding to the
-learned biases and weights.
-
-Here's an example, which assumes you have a folder containing class-named
-subfolders, each full of images for each label. The example folder flower_photos
-should have a structure like this:
-
-~/flower_photos/daisy/photo1.jpg
-~/flower_photos/daisy/photo2.jpg
-...
-~/flower_photos/rose/anotherphoto77.jpg
-...
-~/flower_photos/sunflower/somepicture.jpg
-
-The subfolder names are important, since they define what label is applied to
-each image, but the filenames themselves don't matter. Once your images are
-prepared, you can run the training with a command like this:
-
-```bash
-bazel build tensorflow/examples/image_retraining:retrain && \
-bazel-bin/tensorflow/examples/image_retraining/retrain \
-    --image_dir ~/flower_photos
-```
-
-Or, if you have a pip installation of tensorflow, `retrain.py` can be run
-without bazel:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos
-```
-
-You can replace the image_dir argument with any folder containing subfolders of
-images. The label for each image is taken from the name of the subfolder it's
-in.
-
-This produces a new model file that can be loaded and run by any TensorFlow
-program, for example the label_image sample code.
-
-By default this script will use the high accuracy, but comparatively large and
-slow Inception v3 model architecture. It's recommended that you start with this
-to validate that you have gathered good training data, but if you want to deploy
-on resource-limited platforms, you can try the `--architecture` flag with a
-Mobilenet model. For example:
-
-Run floating-point version of mobilenet:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_1.0_224
-```
-
-Run mobilenet, instrumented for quantization:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos/   --architecture mobilenet_1.0_224_quant
-```
-
-These instrumented models can be converted to fully quantized mobile models via
-TensorFlow Lite.
-
-There are 32 different Mobilenet models to choose from, with a variety of file
-size and latency options. The first number can be '1.0', '0.75', '0.50', or
-'0.25' to control the size, and the second controls the input image size, either
-'224', '192', '160', or '128', with smaller sizes running faster. See
-https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
-for more information on Mobilenet.
-
-To use with TensorBoard:
-
-By default, this script will log summaries to /tmp/retrain_logs directory
-
-Visualize the summaries with this command:
-
-tensorboard --logdir /tmp/retrain_logs
-
-To use with Tensorflow Serving:
-
-```bash
-tensorflow_model_server --port=9000 --model_name=inception \
-    --model_base_path=/tmp/saved_models/
-```
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from datetime import datetime
-import hashlib
-import os.path
-import random
-import re
-import sys
-import tarfile
-
-import numpy as np
-from six.moves import urllib
-import tensorflow as tf
-
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.platform import gfile
-from tensorflow.python.util import compat
-
-FLAGS = None
-
-# These are all parameters that are tied to the particular model architecture
-# we're using for Inception v3. These include things like tensor names and their
-# sizes. If you want to adapt this script to work with another model, you will
-# need to update these to reflect the values in the network you're using.
-MAX_NUM_IMAGES_PER_CLASS = 2 ** 27 - 1  # ~134M
-
-# The location where variable checkpoints will be stored.
-CHECKPOINT_NAME = '/tmp/_retrain_checkpoint'
-
-
-def create_image_lists(image_dir, testing_percentage, validation_percentage):
-  """Builds a list of training images from the file system.
-
-  Analyzes the sub folders in the image directory, splits them into stable
-  training, testing, and validation sets, and returns a data structure
-  describing the lists of images for each label and their paths.
-
-  Args:
-    image_dir: String path to a folder containing subfolders of images.
-    testing_percentage: Integer percentage of the images to reserve for tests.
-    validation_percentage: Integer percentage of images reserved for validation.
-
-  Returns:
-    A dictionary containing an entry for each label subfolder, with images split
-    into training, testing, and validation sets within each label.
-  """
-  if not gfile.Exists(image_dir):
-    tf.logging.error("Image directory '" + image_dir + "' not found.")
-    return None
-  result = {}
-  sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
-  # The root directory comes first, so skip it.
-  is_root_dir = True
-  for sub_dir in sub_dirs:
-    if is_root_dir:
-      is_root_dir = False
-      continue
-    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
-    file_list = []
-    dir_name = os.path.basename(sub_dir)
-    if dir_name == image_dir:
-      continue
-    tf.logging.info("Looking for images in '" + dir_name + "'")
-    for extension in extensions:
-      file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
-      file_list.extend(gfile.Glob(file_glob))
-    if not file_list:
-      tf.logging.warning('No files found')
-      continue
-    if len(file_list) < 20:
-      tf.logging.warning(
-          'WARNING: Folder has less than 20 images, which may cause issues.')
-    elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
-      tf.logging.warning(
-          'WARNING: Folder {} has more than {} images. Some images will '
-          'never be selected.'.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
-    label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
-    training_images = []
-    testing_images = []
-    validation_images = []
-    for file_name in file_list:
-      base_name = os.path.basename(file_name)
-      # We want to ignore anything after '_nohash_' in the file name when
-      # deciding which set to put an image in, the data set creator has a way of
-      # grouping photos that are close variations of each other. For example
-      # this is used in the plant disease data set to group multiple pictures of
-      # the same leaf.
-      hash_name = re.sub(r'_nohash_.*$', '', file_name)
-      # This looks a bit magical, but we need to decide whether this file should
-      # go into the training, testing, or validation sets, and we want to keep
-      # existing files in the same set even if more files are subsequently
-      # added.
-      # To do that, we need a stable way of deciding based on just the file name
-      # itself, so we do a hash of that and then use that to generate a
-      # probability value that we use to assign it.
-      hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
-      percentage_hash = ((int(hash_name_hashed, 16) %
-                          (MAX_NUM_IMAGES_PER_CLASS + 1)) *
-                         (100.0 / MAX_NUM_IMAGES_PER_CLASS))
-      if percentage_hash < validation_percentage:
-        validation_images.append(base_name)
-      elif percentage_hash < (testing_percentage + validation_percentage):
-        testing_images.append(base_name)
-      else:
-        training_images.append(base_name)
-    result[label_name] = {
-        'dir': dir_name,
-        'training': training_images,
-        'testing': testing_images,
-        'validation': validation_images,
-    }
-  return result
-
-
-def get_image_path(image_lists, label_name, index, image_dir, category):
-  """"Returns a path to an image for a label at the given index.
-
-  Args:
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Int offset of the image we want. This will be moduloed by the
-    available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    category: Name string of set to pull images from - training, testing, or
-    validation.
-
-  Returns:
-    File system path string to an image that meets the requested parameters.
-
-  """
-  if label_name not in image_lists:
-    tf.logging.fatal('Label does not exist %s.', label_name)
-  label_lists = image_lists[label_name]
-  if category not in label_lists:
-    tf.logging.fatal('Category does not exist %s.', category)
-  category_list = label_lists[category]
-  if not category_list:
-    tf.logging.fatal('Label %s has no images in the category %s.',
-                     label_name, category)
-  mod_index = index % len(category_list)
-  base_name = category_list[mod_index]
-  sub_dir = label_lists['dir']
-  full_path = os.path.join(image_dir, sub_dir, base_name)
-  return full_path
-
-
-def get_bottleneck_path(image_lists, label_name, index, bottleneck_dir,
-                        category, architecture):
-  """"Returns a path to a bottleneck file for a label at the given index.
-
-  Args:
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Integer offset of the image we want. This will be moduloed by the
-    available number of images for the label, so it can be arbitrarily large.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    category: Name string of set to pull images from - training, testing, or
-    validation.
-    architecture: The name of the model architecture.
-
-  Returns:
-    File system path string to an image that meets the requested parameters.
-  """
-  return get_image_path(image_lists, label_name, index, bottleneck_dir,
-                        category) + '_' + architecture + '.txt'
-
-
-def create_model_graph(model_info):
-  """"Creates a graph from saved GraphDef file and returns a Graph object.
-
-  Args:
-    model_info: Dictionary containing information about the model architecture.
-
-  Returns:
-    Graph holding the trained Inception network, and various tensors we'll be
-    manipulating.
-  """
-  with tf.Graph().as_default() as graph:
-    model_path = os.path.join(FLAGS.model_dir, model_info['model_file_name'])
-    print('Model path: ', model_path)
-    with gfile.FastGFile(model_path, 'rb') as f:
-      graph_def = tf.GraphDef()
-      graph_def.ParseFromString(f.read())
-      bottleneck_tensor, resized_input_tensor = (tf.import_graph_def(
-          graph_def,
-          name='',
-          return_elements=[
-              model_info['bottleneck_tensor_name'],
-              model_info['resized_input_tensor_name'],
-          ]))
-  return graph, bottleneck_tensor, resized_input_tensor
-
-
-def run_bottleneck_on_image(sess, image_data, image_data_tensor,
-                            decoded_image_tensor, resized_input_tensor,
-                            bottleneck_tensor):
-  """Runs inference on an image to extract the 'bottleneck' summary layer.
-
-  Args:
-    sess: Current active TensorFlow Session.
-    image_data: String of raw JPEG data.
-    image_data_tensor: Input data layer in the graph.
-    decoded_image_tensor: Output of initial image resizing and preprocessing.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: Layer before the final softmax.
-
-  Returns:
-    Numpy array of bottleneck values.
-  """
-  # First decode the JPEG image, resize it, and rescale the pixel values.
-  resized_input_values = sess.run(decoded_image_tensor,
-                                  {image_data_tensor: image_data})
-  # Then run it through the recognition network.
-  bottleneck_values = sess.run(bottleneck_tensor,
-                               {resized_input_tensor: resized_input_values})
-  bottleneck_values = np.squeeze(bottleneck_values)
-  return bottleneck_values
-
-
-def maybe_download_and_extract(data_url):
-  """Download and extract model tar file.
-
-  If the pretrained model we're using doesn't already exist, this function
-  downloads it from the TensorFlow.org website and unpacks it into a directory.
-
-  Args:
-    data_url: Web location of the tar file containing the pretrained model.
-  """
-  dest_directory = FLAGS.model_dir
-  if not os.path.exists(dest_directory):
-    os.makedirs(dest_directory)
-  filename = data_url.split('/')[-1]
-  filepath = os.path.join(dest_directory, filename)
-  if not os.path.exists(filepath):
-
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' %
-                       (filename,
-                        float(count * block_size) / float(total_size) * 100.0))
-      sys.stdout.flush()
-
-    filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
-    print()
-    statinfo = os.stat(filepath)
-    tf.logging.info('Successfully downloaded %s %d bytes.', filename,
-                    statinfo.st_size)
-    print('Extracting file from ', filepath)
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
-  else:
-    print('Not extracting or downloading files, model already present in disk')
-
-
-def ensure_dir_exists(dir_name):
-  """Makes sure the folder exists on disk.
-
-  Args:
-    dir_name: Path string to the folder we want to create.
-  """
-  if not os.path.exists(dir_name):
-    os.makedirs(dir_name)
-
-
-bottleneck_path_2_bottleneck_values = {}
-
-
-def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor):
-  """Create a single bottleneck file."""
-  tf.logging.info('Creating bottleneck at ' + bottleneck_path)
-  image_path = get_image_path(image_lists, label_name, index,
-                              image_dir, category)
-  if not gfile.Exists(image_path):
-    tf.logging.fatal('File does not exist %s', image_path)
-  image_data = gfile.FastGFile(image_path, 'rb').read()
-  try:
-    bottleneck_values = run_bottleneck_on_image(
-        sess, image_data, jpeg_data_tensor, decoded_image_tensor,
-        resized_input_tensor, bottleneck_tensor)
-  except Exception as e:
-    raise RuntimeError('Error during processing file %s (%s)' % (image_path,
-                                                                 str(e)))
-  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
-  with open(bottleneck_path, 'w') as bottleneck_file:
-    bottleneck_file.write(bottleneck_string)
-
-
-def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
-                             category, bottleneck_dir, jpeg_data_tensor,
-                             decoded_image_tensor, resized_input_tensor,
-                             bottleneck_tensor, architecture):
-  """Retrieves or calculates bottleneck values for an image.
-
-  If a cached version of the bottleneck data exists on-disk, return that,
-  otherwise calculate the data and save it to disk for future use.
-
-  Args:
-    sess: The current active TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Integer offset of the image we want. This will be modulo-ed by the
-    available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    category: Name string of which set to pull images from - training, testing,
-    or validation.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    jpeg_data_tensor: The tensor to feed loaded jpeg data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The output tensor for the bottleneck values.
-    architecture: The name of the model architecture.
-
-  Returns:
-    Numpy array of values produced by the bottleneck layer for the image.
-  """
-  label_lists = image_lists[label_name]
-  sub_dir = label_lists['dir']
-  sub_dir_path = os.path.join(bottleneck_dir, sub_dir)
-  ensure_dir_exists(sub_dir_path)
-  bottleneck_path = get_bottleneck_path(image_lists, label_name, index,
-                                        bottleneck_dir, category, architecture)
-  if not os.path.exists(bottleneck_path):
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor)
-  with open(bottleneck_path, 'r') as bottleneck_file:
-    bottleneck_string = bottleneck_file.read()
-  did_hit_error = False
-  try:
-    bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  except ValueError:
-    tf.logging.warning('Invalid float found, recreating bottleneck')
-    did_hit_error = True
-  if did_hit_error:
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor)
-    with open(bottleneck_path, 'r') as bottleneck_file:
-      bottleneck_string = bottleneck_file.read()
-    # Allow exceptions to propagate here, since they shouldn't happen after a
-    # fresh creation
-    bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  return bottleneck_values
-
-
-def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
-                      jpeg_data_tensor, decoded_image_tensor,
-                      resized_input_tensor, bottleneck_tensor, architecture):
-  """Ensures all the training, testing, and validation bottlenecks are cached.
-
-  Because we're likely to read the same image multiple times (if there are no
-  distortions applied during training) it can speed things up a lot if we
-  calculate the bottleneck layer values once for each image during
-  preprocessing, and then just read those cached values repeatedly during
-  training. Here we go through all the images we've found, calculate those
-  values, and save them off.
-
-  Args:
-    sess: The current active TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    jpeg_data_tensor: Input tensor for jpeg data from file.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The penultimate output layer of the graph.
-    architecture: The name of the model architecture.
-
-  Returns:
-    Nothing.
-  """
-  how_many_bottlenecks = 0
-  ensure_dir_exists(bottleneck_dir)
-  for label_name, label_lists in image_lists.items():
-    for category in ['training', 'testing', 'validation']:
-      category_list = label_lists[category]
-      for index, unused_base_name in enumerate(category_list):
-        get_or_create_bottleneck(
-            sess, image_lists, label_name, index, image_dir, category,
-            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-            resized_input_tensor, bottleneck_tensor, architecture)
-
-        how_many_bottlenecks += 1
-        if how_many_bottlenecks % 100 == 0:
-          tf.logging.info(
-              str(how_many_bottlenecks) + ' bottleneck files created.')
-
-
-def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
-                                  bottleneck_dir, image_dir, jpeg_data_tensor,
-                                  decoded_image_tensor, resized_input_tensor,
-                                  bottleneck_tensor, architecture):
-  """Retrieves bottleneck values for cached images.
-
-  If no distortions are being applied, this function can retrieve the cached
-  bottleneck values directly from disk for images. It picks a random set of
-  images from the specified category.
-
-  Args:
-    sess: Current TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    how_many: If positive, a random sample of this size will be chosen.
-    If negative, all bottlenecks will be retrieved.
-    category: Name string of which set to pull from - training, testing, or
-    validation.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    jpeg_data_tensor: The layer to feed jpeg image data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-    architecture: The name of the model architecture.
-
-  Returns:
-    List of bottleneck arrays, their corresponding ground truths, and the
-    relevant filenames.
-  """
-  class_count = len(image_lists.keys())
-  bottlenecks = []
-  ground_truths = []
-  filenames = []
-  if how_many >= 0:
-    # Retrieve a random sample of bottlenecks.
-    for unused_i in range(how_many):
-      label_index = random.randrange(class_count)
-      label_name = list(image_lists.keys())[label_index]
-      image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
-      image_name = get_image_path(image_lists, label_name, image_index,
-                                  image_dir, category)
-      bottleneck = get_or_create_bottleneck(
-          sess, image_lists, label_name, image_index, image_dir, category,
-          bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-          resized_input_tensor, bottleneck_tensor, architecture)
-      bottlenecks.append(bottleneck)
-      ground_truths.append(label_index)
-      filenames.append(image_name)
-  else:
-    # Retrieve all bottlenecks.
-    for label_index, label_name in enumerate(image_lists.keys()):
-      for image_index, image_name in enumerate(
-          image_lists[label_name][category]):
-        image_name = get_image_path(image_lists, label_name, image_index,
-                                    image_dir, category)
-        bottleneck = get_or_create_bottleneck(
-            sess, image_lists, label_name, image_index, image_dir, category,
-            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-            resized_input_tensor, bottleneck_tensor, architecture)
-        bottlenecks.append(bottleneck)
-        ground_truths.append(label_index)
-        filenames.append(image_name)
-  return bottlenecks, ground_truths, filenames
-
-
-def get_random_distorted_bottlenecks(
-    sess, image_lists, how_many, category, image_dir, input_jpeg_tensor,
-    distorted_image, resized_input_tensor, bottleneck_tensor):
-  """Retrieves bottleneck values for training images, after distortions.
-
-  If we're training with distortions like crops, scales, or flips, we have to
-  recalculate the full model for every image, and so we can't use cached
-  bottleneck values. Instead we find random images for the requested category,
-  run them through the distortion graph, and then the full graph to get the
-  bottleneck results for each.
-
-  Args:
-    sess: Current TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    how_many: The integer number of bottleneck values to return.
-    category: Name string of which set of images to fetch - training, testing,
-    or validation.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    input_jpeg_tensor: The input layer we feed the image data to.
-    distorted_image: The output node of the distortion graph.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-
-  Returns:
-    List of bottleneck arrays and their corresponding ground truths.
-  """
-  class_count = len(image_lists.keys())
-  bottlenecks = []
-  ground_truths = []
-  for unused_i in range(how_many):
-    label_index = random.randrange(class_count)
-    label_name = list(image_lists.keys())[label_index]
-    image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
-    image_path = get_image_path(image_lists, label_name, image_index, image_dir,
-                                category)
-    if not gfile.Exists(image_path):
-      tf.logging.fatal('File does not exist %s', image_path)
-    jpeg_data = gfile.FastGFile(image_path, 'rb').read()
-    # Note that we materialize the distorted_image_data as a numpy array before
-    # sending running inference on the image. This involves 2 memory copies and
-    # might be optimized in other implementations.
-    distorted_image_data = sess.run(distorted_image,
-                                    {input_jpeg_tensor: jpeg_data})
-    bottleneck_values = sess.run(bottleneck_tensor,
-                                 {resized_input_tensor: distorted_image_data})
-    bottleneck_values = np.squeeze(bottleneck_values)
-    bottlenecks.append(bottleneck_values)
-    ground_truths.append(label_index)
-  return bottlenecks, ground_truths
-
-
-def should_distort_images(flip_left_right, random_crop, random_scale,
-                          random_brightness):
-  """Whether any distortions are enabled, from the input flags.
-
-  Args:
-    flip_left_right: Boolean whether to randomly mirror images horizontally.
-    random_crop: Integer percentage setting the total margin used around the
-    crop box.
-    random_scale: Integer percentage of how much to vary the scale by.
-    random_brightness: Integer range to randomly multiply the pixel values by.
-
-  Returns:
-    Boolean value indicating whether any distortions should be applied.
-  """
-  return (flip_left_right or (random_crop != 0) or (random_scale != 0) or
-          (random_brightness != 0))
-
-
-def add_input_distortions(flip_left_right, random_crop, random_scale,
-                          random_brightness, input_width, input_height,
-                          input_depth, input_mean, input_std):
-  """Creates the operations to apply the specified distortions.
-
-  During training it can help to improve the results if we run the images
-  through simple distortions like crops, scales, and flips. These reflect the
-  kind of variations we expect in the real world, and so can help train the
-  model to cope with natural data more effectively. Here we take the supplied
-  parameters and construct a network of operations to apply them to an image.
-
-  Cropping
-  ~~~~~~~~
-
-  Cropping is done by placing a bounding box at a random position in the full
-  image. The cropping parameter controls the size of that box relative to the
-  input image. If it's zero, then the box is the same size as the input and no
-  cropping is performed. If the value is 50%, then the crop box will be half the
-  width and height of the input. In a diagram it looks like this:
-
-  <       width         >
-  +---------------------+
-  |                     |
-  |   width - crop%     |
-  |    <      >         |
-  |    +------+         |
-  |    |      |         |
-  |    |      |         |
-  |    |      |         |
-  |    +------+         |
-  |                     |
-  |                     |
-  +---------------------+
-
-  Scaling
-  ~~~~~~~
-
-  Scaling is a lot like cropping, except that the bounding box is always
-  centered and its size varies randomly within the given range. For example if
-  the scale percentage is zero, then the bounding box is the same size as the
-  input and no scaling is applied. If it's 50%, then the bounding box will be in
-  a random range between half the width and height and full size.
-
-  Args:
-    flip_left_right: Boolean whether to randomly mirror images horizontally.
-    random_crop: Integer percentage setting the total margin used around the
-    crop box.
-    random_scale: Integer percentage of how much to vary the scale by.
-    random_brightness: Integer range to randomly multiply the pixel values by.
-    graph.
-    input_width: Horizontal size of expected input image to model.
-    input_height: Vertical size of expected input image to model.
-    input_depth: How many channels the expected input image should have.
-    input_mean: Pixel value that should be zero in the image for the graph.
-    input_std: How much to divide the pixel values by before recognition.
-
-  Returns:
-    The jpeg input layer and the distorted result tensor.
-  """
-
-  jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
-  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
-  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
-  margin_scale = 1.0 + (random_crop / 100.0)
-  resize_scale = 1.0 + (random_scale / 100.0)
-  margin_scale_value = tf.constant(margin_scale)
-  resize_scale_value = tf.random_uniform(tensor_shape.scalar(),
-                                         minval=1.0,
-                                         maxval=resize_scale)
-  scale_value = tf.multiply(margin_scale_value, resize_scale_value)
-  precrop_width = tf.multiply(scale_value, input_width)
-  precrop_height = tf.multiply(scale_value, input_height)
-  precrop_shape = tf.stack([precrop_height, precrop_width])
-  precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32)
-  precropped_image = tf.image.resize_bilinear(decoded_image_4d,
-                                              precrop_shape_as_int)
-  precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0])
-  cropped_image = tf.random_crop(precropped_image_3d,
-                                 [input_height, input_width, input_depth])
-  if flip_left_right:
-    flipped_image = tf.image.random_flip_left_right(cropped_image)
-  else:
-    flipped_image = cropped_image
-  brightness_min = 1.0 - (random_brightness / 100.0)
-  brightness_max = 1.0 + (random_brightness / 100.0)
-  brightness_value = tf.random_uniform(tensor_shape.scalar(),
-                                       minval=brightness_min,
-                                       maxval=brightness_max)
-  brightened_image = tf.multiply(flipped_image, brightness_value)
-  offset_image = tf.subtract(brightened_image, input_mean)
-  mul_image = tf.multiply(offset_image, 1.0 / input_std)
-  distort_result = tf.expand_dims(mul_image, 0, name='DistortResult')
-  return jpeg_data, distort_result
-
-
-def variable_summaries(var):
-  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
-  with tf.name_scope('summaries'):
-    mean = tf.reduce_mean(var)
-    tf.summary.scalar('mean', mean)
-    with tf.name_scope('stddev'):
-      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
-    tf.summary.scalar('stddev', stddev)
-    tf.summary.scalar('max', tf.reduce_max(var))
-    tf.summary.scalar('min', tf.reduce_min(var))
-    tf.summary.histogram('histogram', var)
-
-
-def add_final_retrain_ops(class_count, final_tensor_name, bottleneck_tensor,
-                          bottleneck_tensor_size, quantize_layer, is_training):
-  """Adds a new softmax and fully-connected layer for training and eval.
-
-  We need to retrain the top layer to identify our new classes, so this function
-  adds the right operations to the graph, along with some variables to hold the
-  weights, and then sets up all the gradients for the backward pass.
-
-  The set up for the softmax and fully-connected layers is based on:
-  https://www.tensorflow.org/versions/master/tutorials/mnist/beginners/index.html
-
-  Args:
-    class_count: Integer of how many categories of things we're trying to
-        recognize.
-    final_tensor_name: Name string for the new final node that produces results.
-    bottleneck_tensor: The output of the main CNN graph.
-    bottleneck_tensor_size: How many entries in the bottleneck vector.
-    quantize_layer: Boolean, specifying whether the newly added layer should be
-        instrumented for quantized.
-    is_training: Boolean, specifying whether the newly add layer is for training
-        or eval.
-
-  Returns:
-    The tensors for the training and cross entropy results, and tensors for the
-    bottleneck input and ground truth input.
-  """
-  with tf.name_scope('input'):
-    bottleneck_input = tf.placeholder_with_default(
-        bottleneck_tensor,
-        shape=[None, bottleneck_tensor_size],
-        name='BottleneckInputPlaceholder')
-
-    ground_truth_input = tf.placeholder(
-        tf.int64, [None], name='GroundTruthInput')
-
-  # Organizing the following ops so they are easier to see in TensorBoard.
-  layer_name = 'final_retrain_ops'
-  with tf.name_scope(layer_name):
-    with tf.name_scope('weights'):
-      initial_value = tf.truncated_normal(
-          [bottleneck_tensor_size, class_count], stddev=0.001)
-      layer_weights = tf.Variable(initial_value, name='final_weights')
-      variable_summaries(layer_weights)
-
-    with tf.name_scope('biases'):
-      layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
-      variable_summaries(layer_biases)
-
-    with tf.name_scope('Wx_plus_b'):
-      logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
-      tf.summary.histogram('pre_activations', logits)
-
-  final_tensor = tf.nn.softmax(logits, name=final_tensor_name)
-
-  # The tf.contrib.quantize functions rewrite the graph in place for
-  # quantization. The imported model graph has already been rewritten, so upon
-  # calling these rewrites, only the newly added final layer will be
-  # transformed.
-  if quantize_layer:
-    if is_training:
-      tf.contrib.quantize.create_training_graph()
-    else:
-      tf.contrib.quantize.create_eval_graph()
-
-  tf.summary.histogram('activations', final_tensor)
-
-  # If this is an eval graph, we don't need to add loss ops or an optimizer.
-  if not is_training:
-    return None, None, bottleneck_input, ground_truth_input, final_tensor
-
-  with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
-        labels=ground_truth_input, logits=logits)
-
-  tf.summary.scalar('cross_entropy', cross_entropy_mean)
-
-  with tf.name_scope('train'):
-    optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
-    train_step = optimizer.minimize(cross_entropy_mean)
-
-  return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input,
-          final_tensor)
-
-
-def add_evaluation_step(result_tensor, ground_truth_tensor):
-  """Inserts the operations we need to evaluate the accuracy of our results.
-
-  Args:
-    result_tensor: The new final node that produces results.
-    ground_truth_tensor: The node we feed ground truth data
-    into.
-
-  Returns:
-    Tuple of (evaluation step, prediction).
-  """
-  with tf.name_scope('accuracy'):
-    with tf.name_scope('correct_prediction'):
-      prediction = tf.argmax(result_tensor, 1)
-      correct_prediction = tf.equal(prediction, ground_truth_tensor)
-    with tf.name_scope('accuracy'):
-      evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  tf.summary.scalar('accuracy', evaluation_step)
-  return evaluation_step, prediction
-
-
-def run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
-                   decoded_image_tensor, resized_image_tensor,
-                   bottleneck_tensor):
-  """Runs a final evaluation on an eval graph using the test data set.
-
-  Args:
-    sess: Session for the train graph.
-    model_info: Model info dictionary from create_model_info()
-    class_count: Number of classes
-    image_lists: Dictionary of training images for each label.
-    jpeg_data_tensor: The layer to feed jpeg image data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_image_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-  """
-  test_bottlenecks, test_ground_truth, test_filenames = (
-      get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
-                                    'testing', FLAGS.bottleneck_dir,
-                                    FLAGS.image_dir, jpeg_data_tensor,
-                                    decoded_image_tensor, resized_image_tensor,
-                                    bottleneck_tensor, FLAGS.architecture))
-
-  (sess, bottleneck_input, ground_truth_input, evaluation_step,
-   prediction) = build_eval_session(model_info, class_count)
-
-  test_accuracy, predictions = sess.run(
-      [evaluation_step, prediction],
-      feed_dict={
-          bottleneck_input: test_bottlenecks,
-          ground_truth_input: test_ground_truth
-      })
-  tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
-                  (test_accuracy * 100, len(test_bottlenecks)))
-
-  if FLAGS.print_misclassified_test_images:
-    tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
-    for i, test_filename in enumerate(test_filenames):
-      if predictions[i] != test_ground_truth[i]:
-        tf.logging.info('%70s  %s' % (test_filename,
-                                      list(image_lists.keys())[predictions[i]]))
-
-
-def build_eval_session(model_info, class_count):
-  """Builds an restored eval session without train operations for exporting.
-
-  Args:
-    model_info: Model info dictionary from create_model_info()
-    class_count: Number of classes
-
-  Returns:
-    Eval session containing the restored eval graph.
-    The bottleneck input, ground truth, eval step, and prediction tensors.
-  """
-  # If quantized, we need to create the correct eval graph for exporting.
-  eval_graph, bottleneck_tensor, _ = create_model_graph(model_info)
-
-  eval_sess = tf.Session(graph=eval_graph)
-  with eval_graph.as_default():
-    # Add the new layer for exporting.
-    (_, _, bottleneck_input,
-     ground_truth_input, final_tensor) = add_final_retrain_ops(
-         class_count, FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'], model_info['quantize_layer'],
-         False)
-
-    # Now we need to restore the values from the training graph to the eval
-    # graph.
-    tf.train.Saver().restore(eval_sess, CHECKPOINT_NAME)
-
-    evaluation_step, prediction = add_evaluation_step(final_tensor,
-                                                      ground_truth_input)
-
-  return (eval_sess, bottleneck_input, ground_truth_input, evaluation_step,
-          prediction)
-
-
-def save_graph_to_file(graph, graph_file_name, model_info, class_count):
-  """Saves an graph to file, creating a valid quantized one if necessary."""
-  sess, _, _, _, _ = build_eval_session(model_info, class_count)
-  graph = sess.graph
-
-  output_graph_def = graph_util.convert_variables_to_constants(
-      sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-
-  with gfile.FastGFile(graph_file_name, 'wb') as f:
-    f.write(output_graph_def.SerializeToString())
-
-
-def prepare_file_system():
-  # Setup the directory we'll write summaries to for TensorBoard
-  if tf.gfile.Exists(FLAGS.summaries_dir):
-    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
-  tf.gfile.MakeDirs(FLAGS.summaries_dir)
-  if FLAGS.intermediate_store_frequency > 0:
-    ensure_dir_exists(FLAGS.intermediate_output_graphs_dir)
-  return
-
-
-def create_model_info(architecture):
-  """Given the name of a model architecture, returns information about it.
-
-  There are different base image recognition pretrained models that can be
-  retrained using transfer learning, and this function translates from the name
-  of a model to the attributes that are needed to download and train with it.
-
-  Args:
-    architecture: Name of a model architecture.
-
-  Returns:
-    Dictionary of information about the model, or None if the name isn't
-    recognized
-
-  Raises:
-    ValueError: If architecture name is unknown.
-  """
-  architecture = architecture.lower()
-  is_quantized = False
-  if architecture == 'inception_v3':
-    # pylint: disable=line-too-long
-    data_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
-    # pylint: enable=line-too-long
-    bottleneck_tensor_name = 'pool_3/_reshape:0'
-    bottleneck_tensor_size = 2048
-    input_width = 299
-    input_height = 299
-    input_depth = 3
-    resized_input_tensor_name = 'Mul:0'
-    model_file_name = 'classify_image_graph_def.pb'
-    input_mean = 128
-    input_std = 128
-  elif architecture.startswith('mobilenet_'):
-    parts = architecture.split('_')
-    if len(parts) != 3 and len(parts) != 4:
-      tf.logging.error("Couldn't understand architecture name '%s'",
-                       architecture)
-      return None
-    version_string = parts[1]
-    if (version_string != '1.0' and version_string != '0.75' and
-        version_string != '0.5' and version_string != '0.25'):
-      tf.logging.error(
-          """"The Mobilenet version should be '1.0', '0.75', '0.5', or '0.25',
-  but found '%s' for architecture '%s'""", version_string, architecture)
-      return None
-    size_string = parts[2]
-    if (size_string != '224' and size_string != '192' and
-        size_string != '160' and size_string != '128'):
-      tf.logging.error(
-          """The Mobilenet input size should be '224', '192', '160', or '128',
- but found '%s' for architecture '%s'""",
-          size_string, architecture)
-      return None
-    if len(parts) == 3:
-      is_quantized = False
-    else:
-      if parts[3] != 'quant':
-        tf.logging.error(
-            "Couldn't understand architecture suffix '%s' for '%s'", parts[3],
-            architecture)
-        return None
-      is_quantized = True
-
-    data_url = 'http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/'
-    model_name = 'mobilenet_v1_' + version_string + '_' + size_string
-    if is_quantized:
-      model_name += '_quant'
-    data_url += model_name + '.tgz'
-    bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
-    resized_input_tensor_name = 'input:0'
-    model_file_name = model_name + '_frozen.pb'
-
-    bottleneck_tensor_size = 1001
-    input_width = int(size_string)
-    input_height = int(size_string)
-    input_depth = 3
-    input_mean = 127.5
-    input_std = 127.5
-  else:
-    tf.logging.error("Couldn't understand architecture name '%s'", architecture)
-    raise ValueError('Unknown architecture', architecture)
-
-  return {
-      'data_url': data_url,
-      'bottleneck_tensor_name': bottleneck_tensor_name,
-      'bottleneck_tensor_size': bottleneck_tensor_size,
-      'input_width': input_width,
-      'input_height': input_height,
-      'input_depth': input_depth,
-      'resized_input_tensor_name': resized_input_tensor_name,
-      'model_file_name': model_file_name,
-      'input_mean': input_mean,
-      'input_std': input_std,
-      'quantize_layer': is_quantized,
-  }
-
-
-def add_jpeg_decoding(input_width, input_height, input_depth, input_mean,
-                      input_std):
-  """Adds operations that perform JPEG decoding and resizing to the graph..
-
-  Args:
-    input_width: Desired width of the image fed into the recognizer graph.
-    input_height: Desired width of the image fed into the recognizer graph.
-    input_depth: Desired channels of the image fed into the recognizer graph.
-    input_mean: Pixel value that should be zero in the image for the graph.
-    input_std: How much to divide the pixel values by before recognition.
-
-  Returns:
-    Tensors for the node to feed JPEG data into, and the output of the
-      preprocessing steps.
-  """
-  jpeg_data = tf.placeholder(tf.string, name='DecodeJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
-  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
-  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
-  resize_shape = tf.stack([input_height, input_width])
-  resize_shape_as_int = tf.cast(resize_shape, dtype=tf.int32)
-  resized_image = tf.image.resize_bilinear(decoded_image_4d,
-                                           resize_shape_as_int)
-  offset_image = tf.subtract(resized_image, input_mean)
-  mul_image = tf.multiply(offset_image, 1.0 / input_std)
-  return jpeg_data, mul_image
-
-
-def export_model(model_info, class_count, saved_model_dir):
-  """Exports model for serving.
-
-  Args:
-    model_info: The modelinfo for the current model.
-    class_count: The number of classes.
-    saved_model_dir: Directory in which to save exported model and variables.
-  """
-  # The SavedModel should hold the eval graph.
-  sess, _, _, _, _ = build_eval_session(model_info, class_count)
-  graph = sess.graph
-  with graph.as_default():
-    input_tensor = model_info['resized_input_tensor_name']
-    in_image = sess.graph.get_tensor_by_name(input_tensor)
-    inputs = {'image': tf.saved_model.utils.build_tensor_info(in_image)}
-
-    out_classes = sess.graph.get_tensor_by_name('final_result:0')
-    outputs = {
-        'prediction': tf.saved_model.utils.build_tensor_info(out_classes)
-    }
-
-    signature = tf.saved_model.signature_def_utils.build_signature_def(
-        inputs=inputs,
-        outputs=outputs,
-        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
-
-    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
-
-    # Save out the SavedModel.
-    builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir)
-    builder.add_meta_graph_and_variables(
-        sess, [tf.saved_model.tag_constants.SERVING],
-        signature_def_map={
-            tf.saved_model.signature_constants.
-            DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                signature
-        },
-        legacy_init_op=legacy_init_op)
-    builder.save()
-
-
-def main(_):
-  # Needed to make sure the logging output is visible.
-  # See https://github.com/tensorflow/tensorflow/issues/3047
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  # Prepare necessary directories that can be used during training
-  prepare_file_system()
-
-  # Gather information about the model architecture we'll be using.
-  model_info = create_model_info(FLAGS.architecture)
-  if not model_info:
-    tf.logging.error('Did not recognize architecture flag')
-    return -1
-
-  # Look at the folder structure, and create lists of all the images.
-  image_lists = create_image_lists(FLAGS.image_dir, FLAGS.testing_percentage,
-                                   FLAGS.validation_percentage)
-  class_count = len(image_lists.keys())
-  if class_count == 0:
-    tf.logging.error('No valid folders of images found at ' + FLAGS.image_dir)
-    return -1
-  if class_count == 1:
-    tf.logging.error('Only one valid folder of images found at ' +
-                     FLAGS.image_dir +
-                     ' - multiple classes are needed for classification.')
-    return -1
-
-  # See if the command-line flags mean we're applying any distortions.
-  do_distort_images = should_distort_images(
-      FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-      FLAGS.random_brightness)
-
-  # Set up the pre-trained graph.
-  maybe_download_and_extract(model_info['data_url'])
-  graph, bottleneck_tensor, resized_image_tensor = (
-      create_model_graph(model_info))
-
-  # Add the new layer that we'll be training.
-  with graph.as_default():
-    (train_step, cross_entropy, bottleneck_input,
-     ground_truth_input, final_tensor) = add_final_retrain_ops(
-         class_count, FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'], model_info['quantize_layer'],
-         True)
-
-  with tf.Session(graph=graph) as sess:
-    # Set up the image decoding sub-graph.
-    jpeg_data_tensor, decoded_image_tensor = add_jpeg_decoding(
-        model_info['input_width'], model_info['input_height'],
-        model_info['input_depth'], model_info['input_mean'],
-        model_info['input_std'])
-
-    if do_distort_images:
-      # We will be applying distortions, so setup the operations we'll need.
-      (distorted_jpeg_data_tensor,
-       distorted_image_tensor) = add_input_distortions(
-           FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-           FLAGS.random_brightness, model_info['input_width'],
-           model_info['input_height'], model_info['input_depth'],
-           model_info['input_mean'], model_info['input_std'])
-    else:
-      # We'll make sure we've calculated the 'bottleneck' image summaries and
-      # cached them on disk.
-      cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
-                        FLAGS.bottleneck_dir, jpeg_data_tensor,
-                        decoded_image_tensor, resized_image_tensor,
-                        bottleneck_tensor, FLAGS.architecture)
-
-    # Create the operations we need to evaluate the accuracy of our new layer.
-    evaluation_step, _ = add_evaluation_step(final_tensor, ground_truth_input)
-
-    # Merge all the summaries and write them out to the summaries_dir
-    merged = tf.summary.merge_all()
-    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                         sess.graph)
-
-    validation_writer = tf.summary.FileWriter(
-        FLAGS.summaries_dir + '/validation')
-
-    # Create a train saver that is used to restore values into an eval graph
-    # when exporting models.
-    train_saver = tf.train.Saver()
-
-    # Set up all our weights to their initial default values.
-    init = tf.global_variables_initializer()
-    sess.run(init)
-
-    # Run the training for as many cycles as requested on the command line.
-    for i in range(FLAGS.how_many_training_steps):
-      # Get a batch of input bottleneck values, either calculated fresh every
-      # time with distortions applied, or from the cache stored on disk.
-      if do_distort_images:
-        (train_bottlenecks,
-         train_ground_truth) = get_random_distorted_bottlenecks(
-             sess, image_lists, FLAGS.train_batch_size, 'training',
-             FLAGS.image_dir, distorted_jpeg_data_tensor,
-             distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
-      else:
-        (train_bottlenecks,
-         train_ground_truth, _) = get_random_cached_bottlenecks(
-             sess, image_lists, FLAGS.train_batch_size, 'training',
-             FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-             decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-             FLAGS.architecture)
-      # Feed the bottlenecks and ground truth into the graph, and run a training
-      # step. Capture training summaries for TensorBoard with the `merged` op.
-      train_summary, _ = sess.run(
-          [merged, train_step],
-          feed_dict={bottleneck_input: train_bottlenecks,
-                     ground_truth_input: train_ground_truth})
-      train_writer.add_summary(train_summary, i)
-
-      # Every so often, print out how well the graph is training.
-      is_last_step = (i + 1 == FLAGS.how_many_training_steps)
-      if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
-        train_accuracy, cross_entropy_value = sess.run(
-            [evaluation_step, cross_entropy],
-            feed_dict={bottleneck_input: train_bottlenecks,
-                       ground_truth_input: train_ground_truth})
-        tf.logging.info('%s: Step %d: Train accuracy = %.1f%%' %
-                        (datetime.now(), i, train_accuracy * 100))
-        tf.logging.info('%s: Step %d: Cross entropy = %f' %
-                        (datetime.now(), i, cross_entropy_value))
-        # TODO(suharshs): Make this use an eval graph, to avoid quantization
-        # moving averages being updated by the validation set, though in
-        # practice this makes a negligable difference.
-        validation_bottlenecks, validation_ground_truth, _ = (
-            get_random_cached_bottlenecks(
-                sess, image_lists, FLAGS.validation_batch_size, 'validation',
-                FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-                decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-                FLAGS.architecture))
-        # Run a validation step and capture training summaries for TensorBoard
-        # with the `merged` op.
-        validation_summary, validation_accuracy = sess.run(
-            [merged, evaluation_step],
-            feed_dict={bottleneck_input: validation_bottlenecks,
-                       ground_truth_input: validation_ground_truth})
-        validation_writer.add_summary(validation_summary, i)
-        tf.logging.info('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
-                        (datetime.now(), i, validation_accuracy * 100,
-                         len(validation_bottlenecks)))
-
-      # Store intermediate results
-      intermediate_frequency = FLAGS.intermediate_store_frequency
-
-      if (intermediate_frequency > 0 and (i % intermediate_frequency == 0)
-          and i > 0):
-        # If we want to do an intermediate save, save a checkpoint of the train
-        # graph, to restore into the eval graph.
-        train_saver.save(sess, CHECKPOINT_NAME)
-        intermediate_file_name = (FLAGS.intermediate_output_graphs_dir +
-                                  'intermediate_' + str(i) + '.pb')
-        tf.logging.info('Save intermediate result to : ' +
-                        intermediate_file_name)
-        save_graph_to_file(graph, intermediate_file_name, model_info,
-                           class_count)
-
-    # After training is complete, force one last save of the train checkpoint.
-    train_saver.save(sess, CHECKPOINT_NAME)
-
-    # We've completed all our training, so run a final test evaluation on
-    # some new images we haven't used before.
-    run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
-                   decoded_image_tensor, resized_image_tensor,
-                   bottleneck_tensor)
-
-    # Write out the trained graph and labels with the weights stored as
-    # constants.
-    save_graph_to_file(graph, FLAGS.output_graph, model_info, class_count)
-    with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
-      f.write('\n'.join(image_lists.keys()) + '\n')
-
-    export_model(model_info, class_count, FLAGS.saved_model_dir)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--image_dir',
-      type=str,
-      default='',
-      help='Path to folders of labeled images.'
-  )
-  parser.add_argument(
-      '--output_graph',
-      type=str,
-      default='/tmp/output_graph.pb',
-      help='Where to save the trained graph.'
-  )
-  parser.add_argument(
-      '--intermediate_output_graphs_dir',
-      type=str,
-      default='/tmp/intermediate_graph/',
-      help='Where to save the intermediate graphs.'
-  )
-  parser.add_argument(
-      '--intermediate_store_frequency',
-      type=int,
-      default=0,
-      help="""\
-         How many steps to store intermediate graph. If "0" then will not
-         store.\
-      """
-  )
-  parser.add_argument(
-      '--output_labels',
-      type=str,
-      default='/tmp/output_labels.txt',
-      help='Where to save the trained graph\'s labels.'
-  )
-  parser.add_argument(
-      '--summaries_dir',
-      type=str,
-      default='/tmp/retrain_logs',
-      help='Where to save summary logs for TensorBoard.'
-  )
-  parser.add_argument(
-      '--how_many_training_steps',
-      type=int,
-      default=4000,
-      help='How many training steps to run before ending.'
-  )
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='How large a learning rate to use when training.'
-  )
-  parser.add_argument(
-      '--testing_percentage',
-      type=int,
-      default=10,
-      help='What percentage of images to use as a test set.'
-  )
-  parser.add_argument(
-      '--validation_percentage',
-      type=int,
-      default=10,
-      help='What percentage of images to use as a validation set.'
-  )
-  parser.add_argument(
-      '--eval_step_interval',
-      type=int,
-      default=10,
-      help='How often to evaluate the training results.'
-  )
-  parser.add_argument(
-      '--train_batch_size',
-      type=int,
-      default=100,
-      help='How many images to train on at a time.'
-  )
-  parser.add_argument(
-      '--test_batch_size',
-      type=int,
-      default=-1,
-      help="""\
-      How many images to test on. This test set is only used once, to evaluate
-      the final accuracy of the model after training completes.
-      A value of -1 causes the entire test set to be used, which leads to more
-      stable results across runs.\
-      """
-  )
-  parser.add_argument(
-      '--validation_batch_size',
-      type=int,
-      default=100,
-      help="""\
-      How many images to use in an evaluation batch. This validation set is
-      used much more often than the test set, and is an early indicator of how
-      accurate the model is during training.
-      A value of -1 causes the entire validation set to be used, which leads to
-      more stable results across training iterations, but may be slower on large
-      training sets.\
-      """
-  )
-  parser.add_argument(
-      '--print_misclassified_test_images',
-      default=False,
-      help="""\
-      Whether to print out a list of all misclassified test images.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--model_dir',
-      type=str,
-      default='/tmp/imagenet',
-      help="""\
-      Path to classify_image_graph_def.pb,
-      imagenet_synset_to_human_label_map.txt, and
-      imagenet_2012_challenge_label_map_proto.pbtxt.\
-      """
-  )
-  parser.add_argument(
-      '--bottleneck_dir',
-      type=str,
-      default='/tmp/bottleneck',
-      help='Path to cache bottleneck layer values as files.'
-  )
-  parser.add_argument(
-      '--final_tensor_name',
-      type=str,
-      default='final_result',
-      help="""\
-      The name of the output classification layer in the retrained graph.\
-      """
-  )
-  parser.add_argument(
-      '--flip_left_right',
-      default=False,
-      help="""\
-      Whether to randomly flip half of the training images horizontally.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--random_crop',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much of a margin to randomly crop off the
-      training images.\
-      """
-  )
-  parser.add_argument(
-      '--random_scale',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much to randomly scale up the size of the
-      training images by.\
-      """
-  )
-  parser.add_argument(
-      '--random_brightness',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much to randomly multiply the training image
-      input pixels up or down by.\
-      """
-  )
-  parser.add_argument(
-      '--architecture',
-      type=str,
-      default='inception_v3',
-      help="""\
-      Which model architecture to use. 'inception_v3' is the most accurate, but
-      also the slowest. For faster or smaller models, chose a MobileNet with the
-      form 'mobilenet_<parameter size>_<input_size>[_quantized]'. For example,
-      'mobilenet_1.0_224' will pick a model that is 17 MB in size and takes 224
-      pixel input images, while 'mobilenet_0.25_128_quantized' will choose a much
-      smaller and less accurate model, taking 128x128 images, and instrumented
-      for eventual quantization via TensorFlow Lite.
-      See https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
-      for more information on Mobilenet.\
-      """)
-  parser.add_argument(
-      '--saved_model_dir',
-      type=str,
-      default='/tmp/saved_models/1/',
-      help='Where to save the exported graph.')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
deleted file mode 100644
index fb7324c58a..0000000000
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=g-bad-import-order,unused-import
-"""Tests the graph freezing tool."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import os
-
-from tensorflow.examples.image_retraining import retrain
-from tensorflow.python.framework import test_util
-
-
-class ImageRetrainingTest(test_util.TensorFlowTestCase):
-
-  def dummyImageLists(self):
-    return {'label_one': {'dir': 'somedir', 'training': ['image_one.jpg',
-                                                         'image_two.jpg'],
-                          'testing': ['image_three.jpg', 'image_four.jpg'],
-                          'validation': ['image_five.jpg', 'image_six.jpg']},
-            'label_two': {'dir': 'otherdir', 'training': ['image_one.jpg',
-                                                          'image_two.jpg'],
-                          'testing': ['image_three.jpg', 'image_four.jpg'],
-                          'validation': ['image_five.jpg', 'image_six.jpg']}}
-
-  def testGetImagePath(self):
-    image_lists = self.dummyImageLists()
-    self.assertEqual('image_dir/somedir/image_one.jpg', retrain.get_image_path(
-        image_lists, 'label_one', 0, 'image_dir', 'training'))
-    self.assertEqual('image_dir/otherdir/image_four.jpg',
-                     retrain.get_image_path(image_lists, 'label_two', 1,
-                                            'image_dir', 'testing'))
-
-  def testGetBottleneckPath(self):
-    image_lists = self.dummyImageLists()
-    self.assertEqual('bottleneck_dir/somedir/image_five.jpg_imagenet_v3.txt',
-                     retrain.get_bottleneck_path(
-                         image_lists, 'label_one', 0, 'bottleneck_dir',
-                         'validation', 'imagenet_v3'))
-
-  def testShouldDistortImage(self):
-    self.assertEqual(False, retrain.should_distort_images(False, 0, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(True, 0, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 10, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 0, 1, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 0, 0, 50))
-
-  def testAddInputDistortions(self):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        retrain.add_input_distortions(True, 10, 10, 10, 299, 299, 3, 128, 128)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortJPGInput:0'))
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortResult:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalRetrainOps(self, flags_mock):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization.
-        retrain.add_final_retrain_ops(5, 'final', bottleneck, 1024, False,
-                                      False)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalRetrainOpsQuantized(self, flags_mock):
-    # Ensure that the training and eval graph for quantized models are correctly
-    # created.
-    with tf.Graph().as_default() as g:
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization, set is_training to
-        # true.
-        retrain.add_final_retrain_ops(5, 'final', bottleneck, 1024, True, True)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-        found_fake_quant = 0
-        for op in g.get_operations():
-          if op.type == 'FakeQuantWithMinMaxVars':
-            found_fake_quant += 1
-            # Ensure that the inputs of each FakeQuant operations has 2 Assign
-            # operations in the training graph (Assign[Min,Max]Last,
-            # Assign[Min,Max]Ema)
-            self.assertEqual(2,
-                             len([i for i in op.inputs if 'Assign' in i.name]))
-        self.assertEqual(found_fake_quant, 2)
-    with tf.Graph().as_default() as g:
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization, set is_training to
-        # false.
-        retrain.add_final_retrain_ops(5, 'final', bottleneck, 1024, True, False)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-        found_fake_quant = 0
-        for op in g.get_operations():
-          if op.type == 'FakeQuantWithMinMaxVars':
-            found_fake_quant += 1
-            for i in op.inputs:
-              # Ensure that no operations are Assign operation since this is the
-              # evaluation graph.
-              self.assertTrue('Assign' not in i.name)
-        self.assertEqual(found_fake_quant, 2)
-
-  def testAddEvaluationStep(self):
-    with tf.Graph().as_default():
-      final = tf.placeholder(tf.float32, [1], name='final')
-      gt = tf.placeholder(tf.int64, [1], name='gt')
-      self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
-
-  def testAddJpegDecoding(self):
-    with tf.Graph().as_default():
-      jpeg_data, mul_image = retrain.add_jpeg_decoding(10, 10, 3, 0, 255)
-      self.assertIsNotNone(jpeg_data)
-      self.assertIsNotNone(mul_image)
-
-  def testCreateModelInfo(self):
-    did_raise_value_error = False
-    try:
-      retrain.create_model_info('no_such_model_name')
-    except ValueError:
-      did_raise_value_error = True
-    self.assertTrue(did_raise_value_error)
-    model_info = retrain.create_model_info('inception_v3')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(299, model_info['input_width'])
-
-  def testCreateModelInfoQuantized(self):
-    # Test for mobilenet_quantized
-    model_info = retrain.create_model_info('mobilenet_1.0_224')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(224, model_info['input_width'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
-- 
GitLab


From 73aef57c451a13e07e48933d0bae3ad3ed2c64bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 13:53:01 -0700
Subject: [PATCH 0611/1262] Support for removing unfused quantized activation
 functions and min/max.

PiperOrigin-RevId: 192503204
---
 tensorflow/contrib/lite/toco/BUILD            |   3 +
 .../graph_transformations.h                   |   1 +
 .../quantization_util.cc                      | 173 ++++++++++++++++++
 .../graph_transformations/quantization_util.h |  50 +++++
 .../toco/graph_transformations/quantize.cc    |  75 +-------
 .../remove_trivial_passthrough.cc             |  29 +--
 ...emove_trivial_quantized_activation_func.cc | 116 +++++++-----
 .../remove_trivial_quantized_min_max.cc       |  90 +++++++++
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  11 +-
 9 files changed, 413 insertions(+), 135 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 8a35fb9034..a05d71985f 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -238,6 +238,8 @@ cc_library(
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
+        "graph_transformations/quantization_util.cc",
+        "graph_transformations/quantization_util.h",
         "graph_transformations/quantize.cc",
         "graph_transformations/read_fake_quant_min_max.cc",
         "graph_transformations/remove_final_dequantize_op.cc",
@@ -249,6 +251,7 @@ cc_library(
         "graph_transformations/remove_trivial_passthrough.cc",
         "graph_transformations/remove_trivial_passthrough.h",
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
+        "graph_transformations/remove_trivial_quantized_min_max.cc",
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 27c5044bb3..80463ce8f8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -146,6 +146,7 @@ DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedMinMax)
 DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantBinaryOperator)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
new file mode 100644
index 0000000000..e080df4bed
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value) {
+  switch (data_type) {
+    case ArrayDataType::kUint8:
+      *out_min_value = 0;
+      *out_max_value = 255;
+      return true;
+    case ArrayDataType::kInt16:
+      *out_min_value = -32768;
+      *out_max_value = 32767;
+      return true;
+    default:
+      return false;
+  }
+}
+
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type) {
+  switch (array.final_data_type) {
+    case ArrayDataType::kInt8:
+    case ArrayDataType::kUint8:
+    case ArrayDataType::kInt16:
+    case ArrayDataType::kUint16:
+    case ArrayDataType::kInt32:
+    case ArrayDataType::kUint32:
+    case ArrayDataType::kInt64:
+    case ArrayDataType::kUint64:
+      return array.final_data_type;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+      return default_type;
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(array.final_data_type);
+  }
+}
+
+void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
+                           QuantizationParams* quantization_params) {
+  switch (data_type) {
+    case ArrayDataType::kInt8:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt8>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint8:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt16:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint16:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint16>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt32:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt32>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint32:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint32>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt64:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt64>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint64:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint64>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(data_type);
+  }
+}
+
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max) {
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(array, array.data_type);
+  if (quantized_data_type == ArrayDataType::kNone ||
+      quantized_data_type == ArrayDataType::kFloat) {
+    // The array is not (or never will be) quantized.
+    return false;
+  }
+
+  QuantizationParams quantization_params;
+  if (!array.quantization_params) {
+    if (!array.minmax) {
+      transformation->AddMessageF("No quantization params and no minmax");
+      return false;
+    } else {
+      // Work around cases where we are asking for this prior to the Quantize
+      // transformation having added the quantization_params.
+      GetQuantizationParams(quantized_data_type, *array.minmax,
+                            &quantization_params);
+      transformation->AddMessageF(
+          "No quantization params - infering from data type %s with minmax "
+          "%g,%g as zero_point=%g, scale=%g",
+          ArrayDataTypeName(quantized_data_type), array.minmax->min,
+          array.minmax->max, quantization_params.zero_point,
+          quantization_params.scale);
+    }
+  } else {
+    quantization_params = array.GetQuantizationParams();
+  }
+
+  double quantized_min, quantized_max;
+  CHECK(GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
+                                           &quantized_max))
+      << "Type is not quantized";
+
+  bool has_nontrivial_min_bound = false;
+  bool has_nontrivial_max_bound = false;
+
+  double lowest_representable_output =
+      (quantized_min - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (lowest_representable_output < clamp_min) {
+    has_nontrivial_min_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the lowest representable output value %g"
+        " less than the clamp min bound %g.",
+        lowest_representable_output, clamp_min);
+  }
+
+  double highest_representable_output =
+      (quantized_max - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (highest_representable_output > clamp_max) {
+    has_nontrivial_max_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the highest representable output value %g"
+        " is greater than the clamp max bound %g.",
+        highest_representable_output, clamp_max);
+  }
+
+  return !has_nontrivial_min_bound && !has_nontrivial_max_bound;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
new file mode 100644
index 0000000000..35fb310777
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+// Gets the min/max numerical range for the given quantized data type.
+// For example, kUint8 will return [0,255].
+// Returns true if the ranges were set and false if the type is not quantized.
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value);
+
+// Returns the quantized data type of an array, falling back to the provided
+// default data type.
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type);
+
+// Gets the quantization params for the array with the given data type and
+// minmax.
+void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
+                           QuantizationParams* quantization_params);
+
+// Returns true if the given array, when quantized, contains only values between
+// the provided clamp min/max.
+// Either clamp_min or clamp_max may be +/-infinity to indicate that the value
+// is unbounded on that side.
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index f50830ae60..d6cae3cdbf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -205,70 +206,6 @@ QuantizationPoints GetQuantizationPoints(ArrayDataType data_type) {
   }
 }
 
-ArrayDataType GetQuantizedDataType(const Array& array,
-                                   ArrayDataType default_type) {
-  switch (array.final_data_type) {
-    case ArrayDataType::kInt8:
-    case ArrayDataType::kUint8:
-    case ArrayDataType::kInt16:
-    case ArrayDataType::kUint16:
-    case ArrayDataType::kInt32:
-    case ArrayDataType::kUint32:
-    case ArrayDataType::kInt64:
-    case ArrayDataType::kUint64:
-      return array.final_data_type;
-    case ArrayDataType::kFloat:
-    case ArrayDataType::kNone:
-      return default_type;
-    default:
-      LOG(FATAL) << "Unhandled final quantization type "
-                 << static_cast<int>(array.final_data_type);
-  }
-}
-
-void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
-                           QuantizationParams* quantization_params) {
-  switch (data_type) {
-    case ArrayDataType::kInt8:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt8>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint8:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kInt16:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint16:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint16>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kInt32:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt32>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint32:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint32>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kInt64:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt64>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint64:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint64>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kFloat:
-    case ArrayDataType::kNone:
-    default:
-      LOG(FATAL) << "Unhandled final quantization type "
-                 << static_cast<int>(data_type);
-  }
-}
-
 bool ChooseQuantizationForOperatorInput(
     GraphTransformation* transformation, Model* model, const Operator& op,
     std::size_t input_index, ArrayDataType* quantized_data_type,
@@ -336,12 +273,11 @@ bool ChooseQuantizationForOperatorInput(
   *quantized_data_type = GetQuantizedDataType(array, ArrayDataType::kUint8);
   GetQuantizationParams(*quantized_data_type, minmax, quantization_params);
   transformation->AddMessageF(
-      "For input array %s with min=%g"
-      ", max=%g"
-      ", chose to quantize as %s with zero_point=%d"
-      ", scale=%g",
+      "For input array %s with min=%g, max=%g, chose to quantize as %s (f=%s) "
+      "with zero_point=%d, scale=%g",
       input, minmax.min, minmax.max, ArrayDataTypeName(*quantized_data_type),
-      quantization_params->zero_point, quantization_params->scale);
+      ArrayDataTypeName(array.final_data_type), quantization_params->zero_point,
+      quantization_params->scale);
   return true;
 }
 
@@ -525,6 +461,7 @@ void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
     minmax->max = max;
   }
 }
+
 }  // namespace
 
 bool Quantize::Run(Model* model, std::size_t op_index) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index aa93ace03a..3e021b819f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -82,22 +82,13 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
 
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
-        "Removing %s, keeping its non-constant input array",
-        LogName(*passthru_op));
-    for (const string& input : passthru_op->inputs) {
-      if (IsDiscardableArray(*model, input) && input != main_input_name &&
-          CountOpsWithInput(*model, input) == 1) {
-      }
-    }
+        "Removing %s, keeping its non-constant input array %s and removing %s",
+        LogName(*passthru_op), main_input_name, output_name);
     RerouteEdges(output_name, main_input_name, model);
   } else if (IsDiscardableArray(*model, main_input_name)) {
-    transformation->AddMessageF("Removing %s, keeping its output array",
-                                LogName(*passthru_op));
-    for (const string& input : passthru_op->inputs) {
-      if (IsDiscardableArray(*model, input) &&
-          (input == main_input_name || CountOpsWithInput(*model, input) == 1)) {
-      }
-    }
+    transformation->AddMessageF(
+        "Removing %s, keeping its output array %s and removing input %s",
+        LogName(*passthru_op), output_name, main_input_name);
     RerouteEdges(main_input_name, output_name, model);
   } else {
     transformation->AddMessageF(
@@ -113,6 +104,16 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
   // Remove any array that is no longer used.
   for (const string& removal_candidate : removal_candidates) {
     bool is_referenced = false;
+    for (const auto& array : model->flags.input_arrays()) {
+      if (array.name() == removal_candidate) {
+        is_referenced = true;
+      }
+    }
+    for (const auto& array_name : model->flags.output_arrays()) {
+      if (array_name == removal_candidate) {
+        is_referenced = true;
+      }
+    }
     for (const auto& op : model->operators) {
       for (const string& input : op->inputs) {
         if (input == removal_candidate) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 9b65feaa64..752560e075 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
@@ -26,27 +28,44 @@ limitations under the License.
 
 namespace toco {
 
-bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
-                                               std::size_t op_index) {
-  const auto it = model->operators.begin() + op_index;
-  auto* op = it->get();
-  if (op->fused_activation_function != FusedActivationFunctionType::kRelu &&
-      op->fused_activation_function != FusedActivationFunctionType::kRelu1 &&
-      op->fused_activation_function != FusedActivationFunctionType::kRelu6) {
-    return false;
-  }
-  const auto& output_array = model->GetArray(op->outputs[0]);
-  if (!output_array.quantization_params) {
-    return false;
-  }
-  if (output_array.data_type != ArrayDataType::kUint8) {
-    return false;
+namespace {
+
+bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
+                                    const Model& model, OperatorType op_type,
+                                    const string& input_array_name) {
+  double clamp_min;
+  double clamp_max;
+  switch (op_type) {
+    case OperatorType::kRelu:
+      clamp_min = 0.0;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    case OperatorType::kRelu1:
+      clamp_min = -1.0;
+      clamp_max = 1.0;
+      break;
+    case OperatorType::kRelu6:
+      clamp_min = 0.0;
+      clamp_max = 6.0;
+      break;
+    default:
+      return false;
   }
-  const auto& quantization_params = output_array.GetQuantizationParams();
 
+  const auto& input_array = model.GetArray(input_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, input_array, clamp_min,
+                                     clamp_max);
+}
+
+bool IsTrivialFusedActivationFunc(
+    GraphTransformation* transformation, const Model& model,
+    FusedActivationFunctionType activation_function,
+    const string& output_array_name) {
   double clamp_min;
   double clamp_max;
-  switch (op->fused_activation_function) {
+  switch (activation_function) {
+    case FusedActivationFunctionType::kNone:
+      return false;
     case FusedActivationFunctionType::kRelu:
       clamp_min = 0.0;
       clamp_max = std::numeric_limits<double>::infinity();
@@ -61,45 +80,46 @@ bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
       break;
     default:
       LOG(FATAL) << "Unsupported fused activation type: "
-                 << static_cast<int>(op->fused_activation_function);
+                 << static_cast<int>(activation_function);
       return false;
   }
 
-  bool has_nontrivial_min_bound = false;
-  bool has_nontrivial_max_bound = false;
+  const auto& output_array = model.GetArray(output_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, output_array, clamp_min,
+                                     clamp_max);
+}
 
-  double lowest_representable_output =
-      (0. - quantization_params.zero_point) * quantization_params.scale;
-  if (lowest_representable_output < clamp_min) {
-    has_nontrivial_min_bound = true;
-    AddMessageF(
-        "Quantized activation function is not trivial: "
-        "the lowest representable output value %g"
-        " less than the clamp min bound %g.",
-        lowest_representable_output, clamp_min);
-  }
-  double highest_representable_output =
-      (255. - quantization_params.zero_point) * quantization_params.scale;
-  if (highest_representable_output > clamp_max) {
-    has_nontrivial_max_bound = true;
-    AddMessageF(
-        "Quantized activation function is not trivial: "
-        "the highest representable output value %g"
-        " is greater than the clamp max bound %g.",
-        highest_representable_output, clamp_max);
-  }
+}  // namespace
 
-  if (has_nontrivial_min_bound || has_nontrivial_max_bound) {
+// Attempts to remove both fused and unfused activation functions if the
+// quantization params indicate that the representable values fall inside the
+// activation range.
+bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
+                                               std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if (op->inputs.empty()) {
     return false;
   }
 
-  op->fused_activation_function = FusedActivationFunctionType::kNone;
-  AddMessageF(
-      "Removing trivial quantized activation function on %s"
-      " because the output quantization parameters imply at least as tight"
-      " a clamp anyway.",
-      LogName(*op));
-  return true;
+  if (IsTrivialUnfusedActivationFunc(this, *model, op->type, op->inputs[0])) {
+    AddMessageF(
+        "Removing trivial unfused activation function %s because the input "
+        "minmax imply at least as tight a clamp anyway.",
+        LogName(*op));
+    return RemoveTrivialPassthroughOp(this, model, op_index);
+  }
+  if (IsTrivialFusedActivationFunc(this, *model, op->fused_activation_function,
+                                   op->outputs[0])) {
+    op->fused_activation_function = FusedActivationFunctionType::kNone;
+    AddMessageF(
+        "Removing trivial quantized activation function on %s "
+        "because the output quantization parameters imply at least as tight "
+        "a clamp anyway.",
+        LogName(*op));
+    return true;
+  }
+  return false;
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
new file mode 100644
index 0000000000..eaee1c662b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
+                     OperatorType op_type, const string& input_array_name,
+                     const string& clamp_value_array_name) {
+  const auto& clamp_value_array = model.GetArray(clamp_value_array_name);
+  if (!IsConstantParameterArray(model, clamp_value_array_name)) {
+    transformation->AddMessageF("Clip value array %s is non-constant",
+                                clamp_value_array_name);
+    return false;
+  }
+  const auto& clamp_value_buffer =
+      clamp_value_array.GetBuffer<ArrayDataType::kFloat>();
+  CHECK_EQ(clamp_value_buffer.Length(), 1);
+  float clamp_value = clamp_value_buffer.data[0];
+
+  double clamp_min;
+  double clamp_max;
+  switch (op_type) {
+    case OperatorType::kTensorFlowMinimum:
+      clamp_min = -std::numeric_limits<double>::infinity();
+      clamp_max = clamp_value;
+      break;
+    case OperatorType::kTensorFlowMaximum:
+      clamp_min = clamp_value;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    default:
+      CHECK(false);
+      return false;
+  }
+
+  const auto& input_array = model.GetArray(input_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, input_array, clamp_min,
+                                     clamp_max);
+}
+
+}  // namespace
+
+// Attempts to remove min/max functions if the quantization params indicate that
+// the representable values fall inside the clip range.
+bool RemoveTrivialQuantizedMinMax::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if ((op->type != OperatorType::kTensorFlowMinimum &&
+       op->type != OperatorType::kTensorFlowMaximum) ||
+      op->inputs.size() != 2) {
+    return false;
+  }
+  if (IsTrivialMinMax(this, *model, op->type, op->inputs[0], op->inputs[1])) {
+    AddMessageF(
+        "Removing trivial min/max %s because the quantization parameters imply "
+        "at least as tight a clamp anyway.",
+        LogName(*op));
+    return RemoveTrivialPassthroughOp(this, model, op_index);
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 96c5ebd64f..1ab0a6f058 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -279,10 +279,13 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
           {new HardcodeMinMax});
     }
     CheckIsReadyForQuantization(*model);
-    RunGraphTransformations(
-        model, "quantization graph transformations",
-        {new Quantize, new RemoveTrivialQuantizedActivationFunc,
-         new RemoveFinalDequantizeOp});
+    RunGraphTransformations(model, "quantization graph transformations",
+                            {
+                                new RemoveTrivialQuantizedActivationFunc,
+                                new RemoveTrivialQuantizedMinMax,
+                                new Quantize,
+                                new RemoveFinalDequantizeOp,
+                            });
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
     // Dequantize creates FakeQuant nodes. We may want to discard
-- 
GitLab


From 1e283d64816b92de6c398bee6df2122409c87d73 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 11 Apr 2018 13:59:58 -0700
Subject: [PATCH 0612/1262] Porting tests for the `decode_proto` and
 `encode_proto` to OS.

PiperOrigin-RevId: 192504411
---
 tensorflow/contrib/proto/BUILD                |  16 +
 .../contrib/proto/python/kernel_tests/BUILD   |  81 +++++
 .../proto/python/kernel_tests/build_defs.bzl  |  78 +++++
 .../kernel_tests/decode_proto_fail_test.py    |  68 ++++
 .../kernel_tests/decode_proto_op_test.py      | 300 ++++++++++++++++++
 .../kernel_tests/encode_proto_op_test.py      | 179 +++++++++++
 .../python/kernel_tests/minmax.TestCase.pbtxt | 161 ++++++++++
 .../python/kernel_tests/nested.TestCase.pbtxt |  16 +
 .../kernel_tests/optional.TestCase.pbtxt      |  20 ++
 .../promote_unsigned.TestCase.pbtxt           |  21 ++
 .../python/kernel_tests/ragged.TestCase.pbtxt |  32 ++
 .../kernel_tests/shaped_batch.TestCase.pbtxt  |  62 ++++
 .../python/kernel_tests/simple.TestCase.pbtxt |  21 ++
 .../proto/python/kernel_tests/test_case.py    |  35 ++
 .../python/kernel_tests/test_example.proto    | 149 +++++++++
 tensorflow/tools/pip_package/BUILD            |   1 +
 16 files changed, 1240 insertions(+)
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_case.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index 046652cbc5..3e9b1a0b8d 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "proto",
     srcs = [
@@ -14,3 +16,17 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
+
+py_library(
+    name = "proto_pip",
+    data = [
+        "//tensorflow/contrib/proto/python/kernel_tests:test_messages",
+    ] + if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":proto",
+        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..4125ea8a2a
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -0,0 +1,81 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Much of the work in this BUILD file actually happens in the corresponding
+# build_defs.bzl, which creates an individual testcase for each example .pbtxt
+# file in this directory.
+#
+load(":build_defs.bzl", "decode_proto_test_suite")
+load(":build_defs.bzl", "encode_proto_test_suite")
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :decode_proto_op_tests.
+decode_proto_test_suite(
+    name = "decode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :encode_proto_op_tests.
+encode_proto_test_suite(
+    name = "encode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# Below here are tests that are not tied to an example text proto.
+filegroup(
+    name = "test_messages",
+    srcs = glob(["*.pbtxt"]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_py_test(
+    name = "decode_proto_fail_test",
+    size = "small",
+    srcs = ["decode_proto_fail_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/proto:proto",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+)
+
+py_library(
+    name = "test_case",
+    srcs = ["test_case.py"],
+    deps = ["//tensorflow/python:client_testlib"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [
+        ":test_case",
+        ":test_example_proto_py",
+    ],
+)
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
new file mode 100644
index 0000000000..6fe48ae807
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
@@ -0,0 +1,78 @@
+"""BUILD rules for generating file-driven proto test cases.
+
+The decode_proto_test_suite() and encode_proto_test_suite() rules take a list
+of text protos and generates a tf_py_test() for each one.
+"""
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "register_extension_info")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+def _test_name(test, path):
+  return "%s_%s_test" % (test, path.split("/")[-1].split(".")[0])
+
+def decode_proto_test_suite(name, examples):
+  """Build the decode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("decode_proto", test_filename),
+        srcs = ["decode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "decode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("decode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+def encode_proto_test_suite(name, examples):
+  """Build the encode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("encode_proto", test_filename),
+        srcs = ["encode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "encode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("encode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+register_extension_info(
+    extension_name = "decode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:decode_example_.*",
+    })
+
+register_extension_info(
+    extension_name = "encode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:encode_example_.*",
+    })
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
new file mode 100644
index 0000000000..f019833905
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -0,0 +1,68 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import proto
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class DecodeProtoFailTest(test_case.ProtoOpTestCase):
+  """Test failure cases for DecodeToProto."""
+
+  def _TestCorruptProtobuf(self, sanitize):
+    """Test failure cases for DecodeToProto."""
+
+    # The goal here is to check the error reporting.
+    # Testing against a variety of corrupt protobufs is
+    # done by fuzzing.
+    corrupt_proto = 'This is not a binary protobuf'
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(corrupt_proto, dtype=object)
+    msg_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['sizes']
+    field_types = [dtypes.int32]
+
+    with self.test_session() as sess:
+      ctensor, vtensor = proto.decode_proto(
+          batch,
+          message_type=msg_type,
+          field_names=field_names,
+          output_types=field_types,
+          sanitize=sanitize)
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   'Unable to parse binary protobuf'
+                                   '|Failed to consume entire buffer'):
+        _ = sess.run([ctensor] + vtensor)
+
+  def testCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=False)
+
+  def testSanitizerCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
new file mode 100644
index 0000000000..30ceac5f5f
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -0,0 +1,300 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for decode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+"""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib import proto
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class DecodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def _compareValues(self, fd, vs, evs):
+    """Compare lists/arrays of field values."""
+
+    if len(vs) != len(evs):
+      self.fail('Field %s decoded %d outputs, expected %d' %
+                (fd.name, len(vs), len(evs)))
+    for i, ev in enumerate(evs):
+      # Special case fuzzy match for float32. TensorFlow seems to mess with
+      # MAX_FLT slightly and the test doesn't work otherwise.
+      # TODO(nix): ask on TF list about why MAX_FLT doesn't pass through.
+      if fd.cpp_type == fd.CPPTYPE_FLOAT:
+        # Numpy isclose() is better than assertIsClose() which uses an absolute
+        # value comparison.
+        self.assertTrue(
+            np.isclose(vs[i], ev), 'expected %r, actual %r' % (ev, vs[i]))
+      elif fd.cpp_type == fd.CPPTYPE_STRING:
+        # In Python3 string tensor values will be represented as bytes, so we
+        # reencode the proto values to match that.
+        self.assertEqual(vs[i], ev.encode('ascii'))
+      else:
+        # Doubles and other types pass through unscathed.
+        self.assertEqual(vs[i], ev)
+
+  def _compareRepeatedPrimitiveValue(self, batch_shape, sizes, fields,
+                                     field_dict):
+    """Compare protos of type RepeatedPrimitiveValue.
+
+    Args:
+      batch_shape: the shape of the input tensor of serialized messages.
+      sizes: int matrix of repeat counts returned by decode_proto
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      field_dict: map from field names to decoded numpy tensors of values
+    """
+
+    # Check that expected values match.
+    for field in fields:
+      values = field_dict[field.name]
+      self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
+
+      fd = field.expected.DESCRIPTOR.fields_by_name[field.name]
+
+      # Values has the same shape as the input plus an extra
+      # dimension for repeats.
+      self.assertEqual(list(values.shape)[:-1], batch_shape)
+
+      # Nested messages are represented as TF strings, requiring
+      # some special handling.
+      if field.name == 'message_value':
+        vs = []
+        for buf in values.flat:
+          msg = test_example_pb2.PrimitiveValue()
+          msg.ParseFromString(buf)
+          vs.append(msg)
+        evs = getattr(field.expected, field.name)
+        if len(vs) != len(evs):
+          self.fail('Field %s decoded %d outputs, expected %d' %
+                    (fd.name, len(vs), len(evs)))
+        for v, ev in zip(vs, evs):
+          self.assertEqual(v, ev)
+        continue
+
+      # This can be a little confusing. For testing we are using
+      # RepeatedPrimitiveValue in two ways: it's the proto that we
+      # decode for testing, and it's used in the expected value as a
+      # union type. The two cases are slightly different: this is the
+      # second case.
+      # We may be fetching the uint64_value from the test proto, but
+      # in the expected proto we store it in the int64_value field
+      # because TensorFlow doesn't support unsigned int64.
+      tf_type_to_primitive_value_field = {
+          dtypes.float32:
+              'float_value',
+          dtypes.float64:
+              'double_value',
+          dtypes.int32:
+              'int32_value',
+          dtypes.uint8:
+              'uint8_value',
+          dtypes.int8:
+              'int8_value',
+          dtypes.string:
+              'string_value',
+          dtypes.int64:
+              'int64_value',
+          dtypes.bool:
+              'bool_value',
+          # Unhandled TensorFlow types:
+          # DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+          # DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+      }
+      tf_field_name = tf_type_to_primitive_value_field.get(field.dtype)
+      if tf_field_name is None:
+        self.fail('Unhandled tensorflow type %d' % field.dtype)
+
+      self._compareValues(fd, values.flat,
+                          getattr(field.expected, tf_field_name))
+
+  def _runDecodeProtoTests(self, fields, case_sizes, batch_shape, batch,
+                           message_type, message_format, sanitize,
+                           force_disordered=False):
+    """Run decode tests on a batch of messages.
+
+    Args:
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      case_sizes: expected sizes array
+      batch_shape: the shape of the input tensor of serialized messages
+      batch: list of serialized messages
+      message_type: descriptor name for messages
+      message_format: format of messages, 'text' or 'binary'
+      sanitize: whether to sanitize binary protobuf inputs
+      force_disordered: whether to force fields encoded out of order.
+    """
+
+    if force_disordered:
+      # Exercise code path that handles out-of-order fields by prepending extra
+      # fields with tag numbers higher than any real field. Note that this won't
+      # work with sanitization because that forces reserialization using a
+      # trusted decoder and encoder.
+      assert not sanitize
+      extra_fields = test_example_pb2.ExtraFields()
+      extra_fields.string_value = 'IGNORE ME'
+      extra_fields.bool_value = False
+      extra_msg = extra_fields.SerializeToString()
+      batch = [extra_msg + msg for msg in batch]
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(batch, dtype=object)
+    batch = np.reshape(batch, batch_shape)
+
+    field_names = [f.name for f in fields]
+    output_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, vtensor = proto.decode_proto(
+          batch,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=output_types,
+          message_format=message_format,
+          sanitize=sanitize)
+
+      vlist = sess.run([sizes] + vtensor)
+      sizes = vlist[0]
+      # Values is a list of tensors, one for each field.
+      value_tensors = vlist[1:]
+
+      # Check that the repeat sizes are correct.
+      self.assertTrue(
+          np.all(np.array(sizes.shape) == batch_shape + [len(field_names)]))
+
+      # Check that the decoded sizes match the expected sizes.
+      self.assertEqual(len(sizes.flat), len(case_sizes))
+      self.assertTrue(
+          np.all(sizes.flat == np.array(
+              case_sizes, dtype=np.int32)))
+
+      field_dict = dict(zip(field_names, value_tensors))
+
+      self._compareRepeatedPrimitiveValue(batch_shape, sizes, fields,
+                                          field_dict)
+
+  def testBinary(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testBinaryDisordered(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False,
+        force_disordered=True)
+
+  def testPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    packed_batch = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        packed_batch,
+        'tensorflow.contrib.proto.PackedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testText(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Note: float_format='.17g' is necessary to ensure preservation of
+    # doubles and floats in text format.
+    text_batch = [
+        text_format.MessageToString(
+            primitive, float_format='.17g') for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        text_batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'text',
+        sanitize=False)
+
+  def testSanitizerGood(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
new file mode 100644
index 0000000000..2a24c3b8ce
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -0,0 +1,179 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for encode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+
+It tests that encode_proto is a lossless inverse of decode_proto
+(for the specified fields).
+"""
+# Python3 readiness boilerplate
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib import proto
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class EncodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def testBadInputs(self):
+    # Invalid field name
+    with self.test_session():
+      with self.assertRaisesOpError('Unknown field: non_existent_field'):
+        proto.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['non_existent_field']).eval()
+
+    # Incorrect types.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Incompatible type for field double_value.'):
+        proto.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval()
+
+    # Incorrect shapes of sizes.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r'sizes should be batch_size \+ \[len\(field_names\)\]'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values = array_ops.placeholder(dtypes.float64)
+        proto.encode_proto(
+            sizes=sizes,
+            values=[values],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval(feed_dict={
+                sizes: [[[0, 0]]],
+                values: [[0.0]]
+            })
+
+    # Inconsistent shapes of values.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Values must match up to the last dimension'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values1 = array_ops.placeholder(dtypes.float64)
+        values2 = array_ops.placeholder(dtypes.int32)
+        (proto.encode_proto(
+            sizes=[[1, 1]],
+            values=[values1, values2],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value', 'int32_value']).eval(feed_dict={
+                values1: [[0.0]],
+                values2: [[0], [0]]
+            }))
+
+  def _testRoundtrip(self, in_bufs, message_type, fields):
+
+    field_names = [f.name for f in fields]
+    out_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = proto.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=out_types)
+
+      out_tensors = proto.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        in_obj = test_example_pb2.RepeatedPrimitiveValue()
+        in_obj.ParseFromString(in_buf)
+
+        out_obj = test_example_pb2.RepeatedPrimitiveValue()
+        out_obj.ParseFromString(out_buf)
+
+        # Check that the deserialized objects are identical.
+        self.assertEqual(in_obj, out_obj)
+
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testRoundtrip(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    in_bufs = [primitive.SerializeToString() for primitive in case.primitive]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.RepeatedPrimitiveValue', case.field)
+
+  def testRoundtripPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    in_bufs = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.PackedPrimitiveValue', case.field)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
new file mode 100644
index 0000000000..b170f89c0f
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
@@ -0,0 +1,161 @@
+primitive {
+  double_value: -1.7976931348623158e+308
+  double_value: 2.2250738585072014e-308
+  double_value: 1.7976931348623158e+308
+  float_value: -3.402823466e+38
+  float_value: 1.175494351e-38
+  float_value: 3.402823466e+38
+  int64_value: -9223372036854775808
+  int64_value: 9223372036854775807
+  uint64_value: 0
+  uint64_value: 18446744073709551615
+  int32_value: -2147483648
+  int32_value: 2147483647
+  fixed64_value: 0
+  fixed64_value: 18446744073709551615
+  fixed32_value: 0
+  fixed32_value: 4294967295
+  bool_value: false
+  bool_value: true
+  string_value: ""
+  string_value: "I refer to the infinite."
+  uint32_value: 0
+  uint32_value: 4294967295
+  sfixed32_value: -2147483648
+  sfixed32_value: 2147483647
+  sfixed64_value: -9223372036854775808
+  sfixed64_value: 9223372036854775807
+  sint32_value: -2147483648
+  sint32_value: 2147483647
+  sint64_value: -9223372036854775808
+  sint64_value: 9223372036854775807
+}
+shape: 1
+sizes: 3
+sizes: 3
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: -1.7976931348623158e+308
+    double_value: 2.2250738585072014e-308
+    double_value: 1.7976931348623158e+308
+  }
+}
+field {
+  name: "float_value"
+  dtype: DT_FLOAT
+  expected {
+    float_value: -3.402823466e+38
+    float_value: 1.175494351e-38
+    float_value: 3.402823466e+38
+  }
+}
+field {
+  name: "int64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "uint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1
+  }
+}
+field {
+  name: "int32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "fixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1  # unsigned is 18446744073709551615
+  }
+}
+field {
+  name: "fixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: false
+    bool_value: true
+  }
+}
+field {
+  name: "string_value"
+  dtype: DT_STRING
+  expected {
+    string_value: ""
+    string_value: "I refer to the infinite."
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "sfixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sfixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "sint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
new file mode 100644
index 0000000000..c664e52851
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
@@ -0,0 +1,16 @@
+primitive {
+  message_value {
+    double_value: 23.5
+  }
+}
+shape: 1
+sizes: 1
+field {
+  name: "message_value"
+  dtype: DT_STRING
+  expected {
+    message_value {
+      double_value: 23.5
+    }
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
new file mode 100644
index 0000000000..125651d7ea
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
@@ -0,0 +1,20 @@
+primitive {
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 0
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 0.0
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
new file mode 100644
index 0000000000..db7555bf2d
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  fixed32_value: 4294967295
+  uint32_value: 4294967295
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "fixed32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
new file mode 100644
index 0000000000..61c7ac53f7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
@@ -0,0 +1,32 @@
+primitive {
+  double_value: 23.5
+  double_value: 123.0
+  bool_value: true
+}
+primitive {
+  double_value: 3.1
+  bool_value: false
+}
+shape: 2
+sizes: 2
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 123.0
+    double_value: 3.1
+    double_value: 0.0
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
new file mode 100644
index 0000000000..f4828076d5
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
@@ -0,0 +1,62 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+primitive {
+  double_value: 44.0
+  bool_value: false
+}
+primitive {
+  double_value: 3.14159
+  bool_value: true
+}
+primitive {
+  double_value: 1.414
+  bool_value: true
+}
+primitive {
+  double_value: -32.2
+  bool_value: false
+}
+primitive {
+  double_value: 0.0001
+  bool_value: true
+}
+shape: 3
+shape: 2
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 44.0
+    double_value: 3.14159
+    double_value: 1.414
+    double_value: -32.2
+    double_value: 0.0001
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+    bool_value: true
+    bool_value: true
+    bool_value: false
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
new file mode 100644
index 0000000000..dc20ac147b
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_case.py b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
new file mode 100644
index 0000000000..b95202c5df
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
@@ -0,0 +1,35 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Test case base for testing proto operations."""
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+from tensorflow.python.platform import test
+
+
+class ProtoOpTestCase(test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(ProtoOpTestCase, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..dc495034ff
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -0,0 +1,149 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.proto;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 376644718f..a0bae23a7c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -74,6 +74,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-- 
GitLab


From eed6828acf19260279b38a7fbaf79141c813f795 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 14:02:49 -0700
Subject: [PATCH 0613/1262] BREAKING_CHANGE: Remove event_ndims in Bijector,
 and require `log_det_jacobian` methods to take event_ndims.

The class level event_ndims parameter is being deprecated in favor of passing it in
to the `log_det_jacobian` methods.

Specific changes:

  - `log_det_jacobian` signatures are now `log_det_jacobian(input, event_ndims)`

  - Constructors no long have event_ndims passed in (e.g. Affine() vs. Affine(event_ndims=0)).

  - All bijectors must specify a subset of [forward_min_event_ndims, inverse_min_event_ndims]. This is the minimal dimensionality the bijector operates on, with it being "broadcasted" to any passed in event_ndims (e.g. Exp has forward_min_event_ndims = 0. That means it operates on scalars. However, we can use the bijector on any event_ndims > 0 (i.e. we've broadcasted the transformation to work on any amount of event_ndims > 0), and jacobian reduction will work in those cases.

As a result of this change, all bijectors should "broadcast" (e.g. Sigmoid now works on any number of event_ndims).

Other changes (internal and documentation):
  - Added clarifications on Jacobian Determinant vs. Jacobian Matrix.
  - Added clarifications on min_event_ndims, and what the jacobian reduction is over.
  - Changed caching of ildj to be keyed on event_ndims.
  - Several bug fixes to bugs unearthed while writing this code (e.g. transformed distribution shape computation being incorrect)

PiperOrigin-RevId: 192504919
---
 .../bijectors/absolute_value_test.py          |  35 +-
 .../bijectors/affine_linear_operator_test.py  |  30 +-
 .../bijectors/affine_scalar_test.py           |  65 +--
 .../kernel_tests/bijectors/affine_test.py     | 231 ++++++----
 .../bijectors/batch_normalization_test.py     |   5 +-
 .../kernel_tests/bijectors/chain_test.py      | 132 +++++-
 .../bijectors/cholesky_outer_product_test.py  |   9 +-
 .../bijectors/conditional_bijector_test.py    |  12 +-
 .../python/kernel_tests/bijectors/exp_test.py |  18 +-
 .../kernel_tests/bijectors/gumbel_test.py     |  16 +-
 .../kernel_tests/bijectors/inline_test.py     |  18 +-
 .../kernel_tests/bijectors/invert_test.py     |  12 +-
 .../bijectors/kumaraswamy_bijector_test.py    |  15 +-
 .../bijectors/masked_autoregressive_test.py   |   5 +-
 .../kernel_tests/bijectors/permute_test.py    |  11 +-
 .../bijectors/power_transform_test.py         |  17 +-
 .../kernel_tests/bijectors/real_nvp_test.py   |  12 +-
 .../kernel_tests/bijectors/reshape_test.py    |   7 +-
 .../kernel_tests/bijectors/sigmoid_test.py    |  16 +-
 .../bijectors/sinh_arcsinh_bijector_test.py   |  22 +-
 .../bijectors/softmax_centered_test.py        |  14 +-
 .../kernel_tests/bijectors/softplus_test.py   |  40 +-
 .../kernel_tests/bijectors/square_test.py     |   7 +-
 .../kernel_tests/bijectors/weibull_test.py    |  16 +-
 ...nditional_transformed_distribution_test.py |   3 +-
 .../python/kernel_tests/mvn_diag_test.py      |   2 +-
 .../transformed_distribution_test.py          | 121 ++++-
 .../kernel_tests/vector_laplace_diag_test.py  |   2 +-
 .../python/ops/bijectors/absolute_value.py    |  29 +-
 .../python/ops/bijectors/affine.py            |  10 +-
 .../ops/bijectors/affine_linear_operator.py   |  36 +-
 .../python/ops/bijectors/affine_scalar.py     |  13 +-
 .../ops/bijectors/batch_normalization.py      |   6 +-
 .../python/ops/bijectors/chain.py             | 157 ++++++-
 .../ops/bijectors/cholesky_outer_product.py   |   2 +-
 .../ops/bijectors/conditional_bijector.py     |  12 +-
 .../distributions/python/ops/bijectors/exp.py |  10 +-
 .../python/ops/bijectors/gumbel.py            |  15 +-
 .../python/ops/bijectors/inline.py            |  15 +-
 .../python/ops/bijectors/invert.py            |   3 +-
 .../python/ops/bijectors/kumaraswamy.py       |  27 +-
 .../ops/bijectors/masked_autoregressive.py    |   3 +-
 .../python/ops/bijectors/permute.py           |   8 +-
 .../python/ops/bijectors/power_transform.py   |  16 +-
 .../python/ops/bijectors/real_nvp.py          |   4 +-
 .../python/ops/bijectors/reshape.py           |   8 +-
 .../python/ops/bijectors/sigmoid.py           |   4 +-
 .../python/ops/bijectors/sinh_arcsinh.py      |  29 +-
 .../python/ops/bijectors/softmax_centered.py  |  12 +-
 .../python/ops/bijectors/softplus.py          |  11 +-
 .../python/ops/bijectors/square.py            |   2 +-
 .../python/ops/bijectors/weibull.py           |  17 +-
 .../conditional_transformed_distribution.py   |  21 +-
 .../python/ops/poisson_lognormal.py           |   2 +-
 .../python/ops/relaxed_onehot_categorical.py  |   2 +-
 .../distributions/python/ops/sinh_arcsinh.py  |   4 +-
 .../python/ops/vector_diffeomixture.py        |  10 +-
 .../python/ops/vector_sinh_arcsinh_diag.py    |   4 +-
 .../distributions/bijector_test.py            | 181 ++++++--
 .../distributions/identity_bijector_test.py   |  21 +-
 .../python/ops/distributions/bijector_impl.py | 429 +++++++++++++-----
 .../ops/distributions/bijector_test_util.py   |  23 +-
 .../python/ops/distributions/bijectors.py     |  31 --
 .../python/ops/distributions/distributions.py |   2 -
 .../ops/distributions/identity_bijector.py    |   8 +-
 .../distributions/transformed_distribution.py |  58 ++-
 ...ow.distributions.bijectors.-bijector.pbtxt |  65 ---
 ...ow.distributions.bijectors.-identity.pbtxt |  66 ---
 .../tensorflow.distributions.bijectors.pbtxt  |  11 -
 .../api/golden/tensorflow.distributions.pbtxt |   4 -
 70 files changed, 1412 insertions(+), 872 deletions(-)
 delete mode 100644 tensorflow/python/ops/distributions/bijectors.py
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
index e0d65c79b2..042c8ebd51 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
@@ -18,11 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 # pylint: disable=g-importing-member
 from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value import AbsoluteValue
-from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -35,50 +32,38 @@ class AbsoluteValueTest(test.TestCase):
 
   def testBijectorVersusNumpyRewriteOfBasicFunctionsEventNdims0(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       self.assertEqual("absolute_value", bijector.name)
       x = array_ops.constant([[0., 1., -1], [0., -5., 3.]])  # Shape [2, 3]
       y = math_ops.abs(x)
 
       y_ = y.eval()
-      zeros = np.zeros((2, 3))
 
       self.assertAllClose(y_, bijector.forward(x).eval())
       self.assertAllClose((-y_, y_), sess.run(bijector.inverse(y)))
-      self.assertAllClose((zeros, zeros),
-                          sess.run(bijector.inverse_log_det_jacobian(y)))
+      self.assertAllClose((0., 0.),
+                          sess.run(bijector.inverse_log_det_jacobian(
+                              y, event_ndims=0)))
 
       # Run things twice to make sure there are no issues in caching the tuples
       # returned by .inverse*
       self.assertAllClose(y_, bijector.forward(x).eval())
       self.assertAllClose((-y_, y_), sess.run(bijector.inverse(y)))
-      self.assertAllClose((zeros, zeros),
-                          sess.run(bijector.inverse_log_det_jacobian(y)))
-
-  def testEventNdimsMustBeZeroOrRaiseStatic(self):
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "event_ndims.*was not 0"):
-        AbsoluteValue(event_ndims=1)
-
-  def testEventNdimsMustBeZeroOrRaiseDynamic(self):
-    with self.test_session() as sess:
-      event_ndims = array_ops.placeholder(dtypes.int32)
-      abs_bijector = AbsoluteValue(event_ndims=event_ndims, validate_args=True)
-      with self.assertRaisesOpError("event_ndims was not 0"):
-        sess.run(abs_bijector.inverse_log_det_jacobian([1.]),
-                 feed_dict={event_ndims: 1})
+      self.assertAllClose((0., 0.),
+                          sess.run(bijector.inverse_log_det_jacobian(
+                              y, event_ndims=0)))
 
   def testNegativeYRaisesForInverseIfValidateArgs(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
         sess.run(bijector.inverse(-1.))
 
   def testNegativeYRaisesForILDJIfValidateArgs(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
-        sess.run(bijector.inverse_log_det_jacobian(-1.))
+        sess.run(bijector.inverse_log_det_jacobian(-1., event_ndims=0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
index 405ddd292c..1e4ad724d0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
@@ -38,9 +38,11 @@ class AffineLinearOperatorTest(test.TestCase):
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(
+          y, event_ndims=2).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
   def testDiag(self):
     with self.test_session():
@@ -58,14 +60,16 @@ class AffineLinearOperatorTest(test.TestCase):
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          ildj, affine.inverse_log_det_jacobian(y, event_ndims=1).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testTriL(self):
     with self.test_session():
       shift = np.array([-1, 0, 1], dtype=np.float32)
-      tril = np.array([[[1, 0, 0],
+      tril = np.array([[[3, 0, 0],
                         [2, -1, 0],
                         [3, 2, 1]],
                        [[2, 0, 0],
@@ -85,15 +89,17 @@ class AffineLinearOperatorTest(test.TestCase):
       # y = np.matmul(x, tril) + shift.
       y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
       ildj = -np.sum(np.log(np.abs(np.diagonal(
-          tril, axis1=-2, axis2=-1))),
-                     axis=-1)
+          tril, axis1=-2, axis2=-1))))
 
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          ildj, affine.inverse_log_det_jacobian(
+              y, event_ndims=2).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
index 16173a166f..d2533620be 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
@@ -40,13 +40,13 @@ class AffineScalarBijectorTest(test.TestCase):
   def testNoBatchScalar(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
         x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -55,19 +55,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = [1., 2, 3]  # Three scalar samples (no batches).
         self.assertAllClose([1., 3, 5], run(bijector.forward, x))
         self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose([-np.log(2.)] * 3,
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testOneBatchScalarViaIdentityIn64BitUserProvidesShiftOnly(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value).astype(np.float64)
         x = array_ops.placeholder(dtypes.float64, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = np.float64([1.])
@@ -77,18 +78,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = np.float64([1.])  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
         self.assertAllClose([0.], run(bijector.inverse, x))
-        self.assertAllClose([0.], run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0.,
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testOneBatchScalarViaIdentityIn64BitUserProvidesScaleOnly(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value).astype(np.float64)
         x = array_ops.placeholder(dtypes.float64, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         multiplier = np.float64([2.])
@@ -98,19 +101,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = np.float64([1.])  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
         self.assertAllClose([0.5], run(bijector.inverse, x))
-        self.assertAllClose([np.log(0.5)],
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            [np.log(0.5)],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testTwoBatchScalarIdentityViaIdentity(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float32)
         x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -120,18 +124,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = [1., 1]  # One sample from each of two batches.
         self.assertAllClose([2., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose([0., 0.], run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0.,
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testTwoBatchScalarIdentityViaScale(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float32)
         x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -142,7 +148,8 @@ class AffineScalarBijectorTest(test.TestCase):
         self.assertAllClose([3., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
         self.assertAllClose(
-            [-np.log(2), 0.], run(bijector.inverse_log_det_jacobian, x))
+            [-np.log(2), 0.],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index 077e6176b4..9e14b9a53e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -40,14 +40,15 @@ class AffineBijectorTest(test.TestCase):
 
   def testNoBatchMultivariateIdentity(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -66,18 +67,20 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 1], [-1., -1]]
         self.assertAllClose([[2., 0], [0., -2]], run(bijector.forward, x))
         self.assertAllClose([[0., 2], [-2., 0]], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0., run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateDiag(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -89,9 +92,12 @@ class AffineBijectorTest(test.TestCase):
         # = [-1, -1] + [1, -1]
         self.assertAllClose([3., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
+        # Reset bijector.
+        bijector = Affine(shift=mu, scale_diag=[2., 1])
         # x is a 2-batch of 2-vectors.
         # The first vector is [1, 1], the second is [-1, -1].
         # Each undergoes matmul(sigma, x) + shift.
@@ -103,8 +109,9 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([[0., 2],
                              [-1., 0]],
                             run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateFullDynamic(self):
     with self.test_session() as sess:
@@ -126,18 +133,20 @@ class AffineBijectorTest(test.TestCase):
       self.assertAllClose([[0., 1]], sess.run(bijector.inverse(x), feed_dict))
       self.assertAllClose(
           -np.log(4),
-          sess.run(bijector.inverse_log_det_jacobian(x), feed_dict))
+          sess.run(bijector.inverse_log_det_jacobian(x, event_ndims=1),
+                   feed_dict))
 
   def testBatchMultivariateIdentity(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [[1., -1]]
@@ -147,19 +156,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
         self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(4),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(4),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateDiag(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [[1., -1]]
@@ -169,8 +180,9 @@ class AffineBijectorTest(test.TestCase):
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
         self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose([-np.log(4)],
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            [-np.log(4)],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateFullDynamic(self):
     with self.test_session() as sess:
@@ -191,20 +203,22 @@ class AffineBijectorTest(test.TestCase):
       bijector = Affine(shift=mu, scale_diag=scale_diag)
       self.assertAllClose([[[3., 1]]], sess.run(bijector.forward(x), feed_dict))
       self.assertAllClose([[[0., 1]]], sess.run(bijector.inverse(x), feed_dict))
-      self.assertAllClose([-np.log(4)],
-                          sess.run(
-                              bijector.inverse_log_det_jacobian(x), feed_dict))
+      self.assertAllClose(
+          [-np.log(4)],
+          sess.run(bijector.inverse_log_det_jacobian(
+              x, event_ndims=1), feed_dict))
 
   def testIdentityWithDiagUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -216,19 +230,21 @@ class AffineBijectorTest(test.TestCase):
         x = [1., 2, 3]  # Three scalar samples (no batches).
         self.assertAllClose([1., 3, 5], run(bijector.forward, x))
         self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.**3),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.**3),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -240,19 +256,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[1., 5]], run(bijector.forward, x))
         self.assertAllClose([[1., 0.5]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(4.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(4.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -262,19 +280,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[1., 7]], run(bijector.forward, x))
         self.assertAllClose([[1., 1 / 3.]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(6.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(6.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityAndDiagWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -287,19 +307,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[2., 9]], run(bijector.forward, x))
         self.assertAllClose([[2 / 3., 5 / 12.]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(12.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(12.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -319,22 +341,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 1.5, 4 / 3.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(60.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(60.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -353,22 +377,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 1., 0.8], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(150.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -388,22 +414,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 14 / 15., 4 / 25.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(150.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdateNoDiagonal(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -423,11 +451,12 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([1 / 3., 8 / 9., 4 / 30.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(90.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(90.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateRaisesWhenSingular(self):
     with self.test_session():
@@ -530,6 +559,7 @@ class AffineBijectorTest(test.TestCase):
             backward = np.squeeze(backward, axis=-1)
           self.assertAllClose(backward, bijector.inverse(x).eval())
 
+          scale *= np.ones(shape=x.shape[:-1], dtype=scale.dtype)
           ildj = -np.log(np.abs(np.linalg.det(scale)))
           # TODO(jvdillon): We need to make it so the scale_identity_multiplier
           # case does not deviate in expected shape. Fixing this will get rid of
@@ -540,7 +570,8 @@ class AffineBijectorTest(test.TestCase):
             ildj = np.squeeze(ildj[0])
           elif ildj.ndim < scale.ndim - 2:
             ildj = np.reshape(ildj, scale.shape[0:-2])
-          self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(x).eval())
+          self.assertAllClose(
+              ildj, bijector.inverse_log_det_jacobian(x, event_ndims=1).eval())
 
   def testLegalInputs(self):
     self._testLegalInputs(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
index a215a4a2b1..c832fcaa68 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
@@ -83,10 +83,11 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
           moving_mean = array_ops.identity(batch_norm.batchnorm.moving_mean)
           moving_var = array_ops.identity(batch_norm.batchnorm.moving_variance)
           denorm_x = batch_norm.forward(array_ops.identity(norm_x))
-          fldj = batch_norm.forward_log_det_jacobian(x)
+          fldj = batch_norm.forward_log_det_jacobian(
+              x, event_ndims=len(event_dims))
           # Use identity to invalidate cache.
           ildj = batch_norm.inverse_log_det_jacobian(
-              array_ops.identity(denorm_x))
+              array_ops.identity(denorm_x), event_ndims=len(event_dims))
         variables.global_variables_initializer().run()
         # Update variables.
         norm_x_ = sess.run(norm_x)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index a748acd667..ca20442c39 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -20,21 +20,33 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import Chain
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
+class ShapeChanging(bijector.Bijector):
+  """Only used for op_ndims manipulation."""
+
+  def __init__(self, forward_min_event_ndims=0, inverse_min_event_ndims=3):
+    super(ShapeChanging, self).__init__(
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
+        validate_args=False, name="shape_changer")
+
+
 class ChainBijectorTest(test.TestCase):
   """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
 
   def testBijector(self):
     with self.test_session():
-      chain = Chain((Exp(event_ndims=1), Softplus(event_ndims=1)))
+      chain = Chain((Exp(), Softplus()))
       self.assertEqual("chain_of_exp_of_softplus", chain.name)
       x = np.asarray([[[1., 2.],
                        [2., 3.]]])
@@ -42,9 +54,10 @@ class ChainBijectorTest(test.TestCase):
       self.assertAllClose(np.log(x - 1.), chain.inverse(x).eval())
       self.assertAllClose(
           -np.sum(np.log(x - 1.), axis=2),
-          chain.inverse_log_det_jacobian(x).eval())
+          chain.inverse_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          np.sum(x, axis=2), chain.forward_log_det_jacobian(x).eval())
+          np.sum(x, axis=2),
+          chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testBijectorIdentity(self):
     with self.test_session():
@@ -54,31 +67,126 @@ class ChainBijectorTest(test.TestCase):
                        [2., 3.]]])
       self.assertAllClose(x, chain.forward(x).eval())
       self.assertAllClose(x, chain.inverse(x).eval())
-      self.assertAllClose(0., chain.inverse_log_det_jacobian(x).eval())
-      self.assertAllClose(0., chain.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          0., chain.inverse_log_det_jacobian(x, event_ndims=1).eval())
+      self.assertAllClose(
+          0., chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Chain((Exp(), Softplus()))
+      chain = Chain((Exp(), Softplus()))
       assert_scalar_congruency(
-          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+          chain, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = Chain([
+      chain = Chain([
           SoftmaxCentered(validate_args=True),
           SoftmaxCentered(validate_args=True),
       ])
       x = tensor_shape.TensorShape([1])
       y = tensor_shape.TensorShape([2 + 1])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y, chain.forward_event_shape(x))
       self.assertAllEqual(
           y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+          chain.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, chain.inverse_event_shape(y))
       self.assertAllEqual(
           x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+          chain.inverse_event_shape_tensor(y.as_list()).eval())
+
+  def testMinEventNdimsChain(self):
+    chain = Chain([Exp(), Exp(), Exp()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Affine(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Exp(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Exp()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Exp(), Softplus(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingAddDims(self):
+    chain = Chain([ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(3, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(4, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(3, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(), ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(6, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingRemoveDims(self):
+    chain = Chain([ShapeChanging(3, 0)])
+    self.assertEqual(3, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(3, 0), Affine()])
+    self.assertEqual(3, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), ShapeChanging(3, 0)])
+    self.assertEqual(4, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(3, 0), ShapeChanging(3, 0)])
+    self.assertEqual(6, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingAddRemoveDims(self):
+    chain = Chain([
+        ShapeChanging(2, 1),
+        ShapeChanging(3, 0),
+        ShapeChanging(1, 2)])
+    self.assertEqual(4, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+  def testChainExpAffine(self):
+    scale_diag = np.array([1., 2., 3.], dtype=np.float32)
+    chain = Chain([Exp(), Affine(scale_diag=scale_diag)])
+    x = [0., np.log(2., dtype=np.float32), np.log(3., dtype=np.float32)]
+    y = [1., 4., 27.]
+    self.assertAllClose(y, self.evaluate(chain.forward(x)))
+    self.assertAllClose(x, self.evaluate(chain.inverse(y)))
+    self.assertAllClose(
+        np.log(6, dtype=np.float32) + np.sum(scale_diag * x),
+        self.evaluate(chain.forward_log_det_jacobian(x, event_ndims=1)))
+
+    self.assertAllClose(
+        -np.log(6, dtype=np.float32) - np.sum(scale_diag * x),
+        self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
+
+  def testChainAffineExp(self):
+    scale_diag = np.array([1., 2., 3.], dtype=np.float32)
+    chain = Chain([Affine(scale_diag=scale_diag), Exp()])
+    x = [0., np.log(2., dtype=np.float32), np.log(3., dtype=np.float32)]
+    y = [1., 4., 9.]
+    self.assertAllClose(y, self.evaluate(chain.forward(x)))
+    self.assertAllClose(x, self.evaluate(chain.inverse(y)))
+    self.assertAllClose(
+        np.log(6, dtype=np.float32) + np.sum(x),
+        self.evaluate(chain.forward_log_det_jacobian(x, event_ndims=1)))
+
+    self.assertAllClose(
+        -np.log(6, dtype=np.float32) - np.sum(x),
+        self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index f392e83d2c..e281e81bdf 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -51,10 +51,13 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          ildj, bijector.inverse_log_det_jacobian(y).eval(), atol=0., rtol=1e-7)
+          ildj, bijector.inverse_log_det_jacobian(
+              y, event_ndims=2).eval(), atol=0., rtol=1e-7)
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(
+              y, event_ndims=2).eval(),
+          bijector.forward_log_det_jacobian(
+              x, event_ndims=2).eval(),
           atol=0.,
           rtol=1e-7)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
index 26e0d2a539..8b279ebcd9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
@@ -27,7 +27,7 @@ class _TestBijector(ConditionalBijector):
 
   def __init__(self):
     super(_TestBijector, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         graph_parents=[],
         is_constant_jacobian=True,
         validate_args=False,
@@ -51,11 +51,15 @@ class ConditionalBijectorTest(test.TestCase):
 
   def testConditionalBijector(self):
     b = _TestBijector()
-    for name in ["forward", "inverse", "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian"]:
+    for name in ["forward", "inverse"]:
       method = getattr(b, name)
       with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
-        method(1.0, arg1="b1", arg2="b2")
+        method(1., arg1="b1", arg2="b2")
+
+    for name in ["inverse_log_det_jacobian", "forward_log_det_jacobian"]:
+      method = getattr(b, name)
+      with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
+        method(1., event_ndims=0., arg1="b1", arg2="b2")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
index 9970c0b4d8..7be939cd27 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -31,17 +31,21 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      bijector = Exp(event_ndims=1)
+      bijector = Exp()
       self.assertEqual("exp", bijector.name)
       x = [[[1.], [2.]]]
       y = np.exp(x)
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          -np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-bijector.inverse_log_det_jacobian(np.exp(x)).eval(),
-                          bijector.forward_log_det_jacobian(x).eval())
+          -np.squeeze(np.log(y), axis=-1),
+          bijector.inverse_log_det_jacobian(
+              y, event_ndims=1).eval())
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(
+              np.exp(x), event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(
+              x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -51,10 +55,10 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = Exp(event_ndims=0)
+      bijector = Exp()
       x = np.linspace(-10, 10, num=10).astype(np.float32)
       y = np.logspace(-10, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
index 9a905980c7..54e54c3296 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
@@ -34,7 +34,7 @@ class GumbelBijectorTest(test.TestCase):
     with self.test_session():
       loc = 0.3
       scale = 5.
-      bijector = Gumbel(loc=loc, scale=scale, event_ndims=1, validate_args=True)
+      bijector = Gumbel(loc=loc, scale=scale, validate_args=True)
       self.assertEqual("gumbel", bijector.name)
       x = np.array([[[-3.], [0.], [0.5], [4.2], [12.]]], dtype=np.float32)
       # Gumbel distribution
@@ -43,13 +43,11 @@ class GumbelBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          np.squeeze(gumbel_dist.logpdf(x), axis=2),
-          bijector.forward_log_det_jacobian(x).eval())
+          np.squeeze(gumbel_dist.logpdf(x), axis=-1),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -60,10 +58,10 @@ class GumbelBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = Gumbel(loc=0., scale=3.0, event_ndims=0, validate_args=True)
+      bijector = Gumbel(loc=0., scale=3.0, validate_args=True)
       x = np.linspace(-10., 10., num=10).astype(np.float32)
       y = np.linspace(0.01, 0.99, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
index 739fa6d439..7d3bd758cd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
@@ -33,15 +33,13 @@ class InlineBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      exp = Exp(event_ndims=1)
+      exp = Exp()
       inline = Inline(
           forward_fn=math_ops.exp,
           inverse_fn=math_ops.log,
-          inverse_log_det_jacobian_fn=(
-              lambda y: -math_ops.reduce_sum(  # pylint: disable=g-long-lambda
-                  math_ops.log(y), reduction_indices=-1)),
-          forward_log_det_jacobian_fn=(
-              lambda x: math_ops.reduce_sum(x, reduction_indices=-1)),
+          inverse_log_det_jacobian_fn=lambda y: -math_ops.log(y),
+          forward_log_det_jacobian_fn=lambda x: x,
+          forward_min_event_ndims=0,
           name="exp")
 
       self.assertEqual(exp.name, inline.name)
@@ -51,9 +49,10 @@ class InlineBijectorTest(test.TestCase):
       self.assertAllClose(x, inline.inverse(y).eval())
       self.assertAllClose(
           -np.sum(np.log(y), axis=-1),
-          inline.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-inline.inverse_log_det_jacobian(y).eval(),
-                          inline.forward_log_det_jacobian(x).eval())
+          inline.inverse_log_det_jacobian(y, event_ndims=1).eval())
+      self.assertAllClose(
+          -inline.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          inline.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testShapeGetters(self):
     with self.test_session():
@@ -62,6 +61,7 @@ class InlineBijectorTest(test.TestCase):
           forward_event_shape_fn=lambda x: x.as_list() + [1],
           inverse_event_shape_tensor_fn=lambda x: x[:-1],
           inverse_event_shape_fn=lambda x: x[:-1],
+          forward_min_event_ndims=0,
           name="shape_only")
       x = tensor_shape.TensorShape([1, 2, 3])
       y = tensor_shape.TensorShape([1, 2, 3, 1])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 58ba9cedb1..8b14c8327f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -34,9 +34,9 @@ class InvertBijectorTest(test.TestCase):
     with self.test_session():
       for fwd in [
           bijectors.Identity(),
-          bijectors.Exp(event_ndims=1),
+          bijectors.Exp(),
           bijectors.Affine(shift=[0., 1.], scale_diag=[2., 3.]),
-          bijectors.Softplus(event_ndims=1),
+          bijectors.Softplus(),
           bijectors.SoftmaxCentered(),
       ]:
         rev = bijectors.Invert(fwd)
@@ -46,11 +46,11 @@ class InvertBijectorTest(test.TestCase):
         self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
         self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
         self.assertAllClose(
-            fwd.forward_log_det_jacobian(x).eval(),
-            rev.inverse_log_det_jacobian(x).eval())
+            fwd.forward_log_det_jacobian(x, event_ndims=1).eval(),
+            rev.inverse_log_det_jacobian(x, event_ndims=1).eval())
         self.assertAllClose(
-            fwd.inverse_log_det_jacobian(x).eval(),
-            rev.forward_log_det_jacobian(x).eval())
+            fwd.inverse_log_det_jacobian(x, event_ndims=1).eval(),
+            rev.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
index 074b5f275d..a8089881f6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -34,8 +34,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       a = 2.
       b = 0.3
       bijector = Kumaraswamy(
-          concentration1=a, concentration0=b,
-          event_ndims=0, validate_args=True)
+          concentration1=a, concentration0=b, validate_args=True)
       self.assertEqual("kumaraswamy", bijector.name)
       x = np.array([[[0.1], [0.2], [0.3], [0.4], [0.5]]], dtype=np.float32)
       # Kumaraswamy cdf. This is the same as inverse(x).
@@ -46,13 +45,11 @@ class KumaraswamyBijectorTest(test.TestCase):
                              (b - 1) * np.log1p(-x ** a))
 
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          kumaraswamy_log_pdf,
-          bijector.inverse_log_det_jacobian(x).eval())
+          np.squeeze(kumaraswamy_log_pdf, axis=-1),
+          bijector.inverse_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(x).eval(),
-          bijector.forward_log_det_jacobian(y).eval(),
+          -bijector.inverse_log_det_jacobian(x, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(y, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -73,7 +70,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       # endpoints.
       y = np.linspace(.01, 0.99, num=10).astype(np.float32)
       x = 1 - (1 - y ** concentration1) ** concentration0
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index dcfb0eb051..5ba5a2083b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -79,9 +79,10 @@ class MaskedAutoregressiveFlowTest(test_util.VectorDistributionTestHelpers,
       forward_x = ma.forward(x)
       # Use identity to invalidate cache.
       inverse_y = ma.inverse(array_ops.identity(forward_x))
-      fldj = ma.forward_log_det_jacobian(x)
+      fldj = ma.forward_log_det_jacobian(x, event_ndims=1)
       # Use identity to invalidate cache.
-      ildj = ma.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      ildj = ma.inverse_log_det_jacobian(
+          array_ops.identity(forward_x), event_ndims=1)
       variables.global_variables_initializer().run()
       [
           forward_x_,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
index 54590de373..7eef4ab599 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
@@ -53,8 +53,8 @@ class PermuteBijectorTest(test.TestCase):
           bijector.permutation,
           bijector.inverse(expected_y),
           bijector.forward(expected_x),
-          bijector.forward_log_det_jacobian(expected_x),
-          bijector.inverse_log_det_jacobian(expected_y),
+          bijector.forward_log_det_jacobian(expected_x, event_ndims=1),
+          bijector.inverse_log_det_jacobian(expected_y, event_ndims=1),
       ], feed_dict={permutation_ph: expected_permutation})
       self.assertEqual("permute", bijector.name)
       self.assertAllEqual(expected_permutation, permutation_)
@@ -78,10 +78,9 @@ class PermuteBijectorTest(test.TestCase):
     x = np.random.randn(4, 2, 3)
     y = x[..., permutation]
     with self.test_session():
-      bijector = Permute(
-          permutation=permutation,
-          validate_args=True)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
+      bijector = Permute(permutation=permutation, validate_args=True)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=1, rtol=1e-6, atol=0)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
index de1659aa9f..85d2283013 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -32,8 +32,7 @@ class PowerTransformBijectorTest(test.TestCase):
   def testBijector(self):
     with self.test_session():
       c = 0.2
-      bijector = PowerTransform(
-          power=c, event_ndims=1, validate_args=True)
+      bijector = PowerTransform(power=c, validate_args=True)
       self.assertEqual("power_transform", bijector.name)
       x = np.array([[[-1.], [2.], [-5. + 1e-4]]])
       y = (1. + x * c)**(1. / c)
@@ -41,27 +40,25 @@ class PowerTransformBijectorTest(test.TestCase):
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
           (c - 1.) * np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
+          bijector.inverse_log_det_jacobian(y, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = PowerTransform(
-          power=0.2, validate_args=True)
+      bijector = PowerTransform(power=0.2, validate_args=True)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = PowerTransform(
-          power=0.2, event_ndims=0, validate_args=True)
+      bijector = PowerTransform(power=0.2, validate_args=True)
       x = np.linspace(-4.999, 10, num=10).astype(np.float32)
       y = np.logspace(0.001, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
index 46fe779741..2d52895fbe 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
@@ -52,24 +52,28 @@ class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
       forward_x = nvp.forward(x)
       # Use identity to invalidate cache.
       inverse_y = nvp.inverse(array_ops.identity(forward_x))
-      fldj = nvp.forward_log_det_jacobian(x)
+      forward_inverse_y = nvp.forward(inverse_y)
+      fldj = nvp.forward_log_det_jacobian(x, event_ndims=1)
       # Use identity to invalidate cache.
-      ildj = nvp.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      ildj = nvp.inverse_log_det_jacobian(
+          array_ops.identity(forward_x), event_ndims=1)
       variables.global_variables_initializer().run()
       [
           forward_x_,
           inverse_y_,
+          forward_inverse_y_,
           ildj_,
           fldj_,
       ] = sess.run([
           forward_x,
           inverse_y,
+          forward_inverse_y,
           ildj,
           fldj,
       ])
       self.assertEqual("real_nvp", nvp.name)
-      self.assertAllClose(forward_x_, forward_x_, rtol=1e-6, atol=0.)
-      self.assertAllClose(x_, inverse_y_, rtol=1e-5, atol=0.)
+      self.assertAllClose(forward_x_, forward_inverse_y_, rtol=1e-1, atol=0.)
+      self.assertAllClose(x_, inverse_y_, rtol=1e-1, atol=0.)
       self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=0.)
 
   def testMutuallyConsistent(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index e216d88cb1..46f2c63f9b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -65,8 +65,8 @@ class _ReshapeBijectorTest(object):
        ildj_) = sess.run((
            bijector.inverse(expected_y),
            bijector.forward(expected_x),
-           bijector.forward_log_det_jacobian(expected_x),
-           bijector.inverse_log_det_jacobian(expected_y),
+           bijector.forward_log_det_jacobian(expected_x, event_ndims=2),
+           bijector.inverse_log_det_jacobian(expected_y, event_ndims=2),
        ), feed_dict=feed_dict)
       self.assertEqual("reshape", bijector.name)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
@@ -301,7 +301,8 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
           event_shape_in=[2, 3],
           event_shape_out=[1, 2, 3],
           validate_args=True)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=2, rtol=1e-6, atol=0)
 
   def testInvalidDimensionsOpError(self):
     if ops._USE_C_API:
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
index e4f9d72785..cea4a62c22 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -36,12 +36,13 @@ class SigmoidBijectorTest(test.TestCase):
       x = np.linspace(-10., 10., 100).reshape([2, 5, 10]).astype(np.float32)
       y = special.expit(x)
       ildj = -np.log(y) - np.log1p(-y)
-      self.assertAllClose(y, Sigmoid().forward(x).eval(), atol=0., rtol=1e-2)
-      self.assertAllClose(x, Sigmoid().inverse(y).eval(), atol=0., rtol=1e-4)
-      self.assertAllClose(ildj, Sigmoid().inverse_log_det_jacobian(y).eval(),
-                          atol=0., rtol=1e-6)
-      self.assertAllClose(-ildj, Sigmoid().forward_log_det_jacobian(x).eval(),
-                          atol=0., rtol=1e-4)
+      bijector = Sigmoid()
+      self.assertAllClose(y, bijector.forward(x).eval(), atol=0., rtol=1e-2)
+      self.assertAllClose(x, bijector.inverse(y).eval(), atol=0., rtol=1e-4)
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=0).eval(), atol=0., rtol=1e-6)
+      self.assertAllClose(-ildj, bijector.forward_log_det_jacobian(
+          x, event_ndims=0).eval(), atol=0., rtol=1e-4)
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -52,7 +53,8 @@ class SigmoidBijectorTest(test.TestCase):
       x = np.linspace(-7., 7., 100).astype(np.float32)
       eps = 1e-3
       y = np.linspace(eps, 1. - eps, 100).astype(np.float32)
-      assert_bijective_and_finite(Sigmoid(), x, y, atol=0., rtol=1e-4)
+      assert_bijective_and_finite(
+          Sigmoid(), x, y, event_ndims=0, atol=0., rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 172c180a44..45760a29ee 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -39,7 +39,6 @@ class SinhArcsinhBijectorTest(test.TestCase):
       bijector = SinhArcsinh(
           skewness=skewness,
           tailweight=tailweight,
-          event_ndims=1,
           validate_args=True)
       self.assertEqual("SinhArcsinh", bijector.name)
       x = np.array([[[-2.01], [2.], [1e-4]]]).astype(np.float32)
@@ -50,10 +49,11 @@ class SinhArcsinhBijectorTest(test.TestCase):
           np.sum(
               np.log(np.cosh(np.arcsinh(y) / tailweight - skewness)) -
               np.log(tailweight) - np.log(np.sqrt(y**2 + 1)),
-              axis=-1), bijector.inverse_log_det_jacobian(y).eval())
+              axis=-1),
+          bijector.inverse_log_det_jacobian(y, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -106,14 +106,15 @@ class SinhArcsinhBijectorTest(test.TestCase):
       bijector = SinhArcsinh(skewness=-1., tailweight=0.5, validate_args=True)
       x = np.concatenate((-np.logspace(-2, 10, 1000), [0], np.logspace(
           -2, 10, 1000))).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectiveAndFiniteSkewness1Tailweight3(self):
     with self.test_session():
       bijector = SinhArcsinh(skewness=1., tailweight=3., validate_args=True)
       x = np.concatenate((-np.logspace(-2, 5, 1000), [0], np.logspace(
           -2, 5, 1000))).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+      assert_bijective_and_finite(
+          bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectorEndpoints(self):
     with self.test_session():
@@ -124,7 +125,8 @@ class SinhArcsinhBijectorTest(test.TestCase):
             [np.finfo(dtype).min, np.finfo(dtype).max], dtype=dtype)
         # Note that the above bijector is the identity bijector. Hence, the
         # log_det_jacobian will be 0. Because of this we use atol.
-        assert_bijective_and_finite(bijector, bounds, bounds, atol=2e-6)
+        assert_bijective_and_finite(
+            bijector, bounds, bounds, event_ndims=0, atol=2e-6)
 
   def testBijectorOverRange(self):
     with self.test_session():
@@ -156,12 +158,12 @@ class SinhArcsinhBijectorTest(test.TestCase):
                 np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
                     y_float128**2 + 1)) -
             np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y).eval(),
+            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             rtol=1e-4,
             atol=0.)
         self.assertAllClose(
-            -bijector.inverse_log_det_jacobian(y).eval(),
-            bijector.forward_log_det_jacobian(x).eval(),
+            -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+            bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
             rtol=1e-4,
             atol=0.)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index cad4dd1ac8..0f0a2fa531 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -44,12 +44,12 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
       self.assertAllClose(x, softmax.inverse(y).eval())
       self.assertAllClose(
           -np.sum(np.log(y), axis=1),
-          softmax.inverse_log_det_jacobian(y).eval(),
+          softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
+          -softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          softmax.forward_log_det_jacobian(x, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
 
@@ -67,14 +67,14 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
           feed_dict={y: real_y}))
       self.assertAllClose(
           -np.sum(np.log(real_y), axis=1),
-          softmax.inverse_log_det_jacobian(y).eval(
+          softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(
               feed_dict={y: real_y}),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(
+          -softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(
               feed_dict={y: real_y}),
-          softmax.forward_log_det_jacobian(x).eval(
+          softmax.forward_log_det_jacobian(x, event_ndims=1).eval(
               feed_dict={x: real_x}),
           atol=0.,
           rtol=1e-7)
@@ -104,7 +104,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
       y = np.array([y_0, y_1, y_2])
       y /= y.sum(axis=0)
       y = y.T  # y.shape = [5, 3]
-      assert_bijective_and_finite(softmax, x, y)
+      assert_bijective_and_finite(softmax, x, y, event_ndims=1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index d9af9aec50..3d8a0a32bb 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -43,13 +43,13 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testHingeSoftnessZeroRaises(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=0., validate_args=True)
+      bijector = Softplus(hinge_softness=0., validate_args=True)
       with self.assertRaisesOpError("must be non-zero"):
         bijector.forward([1., 1.]).eval()
 
   def testBijectorForwardInverseEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -59,7 +59,7 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.5)
+      bijector = Softplus(hinge_softness=1.5)
       x = 2 * rng.randn(2, 10)
       y = 1.5 * self._softplus(x / 1.5)
 
@@ -68,16 +68,17 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       # No reduction needed if event_dims = 0.
       ildj = self._softplus_ildj_before_reduction(y)
 
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=0).eval())
 
   def testBijectorForwardInverseEventDimsOne(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=1)
+      bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -87,58 +88,59 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsOne(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=1)
+      bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       ildj_before = self._softplus_ildj_before_reduction(y)
       ildj = np.sum(ildj_before, axis=1)
 
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithPositiveHingeSoftness(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.3)
+      bijector = Softplus(hinge_softness=1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithNegativeHingeSoftness(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=-1.3)
+      bijector = Softplus(hinge_softness=-1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testBijectiveAndFinite32bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.23)
+      bijector = Softplus(hinge_softness=1.23)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=-0.7)
+      bijector = Softplus(hinge_softness=-0.7)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = -np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFinite16bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       # softplus(-20) is zero, so we can't use such a large range as in 32bit.
       x = np.linspace(-10., 20., 100).astype(np.float16)
       # Note that float16 is only in the open set (0, inf) for a smaller
@@ -146,7 +148,7 @@ class SoftplusBijectorTest(test.TestCase):
       # for the test.
       y = np.logspace(-6, 3, 100).astype(np.float16)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-1, atol=1e-3)
+          bijector, x, y, event_ndims=0, rtol=1e-1, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
index f03d6f1343..30c7a738c3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
@@ -41,10 +41,11 @@ class SquareBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          ildj, bijector.inverse_log_det_jacobian(y).eval(), atol=0., rtol=1e-7)
+          ildj, bijector.inverse_log_det_jacobian(
+              y, event_ndims=0).eval(), atol=0., rtol=1e-7)
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
           atol=0.,
           rtol=1e-7)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
index 7a31228d1a..f57adcda89 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
@@ -36,7 +36,7 @@ class WeibullBijectorTest(test.TestCase):
       concentration = 0.3
       bijector = Weibull(
           scale=scale, concentration=concentration,
-          event_ndims=1, validate_args=True)
+          validate_args=True)
       self.assertEqual("weibull", bijector.name)
       x = np.array([[[0.], [1.], [14.], [20.], [100.]]], dtype=np.float32)
       # Weibull distribution
@@ -45,13 +45,11 @@ class WeibullBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          np.squeeze(weibull_dist.logpdf(x), axis=2),
-          bijector.forward_log_det_jacobian(x).eval())
+          weibull_dist.logpdf(x),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -64,12 +62,12 @@ class WeibullBijectorTest(test.TestCase):
   def testBijectiveAndFinite(self):
     with self.test_session():
       bijector = Weibull(
-          scale=20., concentration=2., event_ndims=0, validate_args=True)
+          scale=20., concentration=2., validate_args=True)
       x = np.linspace(1., 8., num=10).astype(np.float32)
       y = np.linspace(
           -np.expm1(-1 / 400.),
           -np.expm1(-16), num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
index 545471907f..4e8989b6c2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
@@ -44,6 +44,7 @@ class _ChooseLocation(ConditionalBijector):
           graph_parents=[self._loc],
           is_constant_jacobian=True,
           validate_args=False,
+          forward_min_event_ndims=0,
           name=name)
 
   def _forward(self, x, z):
@@ -52,7 +53,7 @@ class _ChooseLocation(ConditionalBijector):
   def _inverse(self, x, z):
     return x - self._gather_loc(z)
 
-  def _inverse_log_det_jacobian(self, x, z=None):
+  def _inverse_log_det_jacobian(self, x, event_ndims, z=None):
     return 0.
 
   def _gather_loc(self, z):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index 933756aa8e..9635134b08 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -68,7 +68,7 @@ class MultivariateNormalDiagTest(test.TestCase):
       dist = ds.TransformedDistribution(
           base_dist,
           validate_args=True,
-          bijector=bijectors.Softplus(event_ndims=1))
+          bijector=bijectors.Softplus())
       samps = dist.sample(5)  # Shape [5, 1, 3].
       self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index f0ba1ec3eb..5fe1331d2c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -36,6 +37,35 @@ ds = distributions
 la = linalg
 
 
+class DummyMatrixTransform(bs.Bijector):
+  """Tractable matrix transformation.
+
+  This is a non-sensical bijector that has forward/inverse_min_event_ndims=2.
+  The main use is to check that transformed distribution calculations are done
+  appropriately.
+  """
+
+  def __init__(self):
+    super(DummyMatrixTransform, self).__init__(
+        forward_min_event_ndims=2,
+        is_constant_jacobian=False,
+        validate_args=False,
+        name="dummy")
+
+  def _forward(self, x):
+    return x
+
+  def _inverse(self, y):
+    return y
+
+  # Note: These jacobians don't make sense.
+  def _forward_log_det_jacobian(self, x):
+    return -linalg_ops.matrix_determinant(x)
+
+  def _inverse_log_det_jacobian(self, x):
+    return linalg_ops.matrix_determinant(x)
+
+
 class TransformedDistributionTest(test.TestCase):
 
   def _cls(self):
@@ -55,7 +85,7 @@ class TransformedDistributionTest(test.TestCase):
       # you may or may not need a reduce_sum.
       log_normal = self._cls()(
           distribution=ds.Normal(loc=mu, scale=sigma),
-          bijector=bs.Exp(event_ndims=0))
+          bijector=bs.Exp())
       sp_dist = stats.lognorm(s=sigma, scale=np.exp(mu))
 
       # sample
@@ -87,7 +117,7 @@ class TransformedDistributionTest(test.TestCase):
       sigma = 2.0
       abs_normal = self._cls()(
           distribution=ds.Normal(loc=mu, scale=sigma),
-          bijector=bs.AbsoluteValue(event_ndims=0))
+          bijector=bs.AbsoluteValue())
       sp_normal = stats.norm(mu, sigma)
 
       # sample
@@ -129,7 +159,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(grid, cdf_, rtol=1e-6, atol=0.)
 
   def testCachedSamples(self):
-    exp_forward_only = bs.Exp(event_ndims=0)
+    exp_forward_only = bs.Exp()
     exp_forward_only._inverse = self._make_unimplemented(
         "inverse")
     exp_forward_only._inverse_event_shape_tensor = self._make_unimplemented(
@@ -153,7 +183,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(expected_log_pdf, log_pdf_val, rtol=1e-4, atol=0.)
 
   def testCachedSamplesInvert(self):
-    exp_inverse_only = bs.Exp(event_ndims=0)
+    exp_inverse_only = bs.Exp()
     exp_inverse_only._forward = self._make_unimplemented(
         "forward")
     exp_inverse_only._forward_event_shape_tensor = self._make_unimplemented(
@@ -210,8 +240,11 @@ class TransformedDistributionTest(test.TestCase):
       int_identity = bs.Inline(
           forward_fn=array_ops.identity,
           inverse_fn=array_ops.identity,
-          inverse_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
-          forward_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          inverse_log_det_jacobian_fn=(
+              lambda y: math_ops.cast(0, dtypes.int32)),
+          forward_log_det_jacobian_fn=(
+              lambda x: math_ops.cast(0, dtypes.int32)),
+          forward_min_event_ndims=0,
           is_constant_jacobian=True)
       normal = self._cls()(
           distribution=ds.Normal(loc=0., scale=1.),
@@ -435,6 +468,82 @@ class ScalarToMultiTest(test.TestCase):
             event_shape=[3],
             validate_args=True)
 
+  def testMatrixEvent(self):
+    with self.test_session() as sess:
+      batch_shape = [2]
+      event_shape = [2, 3, 3]
+      batch_shape_pl = array_ops.placeholder(
+          dtypes.int32, name="dynamic_batch_shape")
+      event_shape_pl = array_ops.placeholder(
+          dtypes.int32, name="dynamic_event_shape")
+      feed_dict = {batch_shape_pl: np.array(batch_shape, dtype=np.int32),
+                   event_shape_pl: np.array(event_shape, dtype=np.int32)}
+
+      scale = 2.
+      loc = 0.
+      fake_mvn_dynamic = self._cls()(
+          distribution=ds.Normal(
+              loc=loc,
+              scale=scale),
+          bijector=DummyMatrixTransform(),
+          batch_shape=batch_shape_pl,
+          event_shape=event_shape_pl,
+          validate_args=True)
+
+      fake_mvn_static = self._cls()(
+          distribution=ds.Normal(
+              loc=loc,
+              scale=scale),
+          bijector=DummyMatrixTransform(),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=True)
+
+      def actual_mvn_log_prob(x):
+        # This distribution is the normal PDF, reduced over the
+        # last 3 dimensions + a jacobian term which corresponds
+        # to the determinant of x.
+        return (np.sum(
+            stats.norm(loc, scale).logpdf(x), axis=(-1, -2, -3)) +
+                np.sum(np.linalg.det(x), axis=-1))
+
+      self.assertAllEqual([2, 3, 3], fake_mvn_static.event_shape)
+      self.assertAllEqual([2], fake_mvn_static.batch_shape)
+
+      self.assertAllEqual(tensor_shape.TensorShape(None),
+                          fake_mvn_dynamic.event_shape)
+      self.assertAllEqual(tensor_shape.TensorShape(None),
+                          fake_mvn_dynamic.batch_shape)
+
+      num_samples = 5e3
+      for fake_mvn, feed_dict in ((fake_mvn_static, {}),
+                                  (fake_mvn_dynamic, feed_dict)):
+        # Ensure sample works by checking first, second moments.
+        y = fake_mvn.sample(int(num_samples), seed=0)
+        x = y[0:5, ...]
+        [
+            x_,
+            fake_event_shape_,
+            fake_batch_shape_,
+            fake_log_prob_,
+            fake_prob_,
+        ] = sess.run([
+            x,
+            fake_mvn.event_shape_tensor(),
+            fake_mvn.batch_shape_tensor(),
+            fake_mvn.log_prob(x),
+            fake_mvn.prob(x),
+        ], feed_dict=feed_dict)
+
+        # Ensure all other functions work as intended.
+        self.assertAllEqual([5, 2, 2, 3, 3], x_.shape)
+        self.assertAllEqual([2, 3, 3], fake_event_shape_)
+        self.assertAllEqual([2], fake_batch_shape_)
+        self.assertAllClose(actual_mvn_log_prob(x_), fake_log_prob_,
+                            atol=0., rtol=1e-6)
+        self.assertAllClose(np.exp(actual_mvn_log_prob(x_)), fake_prob_,
+                            atol=0., rtol=1e-5)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
index c355adeedb..1226c66113 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -61,7 +61,7 @@ class VectorLaplaceDiagTest(test.TestCase):
       dist = ds.TransformedDistribution(
           base_dist,
           validate_args=True,
-          bijector=bijectors.Softplus(event_ndims=1))
+          bijector=bijectors.Softplus())
       samps = dist.sample(5)  # Shape [5, 1, 3].
       self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
index 0fe9f6aa78..c9e31d7712 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
@@ -18,9 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -72,38 +70,22 @@ class AbsoluteValue(bijector.Bijector):
 
   """
 
-  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
+  def __init__(self, validate_args=False, name="absolute_value"):
     """Instantiates the `AbsoluteValue` bijector.
 
     Args:
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.  Currently only zero is
-        supported.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness, in particular whether inputs to `inverse` and
         `inverse_log_det_jacobian` are non-negative.
       name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
     """
     self._graph_parents = []
     self._name = name
 
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
     with self._name_scope("init"):
       super(AbsoluteValue, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
           validate_args=validate_args,
           name=name)
 
@@ -121,8 +103,7 @@ class AbsoluteValue(bijector.Bijector):
     # If event_ndims = 2,
     # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
     # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
-    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
-    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
+    zeros = constant_op.constant(0., dtype=y.dtype)
     if self.validate_args:
       zeros = control_flow_ops.with_dependencies(
           [check_ops.assert_non_negative(y, message="Argument y was negative")],
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index bef7bbb49b..b4c2939eb9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -184,6 +184,7 @@ class Affine(bijector.Bijector):
     with self._name_scope("init", values=[
         shift, scale_identity_multiplier, scale_diag, scale_tril,
         scale_perturb_diag, scale_perturb_factor]):
+
       # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
       dtype = dtypes.float32
 
@@ -234,7 +235,7 @@ class Affine(bijector.Bijector):
           event_ndims=1,
           validate_args=validate_args)
       super(Affine, self).__init__(
-          event_ndims=1,
+          forward_min_event_ndims=1,
           graph_parents=(
               [self._scale] if tensor_util.is_tensor(self._scale)
               else self._scale.graph_parents +
@@ -360,16 +361,17 @@ class Affine(bijector.Bijector):
         x, sample_shape, expand_batch_dim=False)
     return x
 
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
   def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self._is_only_identity_multiplier:
       # We don't pad in this case and instead let the fldj be applied
       # via broadcast.
       event_size = array_ops.shape(x)[-1]
       event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
       return math_ops.log(math_ops.abs(self._scale)) * event_size
+
     return self.scale.log_abs_determinant()
 
   def _maybe_check_scale(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index 89043b1410..59f9742d57 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -22,9 +22,6 @@ from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.linalg import linear_operator
 
@@ -94,7 +91,6 @@ class AffineLinearOperator(bijector.Bijector):
   def __init__(self,
                shift=None,
                scale=None,
-               event_ndims=1,
                validate_args=False,
                name="affine_linear_operator"):
     """Instantiates the `AffineLinearOperator` bijector.
@@ -103,14 +99,11 @@ class AffineLinearOperator(bijector.Bijector):
       shift: Floating-point `Tensor`.
       scale:  Subclass of `LinearOperator`. Represents the (batch) positive
         definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
 
     Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
       TypeError: if `scale` is not a `LinearOperator`.
       TypeError: if `shift.dtype` does not match `scale.dtype`.
       ValueError: if not `scale.is_non_singular`.
@@ -120,20 +113,6 @@ class AffineLinearOperator(bijector.Bijector):
     self._validate_args = validate_args
     graph_parents = []
     with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
       # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
       dtype = dtypes.float32
 
@@ -166,10 +145,10 @@ class AffineLinearOperator(bijector.Bijector):
       self._scale = scale
       self._shaper = _DistributionShape(
           batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
+          event_ndims=1,
           validate_args=validate_args)
       super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=1,
           graph_parents=graph_parents,
           is_constant_jacobian=True,
           dtype=dtype,
@@ -213,12 +192,13 @@ class AffineLinearOperator(bijector.Bijector):
           x, sample_shape, expand_batch_dim=False)
     return x
 
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+  def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+      return constant_op.constant(0., dtype=x.dtype.base_dtype)
+
     with ops.control_dependencies(self._maybe_collect_assertions() if
                                   self.validate_args else []):
       return self.scale.log_abs_determinant()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
index 8adaa54c84..cd792e2c8c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -99,7 +100,7 @@ class AffineScalar(bijector.Bijector):
               self._scale)
 
       super(AffineScalar, self).__init__(
-          event_ndims=0,
+          forward_min_event_ndims=0,
           is_constant_jacobian=True,
           validate_args=validate_args,
           name=name)
@@ -131,8 +132,10 @@ class AffineScalar(bijector.Bijector):
     return x
 
   def _forward_log_det_jacobian(self, x):
-    log_det_jacobian = array_ops.zeros_like(x)
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self.scale is None:
-      return log_det_jacobian
-    log_det_jacobian += math_ops.log(math_ops.abs(self.scale))
-    return log_det_jacobian
+      return constant_op.constant(0., dtype=x.dtype.base_dtype)
+
+    return math_ops.log(math_ops.abs(self.scale))
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
index 33fdd32d7a..224cec8a63 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
@@ -157,7 +157,12 @@ class BatchNormalization(bijector.Bijector):
         gamma_constraint=g_constraint)
     self._validate_bn_layer(self.batchnorm)
     self._training = training
+    if isinstance(self.batchnorm.axis, int):
+      forward_min_event_ndims = 1
+    else:
+      forward_min_event_ndims = len(self.batchnorm.axis)
     super(BatchNormalization, self).__init__(
+        forward_min_event_ndims=forward_min_event_ndims,
         validate_args=validate_args, name=name)
 
   def _validate_bn_layer(self, layer):
@@ -186,7 +191,6 @@ class BatchNormalization(bijector.Bijector):
     input_shape = np.int32(x.shape.as_list())
 
     ndims = len(input_shape)
-    # event_dims = self._compute_event_dims(x)
     reduction_axes = [i for i in range(ndims) if i not in self.batchnorm.axis]
     # Broadcasting only necessary for single-axis batch norm where the axis is
     # not the last dimension
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 3ce7c26213..85ad23e413 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -21,6 +21,9 @@ from __future__ import print_function
 import itertools
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bijector
 
 
@@ -29,6 +32,91 @@ __all__ = [
 ]
 
 
+def _use_static_shape(input_tensor, ndims):
+  return input_tensor.shape.is_fully_defined() and isinstance(ndims, int)
+
+
+def _maybe_get_event_ndims_statically(event_ndims):
+  static_event_ndims = (event_ndims if isinstance(event_ndims, int)
+                        else tensor_util.constant_value(event_ndims))
+  if static_event_ndims is not None:
+    return static_event_ndims
+
+  return event_ndims
+
+
+def _compute_min_event_ndims(bijector_list, compute_forward=True):
+  """Computes the min_event_ndims associated with the give list of bijectors.
+
+  Given a list `bijector_list` of bijectors, compute the min_event_ndims that is
+  associated with the composition of bijectors in that list.
+
+  min_event_ndims is the # of right most dimensions for which the bijector has
+  done necessary computation on (i.e. the non-broadcastable part of the
+  computation).
+
+  We can derive the min_event_ndims for a chain of bijectors as follows:
+
+  In the case where there are no rank changing bijectors, this will simply be
+  `max(b.forward_min_event_ndims for b in bijector_list)`. This is because the
+  bijector with the most forward_min_event_ndims requires the most dimensions,
+  and hence the chain also requires operating on those dimensions.
+
+  However in the case of rank changing, more care is needed in determining the
+  exact amount of dimensions. Padding dimensions causes subsequent bijectors to
+  operate on the padded dimensions, and Removing dimensions causes bijectors to
+  operate more left.
+
+  Args:
+    bijector_list: List of bijectors to be composed by chain.
+    compute_forward: Boolean. If True, computes the min_event_ndims associated
+      with a forward call to Chain, and otherwise computes the min_event_ndims
+      associated with an inverse call to Chain. The latter is the same as the
+      min_event_ndims associated with a forward call to Invert(Chain(....)).
+
+  Returns:
+    min_event_ndims
+  """
+  min_event_ndims = 0
+  # This is a mouthful, but what this encapsulates is that if not for rank
+  # changing bijectors, we'd only need to compute the largest of the min
+  # required ndims. Hence "max_min". Due to rank changing bijectors, we need to
+  # account for synthetic rank growth / synthetic rank decrease from a rank
+  # changing bijector.
+  rank_changed_adjusted_max_min_event_ndims = 0
+
+  if compute_forward:
+    bijector_list = reversed(bijector_list)
+
+  for b in bijector_list:
+    if compute_forward:
+      current_min_event_ndims = b.forward_min_event_ndims
+      current_inverse_min_event_ndims = b.inverse_min_event_ndims
+    else:
+      current_min_event_ndims = b.inverse_min_event_ndims
+      current_inverse_min_event_ndims = b.forward_min_event_ndims
+
+    # New dimensions were touched.
+    if rank_changed_adjusted_max_min_event_ndims < current_min_event_ndims:
+      min_event_ndims += (
+          current_min_event_ndims - rank_changed_adjusted_max_min_event_ndims)
+    rank_changed_adjusted_max_min_event_ndims = max(
+        current_min_event_ndims, rank_changed_adjusted_max_min_event_ndims)
+
+    # If the number of dimensions has increased via forward, then
+    # inverse_min_event_ndims > forward_min_event_ndims, and hence the
+    # dimensions we computed on, have moved left (so we have operated
+    # on additional dimensions).
+    # Conversely, if the number of dimensions has decreased via forward,
+    # then we have inverse_min_event_ndims < forward_min_event_ndims,
+    # and so we will have operated on fewer right most dimensions.
+
+    number_of_changed_dimensions = (
+        current_min_event_ndims - current_inverse_min_event_ndims)
+    rank_changed_adjusted_max_min_event_ndims -= number_of_changed_dimensions
+  return min_event_ndims
+
+
 class Chain(bijector.Bijector):
   """Bijector which applies a sequence of bijectors.
 
@@ -93,21 +181,24 @@ class Chain(bijector.Bijector):
       raise ValueError("incompatible dtypes: %s" % dtype)
     elif len(dtype) == 2:
       dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
     elif len(dtype) == 1:
       dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
     else:
       dtype = None
-      event_ndims = None
+
+    inverse_min_event_ndims = _compute_min_event_ndims(
+        bijectors, compute_forward=False)
+    forward_min_event_ndims = _compute_min_event_ndims(
+        bijectors, compute_forward=True)
 
     super(Chain, self).__init__(
         graph_parents=list(itertools.chain.from_iterable(
             b.graph_parents for b in bijectors)),
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
         is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
         validate_args=validate_args,
         dtype=dtype,
-        event_ndims=event_ndims,
         name=name or ("identity" if not bijectors else
                       "_of_".join(["chain"] + [b.name for b in bijectors])))
 
@@ -147,10 +238,31 @@ class Chain(bijector.Bijector):
     return y
 
   def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(0., dtype=y.dtype,
-                                name="inverse_log_det_jacobian")
+    ildj = constant_op.constant(
+        0., dtype=y.dtype.base_dtype, name="inverse_log_det_jacobian")
+
+    if not self.bijectors:
+      return ildj
+
+    event_ndims = _maybe_get_event_ndims_statically(
+        self.inverse_min_event_ndims)
+
+    if _use_static_shape(y, event_ndims):
+      event_shape = y.shape[y.shape.ndims - event_ndims:]
+    else:
+      event_shape = array_ops.shape(y)[array_ops.rank(y) - event_ndims:]
+
     for b in self.bijectors:
-      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      ildj += b.inverse_log_det_jacobian(
+          y, event_ndims=event_ndims, **kwargs.get(b.name, {}))
+
+      if _use_static_shape(y, event_ndims):
+        event_shape = b.inverse_event_shape(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+      else:
+        event_shape = b.inverse_event_shape_tensor(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(
+            array_ops.rank(event_shape))
       y = b.inverse(y, **kwargs.get(b.name, {}))
     return ildj
 
@@ -160,9 +272,34 @@ class Chain(bijector.Bijector):
     return x
 
   def _forward_log_det_jacobian(self, x, **kwargs):
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
+    x = ops.convert_to_tensor(x, name="x")
+
+    fldj = constant_op.constant(
+        0., dtype=x.dtype, name="inverse_log_det_jacobian")
+
+    if not self.bijectors:
+      return fldj
+
+    event_ndims = _maybe_get_event_ndims_statically(
+        self.forward_min_event_ndims)
+
+    if _use_static_shape(x, event_ndims):
+      event_shape = x.shape[x.shape.ndims - event_ndims:]
+    else:
+      event_shape = array_ops.shape(x)[array_ops.rank(x) - event_ndims:]
+
     for b in reversed(self.bijectors):
-      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      fldj += b.forward_log_det_jacobian(
+          x, event_ndims=event_ndims, **kwargs.get(b.name, {}))
+      if _use_static_shape(x, event_ndims):
+        event_shape = b.forward_event_shape(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+      else:
+        event_shape = b.forward_event_shape_tensor(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(
+            array_ops.rank(event_shape))
+
       x = b.forward(x, **kwargs.get(b.name, {}))
+
     return fldj
+
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 8f09e16058..caae2adcfa 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -80,7 +80,7 @@ class CholeskyOuterProduct(bijector.Bijector):
     self._graph_parents = []
     self._name = name
     super(CholeskyOuterProduct, self).__init__(
-        event_ndims=2,
+        forward_min_event_ndims=2,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
index ccb1f02927..e9e994f839 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -44,12 +44,16 @@ class ConditionalBijector(bijector.Bijector):
       "**condition_kwargs":
       "Named arguments forwarded to subclass implementation."})
   def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+      self, y, event_ndims, name="inverse_log_det_jacobian",
+      **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(
+        y, event_ndims, name, **condition_kwargs)
 
   @distribution_util.AppendDocstring(kwargs_dict={
       "**condition_kwargs":
       "Named arguments forwarded to subclass implementation."})
   def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
+      self, x, event_ndims, name="forward_log_det_jacobian",
+      **condition_kwargs):
+    return self._call_forward_log_det_jacobian(
+        x, event_ndims, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
index b1ff840d62..9fc1bbf052 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -33,8 +33,8 @@ class Exp(power_transform.PowerTransform):
 
     ```python
     # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
+    # batch ndim 2.
+    exp = Exp()
     x = [[[1., 2],
            [3, 4]],
           [[5, 6],
@@ -48,19 +48,17 @@ class Exp(power_transform.PowerTransform):
   """
 
   def __init__(self,
-               event_ndims=0,
                validate_args=False,
                name="exp"):
     """Instantiates the `Exp` bijector.
 
     Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
     """
+    # forward_min_event_ndims = 0.
+    # No forward_min_event_ndims specified as this is done in PowerTransform.
     super(Exp, self).__init__(
-        event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
index 67f3978556..e656a258e5 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
@@ -48,7 +48,6 @@ class Gumbel(bijector.Bijector):
   def __init__(self,
                loc=0.,
                scale=1.,
-               event_ndims=0,
                validate_args=False,
                name="gumbel"):
     """Instantiates the `Gumbel` bijector.
@@ -60,8 +59,6 @@ class Gumbel(bijector.Bijector):
       scale: Positive Float-like `Tensor` that is the same dtype and is
         broadcastable with `loc`.
         This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -80,7 +77,9 @@ class Gumbel(bijector.Bijector):
         ], self._scale)
 
     super(Gumbel, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
+        validate_args=validate_args,
+        forward_min_event_ndims=0,
+        name=name)
 
   @property
   def loc(self):
@@ -102,15 +101,11 @@ class Gumbel(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
+    return math_ops.log(self.scale / (-math_ops.log(y) * y))
 
   def _forward_log_det_jacobian(self, x):
-    event_dims = self._event_dims_tensor(x)
     z = (x - self.loc) / self.scale
-    return math_ops.reduce_sum(
-        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
+    return -z - math_ops.exp(-z) - math_ops.log(self.scale)
 
   def _maybe_assert_valid_y(self, y):
     if not self.validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
index fab1b22fbf..2bde956d13 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -40,7 +40,7 @@ class Inline(bijector.Bijector):
     name="exp")
   ```
 
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  The above example is equivalent to the `Bijector` `Exp()`.
   """
 
   def __init__(self,
@@ -54,6 +54,8 @@ class Inline(bijector.Bijector):
                inverse_event_shape_tensor_fn=None,
                is_constant_jacobian=False,
                validate_args=False,
+               forward_min_event_ndims=None,
+               inverse_min_event_ndims=None,
                name="inline"):
     """Creates a `Bijector` from callables.
 
@@ -76,10 +78,15 @@ class Inline(bijector.Bijector):
         constant for all input arguments.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
+      forward_min_event_ndims: Python `int` indicating the minimal
+        dimensionality this bijector acts on.
+      inverse_min_event_ndims: Python `int` indicating the minimal
+        dimensionality this bijector acts on.
       name: Python `str`, name given to ops managed by this object.
     """
     super(Inline, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -134,8 +141,8 @@ class Inline(bijector.Bijector):
           "inverse_log_det_jacobian_fn is not a callable function.")
     return self._inverse_log_det_jacobian_fn(y, **kwargs)
 
-  def _forward_log_det_jacobian(self, y, **kwargs):
+  def _forward_log_det_jacobian(self, x, **kwargs):
     if not callable(self._forward_log_det_jacobian_fn):
       raise NotImplementedError(
           "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
+    return self._forward_log_det_jacobian_fn(x, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index 2c603fe61f..1904239a0e 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -66,8 +66,9 @@ class Invert(bijector_lib.Bijector):
 
     self._bijector = bijector
     super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
         graph_parents=bijector.graph_parents,
+        forward_min_event_ndims=bijector.inverse_min_event_ndims,
+        inverse_min_event_ndims=bijector.forward_min_event_ndims,
         is_constant_jacobian=bijector.is_constant_jacobian,
         validate_args=validate_args,
         dtype=bijector.dtype,
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
index f5de052c9e..97000c1726 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -48,7 +47,6 @@ class Kumaraswamy(bijector.Bijector):
   def __init__(self,
                concentration1=None,
                concentration0=None,
-               event_ndims=0,
                validate_args=False,
                name="kumaraswamy"):
     """Instantiates the `Kumaraswamy` bijector.
@@ -60,31 +58,14 @@ class Kumaraswamy(bijector.Bijector):
       concentration0: Python `float` scalar indicating the transform power,
         i.e., `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a)` where `b` is
         `concentration0`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution. Currently only zero is
-        supported.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
     """
     self._graph_parents = []
     self._name = name
     self._validate_args = validate_args
 
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
     with self._name_scope("init", values=[concentration1, concentration0]):
       concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
@@ -96,7 +77,7 @@ class Kumaraswamy(bijector.Bijector):
     self._concentration1 = concentration1
     self._concentration0 = concentration0
     super(Kumaraswamy, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -123,12 +104,10 @@ class Kumaraswamy(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
+    return (
         math_ops.log(self.concentration1) + math_ops.log(self.concentration0) +
         (self.concentration1 - 1) * math_ops.log(y) +
-        (self.concentration0 - 1) * math_ops.log1p(-y**self.concentration1),
-        axis=event_dims)
+        (self.concentration0 - 1) * math_ops.log1p(-y**self.concentration1))
 
   def _maybe_assert_valid_concentration(self, concentration, validate_args):
     """Checks the validity of a concentration parameter."""
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 84b2340c75..ef56cf6ddd 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -61,7 +61,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   this property by zeroing out weights in its `masked_dense` layers.
 
   In the `tf.distributions` framework, a "normalizing flow" is implemented as a
-  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
+  `tf.contrib.distributions.bijectors.Bijector`. The `forward` "autoregression"
   is implemented using a `tf.while_loop` and a deep neural network (DNN) with
   masked weights such that the autoregressive property is automatically met in
   the `inverse`.
@@ -220,6 +220,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     self._shift_and_log_scale_fn = shift_and_log_scale_fn
     self._unroll_loop = unroll_loop
     super(MaskedAutoregressiveFlow, self).__init__(
+        forward_min_event_ndims=1,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index 8654cc39d0..4978167803 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -114,6 +114,7 @@ class Permute(bijector_lib.Bijector):
         ], permutation)
       self._permutation = permutation
       super(Permute, self).__init__(
+          forward_min_event_ndims=1,
           is_constant_jacobian=True,
           validate_args=validate_args,
           name=name or "permute")
@@ -132,7 +133,10 @@ class Permute(bijector_lib.Bijector):
         axis=-1)
 
   def _inverse_log_det_jacobian(self, y):
-    return constant_op.constant(0., dtype=y.dtype)
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
+    return constant_op.constant(0., dtype=y.dtype.base_dtype)
 
   def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
+    return constant_op.constant(0., dtype=x.dtype.base_dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
index c37db61720..71f123f2a9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -43,7 +43,6 @@ class PowerTransform(bijector.Bijector):
 
   def __init__(self,
                power=0.,
-               event_ndims=0,
                validate_args=False,
                name="power_transform"):
     """Instantiates the `PowerTransform` bijector.
@@ -51,8 +50,6 @@ class PowerTransform(bijector.Bijector):
     Args:
       power: Python `float` scalar indicating the transform power, i.e.,
         `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -70,7 +67,7 @@ class PowerTransform(bijector.Bijector):
       raise ValueError("`power` must be a non-negative TF constant.")
     self._power = power
     super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -97,18 +94,13 @@ class PowerTransform(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y), axis=event_dims)
+    return (self.power - 1.) * math_ops.log(y)
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
     if self.power == 0.:
-      return math_ops.reduce_sum(x, axis=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        axis=event_dims)
+      return x
+    return (1. / self.power - 1.) * math_ops.log1p(x * self.power)
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args or self.power == 0.:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
index 71ab369d01..f09ab21bce 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -166,7 +166,7 @@ class RealNVP(bijector_lib.Bijector):
     self._input_depth = None
     self._shift_and_log_scale_fn = shift_and_log_scale_fn
     super(RealNVP, self).__init__(
-        event_ndims=1,
+        forward_min_event_ndims=1,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -224,7 +224,7 @@ class RealNVP(bijector_lib.Bijector):
     _, log_scale = self._shift_and_log_scale_fn(
         x0, self._input_depth - self._num_masked)
     if log_scale is None:
-      return constant_op.constant(0., dtype=x.dtype, name="ildj")
+      return constant_op.constant(0., dtype=x.dtype, name="fldj")
     return math_ops.reduce_sum(log_scale, axis=-1)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 55eca06312..82210cd6c9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -128,9 +128,11 @@ class Reshape(bijector_lib.Bijector):
       self._event_shape_in = event_shape_in
       self._event_shape_out = event_shape_out
 
-      super(Reshape, self).__init__(is_constant_jacobian=True,
-                                    validate_args=validate_args,
-                                    name=name or "reshape")
+      super(Reshape, self).__init__(
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name or "reshape")
 
   def _maybe_check_valid_shape(self, shape, validate_args):
     """Check that a shape Tensor is int-type and otherwise sane."""
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
index a640dfe7df..5df8c88631 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -33,7 +33,9 @@ class Sigmoid(bijector.Bijector):
 
   def __init__(self, validate_args=False, name="sigmoid"):
     super(Sigmoid, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
 
   def _forward(self, x):
     return math_ops.sigmoid(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index 3a75e4ae94..2a32e8abcd 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -91,7 +91,6 @@ class SinhArcsinh(bijector.Bijector):
   def __init__(self,
                skewness=None,
                tailweight=None,
-               event_ndims=0,
                validate_args=False,
                name="SinhArcsinh"):
     """Instantiates the `SinhArcsinh` bijector.
@@ -101,8 +100,6 @@ class SinhArcsinh(bijector.Bijector):
         of type `float32`.
       tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
         `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -125,7 +122,9 @@ class SinhArcsinh(bijector.Bijector):
                 message="Argument tailweight was not positive")
         ], self._tailweight)
     super(SinhArcsinh, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
 
   @property
   def skewness(self):
@@ -149,31 +148,29 @@ class SinhArcsinh(bijector.Bijector):
     # dx/dy
     # = cosh(arcsinh(y) / tailweight - skewness)
     #     / (tailweight * sqrt(y**2 + 1))
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+
+    # This is computed inside the log to avoid catastrophic cancellations
+    # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+    return (
         math_ops.log(math_ops.cosh(
             math_ops.asinh(y) / self.tailweight - self.skewness)
                      # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
                      # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
                      / _sqrtx2p1(y))
-        - math_ops.log(self.tailweight),
-        axis=event_dims)
+        - math_ops.log(self.tailweight))
 
   def _forward_log_det_jacobian(self, x):
     # y = sinh((arcsinh(x) + skewness) * tailweight)
     # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
     # dy/dx
     # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+
+    # This is computed inside the log to avoid catastrophic cancellations
+    # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+    return (
         math_ops.log(math_ops.cosh(
             (math_ops.asinh(x) + self.skewness) * self.tailweight)
                      # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
                      # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
                      / _sqrtx2p1(x))
-        + math_ops.log(self.tailweight),
-        axis=event_dims)
+        + math_ops.log(self.tailweight))
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index dc94fd0a38..f52b91550e 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -66,7 +66,7 @@ class SoftmaxCentered(bijector.Bijector):
     self._graph_parents = []
     self._name = name
     super(SoftmaxCentered, self).__init__(
-        event_ndims=1,
+        forward_min_event_ndims=1,
         validate_args=validate_args,
         name=name)
 
@@ -105,8 +105,6 @@ class SoftmaxCentered(bijector.Bijector):
       y.shape.assert_is_compatible_with(shape)
       y.set_shape(shape)
 
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
     return nn_ops.softmax(y)
 
   def _inverse(self, y):
@@ -162,8 +160,6 @@ class SoftmaxCentered(bijector.Bijector):
     #   -log_normalization + reduce_sum(logits - log_normalization)
     log_normalization = nn_ops.softplus(
         math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-    fldj = (-log_normalization +
-            math_ops.reduce_sum(x - log_normalization,
-                                axis=-1,
-                                keep_dims=True))
-    return array_ops.squeeze(fldj, squeeze_dims=-1)
+    return array_ops.squeeze(
+        (-log_normalization + math_ops.reduce_sum(
+            x - log_normalization, axis=-1, keepdims=True)), axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
index 81957fcf78..96a938c803 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -62,7 +62,7 @@ class Softplus(bijector.Bijector):
     ```python
     # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
     # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
+    softplus = Softplus()
     x = [[[1., 2],
           [3, 4]],
          [[5, 6],
@@ -81,7 +81,6 @@ class Softplus(bijector.Bijector):
               "Nonzero floating point `Tensor`.  Controls the softness of what "
               "would otherwise be a kink at the origin.  Default is 1.0")})
   def __init__(self,
-               event_ndims=0,
                hinge_softness=None,
                validate_args=False,
                name="softplus"):
@@ -101,7 +100,7 @@ class Softplus(bijector.Bijector):
             [nonzero_check], self.hinge_softness)
 
     super(Softplus, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -130,14 +129,12 @@ class Softplus(bijector.Bijector):
     # 1 - exp{-Y} approx Y.
     if self.hinge_softness is not None:
       y /= math_ops.cast(self.hinge_softness, y.dtype)
-    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                axis=self._event_dims_tensor(y))
+    return -math_ops.log(-math_ops.expm1(-y))
 
   def _forward_log_det_jacobian(self, x):
     if self.hinge_softness is not None:
       x /= math_ops.cast(self.hinge_softness, x.dtype)
-    return -math_ops.reduce_sum(nn_ops.softplus(-x),
-                                axis=self._event_dims_tensor(x))
+    return -nn_ops.softplus(-x)
 
   @property
   def hinge_softness(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/square.py b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
index 1e9dbf3509..2ccfdc9597 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/square.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
@@ -59,7 +59,7 @@ class Square(bijector.Bijector):
     """
     self._name = name
     super(Square, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index 00520bcda8..39129cd22c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -50,7 +50,6 @@ class Weibull(bijector.Bijector):
   def __init__(self,
                scale=1.,
                concentration=1.,
-               event_ndims=0,
                validate_args=False,
                name="weibull"):
     """Instantiates the `Weibull` bijector.
@@ -62,8 +61,6 @@ class Weibull(bijector.Bijector):
       concentration: Positive Float-type `Tensor` that is the same dtype and is
         broadcastable with `scale`.
         This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -89,7 +86,7 @@ class Weibull(bijector.Bijector):
         ], self._concentration)
 
     super(Weibull, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -113,22 +110,18 @@ class Weibull(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
+    return (
         -math_ops.log1p(-y) +
         (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
-        math_ops.log(self.scale / self.concentration),
-        axis=event_dims)
+        math_ops.log(self.scale / self.concentration))
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
+    return (
         -(x / self.scale) ** self.concentration +
         (self.concentration - 1) * math_ops.log(x) +
         math_ops.log(self.concentration) +
-        -self.concentration * math_ops.log(self.scale),
-        axis=event_dims)
+        -self.concentration * math_ops.log(self.scale))
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 1d4c5660d8..10b4536135 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import transformed_distribution
@@ -105,7 +106,9 @@ class ConditionalTransformedDistribution(
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(
+        y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_log_prob_for_one_fiber(y, x, ildj,
                                                  distribution_kwargs)
@@ -128,7 +131,9 @@ class ConditionalTransformedDistribution(
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(
+        y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_prob_for_one_fiber(y, x, ildj, distribution_kwargs)
 
@@ -214,3 +219,15 @@ class ConditionalTransformedDistribution(
     # implies the qth quantile of Y is g(x_q).
     inv_cdf = self.distribution.quantile(value, **distribution_kwargs)
     return self.bijector.forward(inv_cdf, **bijector_kwargs)
+
+  def _maybe_get_event_ndims_statically(self):
+    if self.event_shape.ndims is not None:
+      return self.event_shape.ndims
+
+    event_ndims = array_ops.size(self.event_shape_tensor())
+    static_event_ndims = tensor_util.constant_value(event_ndims)
+
+    if static_event_ndims is not None:
+      return static_event_ndims
+
+    return event_ndims
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 92f2bba182..3314181898 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -114,7 +114,7 @@ def quadrature_scheme_lognormal_quantiles(
     # Create a LogNormal distribution.
     dist = transformed_lib.TransformedDistribution(
         distribution=normal_lib.Normal(loc=loc, scale=scale),
-        bijector=Exp(event_ndims=0),
+        bijector=Exp(),
         validate_args=validate_args)
     batch_ndims = dist.batch_shape.ndims
     if batch_ndims is None:
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index f56ba07816..02cf3c7992 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -409,5 +409,5 @@ class RelaxedOneHotCategorical(
                                        validate_args=validate_args,
                                        allow_nan_stats=allow_nan_stats)
     super(RelaxedOneHotCategorical, self).__init__(dist,
-                                                   bijectors.Exp(event_ndims=1),
+                                                   bijectors.Exp(),
                                                    name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index 0d8a192691..cde6d85500 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -166,13 +166,13 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
 
       # Make the SAS bijector, 'F'.
       f = bijectors.SinhArcsinh(
-          skewness=skewness, tailweight=tailweight, event_ndims=0)
+          skewness=skewness, tailweight=tailweight)
       if has_default_skewness:
         f_noskew = f
       else:
         f_noskew = bijectors.SinhArcsinh(
             skewness=skewness.dtype.as_numpy_dtype(0.),
-            tailweight=tailweight, event_ndims=0)
+            tailweight=tailweight)
 
       # Make the AffineScalar bijector, Z --> loc + scale * Z (2 / F_0(2))
       c = 2 * scale / f_noskew.forward(ops.convert_to_tensor(2, dtype=dtype))
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 971d65c4a6..da271a852d 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -427,7 +427,6 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       self._endpoint_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
-                               event_ndims=1,
                                validate_args=validate_args,
                                name="endpoint_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(loc, scale))]
@@ -467,7 +466,6 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       self._interpolated_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
-                               event_ndims=1,
                                validate_args=validate_args,
                                name="interpolated_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(
@@ -621,9 +619,11 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     log_prob = math_ops.reduce_sum(self.distribution.log_prob(y), axis=-2)
     # Because the affine transformation has a constant Jacobian, it is the case
     # that `affine.fldj(x) = -affine.ildj(x)`. This is not true in general.
-    fldj = array_ops.stack(
-        [aff.forward_log_det_jacobian(x) for aff in self.interpolated_affine],
-        axis=-1)
+    fldj = array_ops.stack([
+        aff.forward_log_det_jacobian(
+            x,
+            event_ndims=array_ops.rank(self.event_shape_tensor())
+        ) for aff in self.interpolated_affine], axis=-1)
     return math_ops.reduce_logsumexp(
         self.mixture_distribution.logits - fldj + log_prob, axis=-1)
 
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 003c66b941..05919be124 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -215,13 +215,13 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
       tailweight = ops.convert_to_tensor(
           tailweight, dtype=dtype, name="tailweight")
       f = bijectors.SinhArcsinh(
-          skewness=skewness, tailweight=tailweight, event_ndims=1)
+          skewness=skewness, tailweight=tailweight)
       if has_default_skewness:
         f_noskew = f
       else:
         f_noskew = bijectors.SinhArcsinh(
             skewness=skewness.dtype.as_numpy_dtype(0.),
-            tailweight=tailweight, event_ndims=0)
+            tailweight=tailweight)
 
       # Make the Affine bijector, Z --> loc + C * Z.
       c = 2 * scale_diag_part / f_noskew.forward(
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index 9f9fb5c0bb..18582241e2 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import abc
 
+import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
@@ -43,11 +44,10 @@ class BaseBijectorTest(test.TestCase):
       """Minimal specification of a `Bijector`."""
 
       def __init__(self):
-        super(_BareBonesBijector, self).__init__()
+        super(_BareBonesBijector, self).__init__(forward_min_event_ndims=0)
 
     with self.test_session() as sess:
       bij = _BareBonesBijector()
-      self.assertEqual(None, bij.event_ndims)
       self.assertEqual([], bij.graph_parents)
       self.assertEqual(False, bij.is_constant_jacobian)
       self.assertEqual(False, bij.validate_args)
@@ -67,13 +67,21 @@ class BaseBijectorTest(test.TestCase):
         self.assertAllEqual(shape, inverse_event_shape_)
         self.assertAllEqual(shape, bij.inverse_event_shape(shape))
 
-      for fn in ["forward",
-                 "inverse",
-                 "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian"]:
-        with self.assertRaisesRegexp(
-            NotImplementedError, fn + " not implemented"):
-          getattr(bij, fn)(0)
+      with self.assertRaisesRegexp(
+          NotImplementedError, "inverse not implemented"):
+        bij.inverse(0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "forward not implemented"):
+        bij.forward(0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "inverse_log_det_jacobian not implemented"):
+        bij.inverse_log_det_jacobian(0, event_ndims=0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "forward_log_det_jacobian not implemented"):
+        bij.forward_log_det_jacobian(0, event_ndims=0)
 
 
 class IntentionallyMissingError(Exception):
@@ -85,7 +93,7 @@ class BrokenBijector(bijector.Bijector):
 
   def __init__(self, forward_missing=False, inverse_missing=False):
     super(BrokenBijector, self).__init__(
-        event_ndims=0, validate_args=False, name="broken")
+        validate_args=False, forward_min_event_ndims=0, name="broken")
     self._forward_missing = forward_missing
     self._inverse_missing = inverse_missing
 
@@ -120,35 +128,42 @@ class BijectorCachingTestBase(object):
 
   def testCachingOfForwardResults(self):
     broken_bijector = self.broken_bijector_cls(inverse_missing=True)
-    with self.test_session():
-      x = constant_op.constant(1.1)
+    x = constant_op.constant(1.1)
+
+    # Call forward and forward_log_det_jacobian one-by-one (not together).
+    y = broken_bijector.forward(x)
+    _ = broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
 
-      # Call forward and forward_log_det_jacobian one-by-one (not together).
-      y = broken_bijector.forward(x)
-      _ = broken_bijector.forward_log_det_jacobian(x)
+    # Now, everything should be cached if the argument is y.
+    broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
+    try:
+      broken_bijector.inverse(y)
+      broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
+    except IntentionallyMissingError:
+      raise AssertionError("Tests failed! Cached values not used.")
 
-      # Now, everything should be cached if the argument is y.
-      try:
-        broken_bijector.inverse(y)
-        broken_bijector.inverse_log_det_jacobian(y)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed! Cached values not used.")
+    # Different event_ndims should not be cached.
+    with self.assertRaises(IntentionallyMissingError):
+      broken_bijector.inverse_log_det_jacobian(y, event_ndims=1)
 
   def testCachingOfInverseResults(self):
     broken_bijector = self.broken_bijector_cls(forward_missing=True)
-    with self.test_session():
-      y = constant_op.constant(1.1)
+    y = constant_op.constant(1.1)
 
-      # Call inverse and inverse_log_det_jacobian one-by-one (not together).
-      x = broken_bijector.inverse(y)
-      _ = broken_bijector.inverse_log_det_jacobian(y)
+    # Call inverse and inverse_log_det_jacobian one-by-one (not together).
+    x = broken_bijector.inverse(y)
+    _ = broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
 
-      # Now, everything should be cached if the argument is x.
-      try:
-        broken_bijector.forward(x)
-        broken_bijector.forward_log_det_jacobian(x)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed! Cached values not used.")
+    # Now, everything should be cached if the argument is x.
+    try:
+      broken_bijector.forward(x)
+      broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
+    except IntentionallyMissingError:
+      raise AssertionError("Tests failed! Cached values not used.")
+
+    # Different event_ndims should not be cached.
+    with self.assertRaises(IntentionallyMissingError):
+      broken_bijector.forward_log_det_jacobian(x, event_ndims=1)
 
 
 class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
@@ -159,5 +174,107 @@ class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
     return BrokenBijector
 
 
+class ExpOnlyJacobian(bijector.Bijector):
+  """Only used for jacobian calculations."""
+
+  def __init__(self, forward_min_event_ndims=0):
+    super(ExpOnlyJacobian, self).__init__(
+        validate_args=False,
+        is_constant_jacobian=False,
+        forward_min_event_ndims=forward_min_event_ndims,
+        name="exp")
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y)
+
+  def _forward_log_det_jacobian(self, x):
+    return math_ops.log(x)
+
+
+class ConstantJacobian(bijector.Bijector):
+  """Only used for jacobian calculations."""
+
+  def __init__(self, forward_min_event_ndims=0):
+    super(ConstantJacobian, self).__init__(
+        validate_args=False,
+        is_constant_jacobian=True,
+        forward_min_event_ndims=forward_min_event_ndims,
+        name="c")
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(2., y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(-2., x.dtype)
+
+
+class BijectorReduceEventDimsTest(test.TestCase):
+  """Test caching with BrokenBijector."""
+
+  def testReduceEventNdimsForward(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian()
+    self.assertAllClose(
+        np.log(x),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        np.sum(np.log(x), axis=-1),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        np.sum(np.log(x), axis=(-1, -2)),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsForwardRaiseError(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+      bij.forward_log_det_jacobian(x, event_ndims=0)
+
+  def testReduceEventNdimsInverse(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian()
+    self.assertAllClose(
+        -np.log(x),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        np.sum(-np.log(x), axis=-1),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        np.sum(-np.log(x), axis=(-1, -2)),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsInverseRaiseError(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+      bij.inverse_log_det_jacobian(x, event_ndims=0)
+
+  def testReduceEventNdimsForwardConstJacobian(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ConstantJacobian()
+    self.assertAllClose(
+        -2.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        -4.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        -8.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsInverseConstJacobian(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ConstantJacobian()
+    self.assertAllClose(
+        2.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        4.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        8.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index e8f9d0b728..b347c20db2 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -27,14 +27,19 @@ class IdentityBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = X transformation."""
 
   def testBijector(self):
-    with self.test_session():
-      bijector = identity_bijector.Identity()
-      self.assertEqual("identity", bijector.name)
-      x = [[[0.], [1.]]]
-      self.assertAllEqual(x, bijector.forward(x).eval())
-      self.assertAllEqual(x, bijector.inverse(x).eval())
-      self.assertAllEqual(0., bijector.inverse_log_det_jacobian(x).eval())
-      self.assertAllEqual(0., bijector.forward_log_det_jacobian(x).eval())
+    bijector = identity_bijector.Identity(validate_args=True)
+    self.assertEqual("identity", bijector.name)
+    x = [[[0.], [1.]]]
+    self.assertAllEqual(x, self.evaluate(bijector.forward(x)))
+    self.assertAllEqual(x, self.evaluate(bijector.inverse(x)))
+    self.assertAllEqual(
+        0.,
+        self.evaluate(
+            bijector.inverse_log_det_jacobian(x, event_ndims=3)))
+    self.assertAllEqual(
+        0.,
+        self.evaluate(
+            bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index ed435557fd..4ebc600d03 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -23,7 +23,6 @@ import collections
 import contextlib
 import re
 
-import numpy as np
 import six
 
 from tensorflow.python.framework import dtypes
@@ -31,8 +30,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -41,23 +40,24 @@ __all__ = [
 
 
 class _Mapping(collections.namedtuple(
-    "_Mapping", ["x", "y", "ildj", "kwargs"])):
+    "_Mapping", ["x", "y", "ildj_map", "kwargs"])):
   """Helper class to make it easier to manage caching in `Bijector`."""
 
-  def __new__(cls, x=None, y=None, ildj=None, kwargs=None):
+  def __new__(cls, x=None, y=None, ildj_map=None, kwargs=None):
     """Custom __new__ so namedtuple items have defaults.
 
     Args:
       x: `Tensor`. Forward.
       y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
+      ildj_map: `Dictionary`. This is a mapping from event_ndims to a `Tensor`
+        representing the inverse log det jacobian.
       kwargs: Python dictionary. Extra args supplied to
         forward/inverse/etc functions.
 
     Returns:
       mapping: New instance of _Mapping.
     """
-    return super(_Mapping, cls).__new__(cls, x, y, ildj, kwargs)
+    return super(_Mapping, cls).__new__(cls, x, y, ildj_map, kwargs)
 
   @property
   def x_key(self):
@@ -69,13 +69,14 @@ class _Mapping(collections.namedtuple(
     """Returns key used for caching X=g^{-1}(Y)."""
     return (self.y,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
 
-  def merge(self, x=None, y=None, ildj=None, kwargs=None, mapping=None):
+  def merge(self, x=None, y=None, ildj_map=None, kwargs=None, mapping=None):
     """Returns new _Mapping with args merged with self.
 
     Args:
       x: `Tensor`. Forward.
       y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
+      ildj_map: `Dictionary`. This is a mapping from event_ndims to a `Tensor`
+        representing the inverse log det jacobian.
       kwargs: Python dictionary. Extra args supplied to
         forward/inverse/etc functions.
       mapping: Instance of _Mapping to merge. Can only be specified if no other
@@ -88,15 +89,30 @@ class _Mapping(collections.namedtuple(
       ValueError: if mapping and any other arg is not `None`.
     """
     if mapping is None:
-      mapping = _Mapping(x=x, y=y, ildj=ildj, kwargs=kwargs)
-    elif not all(arg is None for arg in [x, y, ildj, kwargs]):
-      raise ValueError("Cannot specify mapping and individual args.")
+      mapping = _Mapping(x=x, y=y, ildj_map=ildj_map, kwargs=kwargs)
+    elif any(arg is not None for arg in [x, y, ildj_map, kwargs]):
+      raise ValueError("Cannot simultaneously specify mapping and individual "
+                       "arguments.")
+
     return _Mapping(
         x=self._merge(self.x, mapping.x),
         y=self._merge(self.y, mapping.y),
-        ildj=self._merge(self.ildj, mapping.ildj),
+        ildj_map=self._merge_dicts(self.ildj_map, mapping.ildj_map),
         kwargs=self._merge(self.kwargs, mapping.kwargs))
 
+  def _merge_dicts(self, old=None, new=None):
+    """Helper to merge two dictionaries."""
+    old = dict() if old is None else old
+    new = dict() if new is None else new
+    for k, v in six.iteritems(new):
+      val = old.get(k, None)
+      if val is not None and val != v:
+        raise ValueError("Found different value for existing key "
+                         "(key:{} old_value:{} new_value:{}".format(
+                             k, old[k], v))
+      old[k] = v
+    return old
+
   def _merge(self, old, new):
     """Helper to merge which handles merging one value."""
     if old is None:
@@ -112,7 +128,6 @@ class _Mapping(collections.namedtuple(
 
 
 @six.add_metaclass(abc.ABCMeta)
-@tf_export("distributions.bijectors.Bijector")
 class Bijector(object):
   r"""Interface for transformations of a `Distribution` sample.
 
@@ -137,11 +152,11 @@ class Bijector(object):
   2. Inverse\
      Useful for "reversing" a transformation to compute one probability in
      terms of another.
-  3. `(log o det o Jacobian o inverse)(x)`\
+  3. `log_det_jacobian(x)`\
      "The log of the determinant of the matrix of all first-order partial
      derivatives of the inverse function."\
      Useful for inverting a transformation to compute one probability in terms
-     of another. Geometrically, the det(Jacobian) is the volume of the
+     of another. Geometrically, the Jacobian determinant is the volume of the
      transformation and is used to scale the probability.
 
   By convention, transformations of random variables are named in terms of the
@@ -164,7 +179,7 @@ class Bijector(object):
 
   ```python
   def transformed_log_prob(bijector, log_prob, x):
-    return (bijector.inverse_log_det_jacobian(x) +
+    return (bijector.inverse_log_det_jacobian(x, event_ndims=0) +
             log_prob(bijector.inverse(x)))
   ```
 
@@ -199,9 +214,11 @@ class Bijector(object):
     ```python
       class Exp(Bijector):
 
-        def __init__(self, event_ndims=0, validate_args=False, name="exp"):
+        def __init__(self, validate_args=False, name="exp"):
           super(Exp, self).__init__(
-              event_ndims=event_ndims, validate_args=validate_args, name=name)
+              validate_args=validate_args,
+              forward_min_event_ndims=0,
+              name=name)
 
         def _forward(self, x):
           return math_ops.exp(x)
@@ -213,10 +230,11 @@ class Bijector(object):
           return -self._forward_log_det_jacobian(self._inverse(y))
 
         def _forward_log_det_jacobian(self, x):
-          if self.event_ndims is None:
-            raise ValueError("Jacobian requires known event_ndims.")
-          event_dims = array_ops.shape(x)[-self.event_ndims:]
-          return math_ops.reduce_sum(x, axis=event_dims)
+          # Notice that we needn't do any reducing, even when`event_ndims > 0`.
+          # The base Bijector class will handle reducing for us; it knows how
+          # to do so because we called `super` `__init__` with
+          # `forward_min_event_ndims = 0`.
+          return x
       ```
 
   - "Affine"
@@ -237,18 +255,50 @@ class Bijector(object):
                   MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
       ```
 
-  #### Jacobian
+  #### Min_event_ndims and Naming
+
+  Bijectors are named for the dimensionality of data they act on (i.e. without
+  broadcasting). We can think of bijectors having an intrinsic `min_event_ndims`
+  , which is the minimum number of dimensions for the bijector act on. For
+  instance, a Cholesky decomposition requires a matrix, and hence
+  `min_event_ndims=2`.
+
+  Some examples:
+
+  `AffineScalar:  min_event_ndims=0`
+  `Affine:  min_event_ndims=1`
+  `Cholesky:  min_event_ndims=2`
+  `Exp:  min_event_ndims=0`
+  `Sigmoid:  min_event_ndims=0`
+  `SoftmaxCentered:  min_event_ndims=1`
+
+  Note the difference between `Affine` and `AffineScalar`. `AffineScalar`
+  operates on scalar events, whereas `Affine` operates on vector-valued events.
 
-  The Jacobian is a reduction over event dims. To see this, consider the `Exp`
-  `Bijector` applied to a `Tensor` which has sample, batch, and event (S, B, E)
-  shape semantics. Suppose the `Tensor`'s partitioned-shape is `(S=[4], B=[2],
-  E=[3, 3])`. The shape of the `Tensor` returned by `forward` and `inverse` is
-  unchanged, i.e., `[4, 2, 3, 3]`.  However the shape returned by
-  `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
-  over the event dimensions.
+  More generally, there is a `forward_min_event_ndims` and an
+  `inverse_min_event_ndims`. In most cases, these will be the same.
+  However, for some shape changing bijectors, these will be different
+  (e.g. a bijector which pads an extra dimension at the end, might have
+  `forward_min_event_ndims=0` and `inverse_min_event_ndims=1`.
 
-  It is sometimes useful to implement the inverse Jacobian as the negative
-  forward Jacobian. For example,
+
+  #### Jacobian Determinant
+
+  The Jacobian determinant is a reduction over `event_ndims - min_event_ndims`
+  (`forward_min_event_ndims` for `forward_log_det_jacobian` and
+  `inverse_min_event_ndims` for `inverse_log_det_jacobian`).
+  To see this, consider the `Exp` `Bijector` applied to a `Tensor` which has
+  sample, batch, and event (S, B, E) shape semantics. Suppose the `Tensor`'s
+  partitioned-shape is `(S=[4], B=[2], E=[3, 3])`. The shape of the `Tensor`
+  returned by `forward` and `inverse` is unchanged, i.e., `[4, 2, 3, 3]`.
+  However the shape returned by `inverse_log_det_jacobian` is `[4, 2]` because
+  the Jacobian determinant is a reduction over the event dimensions.
+
+  Another example is the `Affine` `Bijector`. Because `min_event_ndims = 1`, the
+  Jacobian determinant reduction is over `event_ndims - 1`.
+
+  It is sometimes useful to implement the inverse Jacobian determinant as the
+  negative forward Jacobian determinant. For example,
 
   ```python
   def _inverse_log_det_jacobian(self, y):
@@ -279,9 +329,54 @@ class Bijector(object):
       The claim follows from [properties of determinant](
   https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
 
-  Generally its preferable to directly implement the inverse Jacobian. This
-  should have superior numerical stability and will often share subgraphs with
-  the `_inverse` implementation.
+  Generally its preferable to directly implement the inverse Jacobian
+  determinant.  This should have superior numerical stability and will often
+  share subgraphs with the `_inverse` implementation.
+
+  #### Is_constant_jacobian
+
+  Certain bijectors will have constant jacobian matrices. For instance, the
+  `Affine` bijector encodes multiplication by a matrix plus a shift, with
+  jacobian matrix, the same aforementioned matrix.
+
+  `is_constant_jacobian` encodes the fact that the jacobian matrix is constant.
+  The semantics of this argument are the following:
+
+    * Repeated calls to "log_det_jacobian" functions with the same
+      `event_ndims` (but not necessarily same input), will return the first
+      computed jacobian (because the matrix is constant, and hence is input
+      independent).
+    * `log_det_jacobian` implementations are merely broadcastable to the true
+      `log_det_jacobian` (because, again, the jacobian matrix is input
+      independent). Specifically, `log_det_jacobian` is implemented as the
+      log jacobian determinant for a single input.
+
+      ```python
+      class Identity(Bijector):
+
+        def __init__(self, validate_args=False, name="identity"):
+          super(Identity, self).__init__(
+              is_constant_jacobian=True,
+              validate_args=validate_args,
+              forward_min_event_ndims=0,
+              name=name)
+
+        def _forward(self, x):
+          return x
+
+        def _inverse(self, y):
+          return y
+
+        def _inverse_log_det_jacobian(self, y):
+          return -self._forward_log_det_jacobian(self._inverse(y))
+
+        def _forward_log_det_jacobian(self, x):
+          # The full log jacobian determinant would be array_ops.zero_like(x).
+          # However, we circumvent materializing that, since the jacobian
+          # calculation is input independent, and we specify it for one input.
+          return constant_op.constant(0., x.dtype.base_dtype)
+
+      ```
 
   #### Subclass Requirements
 
@@ -364,14 +459,14 @@ class Bijector(object):
   ==> (-1., 1.)
 
   # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
-  abs.inverse_log_det_jacobian(1.)
+  abs.inverse_log_det_jacobian(1., event_ndims=0)
   ==> (0., 0.)
 
   # Special case handling of 0.
   abs.inverse(0.)
   ==> (0., 0.)
 
-  abs.inverse_log_det_jacobian(0.)
+  abs.inverse_log_det_jacobian(0., event_ndims=0)
   ==> (0., 0.)
   ```
 
@@ -379,11 +474,12 @@ class Bijector(object):
 
   @abc.abstractmethod
   def __init__(self,
-               event_ndims=None,
                graph_parents=None,
                is_constant_jacobian=False,
                validate_args=False,
                dtype=None,
+               forward_min_event_ndims=None,
+               inverse_min_event_ndims=None,
                name=None):
     """Constructs Bijector.
 
@@ -392,42 +488,61 @@ class Bijector(object):
     Examples:
 
     ```python
-    # Create the Y = g(X) = X transform which operates on vector events.
-    identity = Identity(event_ndims=1)
+    # Create the Y = g(X) = X transform.
+    identity = Identity()
 
-    # Create the Y = g(X) = exp(X) transform which operates on matrices.
-    exp = Exp(event_ndims=2)
+    # Create the Y = g(X) = exp(X) transform.
+    exp = Exp()
     ```
 
     See `Bijector` subclass docstring for more details and specific examples.
 
     Args:
-      event_ndims: number of dimensions associated with event coordinates.
       graph_parents: Python list of graph prerequisites of this `Bijector`.
-      is_constant_jacobian: Python `bool` indicating that the Jacobian is not a
-        function of the input.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian matrix is
+        not a function of the input.
       validate_args: Python `bool`, default `False`. Whether to validate input
         with asserts. If `validate_args` is `False`, and the inputs are invalid,
         correct behavior is not guaranteed.
       dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
         enforced.
+      forward_min_event_ndims: Python `integer` indicating the minimum number of
+        dimensions `forward` operates on.
+      inverse_min_event_ndims: Python `integer` indicating the minimum number of
+        dimensions `inverse` operates on. Will be set to
+        `forward_min_event_ndims` by default, if no value is provided.
       name: The name to give Ops created by the initializer.
 
     Raises:
+      ValueError:  If neither `forward_min_event_ndims` and
+        `inverse_min_event_ndims` are specified, or if either of them is
+        negative.
       ValueError:  If a member of `graph_parents` is not a `Tensor`.
     """
-    self._event_ndims = (
-        ops.convert_to_tensor(event_ndims, dtype=dtypes.int32)
-        if event_ndims is not None else None)
     self._graph_parents = graph_parents or []
+
+    if forward_min_event_ndims is None and inverse_min_event_ndims is None:
+      raise ValueError("Must specify at least one of `forward_min_event_ndims` "
+                       "and `inverse_min_event_ndims`.")
+    elif inverse_min_event_ndims is None:
+      inverse_min_event_ndims = forward_min_event_ndims
+    elif forward_min_event_ndims is None:
+      forward_min_event_ndims = inverse_min_event_ndims
+
+    if forward_min_event_ndims < 0:
+      raise ValueError("forward_min_event_ndims must be a non-negative "
+                       "integer.")
+    if inverse_min_event_ndims < 0:
+      raise ValueError("inverse_min_event_ndims must be a non-negative "
+                       "integer.")
+    self._forward_min_event_ndims = forward_min_event_ndims
+    self._inverse_min_event_ndims = inverse_min_event_ndims
     self._is_constant_jacobian = is_constant_jacobian
+    self._constant_ildj_map = {}
     self._validate_args = validate_args
     self._dtype = dtype
     self._from_y = {}
     self._from_x = {}
-    # Using abbreviation ildj for "inverse log det Jacobian."
-    # This variable is not `None` iff is_constant_jacobian is `True`.
-    self._constant_ildj = None
     if name:
       self._name = name
     else:
@@ -442,21 +557,27 @@ class Bijector(object):
       if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
 
-  @property
-  def event_ndims(self):
-    """Returns then number of event dimensions this bijector operates on."""
-    return self._event_ndims
-
   @property
   def graph_parents(self):
     """Returns this `Bijector`'s graph_parents as a Python list."""
     return self._graph_parents
 
+  @property
+  def forward_min_event_ndims(self):
+    """Returns the minimal number of dimensions bijector.forward operates on."""
+    return self._forward_min_event_ndims
+
+  @property
+  def inverse_min_event_ndims(self):
+    """Returns the minimal number of dimensions bijector.inverse operates on."""
+    return self._inverse_min_event_ndims
+
   @property
   def is_constant_jacobian(self):
-    """Returns true iff the Jacobian is not a function of x.
+    """Returns true iff the Jacobian matrix is not a function of x.
 
-    Note: Jacobian is either constant for both forward and inverse or neither.
+    Note: Jacobian matrix is either constant for both forward and inverse or
+    neither.
 
     Returns:
       is_constant_jacobian: Python `bool`.
@@ -653,36 +774,57 @@ class Bijector(object):
     return self._call_inverse(y, name)
 
   def _inverse_log_det_jacobian(self, y):
-    """Subclass implementation of `inverse_log_det_jacobian` public function."""
+    """Subclass implementation of `inverse_log_det_jacobian` public function.
+
+    In particular, this method differs from the public function, in that it
+    does not take `event_ndims`. Thus, this implements the minimal Jacobian
+    determinant calculation (i.e. over `inverse_min_event_ndims`).
+
+    Args:
+      y: `Tensor`. The input to the "inverse_log_det_jacobian" evaluation.
+    Returns:
+      inverse_log_det_jacobian: `Tensor`, if this bijector is injective.
+        If not injective, returns the k-tuple containing jacobians for the
+        unique `k` points `(x1, ..., xk)` such that `g(xi) = y`.
+    """
     raise NotImplementedError("inverse_log_det_jacobian not implemented.")
 
-  def _call_inverse_log_det_jacobian(self, y, name, **kwargs):
+  def _call_inverse_log_det_jacobian(self, y, event_ndims, name, **kwargs):
     with self._name_scope(name, [y]):
-      if self._constant_ildj is not None:
-        return self._constant_ildj
+      if event_ndims in self._constant_ildj_map:
+        return self._constant_ildj_map[event_ndims]
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
       if not self._is_injective:  # No caching for non-injective
-        return self._inverse_log_det_jacobian(y, **kwargs)
+        ildjs = self._inverse_log_det_jacobian(y, **kwargs)
+        return tuple(self._reduce_jacobian_det_over_event(
+            y, ildj, self.inverse_min_event_ndims, event_ndims)
+                     for ildj in ildjs)
       mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return mapping.ildj
+      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+        return mapping.ildj_map[event_ndims]
       try:
         x = None  # Not needed; leave cache as is.
         ildj = self._inverse_log_det_jacobian(y, **kwargs)
+        ildj = self._reduce_jacobian_det_over_event(
+            y, ildj, self.inverse_min_event_ndims, event_ndims)
       except NotImplementedError as original_exception:
         try:
           x = mapping.x if mapping.x is not None else self._inverse(y, **kwargs)
           ildj = -self._forward_log_det_jacobian(x, **kwargs)
+          ildj = self._reduce_jacobian_det_over_event(
+              x, ildj, self.forward_min_event_ndims, event_ndims)
         except NotImplementedError:
           raise original_exception
-      mapping = mapping.merge(x=x, ildj=ildj)
+
+      mapping = mapping.merge(x=x, ildj_map={event_ndims: ildj})
       self._cache(mapping)
       if self.is_constant_jacobian:
-        self._constant_ildj = mapping.ildj
-      return mapping.ildj
+        self._constant_ildj_map[event_ndims] = ildj
+      return ildj
 
-  def inverse_log_det_jacobian(self, y, name="inverse_log_det_jacobian"):
+  def inverse_log_det_jacobian(
+      self, y, event_ndims, name="inverse_log_det_jacobian"):
     """Returns the (log o det o Jacobian o inverse)(y).
 
     Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
@@ -691,7 +833,12 @@ class Bijector(object):
     evaluated at `g^{-1}(y)`.
 
     Args:
-      y: `Tensor`. The input to the "inverse" Jacobian evaluation.
+      y: `Tensor`. The input to the "inverse" Jacobian determinant evaluation.
+      event_ndims: Number of dimensions in the probabilistic events being
+        transformed. Must be greater than or equal to
+        `self.inverse_min_event_ndims`. The result is summed over the final
+        dimensions to produce a scalar Jacobian determinant for each event,
+        i.e. it has shape `y.shape.ndims - event_ndims` dimensions.
       name: The name to give this op.
 
     Returns:
@@ -705,45 +852,74 @@ class Bijector(object):
         `self.dtype`.
       NotImplementedError: if `_inverse_log_det_jacobian` is not implemented.
     """
-    return self._call_inverse_log_det_jacobian(y, name)
+    with ops.control_dependencies(self._check_valid_event_ndims(
+        min_event_ndims=self.inverse_min_event_ndims, event_ndims=event_ndims)):
+      return self._call_inverse_log_det_jacobian(y, event_ndims, name)
 
   def _forward_log_det_jacobian(self, x):
-    """Subclass implementation of `forward_log_det_jacobian`."""
+    """Subclass implementation of `forward_log_det_jacobian` public function.
+
+    In particular, this method differs from the public function, in that it
+    does not take `event_ndims`. Thus, this implements the minimal Jacobian
+    determinant calculation (i.e. over `forward_min_event_ndims`).
+
+    Args:
+      x: `Tensor`. The input to the "forward_log_det_jacobian" evaluation.
+
+    Returns:
+      forward_log_det_jacobian: `Tensor`, if this bijector is injective.
+        If not injective, returns the k-tuple containing jacobians for the
+        unique `k` points `(x1, ..., xk)` such that `g(xi) = y`.
+    """
+
     raise NotImplementedError(
         "forward_log_det_jacobian not implemented.")
 
-  def _call_forward_log_det_jacobian(self, x, name, **kwargs):
+  def _call_forward_log_det_jacobian(self, x, event_ndims, name, **kwargs):
     with self._name_scope(name, [x]):
-      if self._constant_ildj is not None:
+      if event_ndims in self._constant_ildj_map:
         # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
-        return -1. * self._constant_ildj
+        return -1. * self._constant_ildj_map[event_ndims]
       x = ops.convert_to_tensor(x, name="x")
       self._maybe_assert_dtype(x)
       if not self._is_injective:
-        return self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+        fldjs = self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+        return tuple(self._reduce_jacobian_det_over_event(
+            x, fldj, self.forward_min_event_ndims, event_ndims)
+                     for fldj in fldjs)
       mapping = self._lookup(x=x, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return -mapping.ildj
+      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+        return -mapping.ildj_map[event_ndims]
       try:
         y = None  # Not needed; leave cache as is.
         ildj = -self._forward_log_det_jacobian(x, **kwargs)
+        ildj = self._reduce_jacobian_det_over_event(
+            x, ildj, self.forward_min_event_ndims, event_ndims)
       except NotImplementedError as original_exception:
         try:
           y = mapping.y if mapping.y is not None else self._forward(x, **kwargs)
           ildj = self._inverse_log_det_jacobian(y, **kwargs)
+          ildj = self._reduce_jacobian_det_over_event(
+              y, ildj, self.inverse_min_event_ndims, event_ndims)
         except NotImplementedError:
           raise original_exception
-      mapping = mapping.merge(y=y, ildj=ildj)
+      mapping = mapping.merge(y=y, ildj_map={event_ndims: ildj})
       self._cache(mapping)
       if self.is_constant_jacobian:
-        self._constant_ildj = mapping.ildj
-      return -mapping.ildj
+        self._constant_ildj_map[event_ndims] = ildj
+      return -ildj
 
-  def forward_log_det_jacobian(self, x, name="forward_log_det_jacobian"):
+  def forward_log_det_jacobian(
+      self, x, event_ndims, name="forward_log_det_jacobian"):
     """Returns both the forward_log_det_jacobian.
 
     Args:
-      x: `Tensor`. The input to the "forward" Jacobian evaluation.
+      x: `Tensor`. The input to the "forward" Jacobian determinant evaluation.
+      event_ndims: Number of dimensions in the probabilistic events being
+        transformed. Must be greater than or equal to
+        `self.forward_min_event_ndims`. The result is summed over the final
+        dimensions to produce a scalar Jacobian determinant for each event,
+        i.e. it has shape `x.shape.ndims - event_ndims` dimensions.
       name: The name to give this op.
 
     Returns:
@@ -761,7 +937,9 @@ class Bijector(object):
       raise NotImplementedError(
           "forward_log_det_jacobian cannot be implemented for non-injective "
           "transforms.")
-    return self._call_forward_log_det_jacobian(x, name)
+    with ops.control_dependencies(self._check_valid_event_ndims(
+        min_event_ndims=self.forward_min_event_ndims, event_ndims=event_ndims)):
+      return self._call_forward_log_det_jacobian(x, event_ndims, name)
 
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
@@ -779,9 +957,6 @@ class Bijector(object):
 
   def _cache(self, mapping):
     """Helper which stores mapping info in forward/inverse dicts."""
-    if self._constant_ildj is not None:
-      # Fold in ildj if known constant Jacobian.
-      mapping = mapping.merge(ildj=self._constant_ildj)
     # Merging from lookup is an added check that we're not overwriting anything
     # which is not None.
     mapping = mapping.merge(mapping=self._lookup(
@@ -803,22 +978,66 @@ class Bijector(object):
       return self._from_y.get(mapping.y_key, mapping)
     return mapping
 
-  def _event_dims_tensor(self, sample):
-    """Return a 1D `int32` tensor: `range(rank(sample))[-event_ndims:]`."""
-    if self.event_ndims is None:
-      raise ValueError("Jacobian cannot be computed with unknown event_ndims")
-    static_event_ndims = tensor_util.constant_value(self.event_ndims)
-    static_rank = sample.get_shape().ndims
-    if static_event_ndims is not None and static_rank is not None:
-      return ops.convert_to_tensor(
-          static_rank + np.arange(-static_event_ndims, 0).astype(np.int32))
-
-    if static_event_ndims is not None:
-      event_range = np.arange(-static_event_ndims, 0).astype(np.int32)
-    else:
-      event_range = math_ops.range(-self.event_ndims, 0, dtype=dtypes.int32)
-
-    if static_rank is not None:
-      return event_range + static_rank
+  def _reduce_jacobian_det_over_event(
+      self, y, ildj, min_event_ndims, event_ndims):
+    """Reduce jacobian over event_ndims - min_event_ndims."""
+    if not self.is_constant_jacobian:
+      return math_ops.reduce_sum(
+          ildj,
+          self._get_event_reduce_dims(min_event_ndims, event_ndims))
+
+    # In this case, we need to tile the jacobian over the event and reduce.
+    y_rank = array_ops.rank(y)
+    y_shape = array_ops.shape(y)[
+        y_rank - event_ndims : y_rank - min_event_ndims]
+
+    ones = array_ops.ones(y_shape, ildj.dtype)
+    reduced_ildj = math_ops.reduce_sum(
+        ones * ildj,
+        axis=self._get_event_reduce_dims(min_event_ndims, event_ndims))
+    # The multiplication by ones can change the inferred static shape so we try
+    # to recover as much as possible.
+    if (isinstance(event_ndims, int) and
+        y.get_shape().ndims and ildj.get_shape().ndims):
+      y_shape = y.get_shape()
+      y_shape = y_shape[y_shape.ndims - event_ndims :
+                        y_shape.ndims - min_event_ndims]
+      ildj_shape = ildj.get_shape()
+      broadcast_shape = array_ops.broadcast_static_shape(
+          ildj_shape, y_shape)
+      reduced_ildj.set_shape(
+          broadcast_shape[: broadcast_shape.ndims - (
+              event_ndims - min_event_ndims)])
+
+    return reduced_ildj
+
+  def _get_event_reduce_dims(self, min_event_ndims, event_ndims):
+    """Compute the reduction dimensions given event_ndims."""
+    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
+                        else tensor_util.constant_value(min_event_ndims))
+    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
+                    else tensor_util.constant_value(event_ndims))
+
+    if min_event_ndims_ is not None and event_ndims_ is not None:
+      return [-index for index in range(1, event_ndims_ - min_event_ndims_ + 1)]
     else:
-      return event_range + array_ops.rank(sample)
+      reduce_ndims = event_ndims - min_event_ndims
+      return math_ops.range(-reduce_ndims, 0)
+
+  def _check_valid_event_ndims(self, min_event_ndims, event_ndims):
+    """Check whether event_ndims is atleast min_event_ndims."""
+    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
+                        else tensor_util.constant_value(min_event_ndims))
+    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
+                    else tensor_util.constant_value(event_ndims))
+
+    if min_event_ndims_ is not None and event_ndims_ is not None:
+      if min_event_ndims_ > event_ndims_:
+        raise ValueError("event_ndims ({}) must be larger than "
+                         "min_event_ndims ({})".format(
+                             event_ndims_, min_event_ndims_))
+      return []
+
+    if self.validate_args:
+      return [check_ops.assert_greater_equal(event_ndims, min_event_ndims)]
+    return []
diff --git a/tensorflow/python/ops/distributions/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
index ff3535c626..784bfd5835 100644
--- a/tensorflow/python/ops/distributions/bijector_test_util.py
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -79,9 +79,7 @@ def assert_scalar_congruency(bijector,
   Raises:
     AssertionError:  If tests fail.
   """
-
   # Checks and defaults.
-  assert bijector.event_ndims.eval() == 0
   if sess is None:
     sess = ops.get_default_session()
 
@@ -111,7 +109,10 @@ def assert_scalar_congruency(bijector,
   # (b - a) = \int_a^b dx = \int_{y(a)}^{y(b)} |dx/dy| dy
   # "change_measure_dy_dx" below is a Monte Carlo approximation to the right
   # hand side, which should then be close to the left, which is (b - a).
-  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(uniform_y_samps))
+  # We assume event_ndims=0 because we assume scalar -> scalar. The log_det
+  # methods will handle whether they expect event_ndims > 0.
+  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(
+      uniform_y_samps, event_ndims=0))
   # E[|dx/dy|] under Uniform[lower_y, upper_y]
   # = \int_{y(a)}^{y(b)} |dx/dy| dP(u), where dP(u) is the uniform measure
   expectation_of_dy_dx_under_uniform = math_ops.reduce_mean(dy_dx)
@@ -121,7 +122,8 @@ def assert_scalar_congruency(bijector,
 
   # We'll also check that dy_dx = 1 / dx_dy.
   dx_dy = math_ops.exp(
-      bijector.forward_log_det_jacobian(bijector.inverse(uniform_y_samps)))
+      bijector.forward_log_det_jacobian(
+          bijector.inverse(uniform_y_samps), event_ndims=0))
 
   [
       forward_on_10_pts_v,
@@ -158,7 +160,8 @@ def assert_scalar_congruency(bijector,
       dy_dx_v, np.divide(1., dx_dy_v), atol=1e-5, rtol=1e-3)
 
 
-def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
+def assert_bijective_and_finite(
+    bijector, x, y, event_ndims, atol=0, rtol=1e-5, sess=None):
   """Assert that forward/inverse (along with jacobians) are inverses and finite.
 
   It is recommended to use x and y values that are very very close to the edge
@@ -168,6 +171,8 @@ def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
     bijector:  A Bijector instance.
     x:  np.array of values in the domain of bijector.forward.
     y:  np.array of values in the domain of bijector.inverse.
+    event_ndims: Integer describing the number of event dimensions this bijector
+      operates on.
     atol:  Absolute tolerance.
     rtol:  Relative tolerance.
     sess:  TensorFlow session.  Defaults to the default session.
@@ -197,10 +202,10 @@ def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
   ] = sess.run([
       bijector.inverse(f_x),
       bijector.forward(g_y),
-      bijector.inverse_log_det_jacobian(f_x),
-      bijector.forward_log_det_jacobian(x),
-      bijector.inverse_log_det_jacobian(y),
-      bijector.forward_log_det_jacobian(g_y),
+      bijector.inverse_log_det_jacobian(f_x, event_ndims=event_ndims),
+      bijector.forward_log_det_jacobian(x, event_ndims=event_ndims),
+      bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims),
+      bijector.forward_log_det_jacobian(g_y, event_ndims=event_ndims),
       f_x,
       g_y,
   ])
diff --git a/tensorflow/python/ops/distributions/bijectors.py b/tensorflow/python/ops/distributions/bijectors.py
deleted file mode 100644
index 69c3a5d4c0..0000000000
--- a/tensorflow/python/ops/distributions/bijectors.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Core module for TensorFlow distribution bijectors."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions.bijector import Bijector
-from tensorflow.python.ops.distributions.identity_bijector import Identity
-
-# pylint: enable=wildcard-import,unused-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Bijector", "Identity"]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/distributions.py b/tensorflow/python/ops/distributions/distributions.py
index 9df7d148a5..7c4b8697d8 100644
--- a/tensorflow/python/ops/distributions/distributions.py
+++ b/tensorflow/python/ops/distributions/distributions.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions import bijectors
 from tensorflow.python.ops.distributions.bernoulli import Bernoulli
 from tensorflow.python.ops.distributions.beta import Beta
 from tensorflow.python.ops.distributions.categorical import Categorical
@@ -40,7 +39,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
-    "bijectors",
     "Bernoulli",
     "Beta",
     "Categorical",
diff --git a/tensorflow/python/ops/distributions/identity_bijector.py b/tensorflow/python/ops/distributions/identity_bijector.py
index 2972c3554b..8628e68f96 100644
--- a/tensorflow/python/ops/distributions/identity_bijector.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -28,7 +27,6 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.bijectors.Identity")
 class Identity(bijector.Bijector):
   """Compute Y = g(X) = X.
 
@@ -37,7 +35,7 @@ class Identity(bijector.Bijector):
     ```python
     # Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
     # ndim and 1 event ndim (i.e., vector of vectors).
-    identity = Identity(event_ndims=1)
+    identity = Identity()
     x = [[1., 2],
          [3, 4]]
     x == identity.forward(x) == identity.inverse(x)
@@ -45,10 +43,10 @@ class Identity(bijector.Bijector):
 
   """
 
-  def __init__(self, validate_args=False, event_ndims=0, name="identity"):
+  def __init__(self, validate_args=False, name="identity"):
     super(Identity, self).__init__(
+        forward_min_event_ndims=0,
         is_constant_jacobian=True,
-        event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1efcf9d32e..1ad63a8cf6 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -197,8 +197,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Affine(
       shift=-1.,
-      scale_identity_multiplier=2.,
-      event_ndims=0),
+      scale_identity_multiplier=2.)
     name="NormalTransformedDistribution")
   ```
 
@@ -419,48 +418,51 @@ class TransformedDistribution(distribution_lib.Distribution):
     # For caching to work, it is imperative that the bijector is the first to
     # modify the input.
     x = self.bijector.inverse(y)
-    ildj = self.bijector.inverse_log_det_jacobian(y)
+    event_ndims = self._maybe_get_event_ndims_statically()
+
+    ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
-      return self._finish_log_prob_for_one_fiber(y, x, ildj)
+      return self._finish_log_prob_for_one_fiber(y, x, ildj, event_ndims)
 
     lp_on_fibers = [
-        self._finish_log_prob_for_one_fiber(y, x_i, ildj_i)
+        self._finish_log_prob_for_one_fiber(y, x_i, ildj_i, event_ndims)
         for x_i, ildj_i in zip(x, ildj)]
     return math_ops.reduce_logsumexp(array_ops.stack(lp_on_fibers), axis=0)
 
-  def _finish_log_prob_for_one_fiber(self, y, x, ildj):
+  def _finish_log_prob_for_one_fiber(self, y, x, ildj, event_ndims):
     """Finish computation of log_prob on one element of the inverse image."""
     x = self._maybe_rotate_dims(x, rotate_right=True)
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
     log_prob += math_ops.cast(ildj, log_prob.dtype)
-    if self._is_maybe_event_override:
+    if self._is_maybe_event_override and isinstance(event_ndims, int):
       log_prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+          x.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
     return log_prob
 
   def _prob(self, y):
     x = self.bijector.inverse(y)
-    ildj = self.bijector.inverse_log_det_jacobian(y)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
-      return self._finish_prob_for_one_fiber(y, x, ildj)
+      return self._finish_prob_for_one_fiber(y, x, ildj, event_ndims)
 
     prob_on_fibers = [
-        self._finish_prob_for_one_fiber(y, x_i, ildj_i)
+        self._finish_prob_for_one_fiber(y, x_i, ildj_i, event_ndims)
         for x_i, ildj_i in zip(x, ildj)]
     return sum(prob_on_fibers)
 
-  def _finish_prob_for_one_fiber(self, y, x, ildj):
+  def _finish_prob_for_one_fiber(self, y, x, ildj, event_ndims):
     """Finish computation of prob on one element of the inverse image."""
     x = self._maybe_rotate_dims(x, rotate_right=True)
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
     prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
-    if self._is_maybe_event_override:
+    if self._is_maybe_event_override and isinstance(event_ndims, int):
       prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+          y.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
     return prob
 
   def _log_cdf(self, y):
@@ -545,10 +547,17 @@ class TransformedDistribution(distribution_lib.Distribution):
           _ones_like(self.distribution.batch_shape_tensor())
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
-    dummy = array_ops.zeros([], self.dtype)
-    entropy -= math_ops.cast(
-        self.bijector.inverse_log_det_jacobian(dummy),
-        entropy.dtype)
+    dummy = array_ops.zeros(
+        shape=array_ops.concat(
+            [self.batch_shape_tensor(), self.event_shape_tensor()],
+            0),
+        dtype=self.dtype)
+    event_ndims = (self.event_shape.ndims if self.event_shape.ndims is not None
+                   else array_ops.size(self.event_shape_tensor()))
+    ildj = self.bijector.inverse_log_det_jacobian(
+        dummy, event_ndims=event_ndims)
+
+    entropy -= math_ops.cast(ildj, entropy.dtype)
     entropy.set_shape(self.batch_shape)
     return entropy
 
@@ -610,3 +619,16 @@ class TransformedDistribution(distribution_lib.Distribution):
     n = (ndims - self._rotate_ndims) if rotate_right else self._rotate_ndims
     return array_ops.transpose(
         x, _concat_vectors(math_ops.range(n, ndims), math_ops.range(0, n)))
+
+  def _maybe_get_event_ndims_statically(self):
+    if self.event_shape.ndims is not None:
+      return self.event_shape.ndims
+
+    event_ndims = array_ops.size(self.event_shape_tensor())
+
+    static_event_ndims = tensor_util.constant_value(event_ndims)
+
+    if static_event_ndims is not None:
+      return static_event_ndims
+
+    return event_ndims
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
deleted file mode 100644
index 11565bd3e4..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
+++ /dev/null
@@ -1,65 +0,0 @@
-path: "tensorflow.distributions.bijectors.Bijector"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_ndims"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_parents"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_constant_jacobian"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'event_ndims\', \'graph_parents\', \'is_constant_jacobian\', \'validate_args\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "forward"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
-  }
-  member_method {
-    name: "forward_event_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "forward_event_shape_tensor"
-    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "forward_log_det_jacobian"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
-  }
-  member_method {
-    name: "inverse"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
-  }
-  member_method {
-    name: "inverse_event_shape"
-    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "inverse_event_shape_tensor"
-    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "inverse_log_det_jacobian"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
deleted file mode 100644
index 1e5fe624eb..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
+++ /dev/null
@@ -1,66 +0,0 @@
-path: "tensorflow.distributions.bijectors.Identity"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.identity_bijector.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_ndims"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_parents"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_constant_jacobian"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'validate_args\', \'event_ndims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'identity\'], "
-  }
-  member_method {
-    name: "forward"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
-  }
-  member_method {
-    name: "forward_event_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "forward_event_shape_tensor"
-    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "forward_log_det_jacobian"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
-  }
-  member_method {
-    name: "inverse"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
-  }
-  member_method {
-    name: "inverse_event_shape"
-    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "inverse_event_shape_tensor"
-    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "inverse_log_det_jacobian"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
deleted file mode 100644
index 1d0144f36e..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.distributions.bijectors"
-tf_module {
-  member {
-    name: "Bijector"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-  member {
-    name: "Identity"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
index 2fba7c506e..90b60ef074 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
@@ -68,10 +68,6 @@ tf_module {
     name: "Uniform"
     mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
   }
-  member {
-    name: "bijectors"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "kl_divergence"
     argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-- 
GitLab


From e5201672aa664cf39725f4a52b9774d2bae43ba3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 14:04:09 -0700
Subject: [PATCH 0614/1262] Adds a nodedef_fn parameter to copy_op_handler,
 allowing customization by mutating NodeDef before creating the copied
 operation.

PiperOrigin-RevId: 192505209
---
 .../graph_editor/tests/transform_test.py      | 29 +++++++++++++++++++
 tensorflow/contrib/graph_editor/transform.py  | 11 ++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index 2603de6407..97f38c923f 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -18,9 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import numpy as np
 from tensorflow.contrib import graph_editor as ge
 from tensorflow.contrib.graph_editor.tests import match
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -42,6 +44,7 @@ class TransformTest(test.TestCase):
     self.graph = ops.Graph()
     with self.graph.as_default():
       c0 = constant_op.constant(1.0, shape=[10], name="Const")
+      c0.op._set_attr("_foo", attr_value_pb2.AttrValue(s=b"foo"))
       c1 = constant_op.constant(1.0, shape=[10], name="Const")
       c2 = constant_op.constant(1.0, shape=[10], name="Const")
       i = constant_op.constant(1.0, shape=[10], name="Input")
@@ -112,6 +115,32 @@ class TransformTest(test.TestCase):
     top = ge.select_ops("^AddNoise_2$", graph=graph)[0]
     self.assertTrue(matcher2(top))
 
+  def test_transform_nodedef_fn(self):
+    transformer = ge.Transformer()
+
+    def nodedef_fn(node_def):
+      if "_foo" in node_def.attr:
+        del node_def.attr["_foo"]
+      node_def.attr["_bar"].s = b"bar"
+      return node_def
+
+    my_copy_op_handler = functools.partial(
+        ge.transform.copy_op_handler, nodedef_fn=nodedef_fn)
+    transformer.transform_op_handler = my_copy_op_handler
+
+    graph = ops.Graph()
+    transformer(self.graph, graph, "", "")
+
+    c0_before = self.graph.get_operation_by_name("Const")
+    c0_after = graph.get_operation_by_name("Const")
+    self.assertEquals(c0_before.get_attr("_foo"), b"foo")
+    with self.assertRaises(ValueError):
+      c0_after.get_attr("_foo")
+
+    all_ops = graph.get_operations()
+    for op in all_ops:
+      self.assertEquals(op.get_attr("_bar"), b"bar")
+
   def test_copy_with_input_replacements(self):
     with self.graph.as_default():
       ten = constant_op.constant(10.0, shape=[10], name="Input")
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index d8a48387a7..a320a3f232 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -129,7 +129,7 @@ def transform_op_if_inside_handler(info, op, keep_if_possible=True):
       return None
 
 
-def copy_op_handler(info, op, new_inputs, copy_shape=True):
+def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None):
   """Copy a `tf.Operation`.
 
   Args:
@@ -137,6 +137,11 @@ def copy_op_handler(info, op, new_inputs, copy_shape=True):
     op: the `tf.Operation` to be copied.
     new_inputs: The new inputs for this op.
     copy_shape: also copy the shape of the tensor
+    nodedef_fn: If provided, a function that will be run on the NodeDef
+      and should return a mutated NodeDef before a new Operation is created.
+      This is useful as certain features cannot be set on the Operation and
+      must be modified in NodeDef.
+
   Returns:
     A `(op, op_outputs)` tuple containing the transformed op and its outputs.
   """
@@ -155,6 +160,10 @@ def copy_op_handler(info, op, new_inputs, copy_shape=True):
   name_ = info.graph_.unique_name(name_)
   node_def_.name = name_
 
+  # Mutate NodeDef if requested:
+  if nodedef_fn is not None:
+    node_def_ = nodedef_fn(node_def_)
+
   # Copy the other inputs needed for initialization
   output_types_ = op._output_types[:]
   input_types_ = op._input_types[:]
-- 
GitLab


From 21fb4eeb3e09fb0dea1dd12b0fff7a7bf0a33643 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 14:39:11 -0700
Subject: [PATCH 0615/1262] Adding support for batch_to_space_nd op with crops.

PiperOrigin-RevId: 192511036
---
 .../contrib/lite/kernels/batch_to_space_nd.cc |  2 +
 .../contrib/lite/kernels/internal/BUILD       |  9 ++
 .../internal/batch_to_space_nd_test.cc        | 98 +++++++++++++++++++
 .../internal/optimized/optimized_ops.h        | 66 +++++++++++--
 .../internal/reference/reference_ops.h        | 27 +++--
 .../contrib/lite/testing/generate_examples.py |  5 +-
 .../propagate_fixed_sizes.cc                  | 16 ++-
 tensorflow/contrib/lite/toco/model.h          |  3 +-
 8 files changed, 195 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc

diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index bc438f99c6..90edf4f9e3 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -123,6 +123,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        GetTensorDims(op_context.input),                \
                        GetTensorData<int32_t>(op_context.block_shape), \
                        GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.crops),       \
+                       GetTensorDims(op_context.crops),                \
                        GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 32a0acf888..67dd188496 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -432,4 +432,13 @@ cc_library(
     ),
 )
 
+cc_test(
+    name = "batch_to_space_nd_test",
+    srcs = ["batch_to_space_nd_test.cc"],
+    deps = [
+        ":optimized_base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
diff --git a/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000..5a2901ac8c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+// A light wrapper of GetIndexRange which returns a pair of start / end
+// indices.
+std::pair<int, int> GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                                  int input_dim, int output_dim) {
+  int index_start = 0;
+  int index_end = 0;
+  optimized_ops::GetIndexRange(spatial_index_dim, block_shape_dim, input_dim,
+                               output_dim, &index_start, &index_end);
+  return {index_start, index_end};
+}
+
+TEST(BatchToSpaceNDTest, TestIndexRange) {
+  // Simple test case, no cropping.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/6,
+                          /*input_dim=*/1, /*output_dim=*/6),
+            std::make_pair(0, 1));
+
+  // No cropping and input_dim > 1.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/2, /*block_shape_dim=*/6,
+                          /*input_dim=*/5, /*output_dim=*/30),
+            std::make_pair(0, 5));
+
+  // With small cropping values (can be either at the beginning or at the end).
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(0, 2));
+
+  // With positive cropping values at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-2, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(1, 3));
+
+  // Large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-26, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  // Large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/4, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Rounding up incorrectly will fail this test.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Extreme cropping with output of a single spatial location.
+  // Valid position 1, when large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 1));
+
+  // Valid position 2, when large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 7));
+
+  // Invalid positions.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/1, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 0));
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-29, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 6));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 5f60b2d6a0..fa91db7fe1 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5212,6 +5212,7 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* paddings_data,
                            const Dims<4>& paddings_dims, T* output_data,
                            const Dims<4>& output_dims) {
+  // Unoptimized - Straight copy from reference ops.
   gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
 
   const int output_batch_size = ArraySize(output_dims, 3);
@@ -5253,29 +5254,76 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Helper methods for BatchToSpaceND.
+// `spatial_index_dim` specifies post-crop offset index in this spatial
+// dimension, i.e. spatial offset introduced by flattening batch to spatial
+// dimension minus the crop size at beginning. `block_shape_dim` is the block
+// size in current dimension. `input_dim` and `output_dim` are input and output
+// size of BatchToSpaceND operation in current dimension.
+// Output start index is inclusive and end index is exclusive.
+inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                          int input_dim, int output_dim, int* start_index,
+                          int* end_index) {
+  // (*start_index) * block_shape_dim is effectively rounded up to the next
+  // multiple of block_shape_dim by the integer division.
+  *start_index =
+      std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
+  // end_index is exclusive).
+  *end_index = std::min(
+      input_dim,
+      (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+}
+
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
 
   const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
   const int input_batch_size = ArraySize(input_dims, 3);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int depth = ArraySize(input_dims, 0);
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int crops_top = crops_data[0];
+  const int crops_left = crops_data[2];
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
-    for (int in_h = 0; in_h < input_height; ++in_h) {
-      for (int in_w = 0; in_w < input_width; ++in_w) {
-        int out_batch = in_batch % output_batch_size;
-        int out_w = in_w * block_shape_width +
-                    (in_batch / output_batch_size) % block_shape_width;
-        int out_h = in_h * block_shape_height +
-                    (in_batch / output_batch_size) / block_shape_width;
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+
+    int in_h_start = 0;
+    int in_h_end = 0;
+    // GetIndexRange ensures start and end indices are in [0, output_height).
+    GetIndexRange(spatial_offset / block_shape_width - crops_top,
+                  block_shape_height, input_height, output_height, &in_h_start,
+                  &in_h_end);
+
+    for (int in_h = in_h_start; in_h < in_h_end; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      TFLITE_DCHECK_GE(out_h, 0);
+      TFLITE_DCHECK_LT(out_h, output_height);
+
+      int in_w_start = 0;
+      int in_w_end = 0;
+      // GetIndexRange ensures start and end indices are in [0, output_width).
+      GetIndexRange(spatial_offset % block_shape_width - crops_left,
+                    block_shape_width, input_width, output_width, &in_w_start,
+                    &in_w_end);
+
+      for (int in_w = in_w_start; in_w < in_w_end; ++in_w) {
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+        TFLITE_DCHECK_GE(out_w, 0);
+        TFLITE_DCHECK_LT(out_w, output_width);
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
         memcpy(out, in, depth * sizeof(T));
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 0912f5928c..c6019390f2 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2873,24 +2873,37 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
   const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
   const int input_batch_size = ArraySize(input_dims, 3);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int depth = ArraySize(input_dims, 0);
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int crops_top = crops_data[0];
+  const int crops_left = crops_data[2];
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
     for (int in_h = 0; in_h < input_height; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      if (out_h < 0 || out_h >= output_height) {
+        continue;
+      }
       for (int in_w = 0; in_w < input_width; ++in_w) {
-        int out_batch = in_batch % output_batch_size;
-        int out_w = in_w * block_shape_width +
-                    (in_batch / output_batch_size) % block_shape_width;
-        int out_h = in_h * block_shape_height +
-                    (in_batch / output_batch_size) / block_shape_width;
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+
+        if (out_w < 0 || out_w >= output_width) {
+          continue;
+        }
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
         memcpy(out, in, depth * sizeof(T));
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 0e6aceeb86..4b4ccc0c37 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -93,9 +93,6 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
-    # BatchToSpaceND doesn't support cropping. This catches test cases with
-    # const tensors as crops.
-    r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
@@ -1595,7 +1592,7 @@ def make_batch_to_space_nd_tests(zip_path):
   test_parameters = [
       {
           "dtype": [tf.float32, tf.int64, tf.int32],
-          "input_shape": [[12, 2, 2, 1]],
+          "input_shape": [[12, 3, 3, 1]],
           "block_shape": [[1, 4], [2, 2], [3, 4]],
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
           "constant_block_shape": [True, False],
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index a648b770f8..9191e69662 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1060,17 +1060,15 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   }
   QCHECK(crops_array.data_type == ArrayDataType::kInt32);
   const auto& crops_data = crops_array.GetBuffer<ArrayDataType::kInt32>().data;
-  // We don't support crops now.
-  QCHECK_EQ(crops_data[0], 0);
-  QCHECK_EQ(crops_data[1], 0);
-  QCHECK_EQ(crops_data[2], 0);
-  QCHECK_EQ(crops_data[3], 0);
-
+  const int crops_top = crops_data[0];
+  const int crops_bottom = crops_data[1];
+  const int crops_left = crops_data[2];
+  const int crops_right = crops_data[3];
+  const int output_height =
+      input_height * block_height - crops_top - crops_bottom;
+  const int output_width = input_width * block_width - crops_left - crops_right;
   QCHECK_EQ(input_shape.dims(0) % (block_height * block_width), 0);
 
-  int output_height = input_height * block_height;
-  int output_width = input_width * block_width;
-
   model->GetArray(op->outputs[0])
       .copy_shape(Shape({input_shape.dims(0) / (block_height * block_width),
                          output_height, output_width, input_shape.dims(3)}));
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 616d53ae3e..716a579d22 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -1420,8 +1420,7 @@ struct SpaceToBatchNDOperator : Operator {
 };
 
 // BatchToSpaceND operator. Rearranges data from batch into blocks of
-// spatial data. Currently, only 2-d blocks are supported. Cropping is not
-// supported, either, and the crops array should be all zero.
+// spatial data. Currently, only 2-d blocks are supported.
 //
 // Inputs:
 //   inputs[0]: required: the input array
-- 
GitLab


From 64c3e9f9636c73a5aec11572475f2cd26dbbc87b Mon Sep 17 00:00:00 2001
From: bhavani-subramanian <bhavani1.subramanian@intel.com>
Date: Wed, 11 Apr 2018 15:10:38 -0700
Subject: [PATCH 0616/1262] [INTEL MKL] Skip special nodes inserted by TF and
 MKL (#18077)

* Skip special nodes inserted by TF. This fixes TFDO-178 JIIRA issue.

* Added a comment about skipping nodes with an /_ in them.

* Stripped trailing whitespace.

* Wrapped code such that it is executed only when INTEL_MKL is defined.
---
 tensorflow/core/grappler/clusters/single_machine_test.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index c6352c1448..352f08fede 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -196,10 +196,19 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
   std::set<string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
+#ifdef INTEL_MKL
+    // Skip the special nodes inserted by TF (and MKL): these are either
+    // prefixed with an underscore or contain "/_".
+    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+      continue;
+    }
+    cost_nodes.insert(node.name());
+#else
     // Skip nodes added by TF internally.
     if (node.name()[0] != '_') {
       cost_nodes.insert(node.name());
     }
+#endif
   }
   const std::set<string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
-- 
GitLab


From d2690cf5893cb117ab52f0169fe730736dc22ab7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 15:09:07 -0700
Subject: [PATCH 0617/1262] Extend support to remove transpose/reverse on
 dimensions of size 1.

PiperOrigin-RevId: 192516190
---
 tensorflow/core/grappler/op_types.cc          |  8 +-
 tensorflow/core/grappler/op_types.h           |  1 +
 .../grappler/optimizers/constant_folding.cc   | 95 +++++++++++++++++--
 .../optimizers/constant_folding_test.cc       | 80 +++++++++++++++-
 4 files changed, 168 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9c45aed62f..cfe1329dbf 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -249,6 +249,10 @@ bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
 
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
+bool IsRandomShuffle(const NodeDef& node) {
+  return node.op() == "RandomShuffle";
+}
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -298,9 +302,7 @@ bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
 
 bool IsShapeN(const NodeDef& node) { return node.op() == "ShapeN"; }
 
-bool IsShuffle(const NodeDef& node) {
-  return node.op() == "Shuffle" || node.op() == "RandomShuffle";
-}
+bool IsShuffle(const NodeDef& node) { return node.op() == "Shuffle"; }
 
 bool IsSigmoidGrad(const NodeDef& node) { return node.op() == "SigmoidGrad"; }
 
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 79fd05e187..0573b02604 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -98,6 +98,7 @@ bool IsPolygamma(const NodeDef& node);
 bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
+bool IsRandomShuffle(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index b2a1ce6ab6..17d8b7421c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1574,24 +1574,99 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
       continue;
     }
 
-    // Remove Shuffle or Reverse op over scalar values.
-    if (use_shape_info &&
-        !properties->GetInputProperties(node->name()).empty() &&
-        (IsShuffle(*node) || IsReverse(*node) || IsTranspose(*node))) {
+    // Remove Shuffle or Transpose op over dimensions of size 1.
+    if (use_shape_info && (IsShuffle(*node) || IsTranspose(*node)) &&
+        !properties->GetInputProperties(node->name()).empty()) {
       const auto& shape =
           properties->GetInputProperties(node->name())[0].shape();
-      // The node is replaceable iff
-      // unknown_rank == false && (dim_size == 0 || all dims have size 1)
-      bool replaceable = !shape.unknown_rank();
-      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-        replaceable &= shape.dim(j).size() == 1;
+      if (shape.unknown_rank()) {
+        // Not optimizable.
+        continue;
       }
-      if (replaceable) {
+      const auto& p = properties->GetInputProperties(node->name())[1];
+      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+        Tensor perm(p.dtype(), p.shape());
+        if (!perm.FromProto(p.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         p.value().DebugString());
+        }
+        std::vector<int> permutation;
+        for (int j = 0; j < perm.NumElements(); ++j) {
+          if (perm.dtype() == DT_INT64) {
+            permutation.push_back(perm.vec<int64>()(j));
+          } else {
+            permutation.push_back(perm.vec<int>()(j));
+          }
+        }
+        if (permutation.size() != shape.dim_size()) {
+          // Number of elements in perm should be same as dim_size. Skip if not.
+          continue;
+        }
+        // The node is replaceable iff
+        // dim_size == 0 || all dims have size 1 ||
+        // all dims with > 1 size are not permuted.
+        bool replaceable = true;
+        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+          replaceable &= shape.dim(j).size() == 1 || j == permutation[j];
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
+    // Remove RandomShuffle op if it is scalar or first dimension is of size 1.
+    if (use_shape_info && IsRandomShuffle(*node) &&
+        !properties->GetInputProperties(node->name()).empty()) {
+      const auto& shape =
+          properties->GetInputProperties(node->name())[0].shape();
+      // The node is replaceable iff
+      // unknown_rank == false && (dim_size == 0 || first dim is of size 1)
+      if (!shape.unknown_rank() &&
+          (shape.dim_size() == 0 || shape.dim(0).size() == 1)) {
         ReplaceOperationWithIdentity(0, node, optimized_graph);
         continue;
       }
     }
 
+    // Remove Reverse op over dimensions with size 1.
+    if (use_shape_info && IsReverse(*node) &&
+        !properties->GetInputProperties(node->name()).empty()) {
+      const auto& shape =
+          properties->GetInputProperties(node->name())[0].shape();
+      const auto& a = properties->GetInputProperties(node->name())[1];
+      if (TensorShape::IsValid(a.shape()) && a.has_value()) {
+        Tensor axis(a.dtype(), a.shape());
+        if (!axis.FromProto(a.value())) {
+          return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                         a.value().DebugString());
+        }
+        std::set<int> target_axes;
+        for (int j = 0; j < axis.NumElements(); ++j) {
+          if (axis.dtype() == DT_INT64) {
+            target_axes.insert(axis.vec<int64>()(j));
+          } else {
+            target_axes.insert(axis.vec<int>()(j));
+          }
+        }
+
+        // The node is replaceable iff
+        // unknown_rank == false &&
+        // (dim_size == 0 || all dims have size 1 ||
+        //  all dims with > 1 size are not in target_axes)
+        bool replaceable = !shape.unknown_rank();
+        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+          replaceable &= shape.dim(j).size() == 1 ||
+                         target_axes.find(j) == target_axes.end();
+        }
+        if (replaceable) {
+          ReplaceOperationWithIdentity(0, node, optimized_graph);
+          continue;
+        }
+      }
+    }
+
     if (use_shape_info && IsSlice(*node) &&
         properties->GetInputProperties(node->name()).size() == 3) {
       const auto& input = properties->GetInputProperties(node->name())[0];
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 31abe43846..7453fb6731 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1389,8 +1389,6 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
   ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
 
-  LOG(INFO) << s1.output.size();
-  LOG(INFO) << s2.output.size();
   ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
 
   GrapplerItem item;
@@ -1418,7 +1416,45 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   CompareGraphs(want, got);
 }
 
-TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
+TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output p1 = ops::Const(scope.WithOpName("p1"), {3, 2, 1, 0}, {4});
+  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 4, 2, 1}),
+                             DT_FLOAT);
+  Output p2 = ops::Const(scope.WithOpName("p2"), {3, 1, 2, 0}, {4});
+  ops::Transpose t1(scope.WithOpName("t1"), in1, p1);
+  ops::Transpose t2(scope.WithOpName("t2").WithControlDependencies({in1}), in2,
+                    p2);
+
+  ops::Add out1(scope.WithOpName("out1"), t1, t2);
+
+  GrapplerItem item;
+  item.fetch = {"out1"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("p1", "Const", {}, {}, &want);
+  AddNode("p2", "Const", {}, {}, &want);
+  AddNode("t1", "Transpose", {"in1", "p1"}, {}, &want);
+  AddNode("t2", "Identity",
+          {"in2", AsControlDependency("in1"), AsControlDependency("p2")}, {},
+          &want);
+  AddNode("out1", "Add", {"t1", "t2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   Output in1 =
@@ -1452,6 +1488,44 @@ TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   CompareGraphs(want, got);
 }
 
+TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output a1 = ops::Const(scope.WithOpName("a1"), {3, 2, 1, 0}, {4});
+  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output a2 = ops::Const(scope.WithOpName("a2"), {0, 3}, {2});
+  ops::Reverse r1(scope.WithOpName("r1"), in1, a1);
+  ops::Reverse r2(scope.WithOpName("r2").WithControlDependencies({in1}), in2,
+                  a2);
+
+  ops::Add out1(scope.WithOpName("out1"), r1, r2);
+
+  GrapplerItem item;
+  item.fetch = {"out1"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("a1", "Const", {}, {}, &want);
+  AddNode("a2", "Const", {}, {}, &want);
+  AddNode("r1", "ReverseV2", {"in1", "a1"}, {}, &want);
+  AddNode("r2", "Identity",
+          {"in2", AsControlDependency("in1"), AsControlDependency("a2")}, {},
+          &want);
+  AddNode("out1", "Add", {"r1", "r2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
   {  // size = {3, 5}
     tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-- 
GitLab


From 0cc518ee98d4caa154f8a7530cb971c00c610905 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 11 Apr 2018 09:34:44 -0700
Subject: [PATCH 0618/1262] Fix Windows GPU TensorFlow Bazel builds.

The configure.py script will error out on Windows GPU builds due
to NCCL attempted to be configured (and is currently Linux only).

PiperOrigin-RevId: 192461362
---
 configure.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 81d5ad77ee..8fb8979111 100644
--- a/configure.py
+++ b/configure.py
@@ -1516,7 +1516,8 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
-    set_tf_nccl_install_path(environ_cp)
+      set_tf_nccl_install_path(environ_cp)
+
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
-- 
GitLab


From 079d63d59b75bdfd25f7371efda25ec5f6739b78 Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Wed, 11 Apr 2018 15:20:11 -0700
Subject: [PATCH 0619/1262] GCS Filesystem should not cache checkpoint file as
 we need to read the updated checkpoints from the contents.

PiperOrigin-RevId: 192517819
---
 .../core/platform/cloud/gcs_file_system.cc    |  8 ++++
 .../platform/cloud/gcs_file_system_test.cc    | 48 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 3c0dc13d75..6ed1d5dad2 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -301,6 +301,14 @@ class GcsRandomAccessFile : public RandomAccessFile {
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
                                                &bytes_transferred));
     *result = StringPiece(scratch, bytes_transferred);
+    string checkpoint_ending = "/checkpoint";
+    // Check if the file is the checkpoint file as we should not be caching
+    // that. As it's contents are updated and used for iterating checkpoints.
+    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
+                   filename_.rbegin())) {
+      // Remove the checkpoint file from the cache
+      file_block_cache_->RemoveFile(filename_);
+    }
     if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 2fbde9b6a7..e9eca04fef 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -198,6 +198,54 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
+  // Our underlying file in this test changes as new data comes in
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "abcdefghi")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  char scratch[100];
+  StringPiece result;
+  {
+    // We are instantiating this in an enclosed scope to make sure after the
+    // unique ptr goes out of scope, we can still access result.
+    std::unique_ptr<RandomAccessFile> file;
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
+
+    // Read the first chunk. The cache will be populated with the first block of
+    // 9 bytes.
+    scratch[5] = 'x';
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("0123", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+
+    // The second chunk should not be in cache so we make a new request
+    // As the checkpoint file should not be cached
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("abcd", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+  }
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
-- 
GitLab


From 4b08b66ab504e5356f1bf2ecf2f0c9e61f1157e7 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 11 Apr 2018 15:23:17 -0700
Subject: [PATCH 0620/1262] Fixes issue where name scope collisions could lead
 to an invalid variable in the metagraph.

PiperOrigin-RevId: 192518307
---
 .../resource_variable_ops_test.py             | 47 +++++++++++++++----
 .../python/ops/resource_variable_ops.py       | 14 ++++--
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 6d33086936..984192258c 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -36,6 +36,9 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+from tensorflow.python.training import saver
+from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
@@ -228,16 +231,40 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testScatterMin(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
-    self.evaluate(
-        resource_variable_ops.assign_variable_op(
-            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
-    self.evaluate(
-        resource_variable_ops.resource_scatter_min(
-            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
-    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-    self.assertEqual(self.evaluate(read), [[3]])
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1])
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(handle,
+                                                   constant_op.constant(
+                                                       [[6]],
+                                                       dtype=dtypes.int32)))
+      self.evaluate(
+          resource_variable_ops.resource_scatter_min(handle, [0],
+                                                     constant_op.constant(
+                                                         [[3]],
+                                                         dtype=dtypes.int32)))
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[3]])
+
+  def testMetagraph(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope("foo", use_resource=True):
+        a = variable_scope.get_variable("a", initializer=10.0)
+
+      momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.1).minimize(
+              a,
+              colocate_gradients_with_ops=True,
+              global_step=training_util.get_or_create_global_step())
+
+      graph = ops.get_default_graph()
+      meta_graph_def = saver.export_meta_graph(graph=graph)
+
+    with ops.Graph().as_default():
+      saver.import_meta_graph(meta_graph_def, import_scope="")
+      meta_graph_two = saver.export_meta_graph(graph=graph)
+    self.assertEqual(meta_graph_def, meta_graph_two)
 
   @test_util.run_in_graph_and_eager_modes()
   def testScatterMax(self):
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 508ba9bfee..c51d1e467d 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -525,8 +525,15 @@ class ResourceVariable(variables.Variable):
       self._cached_value = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
+      self._graph_element = g.as_graph_element(
+          ops.prepend_name_scope(variable_def.snapshot_name,
+                                 import_scope=import_scope))
     else:
       self._cached_value = None
+      # Legacy case for protos without the snapshot name; assume it's the
+      # following.
+      self._graph_element = g.get_tensor_by_name(
+          self._handle.op.name + "/Read/ReadVariableOp:0")
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
           save_slice_info_def=variable_def.save_slice_info_def,
@@ -535,8 +542,6 @@ class ResourceVariable(variables.Variable):
       self._save_slice_info = None
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
-    self._graph_element = g.get_tensor_by_name(
-        self._handle.op.name + "/Read/ReadVariableOp:0")
     self._constraint = None
     self._cached_shape_as_list = None
 
@@ -745,6 +750,10 @@ class ResourceVariable(variables.Variable):
       if self._cached_value is not None:
         var_def.snapshot_name = ops.strip_name_scope(self._cached_value.name,
                                                      export_scope)
+      else:
+        # Store the graph_element here
+        var_def.snapshot_name = ops.strip_name_scope(self._graph_element.name,
+                                                     export_scope)
       var_def.is_resource = True
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
@@ -910,7 +919,6 @@ class ResourceVariable(variables.Variable):
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
     if dtype is not None and dtype != self.dtype:
-      print("trying to switch the dtype to ", dtype, " from ", self.dtype)
       return NotImplemented
     if as_ref:
       return self.read_value().op.inputs[0]
-- 
GitLab


From f029631d65a2209aa3f089cbb980d61ee9d0e7f5 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 11 Apr 2018 15:33:06 -0700
Subject: [PATCH 0621/1262] Increase size of
 //tensorflow/python/kernel_tests:sets_test to "medium".

PiperOrigin-RevId: 192519639
---
 tensorflow/python/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 1827a26902..5738e79b27 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2822,7 +2822,7 @@ sycl_py_test(
 
 tf_py_test(
     name = "sets_test",
-    size = "small",
+    size = "medium",
     srcs = ["sets_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-- 
GitLab


From 9ce7791be6980932c249832dc23d464c1b736cc4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 15:37:49 -0700
Subject: [PATCH 0622/1262] Revealing the range of node ids in the latest layer
 via resource' state

PiperOrigin-RevId: 192520351
---
 ...tedTreesCalculateBestGainsPerFeature.pbtxt |  4 +-
 ...pi_def_BoostedTreesGetEnsembleStates.pbtxt | 12 +++++-
 .../kernels/boosted_trees/boosted_trees.proto |  4 ++
 .../kernels/boosted_trees/resource_ops.cc     | 12 ++++++
 .../core/kernels/boosted_trees/resources.h    | 20 ++++++++++
 .../core/kernels/boosted_trees/stats_ops.cc   |  6 +--
 .../kernels/boosted_trees/training_ops.cc     |  8 ++++
 tensorflow/core/ops/boosted_trees_ops.cc      |  2 +
 .../core/ops/compat/ops_history.v1.pbtxt      |  4 ++
 .../python/estimator/canned/boosted_trees.py  |  9 ++---
 .../estimator/canned/boosted_trees_test.py    | 12 ++++++
 .../boosted_trees/resource_ops_test.py        | 31 +++++++++-----
 .../boosted_trees/stats_ops_test.py           |  8 ++--
 .../boosted_trees/training_ops_test.py        | 40 +++++++++++++++++--
 tensorflow/python/ops/boosted_trees_ops.py    | 15 ++++---
 15 files changed, 150 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index b1921e3507..62876a293c 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "node_id_range"
     description: <<END
-A Rank 1 tensor (shape=[2]) to specify the range [first, last] of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1]+1)` (Note that the last index node_id_range[1] is inclusive).
+A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
 END
   }
   in_arg {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
index ef45a92498..4377125224 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -31,5 +31,13 @@ END
 The number of layers we attempted to build (but not necessarily succeeded).
 END
   }
-  summary: "Retrieves the tree ensemble resource stamp token."
-}
+  out_arg {
+    name: "last_layer_nodes_range"
+    description: <<END
+Rank size 2 tensor that contains start and end ids of the nodes in the latest
+layer.
+END
+
+  }
+  summary: "Retrieves the tree ensemble resource stamp token, number of trees and growing statistics."
+}
\ No newline at end of file
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 106ceedc00..55599de731 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -100,6 +100,10 @@ message GrowingMetadata {
   // Number of layers that we have attempted to build. After pruning, these
   // layers might have been removed.
   int64 num_layers_attempted = 2;
+  // The start (inclusive) and end (exclusive) ids of the nodes in the latest
+  // layer of the latest tree.
+  int32 last_layer_node_start = 3;
+  int32 last_layer_node_end = 4;
 }
 
 // TreeEnsemble describes an ensemble of decision trees.
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
index f49242d856..563f7b8b08 100644
--- a/tensorflow/core/kernels/boosted_trees/resource_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -99,6 +99,7 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     Tensor* output_num_trees_t = nullptr;
     Tensor* output_num_finalized_trees_t = nullptr;
     Tensor* output_num_attempted_layers_t = nullptr;
+    Tensor* output_last_layer_nodes_range_t = nullptr;
 
     OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
                                                      &output_stamp_token_t));
@@ -110,11 +111,22 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(3, TensorShape(),
                                             &output_num_attempted_layers_t));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                4, {2}, &output_last_layer_nodes_range_t));
 
     output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
     output_num_trees_t->scalar<int32>()() = num_trees;
     output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
     output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+
+    int32 range_start;
+    int32 range_end;
+    tree_ensemble_resource->GetLastLayerNodesRange(&range_start, &range_end);
+
+    output_last_layer_nodes_range_t->vec<int32>()(0) = range_start;
+    // For a completely empty ensemble, this will be 0. To make it a valid range
+    // we add this max cond.
+    output_last_layer_nodes_range_t->vec<int32>()(1) = std::max(1, range_end);
   }
 };
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index c82588b950..561ca3a18a 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -93,6 +93,26 @@ class BoostedTreesEnsembleResource : public StampedResource {
         new_num_layers);
   }
 
+  void UpdateLastLayerNodesRange(const int32 node_range_start,
+                                 int32 node_range_end) const {
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+        node_range_start);
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+        node_range_end);
+  }
+
+  void GetLastLayerNodesRange(int32* node_range_start,
+                              int32* node_range_end) const {
+    *node_range_start =
+        tree_ensemble_->growing_metadata().last_layer_node_start();
+    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+  }
+
+  int64 GetNumNodes(const int32 tree_id) {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->trees(tree_id).nodes_size();
+  }
+
   void UpdateGrowingMetadata() const;
 
   int32 GetNumLayersAttempted() {
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 33fdab6a86..16e65cf284 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -42,8 +42,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     const Tensor* node_id_range_t;
     OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
     const auto node_id_range = node_id_range_t->vec<int32>();
-    int32 node_id_first = node_id_range(0);
-    int32 node_id_last = node_id_range(1);  // inclusive.
+    const int32 node_id_first = node_id_range(0);  // inclusive
+    const int32 node_id_last = node_id_range(1);   // exclusive
     // stats_summary_list
     OpInputList stats_summary_list;
     OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
@@ -86,7 +86,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       std::vector<int32> output_thresholds;
       std::vector<float> output_left_node_contribs;
       std::vector<float> output_right_node_contribs;
-      for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) {
+      for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
         // Calculate gains.
         cum_grad.clear();
         cum_hess.clear();
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index b9ded4054a..67cac14c52 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -101,6 +101,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
             << current_tree << " of ensemble of " << current_tree + 1
             << " trees.";
     bool split_happened = false;
+    int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
     // Add the splits to the tree.
     for (auto& split_entry : best_splits) {
       const int32 node_id = split_entry.first;
@@ -139,11 +140,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           right_contrib, &left_node_id, &right_node_id);
       split_happened = true;
     }
+    int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
     if (split_happened) {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
       if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+        // If the tree is finalized, next growing will start from node 0;
+        node_id_start = 0;
+        node_id_end = 1;
         ensemble_resource->SetIsFinalized(current_tree, true);
         if (pruning_mode_ == kPostPruning) {
           ensemble_resource->PostPruneTree(current_tree);
@@ -153,6 +158,9 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
         }
       }
+      // If we managed to split, update the node range. If we didn't, don't
+      // update as we will try to split the same nodes with new instances.
+      ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end);
     }
   }
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 297e94655f..8af4903418 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -128,6 +128,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
     .Output("num_trees: int32")
     .Output("num_finalized_trees: int32")
     .Output("num_attempted_layers: int32")
+    .Output("last_layer_nodes_range: int32")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused_input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
@@ -135,6 +136,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       c->set_output(3, c->Scalar());
+      c->set_output(4, c->Vector(2));
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 12df60a2ae..ba442a0582 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10981,6 +10981,10 @@ op {
     name: "num_attempted_layers"
     type: DT_INT32
   }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
   is_stateful: true
 }
 op {
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index c5d5455b1a..58af59dbb1 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -349,8 +349,8 @@ def _bt_model_fn(
             array_ops.zeros(
                 [batch_size, head.logits_dimension], dtype=dtypes.float32))
       with ops.control_dependencies([ensemble_reload]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = local_tree_ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         last_layer_nodes_range) = local_tree_ensemble.get_states()
         summary.scalar('ensemble/num_trees', num_trees)
         summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
         summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
@@ -393,10 +393,7 @@ def _bt_model_fn(
         (node_ids_per_feature, gains_list, thresholds_list,
          left_node_contribs_list, right_node_contribs_list) = (
              boosted_trees_ops.calculate_best_gains_per_feature(
-                 node_id_range=array_ops.stack([
-                     math_ops.reduce_min(node_ids),
-                     math_ops.reduce_max(node_ids)
-                 ]),
+                 node_id_range=last_layer_nodes_range,
                  stats_summary_list=stats_summary_list,
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 625745a3f9..7823ef8410 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -223,6 +223,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -307,6 +309,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -407,6 +411,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
@@ -444,6 +450,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -528,6 +536,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -628,6 +638,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index a223241e89..d5f0c22d6e 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -36,16 +36,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
       self.assertEqual(0, stamp_token.eval())
-      (_, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (_, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
   def testCreateWithProto(self):
     with self.test_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -141,6 +143,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 6
+          last_layer_node_start: 16
+          last_layer_node_end: 19
         }
       """, ensemble_proto)
       ensemble = boosted_trees_ops.TreeEnsemble(
@@ -148,28 +152,31 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
           stamp_token=7,
           serialized_proto=ensemble_proto.SerializeToString())
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(7, stamp_token.eval())
       self.assertEqual(2, num_trees.eval())
       self.assertEqual(1, num_finalized_trees.eval())
       self.assertEqual(6, num_attempted_layers.eval())
+      self.assertAllEqual([16, 19], nodes_range.eval())
 
   def testSerializeDeserialize(self):
     with self.test_session():
       # Initialize.
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(5, stamp_token.eval())
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -201,6 +208,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 5
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
       """, ensemble_proto)
       with ops.control_dependencies([
@@ -208,13 +217,15 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
               stamp_token=3,
               serialized_proto=ensemble_proto.SerializeToString())
       ]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         nodes_range) = ensemble.get_states()
       self.assertEqual(3, stamp_token.eval())
       self.assertEqual(1, num_trees.eval())
       # This reads from metadata, not really counting the layers.
       self.assertEqual(5, num_attempted_layers.eval())
       self.assertEqual(0, num_finalized_trees.eval())
+      self.assertAllEqual([3, 7], nodes_range.eval())
+
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index a54cc43517..4d09cf94d4 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -29,7 +29,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation without any regularization."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -76,7 +76,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -123,7 +123,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L1."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -173,7 +173,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index 4226ff75c2..d6c0047747 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -132,6 +132,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -314,6 +316,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -461,6 +465,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -615,6 +621,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 5
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -624,7 +632,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test that the metadata is updated even though we can't split."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -655,6 +664,9 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+
         }
       """, tree_ensemble_config)
 
@@ -685,7 +697,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
       # Expect no new splits created, but attempted (global) stats updated. Meta
       # data for this tree should not be updated (we didn't succeed building a
-      # layer.
+      # layer. Node ranges don't change.
       new_stamp, serialized = session.run(tree_ensemble.serialize())
       tree_ensemble = boosted_trees_pb2.TreeEnsemble()
       tree_ensemble.ParseFromString(serialized)
@@ -721,6 +733,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -730,7 +744,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -761,6 +776,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """, tree_ensemble_config)
 
@@ -851,6 +868,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -941,6 +960,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1046,6 +1067,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
        """
       self.assertEqual(new_stamp, 2)
@@ -1179,6 +1202,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 3
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
        """
       self.assertEqual(new_stamp, 3)
@@ -1268,6 +1293,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1307,7 +1334,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       # Expect the ensemble to be empty as post-pruning will prune
       # the entire finalized tree.
       self.assertEqual(new_stamp, 2)
-      self.assertProtoEquals("""
+      self.assertProtoEquals(
+          """
       trees {
         nodes {
           leaf {
@@ -1359,6 +1387,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       growing_metadata {
         num_trees_attempted: 1
         num_layers_attempted: 2
+        last_layer_node_start: 0
+        last_layer_node_end: 1
       }
       """, res_ensemble)
 
@@ -1455,6 +1485,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 174d00987f..2a2bcdd9d6 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -115,7 +115,7 @@ class TreeEnsemble(object):
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
-    stamp_token, _, _, _ = (
+    stamp_token, _, _, _, _ = (
         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
             self.resource_handle))
     return stamp_token
@@ -124,17 +124,20 @@ class TreeEnsemble(object):
     """Returns states of the tree ensemble.
 
     Returns:
-      stamp_token, num_trees, num_finalized_trees, num_attempted_layers.
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers and
+      range of the nodes in the latest layer.
     """
-    stamp_token, num_trees, num_finalized_trees, num_attempted_layers = (
-        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
-            self.resource_handle))
+    (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+     nodes_range) = (
+         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+             self.resource_handle))
     # Use identity to give names.
     return (array_ops.identity(stamp_token, name='stamp_token'),
             array_ops.identity(num_trees, name='num_trees'),
             array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
             array_ops.identity(
-                num_attempted_layers, name='num_attempted_layers'))
+                num_attempted_layers, name='num_attempted_layers'),
+            array_ops.identity(nodes_range, name='last_layer_nodes_range'))
 
   def serialize(self):
     """Serializes the ensemble into proto and returns the serialized proto.
-- 
GitLab


From acad7022b09b090da0684f209ac8d0feb1c986a2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 15:44:55 -0700
Subject: [PATCH 0623/1262] Adding support of core feature columns and losses
 to gradient boosted trees estimators.

PiperOrigin-RevId: 192521398
---
 .../boosted_trees/estimator_batch/BUILD       |  33 +++++
 .../estimator_batch/custom_export_strategy.py |   5 +-
 .../dnn_tree_combined_estimator.py            |  96 ++----------
 .../estimator_batch/estimator.py              |  19 ++-
 .../estimator_batch/estimator_test.py         | 138 ++++++++++++++++++
 .../estimator_batch/estimator_utils.py        |  71 +++++++++
 .../boosted_trees/estimator_batch/model.py    |  27 +++-
 .../python/training/functions/gbdt_batch.py   |  17 ++-
 .../training/functions/gbdt_batch_test.py     |  45 +++++-
 9 files changed, 346 insertions(+), 105 deletions(-)
 create mode 100644 tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
 create mode 100644 tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 17e20c4b31..0f65881aee 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -51,6 +51,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "estimator_utils",
+    srcs = ["estimator_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/learn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_test(
     name = "trainer_hooks_test",
     size = "small",
@@ -118,6 +130,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
         "//tensorflow/contrib/learn",
@@ -130,6 +143,7 @@ py_library(
     srcs = ["dnn_tree_combined_estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
@@ -159,3 +173,22 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
+
+py_test(
+    name = "estimator_test",
+    size = "medium",
+    srcs = ["estimator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":estimator",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index d9b0d89a03..62f1f4122b 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -39,7 +39,8 @@ _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE = "%s_%d"
 def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
-                                export_input_fn):
+                                export_input_fn,
+                                use_core_columns=False):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -58,7 +59,7 @@ def make_custom_export_strategy(name,
   input_fn = export_input_fn()
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns)
+       input_fn.features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 2e7b8cba05..449c130b2d 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -19,25 +19,19 @@ logits of the DNN. The input layer of the DNN (including the embeddings learned
 over sparse features) can optionally be provided to the boosted trees as
 an additional input feature.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.layers.python.layers import optimizers
-from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
-from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn_lib
-from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output
 from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -48,56 +42,8 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
 
-
 _DNN_LEARNING_RATE = 0.001
 
-_CORE_MODE_TO_CONTRIB_MODE_ = {
-    model_fn_lib.ModeKeys.TRAIN: contrib_model_fn_lib.ModeKeys.TRAIN,
-    model_fn_lib.ModeKeys.EVAL: contrib_model_fn_lib.ModeKeys.EVAL,
-    model_fn_lib.ModeKeys.PREDICT: contrib_model_fn_lib.ModeKeys.INFER
-}
-
-
-def _core_mode_to_contrib_mode(mode):
-  return _CORE_MODE_TO_CONTRIB_MODE_[mode]
-
-
-def _export_outputs_to_output_alternatives(export_outputs):
-  """Converts EstimatorSpec.export_outputs to output_alternatives.
-
-  Args:
-    export_outputs: export_outputs created by create_estimator_spec.
-  Returns:
-    converted output_alternatives.
-  """
-  output = dict()
-  if export_outputs is not None:
-    for key, value in export_outputs.items():
-      if isinstance(value, export_output.ClassificationOutput):
-        exported_predictions = {
-            prediction_key.PredictionKey.SCORES: value.scores,
-            prediction_key.PredictionKey.CLASSES: value.classes
-        }
-        output[key] = (constants.ProblemType.CLASSIFICATION,
-                       exported_predictions)
-    return output
-  return None
-
-
-def _estimator_spec_to_model_fn_ops(estimator_spec, is_regression):
-  alternatives = []
-  if not is_regression:
-    _export_outputs_to_output_alternatives(estimator_spec.export_outputs)
-
-  return model_fn.ModelFnOps(
-      mode=_core_mode_to_contrib_mode(estimator_spec.mode),
-      predictions=estimator_spec.predictions,
-      loss=estimator_spec.loss,
-      train_op=estimator_spec.train_op,
-      eval_metric_ops=estimator_spec.eval_metric_ops,
-      output_alternatives=alternatives)
-
-
 def _get_optimizer(optimizer):
   if callable(optimizer):
     return optimizer()
@@ -128,8 +74,7 @@ def _dnn_tree_combined_model_fn(features,
                                 dnn_steps_to_train=10000,
                                 tree_feature_columns=None,
                                 tree_center_bias=False,
-                                use_core_versions=False,
-                                is_regression=False):
+                                use_core_versions=False):
   """DNN and GBDT combined model_fn.
 
   Args:
@@ -169,7 +114,6 @@ def _dnn_tree_combined_model_fn(features,
       first fitting the bias.
     use_core_versions: Whether feature columns and loss are from the core (as
       opposed to contrib) version of tensorflow.
-    is_regression: Whether the problem is regression or not.
 
   Returns:
     A `ModelFnOps` object.
@@ -305,8 +249,8 @@ def _dnn_tree_combined_model_fn(features,
         labels=labels,
         train_op_fn=_dnn_train_op_fn,
         logits=dnn_logits)
-    dnn_train_op = _estimator_spec_to_model_fn_ops(dnn_train_op,
-                                                   is_regression).train_op
+    dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+        dnn_train_op).train_op
 
     tree_train_op = head.create_estimator_spec(
         features=tree_features,
@@ -314,10 +258,10 @@ def _dnn_tree_combined_model_fn(features,
         labels=labels,
         train_op_fn=_tree_train_op_fn,
         logits=tree_train_logits)
-    tree_train_op = _estimator_spec_to_model_fn_ops(tree_train_op,
-                                                    is_regression).train_op
+    tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+        tree_train_op).train_op
 
-    model_fn_ops = _estimator_spec_to_model_fn_ops(model_fn_ops, is_regression)
+    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
   else:
     model_fn_ops = head.create_model_fn_ops(
         features=features,
@@ -529,26 +473,12 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features,
-          labels,
-          mode,
-          head,
-          dnn_hidden_units,
-          dnn_feature_columns,
-          tree_learner_config,
-          num_trees,
-          tree_examples_per_layer,
-          config,
-          dnn_optimizer,
-          dnn_activation_fn,
-          dnn_dropout,
-          dnn_input_layer_partitioner,
-          dnn_input_layer_to_tree,
-          dnn_steps_to_train,
-          tree_feature_columns,
-          tree_center_bias,
-          use_core_versions,
-          is_regression=True)
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedRegressor, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 70454aa6db..89d0d611d2 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -40,7 +40,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
                label_keys=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeClassifier estimator instance.
 
     Args:
@@ -63,7 +64,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
-
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     Raises:
       ValueError: If learner_config is not valid.
     """
@@ -99,6 +101,7 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'center_bias': center_bias,
             'logits_modifier_function': logits_modifier_function,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
@@ -120,7 +123,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeRegressor estimator instance.
 
     Args:
@@ -145,6 +149,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.regression_head(
         label_name=label_name,
@@ -166,6 +172,7 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
@@ -189,7 +196,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeEstimator estimator instance.
 
     Args:
@@ -210,6 +218,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     super(GradientBoostedDecisionTreeEstimator, self).__init__(
         model_fn=model.model_builder,
@@ -222,6 +232,7 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
new file mode 100644
index 0000000000..0d58317bd5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GBDT estimator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+
+
+def _train_input_fn():
+  features = {"x": constant_op.constant([[2.], [1.], [1.]])}
+  label = constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+  return features, label
+
+
+def _eval_input_fn():
+  features = {"x": constant_op.constant([[1.], [2.], [2.]])}
+  label = constant_op.constant([[0], [1], [1]], dtype=dtypes.int32)
+  return features, label
+
+
+class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._export_dir_base = tempfile.mkdtemp() + "export/"
+    gfile.MkDir(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowException(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[contrib_feature_column.real_valued_column("x")])
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+    classifier.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForEstimator(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    # Use core head
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    model = estimator.GradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    model.fit(input_fn=_train_input_fn, steps=15)
+    model.evaluate(input_fn=_eval_input_fn, steps=1)
+    model.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForClassifier(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+    classifier.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForRegressor(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    regressor = estimator.GradientBoostedDecisionTreeRegressor(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    regressor.fit(input_fn=_train_input_fn, steps=15)
+    regressor.evaluate(input_fn=_eval_input_fn, steps=1)
+    regressor.export(self._export_dir_base)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
new file mode 100644
index 0000000000..c9cf4ae25a
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for converting between core and contrib feature columns."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
+
+_CORE_MODE_TO_CONTRIB_MODE_ = {
+    model_fn_lib.ModeKeys.TRAIN: contrib_model_fn_lib.ModeKeys.TRAIN,
+    model_fn_lib.ModeKeys.EVAL: contrib_model_fn_lib.ModeKeys.EVAL,
+    model_fn_lib.ModeKeys.PREDICT: contrib_model_fn_lib.ModeKeys.INFER
+}
+
+
+def _core_mode_to_contrib_mode(mode):
+  return _CORE_MODE_TO_CONTRIB_MODE_[mode]
+
+
+def _export_outputs_to_output_alternatives(export_outputs):
+  """Converts EstimatorSpec.export_outputs to output_alternatives.
+
+  Args:
+    export_outputs: export_outputs created by create_estimator_spec.
+  Returns:
+    converted output_alternatives.
+  """
+  output = dict()
+  if export_outputs is not None:
+    for key, value in export_outputs.items():
+      if isinstance(value, export_output.ClassificationOutput):
+        exported_predictions = {
+            prediction_key.PredictionKey.SCORES: value.scores,
+            prediction_key.PredictionKey.CLASSES: value.classes
+        }
+        output[key] = (constants.ProblemType.CLASSIFICATION,
+                       exported_predictions)
+    return output
+  return None
+
+
+def estimator_spec_to_model_fn_ops(estimator_spec):
+  alternatives = _export_outputs_to_output_alternatives(
+      estimator_spec.export_outputs)
+
+  return model_fn.ModelFnOps(
+      mode=_core_mode_to_contrib_mode(estimator_spec.mode),
+      predictions=estimator_spec.predictions,
+      loss=estimator_spec.loss,
+      train_op=estimator_spec.train_op,
+      eval_metric_ops=estimator_spec.eval_metric_ops,
+      output_alternatives=alternatives)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index c6455a7ea3..15ab6d8145 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
@@ -60,6 +61,7 @@ def model_builder(features, labels, mode, params, config):
   feature_columns = params["feature_columns"]
   weight_column_name = params["weight_column_name"]
   num_trees = params["num_trees"]
+  use_core_libs = params["use_core_libs"]
   logits_modifier_function = params["logits_modifier_function"]
   if features is None:
     raise ValueError("At least one feature must be specified.")
@@ -93,7 +95,8 @@ def model_builder(features, labels, mode, params, config):
       learner_config=learner_config,
       feature_columns=feature_columns,
       logits_dimension=head.logits_dimension,
-      features=training_features)
+      features=training_features,
+      use_core_columns=use_core_libs)
   with ops.name_scope("gbdt", "gbdt_optimizer"):
     predictions_dict = gbdt_model.predict(mode)
     logits = predictions_dict["predictions"]
@@ -108,12 +111,22 @@ def model_builder(features, labels, mode, params, config):
         update_op = state_ops.assign_add(global_step, 1).op
         return update_op
 
-  model_fn_ops = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_train_op_fn,
-      logits=logits)
+  create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
+  if use_core_libs and callable(create_estimator_spec_op):
+    model_fn_ops = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
+  else:
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
   if num_trees:
     if center_bias:
       num_trees += 1
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 85b909e4f2..4bde7f3e33 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -23,7 +23,6 @@ import copy
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import stateless
-
 from tensorflow.contrib.boosted_trees.lib.learner.batch import categorical_split_handler
 from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
@@ -141,7 +140,7 @@ class _OpRoundRobinStrategy(object):
     return task
 
 
-def extract_features(features, feature_columns):
+def extract_features(features, feature_columns, use_core_columns):
   """Extracts columns from a dictionary of features.
 
   Args:
@@ -174,7 +173,11 @@ def extract_features(features, feature_columns):
       transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
-        if isinstance(fc, feature_column_lib._EmbeddingColumn):
+        if use_core_columns:
+          # pylint: disable=protected-access
+          tensor = fc_core._transform_features(features, [fc])[fc]
+          transformed_features[fc.name] = tensor
+        elif isinstance(fc, feature_column_lib._EmbeddingColumn):
           # pylint: enable=protected-access
           transformed_features[fc.name] = fc_core.input_layer(
               features, [fc],
@@ -265,7 +268,8 @@ class GradientBoostedDecisionTreeModel(object):
                learner_config,
                features,
                logits_dimension,
-               feature_columns=None):
+               feature_columns=None,
+               use_core_columns=False):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
     Args:
@@ -338,8 +342,9 @@ class GradientBoostedDecisionTreeModel(object):
     if not features:
       raise ValueError("Features dictionary must be specified.")
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
-     sparse_float_shapes, sparse_int_indices, sparse_int_values,
-     sparse_int_shapes) = extract_features(features, self._feature_columns)
+     sparse_float_shapes, sparse_int_indices,
+     sparse_int_values, sparse_int_shapes) = extract_features(
+         features, self._feature_columns, use_core_columns)
     logging.info("Active Feature Columns: " + str(fc_names))
     self._fc_names = fc_names
     self._dense_floats = dense_floats
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 6411f57a54..17dcb49f47 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -27,9 +27,11 @@ from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.boosted_trees.python.utils import losses
 
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -99,7 +101,8 @@ class GbdtTest(test_util.TensorFlowTestCase):
           array_ops.zeros([2], dtypes.int64))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
-       sparse_int_shapes) = (gbdt_batch.extract_features(features, None))
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(features, None, use_core_columns=False))
       self.assertEqual(len(fc_names), 3)
       self.assertAllEqual(fc_names,
                           ["dense_float", "sparse_float", "sparse_int"])
@@ -148,8 +151,9 @@ class GbdtTest(test_util.TensorFlowTestCase):
               "sparse_categorical", hash_bucket_size=1000000))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
-       sparse_int_shapes) = (gbdt_batch.extract_features(
-           features, feature_columns))
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=False))
       self.assertEqual(len(fc_names), 3)
       self.assertAllEqual(fc_names,
                           ["dense_float", "sparse_float", "sparse_categorical"])
@@ -174,6 +178,41 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sparse_int_shapes[0].eval(),
                           features["sparse_categorical"].dense_shape.eval())
 
+  def testExtractFeaturesFromCoreFeatureColumns(self):
+    """Tests feature extraction when using core columns."""
+    with self.test_session():
+      features = {}
+      # Sparse float column does not exist in core, so only dense numeric and
+      # categorical.
+      features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
+      features["sparse_categorical"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.string), array_ops.zeros([2],
+                                                               dtypes.int64))
+
+      feature_columns = set()
+      feature_columns.add(core_feature_column.numeric_column("dense_float"))
+      feature_columns.add(
+          core_feature_column.categorical_column_with_hash_bucket(
+              "sparse_categorical", hash_bucket_size=1000000))
+      (fc_names, dense_floats, _, _, _, sparse_int_indices, sparse_int_values,
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=True))
+      self.assertEqual(len(fc_names), 2)
+      self.assertAllEqual(fc_names, ["dense_float", "sparse_categorical"])
+      self.assertEqual(len(dense_floats), 1)
+      self.assertEqual(len(sparse_int_indices), 1)
+      self.assertEqual(len(sparse_int_values), 1)
+      self.assertEqual(len(sparse_int_shapes), 1)
+      self.assertAllEqual(dense_floats[0].eval(),
+                          features["dense_float"].eval())
+      self.assertAllEqual(sparse_int_indices[0].eval(),
+                          features["sparse_categorical"].indices.eval())
+      self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
+      self.assertAllEqual(sparse_int_shapes[0].eval(),
+                          features["sparse_categorical"].dense_shape.eval())
+
   def testTrainFnChiefNoBiasCentering(self):
     """Tests the train function running on chief without bias centering."""
     with self.test_session() as sess:
-- 
GitLab


From d6e2513d60999bf0cf315c42a14c0e45eb49cda2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 15:59:47 -0700
Subject: [PATCH 0624/1262] support profiling multiple tpu through one grpc and
 one session. data are saved with host prefix.

PiperOrigin-RevId: 192523668
---
 tensorflow/contrib/tpu/profiler/BUILD         |  1 +
 .../tpu/profiler/capture_tpu_profile.cc       | 53 ++++++++++++++++---
 .../contrib/tpu/profiler/dump_tpu_profile.cc  |  3 +-
 .../contrib/tpu/profiler/tpu_profiler.proto   |  7 ++-
 .../tpu/profiler/tpu_profiler_analysis.proto  |  6 ++-
 5 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 1c32993e8e..dbf1ab6bbf 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -46,6 +46,7 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":dump_tpu_profile",
+        ":tpu_profiler_analysis_proto_cc",
         ":tpu_profiler_proto_cc",
         ":version",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 6b198dbc16..a535884263 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -40,6 +41,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
+using ::tensorflow::grpc::TPUProfileAnalysis;
 using ::tensorflow::TPUProfiler;
 
 constexpr uint64 kMaxEvents = 1000000;
@@ -64,11 +66,10 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-bool Profile(const string& service_addr, const string& logdir, int duration_ms,
-             const string& repository_root, const string& session_id,
-             const ProfileOptions& opts) {
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const string& repository_root,
+                                      const string& session_id,
+                                      const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
@@ -83,6 +84,17 @@ bool Profile(const string& service_addr, const string& logdir, int duration_ms,
   *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
+  return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+             const string& repository_root, const string& session_id,
+             const ProfileOptions& opts) {
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
@@ -120,7 +132,36 @@ bool NewSession(const string& service_addr,
                 const std::vector<tensorflow::string>& hostnames,
                 int duration_ms, const string& repository_root,
                 const string& session_id, const ProfileOptions& opts) {
-  return true;
+  NewProfileSessionRequest new_session_request;
+  *new_session_request.mutable_request() =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+  new_session_request.set_repository_root(repository_root);
+  new_session_request.set_session_id(session_id);
+  std::copy(
+      hostnames.begin(), hostnames.end(),
+      proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts()));
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  // TODO(jiesun): GRPC support following relevant naming scheme:
+  // 1. dns:///host:port
+  // 2. ipv4:host:port or ipv6:[host]:port
+  // We might need to change the prefix which depends on what TPU name resolver
+  // will give us.
+  std::unique_ptr<TPUProfileAnalysis::Stub> stub =
+      TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  NewProfileSessionResponse new_session_response;
+  TF_QCHECK_OK(FromGrpcStatus(
+      stub->NewSession(&context, new_session_request, &new_session_response)));
+
+  std::cout << "Profile session succeed for hosts:"
+            << str_util::Join(hostnames, ",");
+  return new_session_response.empty_trace();
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ae508583f8..b53f9be2e2 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -64,7 +64,8 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
 
 Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
                                const string& encoded_trace, std::ostream* os) {
-  string proto_path = JoinPath(run_dir, kProtoTraceFileName);
+  string proto_path =
+      JoinPath(run_dir, StrCat(host_prefix, kProtoTraceFileName));
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), proto_path, encoded_trace));
   LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 8505c4bc69..7be694e866 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -96,5 +96,10 @@ message ProfileResponse {
 
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
-  // next-field: 7
+
+  // When we write profiling data directly to repository directory, we need a
+  // way to figure out whether the captured trace is empty (due to idle TPU).
+  bool empty_trace = 7;
+
+  // next-field: 8
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
index a4fc8d4e87..8b0bbde98e 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -7,13 +7,15 @@ message NewProfileSessionRequest {
   ProfileRequest request = 1;
   string repository_root = 2;
   repeated string hosts = 3;
+  string session_id = 4;
 }
 
 message NewProfileSessionResponse {
   // Auxiliary error_message.
   string error_message = 1;
-  // If success, return session identifier for future reference.
-  string session_id = 2;
+
+  // Whether all hosts had returned a empty trace.
+  bool empty_trace = 2;
 }
 
 message EnumProfileSessionsAndToolsRequest {
-- 
GitLab


From e7cfede7bb75f22de890f6e94851121c949d8ba9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 16:05:42 -0700
Subject: [PATCH 0625/1262] Speed up computing mean confidence intervals by
 avoiding tf.while_loop.

Implement a vectorized way to compute the same thing instead.

PiperOrigin-RevId: 192524667
---
 .../kernel_tests/statistical_testing_test.py  | 23 +++++++++
 .../python/ops/statistical_testing.py         | 48 +++++++++----------
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
index c4fb669ebb..ce6cf702d5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import statistical_testing as st
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
@@ -215,6 +216,28 @@ class StatisticalTestingTest(test.TestCase):
           samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
       _ = sess.run(op)
 
+  def test_do_maximum_mean(self):
+    n = 117
+    envelope = 0.02  # > 2 / n, but < 3 / n
+    rng = np.random.RandomState(seed=8)
+    samples = rng.uniform(size=n).astype(np.float32)
+
+    # Compute the answer in TF using the code under test
+    with self.test_session() as sess:
+      envelope_t = ops.convert_to_tensor(envelope)
+      max_mean = st._do_maximum_mean(samples, envelope_t, 1)
+      max_mean = sess.run(max_mean)
+
+    # Compute the correct answer for this case in numpy.  In this
+    # example, `n` and `envelope` are such that `samples[2]` is the
+    # element that should be taken partially, regardless of the
+    # content of the `samples` array (see algorithm description in
+    # `../ops/statistical_testing.py`).
+    samples = sorted(samples)
+    weight = 1. / n - (envelope - 2. / n)
+    answer = samples[2] * weight + sum(samples[3:]) / n + envelope * 1.
+    self.assertAllClose(max_mean, answer, rtol=1e-9)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index 9b9fff0afa..9c69435fac 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -130,7 +130,7 @@ import itertools
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -169,31 +169,27 @@ def _do_maximum_mean(samples, envelope, high, name=None):
     samples = array_ops.transpose(samples, perm)
 
     samples = _batch_sort_vector(samples)
-    batch_shape = array_ops.shape(samples)[:-1]
-    n = array_ops.shape(samples)[-1]
-    step = 1. / math_ops.cast(n, dtype=samples.dtype.base_dtype)
-
-    def _loop_body(iter_, total, to_skip):
-      total = array_ops.where(
-          step <= to_skip,
-          total,
-          array_ops.where(
-              to_skip > 0.,
-              total + (step - to_skip) * samples[..., iter_],
-              total + step * samples[..., iter_]))
-      to_skip = array_ops.where(step <= to_skip, to_skip - step, 0.)
-      return [iter_ + 1, total, to_skip]
-
-    _, total, _ = control_flow_ops.while_loop(
-        cond=lambda iter_, *args: iter_ < n,
-        body=_loop_body,
-        loop_vars=[
-            0,
-            array_ops.zeros(batch_shape, dtype=samples.dtype.base_dtype),
-            envelope,  # to_skip
-        ])
-
-  return total + envelope * high
+
+    # The maximum mean is given by taking `envelope`-worth of
+    # probability from the smallest samples and moving it to the
+    # maximum value.  This amounts to:
+    # - ignoring the smallest k samples, where `k/n < envelope`
+    # - taking a `1/n - (envelope - k/n)` part of the index k sample
+    # - taking all the other samples
+    # - and adding `envelope * high` at the end.
+    # The following is a vectorized and batched way of computing this.
+    # `max_mean_contrib` is a mask implementing the previous.
+    batch_size = array_ops.shape(samples)[-1]
+    batch_size = math_ops.cast(batch_size, dtype=samples.dtype.base_dtype)
+    step = 1. / batch_size
+    cum_steps = step * math_ops.range(
+        1, batch_size + 1, dtype=samples.dtype.base_dtype)
+    max_mean_contrib = clip_ops.clip_by_value(
+        cum_steps - envelope[..., array_ops.newaxis],
+        clip_value_min=0.,
+        clip_value_max=step)
+    return math_ops.reduce_sum(
+        samples * max_mean_contrib, axis=-1) + envelope * high
 
 
 def _maximum_mean(samples, envelope, high, name=None):
-- 
GitLab


From 88fcde66561a8c7a869a4dc57003a30376c4b548 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 11 Apr 2018 16:23:10 -0700
Subject: [PATCH 0626/1262] Remove reference cycle checks from unit tests which
 touch uuid.uuid4()

Should fix the release builds. They're failing because uuid4() creates reference
cycles in Python 2.7.9 (2.7.11+ are fine).
---
 .../contrib/eager/python/checkpointable_utils_test.py     | 8 ++++----
 .../contrib/optimizer_v2/checkpointable_utils_test.py     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index e6498ddb06..1dd0f21a07 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -116,7 +116,7 @@ class OnlyOneDep(checkpointable.Checkpointable):
 
 class SplitTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testSaveRestoreSplitDep(self):
     save_checkpoint = checkpointable_utils.Checkpoint(
         dep=SaveTensorSlicesAsDeps())
@@ -390,7 +390,7 @@ class CheckpointingTests(test.TestCase):
             optimizer_node.slot_variables[0]
             .slot_variable_node_id].attributes[0].checkpoint_key)
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testMoreComplexSaveableReturned(self):
     v = _OwnsMirroredVariables()
     checkpoint = checkpointable_utils.Checkpoint(v=v)
@@ -976,7 +976,7 @@ class CheckpointingTests(test.TestCase):
         saver.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testCheckpointCleanup(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -996,7 +996,7 @@ class CheckpointingTests(test.TestCase):
         expected_filenames,
         os.listdir(checkpoint_directory))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testCheckpointCleanupChangingVarList(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 08f9699e85..d219795aa1 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -411,7 +411,7 @@ class CheckpointingTests(test.TestCase):
         optimizer.apply_gradients(
             [(g, v) for g, v in zip(grad, model.vars)])
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-- 
GitLab


From 2b94b444d53cfa6875f7874197cbc584a06d7a30 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 11 Apr 2018 16:43:33 -0700
Subject: [PATCH 0627/1262] Move callback into bound function to avoid copying.

PiperOrigin-RevId: 192530231
---
 .../core/common_runtime/rendezvous_mgr.cc     | 51 +++++++++++--------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 60263d1471..93f24a3217 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -121,27 +121,36 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
   // Recv the tensor from local_.
   local_->RecvAsync(
       parsed, recv_args,
-      [this, parsed, done](
-          const Status& status, const Rendezvous::Args& send_args,
-          const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
-        // If "in" is an uninitialized tensor, do copy-construction to preserve
-        // the uninitialized state, along with data type and shape info, which
-        // is useful for debugger purposes.
-        Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
-
-        StatusCallback final_callback = [done, send_args, recv_args, out,
-                                         is_dead](const Status& s) {
-          done(s, send_args, recv_args, *out, is_dead);
-          delete out;
-        };
-
-        if (status.ok() && in.IsInitialized()) {
-          SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
-                             std::move(final_callback));
-        } else {
-          final_callback(status);
-        }
-      });
+      std::bind(
+          [this, parsed](DoneCallback done,
+                         // Begin unbound arguments.
+                         const Status& status,
+                         const Rendezvous::Args& send_args,
+                         const Rendezvous::Args& recv_args, const Tensor& in,
+                         bool is_dead) {
+            // If "in" is an uninitialized tensor, do copy-construction to
+            // preserve the uninitialized state, along with data type and shape
+            // info, which is useful for debugger purposes.
+            Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
+
+            auto final_callback = std::bind(
+                [send_args, recv_args, out, is_dead](DoneCallback done,
+                                                     // Begin unbound arguments.
+                                                     const Status& s) {
+                  done(s, send_args, recv_args, *out, is_dead);
+                  delete out;
+                },
+                std::move(done), std::placeholders::_1);
+
+            if (status.ok() && in.IsInitialized()) {
+              SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                                 std::move(final_callback));
+            } else {
+              final_callback(status);
+            }
+          },
+          std::move(done), std::placeholders::_1, std::placeholders::_2,
+          std::placeholders::_3, std::placeholders::_4, std::placeholders::_5));
 }
 
 void IntraProcessRendezvous::StartAbort(const Status& s) {
-- 
GitLab


From 3734bb6ca9f5df8dbbf4bceb80b28d69452bdc61 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Wed, 11 Apr 2018 16:59:45 -0700
Subject: [PATCH 0628/1262] boosted_trees: make sure ensemble deserialization
 happens for the non-TRAIN modes too.

PiperOrigin-RevId: 192532297
---
 .../python/estimator/canned/boosted_trees.py  | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 58af59dbb1..0ecc8c7089 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -317,27 +317,28 @@ def _bt_model_fn(
                                                    head.logits_dimension)
 
     # Create Ensemble resources.
-    if is_single_machine:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      local_tree_ensemble = tree_ensemble
-      ensemble_reload = control_flow_ops.no_op()
-    else:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      with ops.device(worker_device):
-        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-            name=name + '_local', is_local=True)
-      # TODO(soroush): Do partial updates if this becomes a bottleneck.
-      ensemble_reload = local_tree_ensemble.deserialize(
-          *tree_ensemble.serialize())
-
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
     # Create logits.
     if mode != model_fn.ModeKeys.TRAIN:
       logits = boosted_trees_ops.predict(
-          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
           logits_dimension=head.logits_dimension,
           max_depth=tree_hparams.max_depth)
     else:
+      if is_single_machine:
+        local_tree_ensemble = tree_ensemble
+        ensemble_reload = control_flow_ops.no_op()
+      else:
+        # Have a local copy of ensemble for the distributed setting.
+        with ops.device(worker_device):
+          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+              name=name + '_local', is_local=True)
+        # TODO(soroush): Do partial updates if this becomes a bottleneck.
+        ensemble_reload = local_tree_ensemble.deserialize(
+            *tree_ensemble.serialize())
       if cache:
         cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
       else:
-- 
GitLab


From 81a9ceaf7290b2260f636609a83b01b9ab2224d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 17:19:20 -0700
Subject: [PATCH 0629/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192534931
---
 tensorflow/core/ops/ops.pbtxt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 6af77be148..43fd09fb72 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4135,6 +4135,10 @@ op {
     name: "num_attempted_layers"
     type: DT_INT32
   }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
   is_stateful: true
 }
 op {
-- 
GitLab


From d62a5a11e99b391f2e61e80c4f0a80def6ff6508 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 17:29:32 -0700
Subject: [PATCH 0630/1262] Automated g4 rollback of changelist 192516190

PiperOrigin-RevId: 192536085
---
 tensorflow/core/grappler/op_types.cc          |  8 +-
 tensorflow/core/grappler/op_types.h           |  1 -
 .../grappler/optimizers/constant_folding.cc   | 95 ++-----------------
 .../optimizers/constant_folding_test.cc       | 80 +---------------
 4 files changed, 16 insertions(+), 168 deletions(-)

diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index cfe1329dbf..9c45aed62f 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -249,10 +249,6 @@ bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
 
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
-bool IsRandomShuffle(const NodeDef& node) {
-  return node.op() == "RandomShuffle";
-}
-
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -302,7 +298,9 @@ bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
 
 bool IsShapeN(const NodeDef& node) { return node.op() == "ShapeN"; }
 
-bool IsShuffle(const NodeDef& node) { return node.op() == "Shuffle"; }
+bool IsShuffle(const NodeDef& node) {
+  return node.op() == "Shuffle" || node.op() == "RandomShuffle";
+}
 
 bool IsSigmoidGrad(const NodeDef& node) { return node.op() == "SigmoidGrad"; }
 
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 0573b02604..79fd05e187 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -98,7 +98,6 @@ bool IsPolygamma(const NodeDef& node);
 bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
-bool IsRandomShuffle(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 17d8b7421c..b2a1ce6ab6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1574,99 +1574,24 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
       continue;
     }
 
-    // Remove Shuffle or Transpose op over dimensions of size 1.
-    if (use_shape_info && (IsShuffle(*node) || IsTranspose(*node)) &&
-        !properties->GetInputProperties(node->name()).empty()) {
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      if (shape.unknown_rank()) {
-        // Not optimizable.
-        continue;
-      }
-      const auto& p = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
-        Tensor perm(p.dtype(), p.shape());
-        if (!perm.FromProto(p.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         p.value().DebugString());
-        }
-        std::vector<int> permutation;
-        for (int j = 0; j < perm.NumElements(); ++j) {
-          if (perm.dtype() == DT_INT64) {
-            permutation.push_back(perm.vec<int64>()(j));
-          } else {
-            permutation.push_back(perm.vec<int>()(j));
-          }
-        }
-        if (permutation.size() != shape.dim_size()) {
-          // Number of elements in perm should be same as dim_size. Skip if not.
-          continue;
-        }
-        // The node is replaceable iff
-        // dim_size == 0 || all dims have size 1 ||
-        // all dims with > 1 size are not permuted.
-        bool replaceable = true;
-        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-          replaceable &= shape.dim(j).size() == 1 || j == permutation[j];
-        }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, optimized_graph);
-          continue;
-        }
-      }
-    }
-
-    // Remove RandomShuffle op if it is scalar or first dimension is of size 1.
-    if (use_shape_info && IsRandomShuffle(*node) &&
-        !properties->GetInputProperties(node->name()).empty()) {
+    // Remove Shuffle or Reverse op over scalar values.
+    if (use_shape_info &&
+        !properties->GetInputProperties(node->name()).empty() &&
+        (IsShuffle(*node) || IsReverse(*node) || IsTranspose(*node))) {
       const auto& shape =
           properties->GetInputProperties(node->name())[0].shape();
       // The node is replaceable iff
-      // unknown_rank == false && (dim_size == 0 || first dim is of size 1)
-      if (!shape.unknown_rank() &&
-          (shape.dim_size() == 0 || shape.dim(0).size() == 1)) {
+      // unknown_rank == false && (dim_size == 0 || all dims have size 1)
+      bool replaceable = !shape.unknown_rank();
+      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+        replaceable &= shape.dim(j).size() == 1;
+      }
+      if (replaceable) {
         ReplaceOperationWithIdentity(0, node, optimized_graph);
         continue;
       }
     }
 
-    // Remove Reverse op over dimensions with size 1.
-    if (use_shape_info && IsReverse(*node) &&
-        !properties->GetInputProperties(node->name()).empty()) {
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      const auto& a = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(a.shape()) && a.has_value()) {
-        Tensor axis(a.dtype(), a.shape());
-        if (!axis.FromProto(a.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         a.value().DebugString());
-        }
-        std::set<int> target_axes;
-        for (int j = 0; j < axis.NumElements(); ++j) {
-          if (axis.dtype() == DT_INT64) {
-            target_axes.insert(axis.vec<int64>()(j));
-          } else {
-            target_axes.insert(axis.vec<int>()(j));
-          }
-        }
-
-        // The node is replaceable iff
-        // unknown_rank == false &&
-        // (dim_size == 0 || all dims have size 1 ||
-        //  all dims with > 1 size are not in target_axes)
-        bool replaceable = !shape.unknown_rank();
-        for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-          replaceable &= shape.dim(j).size() == 1 ||
-                         target_axes.find(j) == target_axes.end();
-        }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, optimized_graph);
-          continue;
-        }
-      }
-    }
-
     if (use_shape_info && IsSlice(*node) &&
         properties->GetInputProperties(node->name()).size() == 3) {
       const auto& input = properties->GetInputProperties(node->name())[0];
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 7453fb6731..31abe43846 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1389,6 +1389,8 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
   ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
 
+  LOG(INFO) << s1.output.size();
+  LOG(INFO) << s2.output.size();
   ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
 
   GrapplerItem item;
@@ -1416,45 +1418,7 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   CompareGraphs(want, got);
 }
 
-TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-
-  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
-                             DT_FLOAT);
-  Output p1 = ops::Const(scope.WithOpName("p1"), {3, 2, 1, 0}, {4});
-  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 4, 2, 1}),
-                             DT_FLOAT);
-  Output p2 = ops::Const(scope.WithOpName("p2"), {3, 1, 2, 0}, {4});
-  ops::Transpose t1(scope.WithOpName("t1"), in1, p1);
-  ops::Transpose t2(scope.WithOpName("t2").WithControlDependencies({in1}), in2,
-                    p2);
-
-  ops::Add out1(scope.WithOpName("out1"), t1, t2);
-
-  GrapplerItem item;
-  item.fetch = {"out1"};
-  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding optimizer(nullptr /* cpu_device */);
-  GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
-  TF_EXPECT_OK(status);
-
-  GraphDef want;
-  AddNode("in1", "VariableV2", {}, {}, &want);
-  AddNode("in2", "VariableV2", {}, {}, &want);
-  AddNode("p1", "Const", {}, {}, &want);
-  AddNode("p2", "Const", {}, {}, &want);
-  AddNode("t1", "Transpose", {"in1", "p1"}, {}, &want);
-  AddNode("t2", "Identity",
-          {"in2", AsControlDependency("in1"), AsControlDependency("p2")}, {},
-          &want);
-  AddNode("out1", "Add", {"t1", "t2"}, {}, &want);
-
-  CompareGraphs(want, got);
-}
-
-TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
+TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   Output in1 =
@@ -1488,44 +1452,6 @@ TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
   CompareGraphs(want, got);
 }
 
-TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-
-  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
-                             DT_FLOAT);
-  Output a1 = ops::Const(scope.WithOpName("a1"), {3, 2, 1, 0}, {4});
-  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 2, 4, 1}),
-                             DT_FLOAT);
-  Output a2 = ops::Const(scope.WithOpName("a2"), {0, 3}, {2});
-  ops::Reverse r1(scope.WithOpName("r1"), in1, a1);
-  ops::Reverse r2(scope.WithOpName("r2").WithControlDependencies({in1}), in2,
-                  a2);
-
-  ops::Add out1(scope.WithOpName("out1"), r1, r2);
-
-  GrapplerItem item;
-  item.fetch = {"out1"};
-  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding optimizer(nullptr /* cpu_device */);
-  GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
-  TF_EXPECT_OK(status);
-
-  GraphDef want;
-  AddNode("in1", "VariableV2", {}, {}, &want);
-  AddNode("in2", "VariableV2", {}, {}, &want);
-  AddNode("a1", "Const", {}, {}, &want);
-  AddNode("a2", "Const", {}, {}, &want);
-  AddNode("r1", "ReverseV2", {"in1", "a1"}, {}, &want);
-  AddNode("r2", "Identity",
-          {"in2", AsControlDependency("in1"), AsControlDependency("a2")}, {},
-          &want);
-  AddNode("out1", "Add", {"r1", "r2"}, {}, &want);
-
-  CompareGraphs(want, got);
-}
-
 TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
   {  // size = {3, 5}
     tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-- 
GitLab


From 7de7245a7b102107b6f6cd20912db5f5be2c955c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 17:49:47 -0700
Subject: [PATCH 0631/1262] Update docs of reduce_max/reduce_min for real
 numeric type (#18422)

Both reduce_max and reduce_min only work for real numeric type
as complex numbers do not apply. This fix update the docs
with `numeric type` -> `real numeric type`.

Note that the current kernel registration in
reduction_ops_max.cc and reduction_ops_min.cc
use `TF_CALL_REAL_NUMBER_TYPES` so it is good.
The op registraton for Max and Min inside math_ops.cc should be `.Attr("T: realnumbertype")` instead of `numbertype`.
However, such a change will break API compatibility so leave it alone.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 01d670ea2d..a38ecb2acb 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1632,7 +1632,7 @@ def reduce_min(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1681,7 +1681,7 @@ def reduce_max(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-- 
GitLab


From 1a721ecd9a9992d48c0deb3008b1fc8df297d300 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 11 Apr 2018 17:46:08 -0700
Subject: [PATCH 0632/1262] Internal testing changes

PiperOrigin-RevId: 192537874
---
 tensorflow/contrib/lite/schema/BUILD  | 3 +++
 tensorflow/contrib/lite/testing/BUILD | 3 +++
 tensorflow/contrib/lite/tools/BUILD   | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index 246ec85fe4..9717a4a1a4 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -63,6 +63,9 @@ cc_test(
         "schema.fbs",
         "schema_v3.fbs",
     ],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         "//tensorflow/core:lib_platform",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 1ce89a25fd..2c226e76d4 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -161,6 +161,9 @@ cc_test(
     size = "small",
     srcs = ["tflite_driver_test.cc"],
     data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         ":tflite_driver",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 44fde69a1e..7b3569ea9c 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -78,6 +78,9 @@ cc_test(
         "//tensorflow/contrib/lite:testdata/test_model.bin",
         "//tensorflow/contrib/lite:testdata/test_model_broken.bin",
     ],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         ":gen_op_registration",
         "@com_google_googletest//:gtest",
-- 
GitLab


From 9d7eee0d7fee883ffa3711f4e80b2c93ff5aecbc Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 17:50:45 -0700
Subject: [PATCH 0633/1262] Imporve shape function of RandomUniformInt (#18420)

* Imporve shape function of RandomUniformInt

The input of `minval` and `maxval` of `RandomUniformInt`
should be scalar though it is not checked in the shape
function. This fix improves the shape function with the
rank check, and adds test case for it.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for maxval and minval for RandomUniformInt

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/random_ops.cc                     |  7 ++++++-
 .../python/kernel_tests/random/random_ops_test.py     | 11 +++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index f6c668f5c9..416ce9c0d8 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -43,7 +43,12 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index df37dd98ec..e4b5c3832a 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -228,6 +228,17 @@ class RandomUniformTest(test.TestCase):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  def testUniformIntsWithInvalidShape(self):
+    for dtype in dtypes.int32, dtypes.int64:
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=[1, 2], maxval=3, dtype=dtype)
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=1, maxval=[2, 3], dtype=dtype)
+
   # Check that uniform ints actually follow a uniform distribution.
   def testUniformInts(self):
     minv = -2
-- 
GitLab


From e5e530f91aae3e8cd08a77487bb00d0630413e8a Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 17:51:26 -0700
Subject: [PATCH 0634/1262] Exclude cudnn_version_test from build in
 tf_stream_executor.cmake

---
 tensorflow/contrib/cmake/tf_stream_executor.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 91ca33f4c4..2b32b22a71 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -65,6 +65,10 @@ if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
+    file(GLOB tf_stream_executor_gpu_tests
+        "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
+    )
+    list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()
 
-- 
GitLab


From da0fed895c2cb8d8f16d0a8083bb635f623cfa75 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 17:51:41 -0700
Subject: [PATCH 0635/1262] Add deprecated_args decoration to expand_dims
 (#18419)

* Add deprecated_args decoration to expand_dims

This fix adds deprecated_args decoration to expand_dims
as `dims` has been deprecated and in favor of `axis`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Enhance deprecated args with deprecation.deprecated_argument_lookup

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fa26e07c85..9e136937f6 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -144,6 +144,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export("expand_dims")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -193,11 +194,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
   Raises:
     ValueError: if both `dim` and `axis` are specified.
   """
-  # TODO(aselle): Remove argument dim
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("can't specify both 'dim' and 'axis'")
-    axis = dim
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   return gen_array_ops.expand_dims(input, axis, name)
 
 
-- 
GitLab


From a75a5e48a4f9240a02a45119e77b28363e772bef Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <vomjom@vomjom.net>
Date: Wed, 11 Apr 2018 17:54:10 -0700
Subject: [PATCH 0636/1262] Improve comment

---
 tensorflow/contrib/lite/toco/model.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 8a936842d9..d0ae8d389f 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -151,9 +151,9 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that does not by itself tell whether the values in the array are
-// real (are literally interpreted as real numbers) or quantized (only acquire
-// a meaning as real numbers in conjunction with QuantizationParams).
+// Note that the type does not by itself tell whether the values in the array
+// are real (are literally interpreted as real numbers) or quantized (only
+// acquire a meaning as real numbers in conjunction with QuantizationParams).
 //
 // In practice though:
 //   float values are always real
-- 
GitLab


From 94768f9a886f85d2e147983907afffa57bc998ff Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 17:57:18 -0700
Subject: [PATCH 0637/1262] Exclude tests from tf_stream_executor build only if
 BUILD_CC_TESTS is OFF

---
 tensorflow/contrib/cmake/tf_stream_executor.cmake | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 2b32b22a71..eaae64e1c6 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -65,10 +65,12 @@ if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
-    file(GLOB tf_stream_executor_gpu_tests
-        "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
-    )
-    list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
+    if (NOT tensorflow_BUILD_CC_TESTS)
+        file(GLOB tf_stream_executor_gpu_tests
+            "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
+        }
+        list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
+    endif()
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()
 
-- 
GitLab


From 40c40bbc4b52a2036b2f6a504f2b3895d789639f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 18:01:38 -0700
Subject: [PATCH 0638/1262] Add negative axis support for tf.manip.roll
 (#18409)

* Add negative axis support for tf.manip.roll

This fix tries to support negative axis for tf.manip.roll.
The tf.manip.roll is supposed to be compatible with numpy.roll
which does support negative axis.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for negative axis support for tf.manip.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add axis check so that negative axis is within the range

Negative axis should be 0 <= axis + dims < dims

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add additional test cases

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/roll_op.cc               |  7 +++++--
 tensorflow/python/kernel_tests/manip_ops_test.py | 10 ++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index bcbdbee058..4b630809c5 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -254,8 +254,11 @@ class RollOp : public OpKernel {
     // total modulo sum of shifts for each dimension
     gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
     for (int i = 0; i < num_shifts; i++) {
-      const int axis = axis_flat(i);
-      OP_REQUIRES(context, axis < num_dims,
+      int axis = axis_flat(i);
+      if (axis < 0) {
+        axis += num_dims;
+      }
+      OP_REQUIRES(context, 0 <= axis && axis < num_dims,
                   errors::InvalidArgument("axis ", axis, " is out of range"));
       const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
       const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index b8200ac0cb..7948a475bb 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -88,6 +88,16 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  def testNegativeAxis(self):
+    self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
+    self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
+    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
+                       3, -10).eval()
+
   def testRollInputMustVectorHigherRaises(self):
     tensor = 7
     shift = 1
-- 
GitLab


From 7b0b7bbe9519a5dee55d9e83d681411495aad45a Mon Sep 17 00:00:00 2001
From: Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>
Date: Wed, 11 Apr 2018 18:04:26 -0700
Subject: [PATCH 0639/1262] Fixing non-mkl builds (#18401)

---
 tensorflow/core/kernels/BUILD | 36 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1018e8d25c..2bbedfff73 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5943,8 +5943,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5959,8 +5958,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5976,8 +5974,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5997,8 +5994,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6014,8 +6010,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6031,8 +6026,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6040,8 +6034,7 @@ tf_mkl_kernel_library(
     srcs = ["mkl_fused_batch_norm_op.cc"],
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6049,8 +6042,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_aggregate_ops",
     deps = MATH_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6058,8 +6050,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_concat_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6067,8 +6058,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_reshape_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6076,8 +6066,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_identity_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6085,8 +6074,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_lrn_op",
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
-- 
GitLab


From d68ceefaba6972221bc6b3f86a76c4d07565fbdb Mon Sep 17 00:00:00 2001
From: Vadim Markovtsev <gmarkhor@gmail.com>
Date: Thu, 12 Apr 2018 03:08:00 +0200
Subject: [PATCH 0640/1262] Replace print with logging (#18392)

---
 tensorflow/python/framework/graph_util_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 910364364c..394fac6c85 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -285,7 +285,7 @@ def convert_variables_to_constants(sess,
     output_graph_def.node.extend([output_node])
 
   output_graph_def.library.CopyFrom(inference_graph.library)
-  print("Converted %d variables to const ops." % how_many_converted)
+  logging.info("Converted %d variables to const ops.", how_many_converted)
   return output_graph_def
 
 
-- 
GitLab


From d1ee6aa01090614ea53bc88ddf5edc1d44215a72 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 12 Apr 2018 03:08:42 +0200
Subject: [PATCH 0641/1262] unified flip_* and random_flip_* functions (#18364)

---
 tensorflow/python/ops/image_ops_impl.py | 74 ++++++++++++++++---------
 1 file changed, 48 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3369fe3c9b..601010bce9 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -269,17 +269,7 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
-    image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [0]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+  return _random_flip(image, 0, seed, 'random_flip_up_down')
 
 
 @tf_export('image.random_flip_left_right')
@@ -301,14 +291,34 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+  return _random_flip(image, 1, seed, 'random_flip_left_right')
+
+
+def _random_flip(image, flip_index, seed, scope_name):
+  """Randomly (50% chance) flip an image along axis `flip_index`.
+    Args:
+      image: A 3-D tensor of shape `[height, width, channels].`
+      flip_index: The dimension along which to flip the image.
+                  Vertical: 0, Horizontal: 1
+      seed: A Python integer. Used to create a random seed. See
+        @{tf.set_random_seed}
+        for behavior.
+      scope_name: Name of the scope in which the ops are added.
+
+    Returns:
+      A 3-D tensor of the same type and shape as `image`.
+
+    Raises:
+      ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(
         mirror_cond,
-        lambda: array_ops.reverse(image, [1]),
+        lambda: array_ops.reverse(image, [flip_index]),
         lambda: image,
         name=scope)
     return fix_image_flip_shape(image, result)
@@ -332,16 +342,7 @@ def flip_left_right(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_left_right', [image]):
-    image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
-    elif shape.ndims == 4:
-      return array_ops.reverse(image, [2])
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  return _flip(image, 1, 'flip_left_right')
 
 
 @tf_export('image.flip_up_down')
@@ -362,14 +363,35 @@ def flip_up_down(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_up_down', [image]):
+  return _flip(image, 0, 'flip_up_down')
+
+
+def _flip(image, flip_index, scope_name):
+  """Flip an image either horizontally or vertically.
+
+  Outputs the contents of `image` flipped along the dimension `flip_index`.
+
+  See also `reverse()`.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+    flip_index: 0 For vertical, 1 for horizontal.
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+      return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
     elif shape.ndims == 4:
-      return array_ops.reverse(image, [1])
+      return array_ops.reverse(image, [flip_index+1])
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-- 
GitLab


From 85b5d2eeb2dd876cb70b4c053110552553ade44b Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Thu, 12 Apr 2018 09:09:17 +0800
Subject: [PATCH 0642/1262] Fix broken links in /extend/language_bindings
 (#18346)

---
 tensorflow/docs_src/extend/language_bindings.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index b9fd72978d..9a968d365b 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -112,11 +112,11 @@ There are a few ways to get a list of the `OpDef`s for the registered ops:
     to interpret the `OpDef` messages.
 -   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
     list of all registered `OpDef`s (defined in
-    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
+    [`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h)). This can be used to write the generator
     in C++ (particularly useful for languages that do not have protocol buffer
     support).
 -   The ASCII-serialized version of that list is periodically checked in to
-    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
+    [`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt) by an automated process.
 
 The `OpDef` specifies the following:
 
@@ -159,7 +159,7 @@ between the generated code and the `OpDef`s checked into the repository, but is
 useful for languages where code is expected to be generated ahead of time like
 `go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
 some languages the code could be generated dynamically from
-[`tensorflow/core/ops/ops.pbtxt`].
+[`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt).
 
 #### Handling Constants
 
@@ -229,6 +229,3 @@ and "while") is not available in languages other than Python. This will be
 updated when the [C API] provides necessary support.
 
 [C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
-[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
-[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
-[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
-- 
GitLab


From 91baf5056f02d235e2516b0c066c473ab77a8955 Mon Sep 17 00:00:00 2001
From: David Norman <DavidNorman@users.noreply.github.com>
Date: Thu, 12 Apr 2018 02:09:31 +0100
Subject: [PATCH 0643/1262] Disable int64 test for backends which don't support
 it (#18344)

---
 tensorflow/compiler/tests/binary_ops_test.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index d1d7379c0a..1e4dd32916 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -360,11 +360,13 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
-    self._testBinary(
-        math_ops.add,
-        np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-        np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-        expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
+    if np.int64 in self.numeric_types:
+      self._testBinary(
+          math_ops.add,
+          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+          expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
+                            dtype=np.int64))
 
   def testComplexOps(self):
     for dtype in self.complex_types:
-- 
GitLab


From 70d99359fcb9aa9efa955fab06227373c734728b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 11 Apr 2018 18:09:42 -0700
Subject: [PATCH 0644/1262] Add `tf.contrib.stateless.stateless_multinomial()`.

This is a starting point for Dataset-compatible weighted sampling across a list of datasets.

PiperOrigin-RevId: 192540412
---
 tensorflow/contrib/stateless/__init__.py      |   2 +
 .../kernel_tests/stateless_random_ops_test.py |  46 ++++++
 .../api_def_StatelessMultinomial.pbtxt        |  30 ++++
 tensorflow/core/kernels/BUILD                 |   1 +
 tensorflow/core/kernels/multinomial_op.cc     | 131 +++++++++++++++---
 .../core/kernels/stateless_random_ops.cc      |  68 +++++----
 .../core/kernels/stateless_random_ops.h       |  34 +++++
 tensorflow/core/ops/stateless_random_ops.cc   |  28 +++-
 tensorflow/core/util/guarded_philox_random.cc |   8 ++
 tensorflow/core/util/guarded_philox_random.h  |   2 +
 10 files changed, 296 insertions(+), 54 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
 create mode 100644 tensorflow/core/kernels/stateless_random_ops.h

diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index ca937546f5..0cca40f071 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -22,6 +22,7 @@ WARNING: These ops are in contrib, and are not stable.  They should be
 consistent across multiple runs on the same hardware, but only for the same
 version of the code.
 
+@@stateless_multinomial
 @@stateless_random_uniform
 @@stateless_random_normal
 @@stateless_truncated_normal
@@ -37,6 +38,7 @@ from tensorflow.contrib.stateless.gen_stateless_random_ops import *
 from tensorflow.python.framework import ops
 from tensorflow.python.util.all_util import remove_undocumented
 
+ops.NotDifferentiable("StatelessMultinomial")
 ops.NotDifferentiable("StatelessRandomNormal")
 ops.NotDifferentiable("StatelessRandomUniform")
 ops.NotDifferentiable("StatelessTruncatedNormal")
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index bea6341cfd..d724a5c014 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -96,6 +96,52 @@ class StatelessOpsTest(test.TestCase):
               for s1, v1 in values:
                 self.assertEqual(s0 == s1, np.all(v0 == v1))
 
+  def testMatchStatefulMultinomial(self):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    num_samples = 4
+    for logits_dtype in np.float16, np.float32, np.float64:
+      for output_dtype in dtypes.int32, dtypes.int64:
+        for seed in (7, 17), (11, 5), (2, 3):
+          preseed = invert_philox(key,
+                                  (seed[0], 0, seed[1], 0)).astype(np.uint64)
+          preseed = preseed[::2] | preseed[1::2] << 32
+          random_seed.set_random_seed(seed[0])
+          with self.test_session(use_gpu=True):
+            for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                      [0.25, 0.75]]):
+              logits_t = constant_op.constant(logits, dtype=logits_dtype)
+              stateful = random_ops.multinomial(
+                  logits_t,
+                  num_samples,
+                  seed=seed[1],
+                  output_dtype=output_dtype)
+              pure = stateless.stateless_multinomial(
+                  logits_t,
+                  num_samples,
+                  seed=preseed,
+                  output_dtype=output_dtype)
+              self.assertAllEqual(stateful.eval(), pure.eval())
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.test_session(use_gpu=True):
+      for seed_type in [dtypes.int32, dtypes.int64]:
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                  [0.25, 0.75]]):
+          pure = stateless.stateless_multinomial(
+              logits, num_samples, seed=seed_t)
+          values = [
+              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
+          ]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
new file mode 100644
index 0000000000..c4e6c1fddd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "StatelessMultinomial"
+  in_arg {
+    name: "logits"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+represents the unnormalized log probabilities for all classes.
+END
+  }
+  in_arg {
+    name: "num_samples"
+    description: <<END
+0-D.  Number of independent samples to draw for each row slice.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+contains the drawn class labels with range `[0, num_classes)`.
+END
+  }
+  summary: "Draws samples from a multinomial distribution."
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1018e8d25c..e2af540dac 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4323,6 +4323,7 @@ tf_kernel_library(
     deps = [
         ":random_op",
         ":random_ops",
+        ":stateless_random_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index d086abb247..7a64788448 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -127,18 +128,16 @@ struct MultinomialFunctor<CPUDevice, T, OutputType> {
 
 }  // namespace functor
 
+namespace {
+
 // Samples from a multinomial distribution.
 template <typename Device, typename T, typename OutputType>
 class MultinomialOp : public OpKernel {
  public:
-  explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, generator_.Init(context));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& logits_t = ctx->input(0);
-    const Tensor& num_samples_t = ctx->input(1);
+  explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+  void DoCompute(OpKernelContext* ctx, const Tensor& logits_t,
+                 const Tensor& num_samples_t, GuardedPhiloxRandom* generator) {
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_t.shape()),
                 errors::InvalidArgument("logits should be a matrix, got shape ",
                                         logits_t.shape().DebugString()));
@@ -194,7 +193,7 @@ class MultinomialOp : public OpKernel {
       // CPU generates doubles = 2 samples per number.
       if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
-          generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
+          generator->ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
       functor::MultinomialFunctor<Device, T, OutputType>()(
           ctx, ctx->eigen_device<Device>(), logits_t.matrix<T>(),
           noises.flat<float>(), scores.flat<float>(), scratch.flat<float>(),
@@ -202,24 +201,38 @@ class MultinomialOp : public OpKernel {
           samples_t->matrix<OutputType>());
     }
   }
+};
+
+template <typename Device, typename T, typename OutputType>
+class StatefulMultinomialOp : public MultinomialOp<Device, T, OutputType> {
+ public:
+  explicit StatefulMultinomialOp(OpKernelConstruction* ctx)
+      : MultinomialOp<Device, T, OutputType>(ctx) {
+    OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& logits_t = ctx->input(0);
+    const Tensor& num_samples_t = ctx->input(1);
+    this->DoCompute(ctx, logits_t, num_samples_t, &generator_);
+  }
 
  private:
   GuardedPhiloxRandom generator_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(MultinomialOp);
 };
 
-#define REGISTER(TYPE)                                                   \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<TYPE>("T")                 \
-                              .TypeConstraint("output_dtype", DT_INT32), \
-                          MultinomialOp<CPUDevice, TYPE, int32>);        \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<TYPE>("T")                 \
-                              .TypeConstraint("output_dtype", DT_INT64), \
-                          MultinomialOp<CPUDevice, TYPE, int64>);
+// TODO(b/77906027): Add a TPU implementation.
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT32),  \
+                          StatefulMultinomialOp<CPUDevice, TYPE, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT64),  \
+                          StatefulMultinomialOp<CPUDevice, TYPE, int64>);
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -233,13 +246,83 @@ TF_CALL_double(REGISTER);
                               .HostMemory("num_samples")                 \
                               .TypeConstraint<TYPE>("T")                 \
                               .TypeConstraint("output_dtype", DT_INT32), \
-                          MultinomialOp<GPUDevice, TYPE, int32>)         \
+                          StatefulMultinomialOp<GPUDevice, TYPE, int32>) \
   REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
                               .Device(DEVICE_GPU)                        \
                               .HostMemory("num_samples")                 \
                               .TypeConstraint<TYPE>("T")                 \
                               .TypeConstraint("output_dtype", DT_INT64), \
-                          MultinomialOp<GPUDevice, TYPE, int64>)
+                          StatefulMultinomialOp<GPUDevice, TYPE, int64>)
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, typename OutputType>
+class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
+ public:
+  explicit StatelessMultinomialOp(OpKernelConstruction* ctx)
+      : MultinomialOp<Device, T, OutputType>(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& logits_t = ctx->input(0);
+    const Tensor& num_samples_t = ctx->input(1);
+
+    const Tensor& seed_t = ctx->input(2);
+    OP_REQUIRES(ctx, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_t, &key, &counter));
+
+    GuardedPhiloxRandom generator;
+    generator.Init(counter, key);
+
+    this->DoCompute(ctx, logits_t, num_samples_t, &generator);
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER(TYPE)                                                     \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                     \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("T")                   \
+                              .TypeConstraint("output_dtype", DT_INT32),   \
+                          StatelessMultinomialOp<CPUDevice, TYPE, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                     \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("T")                   \
+                              .TypeConstraint("output_dtype", DT_INT64),   \
+                          StatelessMultinomialOp<CPUDevice, TYPE, int64>);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#if GOOGLE_CUDA
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("num_samples")                  \
+                              .HostMemory("seed")                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT32),  \
+                          StatelessMultinomialOp<GPUDevice, TYPE, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("num_samples")                  \
+                              .HostMemory("seed")                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT64),  \
+                          StatelessMultinomialOp<GPUDevice, TYPE, int64>)
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -248,4 +331,6 @@ TF_CALL_double(REGISTER);
 
 #endif  // GOOGLE_CUDA
 
+}  // end namespace
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 88fcf542fb..eab176c7fb 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -27,6 +27,41 @@ namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
+Status GenerateKey(Tensor seed, random::PhiloxRandom::Key* out_key,
+                   random::PhiloxRandom::ResultType* out_counter) {
+  // Grab the two seeds
+  uint64 seed0;
+  uint64 seed1;
+  if (seed.dtype() == DT_INT32) {
+    const auto seed_vals = seed.flat<int32>();
+    seed0 = internal::SubtleMustCopy(seed_vals(0));
+    seed1 = internal::SubtleMustCopy(seed_vals(1));
+  } else if (seed.dtype() == DT_INT64) {
+    const auto seed_vals = seed.flat<int64>();
+    seed0 = internal::SubtleMustCopy(seed_vals(0));
+    seed1 = internal::SubtleMustCopy(seed_vals(1));
+  } else {
+    return errors::InvalidArgument("Invalid seed type: ",
+                                   DataTypeString(seed.dtype()));
+  }
+
+  // Scramble the seeds so that the user doesn't need to worry about which
+  // part of the seed needs to be strong.
+  (*out_key)[0] = 0x3ec8f720;
+  (*out_key)[1] = 0x02461e29;
+  (*out_counter)[0] = static_cast<uint32>(seed0);
+  (*out_counter)[1] = static_cast<uint32>(seed0 >> 32);
+  (*out_counter)[2] = static_cast<uint32>(seed1);
+  (*out_counter)[3] = static_cast<uint32>(seed1 >> 32);
+  const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
+  (*out_key)[0] = mix[0];
+  (*out_key)[1] = mix[1];
+  (*out_counter)[0] = (*out_counter)[1] = 0;
+  (*out_counter)[2] = mix[2];
+  (*out_counter)[3] = mix[3];
+  return Status::OK();
+}
+
 namespace {
 
 class StatelessRandomOpBase : public OpKernel {
@@ -49,36 +84,9 @@ class StatelessRandomOpBase : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
     if (shape.num_elements() == 0) return;
 
-    // Grab the two seeds
-    uint64 seed0;
-    uint64 seed1;
-    if (context->input_dtype(1) == DT_INT32) {
-      const auto seed = seed_t.flat<int32>();
-      seed0 = internal::SubtleMustCopy(seed(0));
-      seed1 = internal::SubtleMustCopy(seed(1));
-    } else {
-      CHECK_EQ(DT_INT64, context->input_dtype(1));
-      const auto seed = seed_t.flat<int64>();
-      seed0 = internal::SubtleMustCopy(seed(0));
-      seed1 = internal::SubtleMustCopy(seed(1));
-    }
-
-    // Scramble the seeds so that the user doesn't need to worry about which
-    // part of the seed needs to be strong.
     random::PhiloxRandom::Key key;
     random::PhiloxRandom::ResultType counter;
-    key[0] = 0x3ec8f720;
-    key[1] = 0x02461e29;
-    counter[0] = static_cast<uint32>(seed0);
-    counter[1] = static_cast<uint32>(seed0 >> 32);
-    counter[2] = static_cast<uint32>(seed1);
-    counter[3] = static_cast<uint32>(seed1 >> 32);
-    const auto mix = random::PhiloxRandom(counter, key)();
-    key[0] = mix[0];
-    key[1] = mix[1];
-    counter[0] = counter[1] = 0;
-    counter[2] = mix[2];
-    counter[3] = mix[3];
+    OP_REQUIRES_OK(context, GenerateKey(seed_t, &key, &counter));
 
     // Fill in the random numbers
     Fill(context, random::PhiloxRandom(counter, key), output);
@@ -105,8 +113,6 @@ class StatelessRandomOp : public StatelessRandomOpBase {
   }
 };
 
-}  // namespace
-
 #define REGISTER(TYPE)                                                 \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("StatelessRandomUniform")                                   \
@@ -176,4 +182,6 @@ TF_CALL_double(REGISTER);
 
 #endif  // GOOGLE_CUDA
 
+}  // namespace
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.h b/tensorflow/core/kernels/stateless_random_ops.h
new file mode 100644
index 0000000000..bcd29c4873
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+// Generates a key and counter that can be used to seed a PhiloxRandom,
+// generator, based on the seed value in `seed_t`.
+//
+// REQUIRES: `seed_t` must be a length-2 vector of type DT_INT{32,64}.
+// `out_key` and `out_counter` must be non-null.
+Status GenerateKey(Tensor seed_t, random::PhiloxRandom::Key* out_key,
+                   random::PhiloxRandom::ResultType* out_counter);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 553850610a..742709fb18 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -29,7 +29,7 @@ static Status StatelessShape(shape_inference::InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
 
   // Set output shape
-  shape_inference::ShapeHandle out;
+  ShapeHandle out;
   TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
   context->set_output(0, out);
   return Status::OK();
@@ -54,6 +54,32 @@ REGISTER_STATELESS_OP("StatelessRandomNormal");
 // This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessTruncatedNormal");
 
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_OP("StatelessMultinomial")
+    .Input("logits: T")
+    .Input("num_samples: int32")
+    .Input("seed: Tseed")
+    .Output("output: output_dtype")
+    .Attr("T: realnumbertype")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .Attr("output_dtype: {int32, int64} = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &seed));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused_dim));
+
+      ShapeHandle logits_shape;
+      ShapeHandle unused;
+      DimensionHandle num_samples;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &logits_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &num_samples));
+      c->set_output(0, c->Matrix(c->Dim(logits_shape, 0), num_samples));
+      return Status::OK();
+    });
+
 #undef REGISTER_STATELESS_OP
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/guarded_philox_random.cc b/tensorflow/core/util/guarded_philox_random.cc
index 2d1e9a293e..7c7ba4cef6 100644
--- a/tensorflow/core/util/guarded_philox_random.cc
+++ b/tensorflow/core/util/guarded_philox_random.cc
@@ -43,6 +43,14 @@ void GuardedPhiloxRandom::Init(int64 seed, int64 seed2) {
   initialized_ = true;
 }
 
+void GuardedPhiloxRandom::Init(random::PhiloxRandom::ResultType counter,
+                               random::PhiloxRandom::Key key) {
+  CHECK(!initialized_);
+  mutex_lock lock(mu_);
+  generator_ = random::PhiloxRandom(counter, key);
+  initialized_ = true;
+}
+
 random::PhiloxRandom GuardedPhiloxRandom::ReserveSamples128(int64 samples) {
   CHECK(initialized_);
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/util/guarded_philox_random.h b/tensorflow/core/util/guarded_philox_random.h
index 5b94a76777..44970eb949 100644
--- a/tensorflow/core/util/guarded_philox_random.h
+++ b/tensorflow/core/util/guarded_philox_random.h
@@ -49,6 +49,8 @@ class GuardedPhiloxRandom {
 
   // Initialize with given seeds.
   void Init(int64 seed, int64 seed2);
+  void Init(random::PhiloxRandom::ResultType counter,
+            random::PhiloxRandom::Key key);
 
   // Reserve a certain number of 128-bit samples.
   // This function is thread safe.  The returned generator is valid for the
-- 
GitLab


From 7f39b18febda4513eb9b869396bad3ac9e8f64a8 Mon Sep 17 00:00:00 2001
From: Ivan Zhang <ivan@ivanzhang.ca>
Date: Wed, 11 Apr 2018 21:18:01 -0400
Subject: [PATCH 0645/1262] Fix typo in error message (#18319)

---
 tensorflow/python/estimator/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 4d3eff71ad..301a360636 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -723,7 +723,7 @@ class Estimator(object):
       batch_length = batch_length or value.shape[0]
       if value.shape[0] != batch_length:
         raise ValueError('Batch length of predictions should be same. %s has '
-                         'different batch length then others.' % key)
+                         'different batch length than others.' % key)
     return batch_length
 
   def _extract_keys(self, predictions, predict_keys):
-- 
GitLab


From 41308f454f39d4a5fe5e87b97045d9867a5e7ac2 Mon Sep 17 00:00:00 2001
From: jinghuangintel <jing1.huang@intel.com>
Date: Wed, 11 Apr 2018 18:20:55 -0700
Subject: [PATCH 0646/1262] added missing shapefn to several operators (#18298)

---
 tensorflow/core/ops/nn_ops.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 12d6dc5eaf..18165fb6ed 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1533,6 +1533,7 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
 does not perform anything. It is just created as an intermediate output of
@@ -1559,6 +1560,7 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
 2D convolution and add Bias to the output of convolution.
@@ -1681,6 +1683,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#ifdef INTEL_MKL_ML
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -1697,6 +1700,7 @@ gradients of convolution with respect to the bias.
 NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
+#endif
 
 REGISTER_OP("_MklConv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -2154,6 +2158,7 @@ REGISTER_OP("_MklToTf")
     .Output("output: T")
     .Attr("T: {half, float, double}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
 
@@ -2175,6 +2180,7 @@ REGISTER_OP("_MklInputConversion")
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
 need to be either in TF or in MKL format. This op is added before every
-- 
GitLab


From 58029d1d0b13dbe91db12cb130303bfaaf566d8a Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 11 Apr 2018 18:20:19 -0700
Subject: [PATCH 0647/1262] In model_to_estimator, only run get_weights when
 there are initialized Keras variables(which assumes there exists a session).
 Otherwise create a session so that we can run get_config(). Actually fix
 #18193.

PiperOrigin-RevId: 192541442
---
 .../python/keras/_impl/keras/estimator.py     | 45 +++++++++-----
 .../keras/_impl/keras/estimator_test.py       | 61 ++++++++++---------
 2 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 8043242b70..b922a6c683 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -26,7 +26,6 @@ from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -38,6 +37,7 @@ from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
@@ -55,6 +55,19 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
+def _any_variable_initalized():
+  """Check if any variable has been initialized in the Keras model.
+
+  Returns:
+    boolean, True if at least one variable has been initalized, else False.
+  """
+  variables = variables_module.global_variables()
+  for v in variables:
+    if getattr(v, '_keras_initialized', False):
+      return True
+  return False
+
+
 def _create_ordered_io(keras_model, estimator_io, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
 
@@ -396,7 +409,8 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
                                      custom_objects)
       # save to checkpoint
       with session.Session(config=estimator._session_config) as sess:
-        model.set_weights(keras_weights)
+        if keras_weights:
+          model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
         if not model.train_function:
           # pylint: disable=protected-access
@@ -466,20 +480,21 @@ def model_to_estimator(keras_model=None,
   estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
 
-  old_session = K._SESSION
-  # Pass the config into keras backend's default session.
-  sess = session.Session(config=estimator._session_config)
-  K.set_session(sess)
-  try:
-    keras_weights = keras_model.get_weights()
-  except errors.FailedPreconditionError as e:
-    if old_session is None:
-      raise e
-    logging.warning(
-        'The Keras backend session has already been '
-        'set. The _session_config passed to model_to_estimator is not used.')
-    K.set_session(old_session)
+  # Check if we need to call get_weights:
+  if _any_variable_initalized():
     keras_weights = keras_model.get_weights()
+    # Warn if config passed to estimator tries to update GPUOptions. If a
+    # session has already been created, the GPUOptions passed to the first
+    # session sticks.
+    if estimator._session_config.HasField('gpu_options'):
+      logging.warning(
+          'The Keras backend session has already been set. '
+          'The _session_config passed to model_to_estimator will not be used.')
+  else:
+    # Pass the config into keras backend's default session.
+    sess = session.Session(config=estimator._session_config)
+    K.set_session(sess)
+    keras_weights = None
 
   if keras_model._is_graph_network:
     # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 27b7ec7dd4..653cdc01e2 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -27,10 +27,12 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
+from tensorflow.python.keras._impl.keras.optimizers import SGD
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -443,8 +445,9 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     model = simple_functional_model()
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
-    est_keras = keras.estimator.model_to_estimator(
-        keras_model=model, config=self._config)
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=model, config=self._config)
 
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -497,20 +500,22 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
   def test_gpu_config(self):
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.categorical_accuracy])
+    with ops.Graph().as_default():
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['mse', keras.metrics.categorical_accuracy])
 
-    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
-    sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
-    self._config._session_config = sess_config
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
-    self.assertEqual(keras.backend.get_session()
-                     ._config.gpu_options.per_process_gpu_memory_fraction,
-                     gpu_options.per_process_gpu_memory_fraction)
+      gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
+      sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
+      self._config._session_config = sess_config
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      self.assertEqual(
+          keras.backend.get_session()
+          ._config.gpu_options.per_process_gpu_memory_fraction,
+          gpu_options.per_process_gpu_memory_fraction)
 
   def test_pretrained_weights(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
@@ -518,19 +523,19 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         loss='categorical_crossentropy',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         metrics=['mse', keras.metrics.categorical_accuracy])
-
-    keras_model.train_on_batch(
-        np.random.random((10,) + _INPUT_SIZE), np.random.random((10,
-                                                                 _NUM_CLASS)))
-    weights = keras_model.get_weights()
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.set_weights(weights)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.categorical_accuracy])
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
+    with self.test_session():
+      keras_model.train_on_batch(
+          np.random.random((10,) + _INPUT_SIZE),
+          np.random.random((10, _NUM_CLASS)))
+      weights = keras_model.get_weights()
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.set_weights(weights)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=SGD(lr=0.0001, momentum=0.9),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
 
 
 if __name__ == '__main__':
-- 
GitLab


From 9c2e04411ec1dbcf7aaf604dbc218489928bb2cc Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 18:26:12 -0700
Subject: [PATCH 0648/1262] Check input dimension for
 contrib.layers.conv2d/conv3d (#18251)

* Check input dimension for contrib.layers.conv2d/conv3d

This fix tries to fix the issue raised in 14583 where
the input dimension was not checked for contrib.layers.conv2d/conv3d
and contrib.slim.conv2d/conv3d.

The issue was that conv2d/conv3d were just aliases of
convolution. This fix wrap the conv2d/conv3d with the input
dimension check so that incorrect usage will return ValueError.

This fix fixes 14583.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for conv2d/conv3d shape check

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix impacted tests.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update convolution instead of adding _convolution,

based on review feedback

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add convolution1d and additional update

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../contrib/layers/python/layers/layers.py    | 138 +++++++++++++++++-
 .../layers/python/layers/layers_test.py       |  15 +-
 2 files changed, 148 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 10d7f6d076..949e73deff 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 997f910a2a..b01fd5d5c9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
-- 
GitLab


From a843ec33a8fe8feb41f3733d2bea34691bb02a1e Mon Sep 17 00:00:00 2001
From: brett koonce <koonce@hello.com>
Date: Wed, 11 Apr 2018 18:27:16 -0700
Subject: [PATCH 0649/1262] contrib/image: minor spelling tweaks (#18162)

---
 .../contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc      | 2 +-
 tensorflow/contrib/image/ops/distort_image_ops.cc             | 4 ++--
 tensorflow/contrib/image/ops/image_ops.cc                     | 2 +-
 .../image/ops/single_image_random_dot_stereograms_ops.cc      | 4 ++--
 tensorflow/contrib/image/python/ops/image_ops.py              | 2 +-
 .../image/python/ops/single_image_random_dot_stereograms.py   | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index b71ff9cd50..645abbf0b0 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -53,7 +53,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                           DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
                           &tranformation_matrix));
-  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
   internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
       delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
diff --git a/tensorflow/contrib/image/ops/distort_image_ops.cc b/tensorflow/contrib/image/ops/distort_image_ops.cc
index b169b0b2b2..ca49635d5d 100644
--- a/tensorflow/contrib/image/ops/distort_image_ops.cc
+++ b/tensorflow/contrib/image/ops/distort_image_ops.cc
@@ -36,9 +36,9 @@ REGISTER_OP("AdjustHsvInYiq")
 Adjust the YIQ hue of one or more images.
 
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
-We used linear transfomation described in:
+We used linear transformation described in:
  beesbuzz.biz/code/hsv_color_transforms.php
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into YIQ space, rotated around the Y channel by
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d05..ebdcaea7ab 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -93,7 +93,7 @@ row_to_col_match_indices: A vector of length num_rows, which is the number of
   If `row_to_col_match_indices[i]` is not -1, row i is matched to column
   `row_to_col_match_indices[i]`.
 col_to_row_match_indices: A vector of length num_columns, which is the number
-  of columns of the input ditance matrix.
+  of columns of the input distance matrix.
   If `col_to_row_match_indices[j]` is not -1, column j is matched to row
   `col_to_row_match_indices[j]`.
 )doc");
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 8139d4272d..bd784c6bda 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -69,7 +69,7 @@ Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
-encode 3-D data witin the image.
+encode 3-D data within the image.
 
 This Op is based upon:
 'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
@@ -111,7 +111,7 @@ output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale,
 output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
   and use 'convergence_dots_size' for best fit to avoid overlap if possible
 
-image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+image:= A tensor of size 'output_image_shape' with the encoded 'depth_values'
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8..cd984c8054 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -433,7 +433,7 @@ def bipartite_match(distance_mat,
       of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
       is not -1, row i is matched to column `row_to_col_match_indices[i]`.
     col_to_row_match_indices: A vector of length num_columns, which is the
-      number of columns of the input ditance matrix.
+      number of columns of the input distance matrix.
       If `col_to_row_match_indices[j]` is not -1, column j is matched to row
       `col_to_row_match_indices[j]`.
   """
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index d4a6a5bcbb..0ceb683ff4 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -45,7 +45,7 @@ def single_image_random_dot_stereograms(depth_values,
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
   will encode 3-D data into a 2-D image.  The output of this Op is suitable
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
-  corrupt the encode 3-D data witin the image.
+  corrupt the encode 3-D data within the image.
 
   Based upon [this
   paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
-- 
GitLab


From fd934f119deba4543555c3dac2c8c75936ac12d0 Mon Sep 17 00:00:00 2001
From: tamimaddari82 <37008274+tamimaddari82@users.noreply.github.com>
Date: Thu, 12 Apr 2018 09:28:18 +0800
Subject: [PATCH 0650/1262] Add parallel implementation of CTC greedy decoder
 (#17982)

---
 tensorflow/core/kernels/ctc_decoder_ops.cc | 34 ++++++++++++++--------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 96bdb6a241..8cadeac68d 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -213,20 +214,29 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     // Perform best path decoding
     std::vector<std::vector<std::vector<int> > > sequences(batch_size);
-    for (int b = 0; b < batch_size; ++b) {
-      sequences[b].resize(1);
-      auto& sequence = sequences[b][0];
-      int prev_indices = -1;
-      for (int t = 0; t < seq_len_t(b); ++t) {
-        int max_class_indices;
-        log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
-        if (max_class_indices != blank_index &&
-            !(merge_repeated_ && max_class_indices == prev_indices)) {
-          sequence.push_back(max_class_indices);
+    auto decode = [&](const int64 begin, const int64 end) {
+      for (int b = begin; b < end; ++b) {
+        sequences[b].resize(1);
+        auto &sequence = sequences[b][0];
+        int prev_indices = -1;
+        for (int t = 0; t < seq_len_t(b); ++t) {
+          int max_class_indices;
+          log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
+          if (max_class_indices != blank_index &&
+              !(merge_repeated_ && max_class_indices == prev_indices)) {
+            sequence.push_back(max_class_indices);
+          }
+          prev_indices = max_class_indices;
         }
-        prev_indices = max_class_indices;
       }
-    }
+    };
+
+    const int64 kCostPerUnit = 50 * max_time * num_classes;
+    const int64 total = batch_size;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, total,
+          kCostPerUnit, decode);
 
     OP_REQUIRES_OK(
         ctx, decode_helper_.StoreAllDecodedSequences(
-- 
GitLab


From de72c8cccef2ee77667c041b68a34be6fb61ea65 Mon Sep 17 00:00:00 2001
From: Michal Turek <mixal.turek@gmail.com>
Date: Thu, 12 Apr 2018 03:32:10 +0200
Subject: [PATCH 0651/1262] Add comment to examples to prevent resource leaks
 (#17820)

Issue #17374
---
 tensorflow/docs_src/install/install_java.md                     | 2 ++
 .../java/src/main/java/org/tensorflow/examples/LabelImage.java  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index cdde45a6f4..0dcb059793 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -93,6 +93,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
 
               // Execute the "MyConst" operation in a Session.
               try (Session s = new Session(g);
+                   // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
                    Tensor output = s.runner().fetch("MyConst").run().get(0)) {
                 System.out.println(new String(output.bytesValue(), "UTF-8"));
               }
@@ -207,6 +208,7 @@ public class HelloTF {
 
       // Execute the "MyConst" operation in a Session.
       try (Session s = new Session(g);
+           // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
            Tensor output = s.runner().fetch("MyConst").run().get(0)) {
         System.out.println(new String(output.bytesValue(), "UTF-8"));
       }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 489e95c310..3948991c84 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -101,6 +101,7 @@ public class LabelImage {
                   b.constant("mean", mean)),
               b.constant("scale", scale));
       try (Session s = new Session(g)) {
+        // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
         return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class);
       }
     }
@@ -110,6 +111,7 @@ public class LabelImage {
     try (Graph g = new Graph()) {
       g.importGraphDef(graphDef);
       try (Session s = new Session(g);
+          // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
           Tensor<Float> result =
               s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) {
         final long[] rshape = result.shape();
-- 
GitLab


From c98f8c59b924b87bebe991607a5fb7d3cb90c5ee Mon Sep 17 00:00:00 2001
From: "Seungwoo Choi (Biggie)" <seungjooli@snu.ac.kr>
Date: Thu, 12 Apr 2018 10:33:41 +0900
Subject: [PATCH 0652/1262] Replace wrong variable (#17738)

---
 tensorflow/contrib/quantize/python/fold_batch_norms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 4a8f8a04cc..aa0ef64308 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -545,7 +545,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
         gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
 
   if not has_scaling:
-    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+    gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
 
   return _BatchNormMatch(
       layer_op=None,
-- 
GitLab


From b47ff5f95d42d5321864359bd559fec0c1d81a69 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 18:34:20 -0700
Subject: [PATCH 0653/1262] Enhancement with deprecated_argument_lookup
 (#17527)

* Enhancement with deprecated_argument_lookup

The tf.losses.cosine_distance deprecated dim and switched
to axis. This fix adds the enhancement of using
deprecated_argument_lookup, which is used in all other
arguments deprecations.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add missing import

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/losses/losses_impl.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 34ca1adc3e..19a8eaf22c 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -306,11 +307,8 @@ def cosine_distance(
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   if labels is None:
     raise ValueError("labels must not be None.")
-- 
GitLab


From ff6c11008213424b7a1dd77346f996be693b004a Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 11 Apr 2018 18:37:47 -0700
Subject: [PATCH 0654/1262] Increase size of
 //tensorflow/python/kernel_tests:linalg_ops_test to "medium".

PiperOrigin-RevId: 192542956
---
 tensorflow/python/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 5738e79b27..e504a9fd21 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1607,7 +1607,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "linalg_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["linalg_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-- 
GitLab


From 1caeb2086e7e9d7e3cb85883f0af316cddcf1285 Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@users.noreply.github.com>
Date: Thu, 12 Apr 2018 09:41:48 +0800
Subject: [PATCH 0655/1262] fix tf.GIT_VERSION always 'unknown' on windows
 cmake build (#16730)

---
 tensorflow/contrib/cmake/tf_core_framework.cmake |  2 +-
 tensorflow/tools/git/gen_git_source.py           | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347f..bcfb4f0819 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b80..6a1f126131 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -238,7 +238,7 @@ def generate(arglist):
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, source_dir):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -246,9 +246,10 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(source_dir)
   write_version_info(output_file, git_version)
 
 
@@ -281,6 +282,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -290,7 +296,10 @@ if args.configure is not None:
 elif args.generate is not None:
   generate(args.generate)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
-- 
GitLab


From 4e29ebd67cd4409cbdfa6510b06acd780166aa9d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 18:38:38 -0700
Subject: [PATCH 0656/1262] [XLA] Redesign: test sharding.

Also set the sharding to the instruction when created from proto.

PiperOrigin-RevId: 192543024
---
 .../xla/client/xla_client/xla_builder.h       | 31 +++++++++++++++++++
 .../compiler/xla/service/hlo_instruction.cc   |  6 ++++
 2 files changed, 37 insertions(+)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 24e0be2ac1..e583b4fe48 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -959,6 +959,37 @@ XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
   return ConstantFromArray(values);
 }
 
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+//
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+class XlaScopedShardingAssignment {
+ public:
+  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
+                              tensorflow::gtl::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
+  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
+      delete;
+
+  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::XlaBuilder* const builder_;
+  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a986bbd511..5d2d7a9727 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -159,6 +159,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->fft_length_.push_back(fft_len);
   }
 
+  if (proto.has_sharding()) {
+    TF_ASSIGN_OR_RETURN(const auto& sharding,
+                        HloSharding::FromProto(proto.sharding()));
+    instruction->set_sharding(sharding);
+  }
+
   if (proto.has_gather_dimension_numbers()) {
     instruction->gather_dimension_numbers_ =
         MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
-- 
GitLab


From 079539b2e7acb1813cbfcdd2ab39f7bb77bc0467 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 11 Apr 2018 18:42:50 -0700
Subject: [PATCH 0657/1262] Correct argument doc for BasicLSTMCell.call
 (#16554)

* Correct argument doc for BasicLSTMCell.call

* change self._num_units to num_units.
---
 tensorflow/python/ops/rnn_cell_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index cbc2dcf419..54f4e0f240 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -599,9 +599,9 @@ class BasicLSTMCell(LayerRNNCell):
     Args:
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: An `LSTMStateTuple` of state tensors, each shaped
-        `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
+        `[batch_size, num_units]`, if `state_is_tuple` has been set to
         `True`.  Otherwise, a `Tensor` shaped
-        `[batch_size, 2 * self.state_size]`.
+        `[batch_size, 2 * num_units]`.
 
     Returns:
       A pair containing the new hidden state, and the new state (either a
-- 
GitLab


From b52b5a47148b6f05ed9439840dff9e3f189b3b19 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Wed, 11 Apr 2018 18:57:49 -0700
Subject: [PATCH 0658/1262] Switch to WaitForNotification to fix the flaky
 test.

See:
https://source.cloud.google.com/results/invocations/31632a30-3728-4635-a456-f89b9e8b9dfe/log
PiperOrigin-RevId: 192544848
---
 tensorflow/core/platform/cloud/ram_file_block_cache_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
index d555b682a6..10203783fc 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
@@ -487,8 +487,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
         TF_EXPECT_OK(ReadCache(&cache, "", 0, block_size / 2, &out));
         EXPECT_EQ(out.size(), block_size / 2);
       }));
-  EXPECT_TRUE(WaitForNotificationWithTimeout(&notification, 10000))
-      << "Timeout waiting for concurrent thread to start.";
+  notification.WaitForNotification();
   std::vector<char> out;
   TF_EXPECT_OK(ReadCache(&cache, "", block_size / 2, block_size / 2, &out));
   EXPECT_EQ(out.size(), block_size / 2);
-- 
GitLab


From e7e01ac2597346f9dda2fb8fdb155fe784a1eebd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 19:14:47 -0700
Subject: [PATCH 0659/1262] [XLA] Redesign: fix GetComputationGraphStats.

CreateFromProto requires that the config has proper entry_computation_layout, so give the config the program shape.

PiperOrigin-RevId: 192546316
---
 tensorflow/compiler/xla/service/service.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 70af1c44ea..52500e4e79 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1661,7 +1661,14 @@ tensorflow::Status Service::GetComputationStats(
 
 tensorflow::Status Service::GetComputationGraphStats(
     const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
-  HloModuleConfig config;
+  if (!arg->has_computation()) {
+    return InvalidArgument("Computations may not be empty.");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("Program shape may not be empty.");
+  }
+
+  HloModuleConfig config(arg->computation().program_shape());
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       HloModule::CreateFromProto(arg->computation(), config));
-- 
GitLab


From 6f678934828a988ea06caf419dd97b9140f7c022 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 19:18:20 -0700
Subject: [PATCH 0660/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192546579
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 65 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 65 +++++++++++++++++++
 2 files changed, 130 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index ba442a0582..30d4296326 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -65536,6 +65536,71 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 43fd09fb72..0ed039ac2e 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -29832,6 +29832,71 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
-- 
GitLab


From 6c9f8825096a76b395b01e07b8d611b3e2a23489 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 19:45:42 -0700
Subject: [PATCH 0661/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 192548367

---
 tensorflow/go/op/wrappers.go | 4640 +++++++++++++++++-----------------
 1 file changed, 2320 insertions(+), 2320 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 09da8c1892..2d3e369328 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2505,39 +2505,6 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the sum along sparse segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
@@ -6567,6 +6534,85 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
 // Real-valued fast Fourier transform.
 //
 // Computes the 1-dimensional discrete Fourier transform of a real-valued signal
@@ -7333,6 +7379,29 @@ func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment
 	return op.Output(0)
 }
 
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
 type StatelessTruncatedNormalAttr func(optionalAttr)
 
@@ -8414,98 +8483,49 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+// Computes numerical negative value element-wise.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "Neg",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
+// Execute a sub graph on a remote processor.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
 //
 // Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "RemoteFusedGraphExecute",
 		Input: []tf.Input{
-			data, partitions,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -8516,119 +8536,117 @@ func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_pa
 	var idx int
 	var err error
 	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
 		return
 	}
 	return outputs
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["data_format"] = value
 	}
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["dilations"] = value
 	}
 }
 
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			x,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -8636,38 +8654,38 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 	return op.Output(0)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
 // If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8676,225 +8694,152 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			data, segment_ids,
+			sp_indices, sp_values, sp_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// `data.shape` must start with `partitions.shape`.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// For example:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
+// See `dynamic_stitch` for an example on how to merge partitions back.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
 //
+// Arguments:
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			data, partitions,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
+// Update '*var' according to the adagrad scheme.
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	resource: handle to the resource to delete.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8903,75 +8848,66 @@ func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyReso
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			resource,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// Return the shape of s0 op s1 with broadcast.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["src_format"] = value
 	}
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["dst_format"] = value
 	}
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// Returns the dimension index in the destination data format given the one in
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// the source data format.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8980,26 +8916,9 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -9007,36 +8926,38 @@ func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.Data
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Update '*var' according to the AddSign update.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9045,87 +8966,100 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// 2D real-valued fast Fourier transform.
+// Computes the mean along segments of a tensor.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			input, fft_length,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Resize `images` to `size` using area interpolation.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// Input images can be of different types but output images are always float.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9134,368 +9068,265 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			images, size,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
-// The padded size of each dimension D of the output is:
+// Arguments:
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
 //
-// For example:
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			input, paddings,
+			input_dataset, batch_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
-//	resource: the input resource handle.
+//	input: A complex64 tensor.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "IFFT",
 		Input: []tf.Input{
-			resource,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// Generates values in an interval.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// For example:
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			shape, seed,
+			start, stop, num,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+//
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
+//
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	resource: handle to the resource to delete.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			data,
+			resource,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
+// LRNBias sets the optional bias attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Angle",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
-
-// VarHandleOpContainer sets the optional container attribute to value.
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["alpha"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// LRNBeta sets the optional beta attribute to value.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["beta"] = value
 	}
 }
 
-// Creates a handle to a Variable resource.
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise XOR of `x` and `y`.
-//
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
+		Type: "LRN",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			serialized_sparse,
+			tf.OutputList(input_datasets),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
+// value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9504,138 +9335,87 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// 2D real-valued fast Fourier transform.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
 //
-// variable according to `indices`.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
 //
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See @{tf.scatter_nd} for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			ref, indices, updates,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// Resize `images` to `size` using area interpolation.
 //
-// Or, to remove specific size 1 dimensions:
+// Input images can be of different types but output images are always float.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	input: The `input` to squeeze.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9644,9 +9424,9 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			input,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -9654,98 +9434,91 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// Pads a tensor with zeros.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+//	resource: the input resource handle.
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			resource,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["dtype"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9754,9 +9527,9 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -9764,64 +9537,225 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorDataset",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			data,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the argument of a complex number.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
+//
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Angle",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise XOR of `x` and `y`.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "BitwiseXor",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var, ms, and mom tensors is protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
@@ -9849,10 +9783,9 @@ func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSProp
 //
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9861,168 +9794,77 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_locking"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Applies sparse `updates` to individual values or slices within a given
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// variable according to `indices`.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
 //
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
 //
-// For example,
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
 //
 // ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// The resulting update to ref would look like this:
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+//     [1, 11, 3, 10, 9, 6, 7, 12]
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10031,37 +9873,68 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+//	input: The `input` to squeeze.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "Squeeze",
 		Input: []tf.Input{
 			input,
 		},
@@ -10071,143 +9944,126 @@ func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (o
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Update '*var' according to the adadelta scheme.
 //
-// All elements selected by `indices` must have the same shape.
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
+	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			input_dataset, count,
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -10215,235 +10071,248 @@ func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			data, segment_ids,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
+// Update '*var' according to the RMSProp algorithm.
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input: Base64 strings to decode.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
+// Returns the truth value of (x > y) element-wise.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "Greater",
 		Input: []tf.Input{
-			value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["min_object_covered"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
 // If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Generate a single randomly distorted bounding box for an image.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10452,187 +10321,375 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			input, fft_length,
+			handle, indices, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
+// Returns x / y element-wise for integer types.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Restores tensors from a V2 checkpoint.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+//
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	opspec := tf.OpSpec{
+		Type: "RestoreV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
 	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Computes the maximum along segments of a tensor.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
+// Decode web-safe base64-encoded strings.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
-// Accepted values are:
+// Arguments:
+//	input: Base64 strings to decode.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Store the input tensor in the state of the current session.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// Arguments:
+//	value: The tensor to be stored.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			contents,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -10640,83 +10697,59 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return op.Output(0)
 }
 
 // VariableShapeAttr is an optional argument to VariableShape.
@@ -10759,6 +10792,82 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Computes softmax cross entropy cost and gradients to backpropagate.
 //
 // Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
@@ -11241,42 +11350,137 @@ func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
 
 // TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
 //
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
 //
-// Write data via Write and read via Read or Pack.
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			size,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // MaxPool3DAttr is an optional argument to MaxPool3D.
@@ -13490,228 +13694,73 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 // ```
 func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
-
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the (key, value) element with the smallest
-//
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
-
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "Rint",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			indices,
 		},
 		Attrs: attrs,
 	}
@@ -13721,11 +13770,12 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	}
 	var idx int
 	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
 		return
 	}
-	return outputs
+	return key, values
 }
 
 // SerializeManySparseAttr is an optional argument to SerializeManySparse.
@@ -14192,14 +14242,192 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableDenseHashTableV2",
+		Input: []tf.Input{
+			empty_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D real-valued fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			empty_key,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -14428,6 +14656,29 @@ func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segm
 	return op.Output(0)
 }
 
+// Returns the set of files matching one or more glob patterns.
+//
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the truth value of (x >= y) element-wise.
 //
 // *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
@@ -15157,453 +15408,210 @@ func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 //
 // The advantages of sampling candidates per-batch are simplicity and the
 // possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Save",
-		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-//
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-//
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns immutable tensor from memory region.
+// Saves the input tensors to disk.
 //
-// The current implementation memmaps the tensor from a file.
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
+		Type: "Save",
+		Input: []tf.Input{
+			filename, tensor_names, tf.OutputList(data),
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			input, fft_length,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// then the output will be
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// Graphically this is equivalent to doing
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// then the output will be
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// if hashed_output=true then the output will be
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 //
+// and
 //
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
@@ -15611,287 +15619,242 @@ func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes [
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// Joins the strings in the given list of string tensors into one tensor;
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// Returns immutable tensor from memory region.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// Inverse real-valued fast Fourier transform.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			pattern,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
 //
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
 //
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+// For example, if `concat_dim = 1` and the inputs are
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// For example, if the inputs are
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+//     inputs[2]: Tensor [["f"], ["g"]]
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// then the output will be
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
 		Attrs: attrs,
 	}
@@ -15899,71 +15862,75 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes numerical negative value element-wise.
+// Concatenates quantized tensors along one dimension.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			x,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// For example, if the input is
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			inputs, min, max,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Returns the element-wise min of two SparseTensors.
@@ -18018,6 +17985,39 @@ func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_uppe
 	return op.Output(0)
 }
 
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CumsumAttr is an optional argument to Cumsum.
 type CumsumAttr func(optionalAttr)
 
-- 
GitLab


From 902625480b414562e9a4e21e963cacaa4708f9b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 19:50:10 -0700
Subject: [PATCH 0662/1262] Enable a reduce window test case.

PiperOrigin-RevId: 192548652
---
 tensorflow/compiler/xla/tests/reduce_window_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 8ef980ebd9..425fef7da7 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -1063,14 +1063,14 @@ struct R2ReduceWindowTestData {
      /*strides=*/{1, 1}, /*pad_low=*/{0, 130}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
+     /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
 // TODO(b/76025683): These tests fail on TPU.
 #if defined(XLA_TEST_BACKEND_CPU) || defined(XLA_TEST_BACKEND_GPU)
     {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
      /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
-    {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
-     /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
-     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
 #endif
 };
 
-- 
GitLab


From ffebc37eff2e44bbffa2964deeebb7fdaef2e219 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 19:53:21 -0700
Subject: [PATCH 0663/1262] Build fixes

---
 tensorflow/c/c_api_experimental.cc                | 2 +-
 tensorflow/contrib/cmake/tf_stream_executor.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 9678ee926f..a110770921 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -7088,7 +7088,7 @@ static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
   status->status = tensorflow::errors::Unimplemented(
       "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
       "is not implemented for Windows");
-  return nullptr;
+  return std::vector<UniqueFuncPtr>();
 #else
   const char* func_def = R"PREFIX(
 library {
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index eaae64e1c6..af48ef1fd4 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -68,7 +68,7 @@ if (tensorflow_ENABLE_GPU)
     if (NOT tensorflow_BUILD_CC_TESTS)
         file(GLOB tf_stream_executor_gpu_tests
             "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
-        }
+        )
         list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
     endif()
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
-- 
GitLab


From 5b0cb6c724e12e0d66a11d8043c71d1479f70a47 Mon Sep 17 00:00:00 2001
From: James Wexler <jwexler@google.com>
Date: Wed, 11 Apr 2018 19:58:07 -0700
Subject: [PATCH 0664/1262] Add closure_js_proto_library build for tf.example
 protos.

PiperOrigin-RevId: 192549109
---
 tensorflow/core/BUILD | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c5ca421ced..55b0040b52 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -70,6 +70,10 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load(
+    "@io_bazel_rules_closure//closure:defs.bzl",
+    "closure_js_proto_library",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "full_path",
@@ -244,6 +248,14 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
+closure_js_proto_library(
+    name = "example_js_protos",
+    srcs = [
+        "example/example.proto",
+        "example/feature.proto",
+    ],
+)
+
 exports_files([
     "framework/types.proto",
 ])
-- 
GitLab


From ac9be81b06e9bf93d8ba5f37983c3dd1163a190e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 20:08:30 -0700
Subject: [PATCH 0665/1262] Fix description of DynamicUpdateSlice.

PiperOrigin-RevId: 192550101
---
 .../docs_src/performance/xla/operation_semantics.md      | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 217ab596b7..3963d5faa7 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -854,12 +854,13 @@ calculation of 'start_indices') is currently implementation-defined.
 | `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
 | `update`        | `ComputationDataHandle` | N dimensional array of type T    |
 :                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape    :
+:                 :                         : Each dimension of update shape   :
 :                 :                         : must be strictly greater than    :
 :                 :                         : zero, and start + update must be :
-:                 :                         : less than operand size for each  :
-:                 :                         : dimension to avoid generating    :
-:                 :                         : out-of-bounds update indices.    :
+:                 :                         : less than or equal to the operand:
+:                 :                         : size for each dimension to avoid :
+:                 :                         : generating out-of-bounds update  :
+:                 :                         : indices.                         :
 | `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
 :                 :                         : containing the starting indices  :
 :                 :                         : of the slice for each dimension. :
-- 
GitLab


From 89987f232fd9ff3e6cdab43bc7056f55cb4adf8c Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 20:15:18 -0700
Subject: [PATCH 0666/1262] Added a TODO to cover CreateMNISTDatasetFunctions
 in Windows tests

---
 tensorflow/c/c_api_experimental.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index a110770921..4883e61642 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -7085,6 +7085,7 @@ static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
     const char* file_path, int batch_size, std::string* dataset_name,
     TF_Status* status) {
 #if defined(PLATFORM_WINDOWS)
+  // TODO(ashankar): cover CreateMNISTDatasetFunctions in Windows tests.
   status->status = tensorflow::errors::Unimplemented(
       "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
       "is not implemented for Windows");
-- 
GitLab


From 28fdb0a6b1714a634ead04602732b1c75212fb94 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 11 Apr 2018 20:19:27 -0700
Subject: [PATCH 0667/1262] Fix double linkage of static variables

---
 tensorflow/contrib/tensorrt/BUILD             | 30 +++++++++++++++++--
 .../resources/trt_resource_manager.cc         |  8 +++++
 .../tensorrt/resources/trt_resource_manager.h |  6 +---
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2f316767b3..2a55a49097 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -27,6 +27,11 @@ load(
     "if_tensorrt",
 )
 
+load(
+  "//tensorflow/core:platform/default/build_config_root.bzl",
+  "if_static",
+)
+
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
@@ -52,7 +57,7 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        ":trt_engine_op_kernel",
+        # ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -183,16 +188,34 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
 )
 
+tf_cuda_library(
+  name = "trt_resource_manager_impl",
+  srcs = [
+    "resources/trt_resource_manager.cc",
+    ],
+  hdrs = [
+        "resources/trt_resource_manager.h",
+    ],
+    deps = [
+        ":trt_logging",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 tf_cuda_library(
     name = "trt_resources",
     srcs = [
         "resources/trt_int8_calibrator.cc",
-        "resources/trt_resource_manager.cc",
     ],
     hdrs = [
         "resources/trt_int8_calibrator.h",
@@ -206,6 +229,8 @@ tf_cuda_library(
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
+    ]) + if_static([
+      ":trt_resource_manager_impl",
     ]),
 )
 
@@ -224,6 +249,7 @@ tf_cuda_library(
         ":segment",
         ":trt_logging",
         ":trt_resources",
+        ":trt_resource_manager_impl",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index e663eed4dd..b9a5a00366 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,6 +19,14 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+std::shared_ptr<tensorflow::tensorrt::TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance()
+{
+  static std::shared_ptr<tensorflow::tensorrt::TRTResourceManager> instance_(
+    new tensorflow::tensorrt::TRTResourceManager);
+  return instance_;
+}
+
 std::shared_ptr<tensorflow::ResourceMgr>
 tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
   // mutex is held for lookup only. Most instantiations where mutex will be held
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index 5f8ad491d3..bc15b51e05 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -29,11 +29,7 @@ class TRTResourceManager {
   TRTResourceManager() = default;
 
  public:
-  static std::shared_ptr<TRTResourceManager> instance() {
-    static std::shared_ptr<TRTResourceManager> instance_(
-        new TRTResourceManager);
-    return instance_;
-  }
+  static std::shared_ptr<TRTResourceManager> instance();
   // returns a manager for given op, if it doesn't exists it creates one
   std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
 
-- 
GitLab


From f49a5f2aa35a16eab4625fdc4b2a0acef3933e34 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 11 Apr 2018 21:42:48 -0700
Subject: [PATCH 0668/1262] Disable Grappler optimizer for tests

---
 tensorflow/python/framework/test_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index bf00fa6439..990fa429a1 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -974,6 +974,8 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
+      config.graph_options.rewrite_options.arithmetic_optimization = (
+          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
-- 
GitLab


From 96aba78b0cdb2b9ad316d3c68a52bc2284ea638c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 22:37:57 -0700
Subject: [PATCH 0669/1262] Enable an r2 reduce window test case.

PiperOrigin-RevId: 192560111
---
 tensorflow/compiler/xla/tests/reduce_window_test.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 425fef7da7..6a054a5dd3 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -1066,12 +1066,9 @@ struct R2ReduceWindowTestData {
     {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
      /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
-// TODO(b/76025683): These tests fail on TPU.
-#if defined(XLA_TEST_BACKEND_CPU) || defined(XLA_TEST_BACKEND_GPU)
     {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
      /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
-#endif
 };
 
 string R2ReduceWindowTestDataToString(
-- 
GitLab


From b79de285e04f995eb0220583d6ed333b33a26d7f Mon Sep 17 00:00:00 2001
From: Hovhannes Harutyunyan <hovhannes.harutyunyan@picsart.com>
Date: Thu, 12 Apr 2018 10:04:40 +0400
Subject: [PATCH 0670/1262] Remove redefined BroadcastDiv function

---
 .../internal/reference/reference_ops.h        | 41 -------------------
 1 file changed, 41 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 4509db06fd..750737a730 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1339,47 +1339,6 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
-// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-template <typename T>
-void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest
-  // stride, typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for
-  // the best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
 inline void Div(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
-- 
GitLab


From 09ab7fc83e3b2b66a2d1ff68ac6ad1b56a61fcd6 Mon Sep 17 00:00:00 2001
From: Hovhannes Harutyunyan <hovhannes.harutyunyan@picsart.com>
Date: Thu, 12 Apr 2018 10:54:41 +0400
Subject: [PATCH 0671/1262] Fixe merge issue

---
 .../lite/kernels/internal/reference/reference_ops.h  | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index e8d7da73a2..0fc88b2b8e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1323,18 +1323,6 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] / input2_data[i], output_activation_min,
-        output_activation_max);
-  }
-}
-
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
-- 
GitLab


From 6ca5554b5a87cc5cb784d359ba03c5860ac8ead2 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 12 Apr 2018 00:24:52 -0700
Subject: [PATCH 0672/1262] Trying to fix Windows release build for
 libtensorflow

---
 tensorflow/c/c_api_experimental.cc | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 4883e61642..073dc019c7 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -190,12 +190,6 @@ library {
 //  be deleted by calling TF_DeleteFunction.
 static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
     const char* file_path, std::string* dataset_name, TF_Status* status) {
-#if defined(PLATFORM_WINDOWS)
-  status->status = tensorflow::errors::Unimplemented(
-      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
-      "is not implemented for Windows");
-  return std::vector<UniqueFuncPtr>();
-#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -7074,7 +7068,6 @@ library {
         DCHECK(found);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
-#endif
 }
 
 //  On success, returns a set of TF_Function instances encoding a dataset
@@ -7084,13 +7077,6 @@ library {
 static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
     const char* file_path, int batch_size, std::string* dataset_name,
     TF_Status* status) {
-#if defined(PLATFORM_WINDOWS)
-  // TODO(ashankar): cover CreateMNISTDatasetFunctions in Windows tests.
-  status->status = tensorflow::errors::Unimplemented(
-      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
-      "is not implemented for Windows");
-  return std::vector<UniqueFuncPtr>();
-#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -8220,7 +8206,6 @@ library {
         DCHECK(found_batch_size);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
-#endif
 }
 
 // Adds the input functions to `graph`.  On success, returns the created
@@ -8315,6 +8300,19 @@ TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
 TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     TF_Graph* graph, const char* file_path, int batch_size,
     unsigned char is_mnist, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  // TODO(ashankar): get these functions working on Windows.
+  if (is_mnist) {
+    status->status = tensorflow::errors::Unimplemented(
+        "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+        "is not implemented for Windows");
+  } else {
+    status->status = tensorflow::errors::Unimplemented(
+        "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+        "is not implemented for Windows");
+  }
+  return nullptr
+#else
   tensorflow::Status s;
 
   std::string dataset_name;
@@ -8356,4 +8354,5 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
           << graph->graph.ToGraphDefDebug().DebugString();
 
   return getnext_node;
+#endif
 }
-- 
GitLab


From 2e0cc141b7925d9c9e4c359ccf56e7485623c483 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 12 Apr 2018 00:31:20 -0700
Subject: [PATCH 0673/1262] Remove CreateImagenetDatasetFunctions and
 CreateMNISTDatasetFunctions on Windows

---
 tensorflow/c/c_api_experimental.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 073dc019c7..a4af0b721e 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -7070,6 +7070,7 @@ library {
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 }
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads an MNIST file dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -8207,7 +8208,9 @@ library {
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 }
+#endif
 
+#if not defined(PLATFORM_WINDOWS)
 // Adds the input functions to `graph`.  On success, returns the created
 // IteratorGetNext node.
 static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph(
@@ -8272,6 +8275,7 @@ static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph(
   VLOG(1) << "Output graph: " << graph->graph.ToGraphDefDebug().DebugString();
   return ToTF_Operation(getnext_node);
 }
+#endif
 
 TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
                                                      TF_Status* status) {
-- 
GitLab


From 9397987fe1fd8a632286fc1a2c2fe63bb8b4e26b Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 12 Apr 2018 00:39:45 -0700
Subject: [PATCH 0674/1262] Fix removing incorrect function

---
 tensorflow/c/c_api_experimental.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index a4af0b721e..97ec09e225 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -184,6 +184,7 @@ library {
   return std::move(functions[0]);
 }
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -7069,6 +7070,7 @@ library {
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 }
+#endif
 
 #if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
@@ -8210,7 +8212,6 @@ library {
 }
 #endif
 
-#if not defined(PLATFORM_WINDOWS)
 // Adds the input functions to `graph`.  On success, returns the created
 // IteratorGetNext node.
 static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph(
@@ -8275,7 +8276,6 @@ static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph(
   VLOG(1) << "Output graph: " << graph->graph.ToGraphDefDebug().DebugString();
   return ToTF_Operation(getnext_node);
 }
-#endif
 
 TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
                                                      TF_Status* status) {
-- 
GitLab


From e688642372893d9e51be4119342f787560d8e644 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 04:40:42 -0700
Subject: [PATCH 0675/1262] Make DType, TensorShape, and Dimension "reducable"
 for pickling purposes.

PiperOrigin-RevId: 192591402
---
 tensorflow/python/framework/dtypes.py          |  3 +++
 tensorflow/python/framework/dtypes_test.py     |  9 +++++++++
 tensorflow/python/framework/tensor_shape.py    |  6 ++++++
 .../python/framework/tensor_shape_test.py      | 18 ++++++++++++++++++
 4 files changed, 36 insertions(+)

diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index a31c424263..51ff5171a3 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -297,6 +297,9 @@ class DType(object):
   def __hash__(self):
     return self._type_enum
 
+  def __reduce__(self):
+    return as_dtype, (self.name,)
+
   @property
   def size(self):
     if (self._type_enum == types_pb2.DT_VARIANT or
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index e49e2fda5d..e55783bb79 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,15 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testReduce(self):
+    for enum in dtypes._TYPE_TO_STRING:
+      dtype = dtypes.DType(enum)
+      ctor, args = dtype.__reduce__()
+      self.assertEquals(ctor, dtypes.as_dtype)
+      self.assertEquals(args, (dtype.name,))
+      reconstructed = ctor(*args)
+      self.assertEquals(reconstructed, dtype)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index af2a5b1a7e..00f256cd45 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -456,6 +456,9 @@ class Dimension(object):
     else:
       return self._value >= other.value
 
+  def __reduce__(self):
+    return Dimension, (self._value,)
+
 
 def as_dimension(value):
   """Converts the given value to a Dimension.
@@ -928,6 +931,9 @@ class TensorShape(object):
       return True
     return self._dims != other.dims
 
+  def __reduce__(self):
+    return TensorShape, (self._dims,)
+
 
 def as_shape(shape):
   """Converts the given object to a TensorShape."""
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 4e8ce4d889..498574eded 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -192,6 +192,14 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(nine % 4, 1)
     self.assertEqual(4 % nine, 4)
 
+  def testReduce(self):
+    dim = tensor_shape.Dimension(5)
+    ctor, args = dim.__reduce__()
+    self.assertEquals(ctor, tensor_shape.Dimension)
+    self.assertEquals(args, (5,))
+    reconstructed = ctor(*args)
+    self.assertEquals(reconstructed, dim)
+
 
 class ShapeTest(test_util.TensorFlowTestCase):
 
@@ -417,5 +425,15 @@ class ShapeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([2, None, 4], tensor_shape.TensorShape(
         (2, None, 4)).as_list())
 
+  def testReduce(self):
+    shape = tensor_shape.TensorShape([2, 3])
+    ctor, args = shape.__reduce__()
+    self.assertEquals(ctor, tensor_shape.TensorShape)
+    self.assertEquals(args, ([tensor_shape.Dimension(2),
+                              tensor_shape.Dimension(3)],))
+    reconstructed = ctor(*args)
+    self.assertEquals(reconstructed, shape)
+
+
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From cf542ae4174d954ad21ab255bc0fdb81326e4443 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 06:22:30 -0700
Subject: [PATCH 0676/1262] Special-case the name scoping for operator methods.
 TensorFlow disallows top-level name scopes to begin with underscores. Also
 use the transformer scope information to get to the enclosing function name.

PiperOrigin-RevId: 192600256
---
 .../autograph/converters/name_scopes.py       | 38 ++++++++-----
 .../autograph/converters/name_scopes_test.py  | 55 ++++++++++++++-----
 2 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/name_scopes.py b/tensorflow/contrib/autograph/converters/name_scopes.py
index 2a3f474360..280bc4c314 100644
--- a/tensorflow/contrib/autograph/converters/name_scopes.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes.py
@@ -28,22 +28,34 @@ from tensorflow.contrib.autograph.pyct import transformer
 class FunctionNameScopeTransformer(transformer.Base):
   """Wrap a function body with a `name_scope` of the function name."""
 
-  def __init__(self, context):
-    super(FunctionNameScopeTransformer, self).__init__(context)
-    self._function_level = 0
+  def _name_for_current_scope(self):
+    innermost = self.enclosing_entities[-1]
+    if len(self.enclosing_entities) > 1:
+      parent = self.enclosing_entities[-2]
+      if isinstance(parent, gast.ClassDef):
+        # Methods also take the name of their class.
+        name = '%s/%s' % (parent.name, innermost.name)
+      else:
+        name = innermost.name
+    else:
+      name = innermost.name
+
+    # Sanitize the name.
+    # See https://www.tensorflow.org/api_docs/python/tf/Graph#name_scope
+    # TensorFlow doesn't like leading underscores at the top level.
+    while name[0] == '_':
+      name = name[1:]
+    return name
 
   def visit_FunctionDef(self, node):
-    self._function_level += 1
-    try:
-      self.generic_visit(node)
-    finally:
-      self._function_level -= 1
-    scope_name = node.name
-    if self._function_level == 0 and self.context.owner_type is not None:
-      scope_name = '{}/{}'.format(self.context.owner_type.__name__, scope_name)
+    self.generic_visit(node)
+    template = """
+      with tf.name_scope(scope_name):
+        body
+    """
     node.body = templates.replace(
-        'with tf.name_scope(scope_name): body',
-        scope_name=gast.Str(scope_name),
+        template,
+        scope_name=gast.Str(self._name_for_current_scope()),
         body=node.body)
     return node
 
diff --git a/tensorflow/contrib/autograph/converters/name_scopes_test.py b/tensorflow/contrib/autograph/converters/name_scopes_test.py
index 61e5db2af8..2c2b6bbbec 100644
--- a/tensorflow/contrib/autograph/converters/name_scopes_test.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes_test.py
@@ -38,29 +38,29 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
     node = name_scopes.transform(node, self.ctx)
 
     with self.compiled(node, ops.name_scope) as result:
-      result_op = result.test_fn(constant_op.constant([1, 2, 3]))
+      result_op = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', result_op.op.name)
 
   def test_nested_name(self):
 
     def test_fn(l):
 
-      def body(i):
-        return i**2
+      def inner_fn(i):
+        return i ** 2
 
-      l += [4]
-      return body(l)
+      l += 4
+      return inner_fn(l)
 
     node = self.parse_and_analyze(test_fn, {})
     node = name_scopes.transform(node, self.ctx)
 
     with self.compiled(node, ops.name_scope) as result:
-      result_op = result.test_fn(constant_op.constant([1, 2, 3]))
+      result_op = result.test_fn(constant_op.constant(1))
       first_result_input_name = result_op.op.inputs[0].name
       second_result_input_name = result_op.op.inputs[1].name
       self.assertIn('test_fn/', first_result_input_name)
-      self.assertNotIn('body/', first_result_input_name)
-      self.assertIn('test_fn/body/', second_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('test_fn/inner_fn/', second_result_input_name)
 
   def test_class_name(self):
 
@@ -68,11 +68,11 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
 
       def test_fn(self, l):
 
-        def body(i):
-          return i**2
+        def inner_fn(i):
+          return i ** 2
 
-        l += [4]
-        return body(l)
+        l += 4
+        return inner_fn(l)
 
     # Note that 'TestClass' was needed in the namespace here.
     node = self.parse_and_analyze(
@@ -80,12 +80,37 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
     node = name_scopes.transform(node, self.ctx)
 
     with self.compiled(node, ops.name_scope) as result:
-      result_op = result.TestClass().test_fn(constant_op.constant([1, 2, 3]))
+      result_op = result.TestClass().test_fn(constant_op.constant(1))
       first_result_input_name = result_op.op.inputs[0].name
       second_result_input_name = result_op.op.inputs[1].name
       self.assertIn('TestClass/test_fn/', first_result_input_name)
-      self.assertNotIn('body/', first_result_input_name)
-      self.assertIn('TestClass/test_fn/body/', second_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('TestClass/test_fn/inner_fn/', second_result_input_name)
+
+  def test_special_name(self):
+
+    class TestClass(object):
+
+      def __call__(self, l):
+
+        def inner_fn(i):
+          return i ** 2
+
+        l += 4
+        return inner_fn(l)
+
+    # Note that 'TestClass' was needed in the namespace here.
+    node = self.parse_and_analyze(
+        TestClass.__call__, {'TestClass': TestClass}, owner_type=TestClass)
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      result_op = result.__call__(TestClass(), constant_op.constant(1))
+      first_result_input_name = result_op.op.inputs[0].name
+      second_result_input_name = result_op.op.inputs[1].name
+      self.assertIn('call__/', first_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('call__/inner_fn/', second_result_input_name)
 
 
 if __name__ == '__main__':
-- 
GitLab


From e52563a43a286042142c98fa1900ed0015d45c3f Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 12 Apr 2018 08:48:19 -0700
Subject: [PATCH 0677/1262] Remove redundant if-statement

---
 tensorflow/c/c_api_experimental.cc | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 97ec09e225..0c3bb680e7 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -8306,15 +8306,9 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     unsigned char is_mnist, TF_Status* status) {
 #if defined(PLATFORM_WINDOWS)
   // TODO(ashankar): get these functions working on Windows.
-  if (is_mnist) {
-    status->status = tensorflow::errors::Unimplemented(
-        "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
-        "is not implemented for Windows");
-  } else {
-    status->status = tensorflow::errors::Unimplemented(
-        "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
-        "is not implemented for Windows");
-  }
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
   return nullptr
 #else
   tensorflow::Status s;
-- 
GitLab


From b0978aa81d304a52516362432bc467462b4c7520 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 08:49:47 -0700
Subject: [PATCH 0678/1262] Updating tests containing graphs with Variables so
 that they Evaluate the original and optimized graphs and check if the outputs
 are same.

PiperOrigin-RevId: 192616402
---
 .../optimizers/constant_folding_test.cc       | 127 ++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 31abe43846..36625b68b7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -933,6 +933,17 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
     }
   }
   EXPECT_EQ(1, found);
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>({5, 7});
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>({11, 13});
+
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, item.fetch.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, item.fetch.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
@@ -1095,6 +1106,17 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"v1", v1_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch, {{"v1", v1_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
@@ -1234,6 +1256,18 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
     }
   }
   EXPECT_EQ(2, found);
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+  v_ctrl_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, MergeNodes) {
@@ -1374,6 +1408,16 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SplitVRemoval) {
@@ -1416,6 +1460,16 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
@@ -1450,6 +1504,17 @@ TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   AddNode("out2", "Identity", {"s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(2, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
@@ -1486,6 +1551,16 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
     CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   }
   {  // size = {-1, -1}
     tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
@@ -1524,6 +1599,16 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
     CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   }
 }
 
@@ -1602,6 +1687,16 @@ TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   AddNode("out", "Add", {"p1", "p2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({4, 6}));
+  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 2}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
@@ -1632,6 +1727,16 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 3}));
+  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({1, 2, 3, 1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, NoOpReduction) {
@@ -1666,6 +1771,13 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
     }
   }
   EXPECT_TRUE(found);
+
+  auto v_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"v", v_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NoOpReshape) {
@@ -1744,6 +1856,21 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({17}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({17, 1}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 5, 5}));
+  auto v4_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 5, 5}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch,
+                    {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}, {"v4", v4_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch,
+                    {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}, {"v4", v4_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, Packing) {
-- 
GitLab


From cbea75338433bd36b22742abed13e36bb8cbdc84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 09:44:16 -0700
Subject: [PATCH 0679/1262] Fixing dependencies.

PiperOrigin-RevId: 192624191
---
 tensorflow/python/tools/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index cc2884a4f6..84d20f8e36 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -38,6 +38,7 @@ py_library(
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
         "//tensorflow/python:parsing_ops",
-- 
GitLab


From 8a247976484173059aedc17bfd8d770b8d1a70e1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 09:46:34 -0700
Subject: [PATCH 0680/1262] Collective Ops Part 3

BaseCollectiveExecutor and RingReducer.

This change is part of a series of changes introducing infrastructure
for collective ops and initial implementations of reduction and broadcast.

PiperOrigin-RevId: 192624521
---
 tensorflow/core/BUILD                         |  33 +
 .../base_collective_executor.cc               | 257 ++++++++
 .../common_runtime/base_collective_executor.h | 144 +++++
 .../common_runtime/collective_executor_mgr.cc |  38 +-
 tensorflow/core/common_runtime/dma_helper.h   |   3 +
 .../core/common_runtime/ring_reducer.cc       | 542 ++++++++++++++++
 tensorflow/core/common_runtime/ring_reducer.h | 146 +++++
 .../core/common_runtime/ring_reducer_test.cc  | 606 ++++++++++++++++++
 .../test_collective_executor_mgr.h            | 116 ++++
 9 files changed, 1851 insertions(+), 34 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/base_collective_executor.cc
 create mode 100644 tensorflow/core/common_runtime/base_collective_executor.h
 create mode 100644 tensorflow/core/common_runtime/ring_reducer.cc
 create mode 100644 tensorflow/core/common_runtime/ring_reducer.h
 create mode 100644 tensorflow/core/common_runtime/ring_reducer_test.cc
 create mode 100644 tensorflow/core/common_runtime/test_collective_executor_mgr.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 55b0040b52..118955219b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1064,6 +1064,7 @@ cc_library(
     hdrs = [
         "common_runtime/function_testlib.h",
         "common_runtime/kernel_benchmark_testlib.h",
+        "common_runtime/test_collective_executor_mgr.h",
         "framework/fake_input.h",
         "framework/function_testlib.h",
         "framework/shape_inference_testutil.h",
@@ -2261,6 +2262,7 @@ tf_cuda_library(
 
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
+    "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
     "common_runtime/buf_rendezvous.h",
     "common_runtime/build_graph_options.h",
@@ -2289,6 +2291,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/renamed_device.h",
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
+    "common_runtime/ring_reducer.h",
     "common_runtime/scoped_allocator.h",
     "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
@@ -2306,6 +2309,7 @@ tf_cuda_library(
     srcs = [
         "common_runtime/accumulate_n_optimizer.cc",
         "common_runtime/allocator_retry.cc",
+        "common_runtime/base_collective_executor.cc",
         "common_runtime/bfc_allocator.cc",
         "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
@@ -2336,6 +2340,7 @@ tf_cuda_library(
         "common_runtime/renamed_device.cc",
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
+        "common_runtime/ring_reducer.cc",
         "common_runtime/scoped_allocator.cc",
         "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/session.cc",
@@ -3101,6 +3106,34 @@ tf_cc_test(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "ring_reducer_test",
+    size = "medium",
+    srcs = [
+        "common_runtime/ring_reducer_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
new file mode 100644
index 0000000000..f6332fabdb
--- /dev/null
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -0,0 +1,257 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+#define VALUE_IN_DEBUG_STRING false
+
+namespace tensorflow {
+/*static*/
+int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
+                                          int64 num_chunks) {
+  DCHECK_GT(num_chunks, 0);
+  int64 base_chunk_elts = (total_elts + (num_chunks - 1)) / num_chunks;
+  if (EIGEN_MAX_ALIGN_BYTES == 0) return base_chunk_elts;
+  if (EIGEN_MAX_ALIGN_BYTES <= elt_bytes) {
+    // Tolerate weird small values of EIGEN_MAX_ALIGN_BYTES
+    DCHECK_EQ(0, elt_bytes % EIGEN_MAX_ALIGN_BYTES);
+    return base_chunk_elts;
+  }
+  // elt_bytes < EIGEN_MAX_ALIGN_BYTES, which
+  // must be a common multiple of the various atomic data types.
+  DCHECK_EQ(0, EIGEN_MAX_ALIGN_BYTES % elt_bytes)
+      << "total_elts=" << total_elts << " num_chunks=" << num_chunks
+      << " EIGEN_MAX_ALIGN_BYTES=" << EIGEN_MAX_ALIGN_BYTES
+      << " elt_bytes=" << elt_bytes;
+  // Round bytes per chunk up to the next multiple of EIGEN_MAX_ALIGN_BYTES.
+  int64 chunk_bytes = base_chunk_elts * elt_bytes;
+  int64 diff =
+      (chunk_bytes < EIGEN_MAX_ALIGN_BYTES)
+          ? (EIGEN_MAX_ALIGN_BYTES - chunk_bytes)
+          : (EIGEN_MAX_ALIGN_BYTES - (chunk_bytes % EIGEN_MAX_ALIGN_BYTES));
+  CHECK_EQ(0, diff % elt_bytes);
+  base_chunk_elts += (diff / elt_bytes);
+  DCHECK_EQ(0, ((base_chunk_elts * elt_bytes) % EIGEN_MAX_ALIGN_BYTES))
+      << "total_elts=" << total_elts << " num_chunks=" << num_chunks
+      << " EIGEN_MAX_ALIGN_BYTES=" << EIGEN_MAX_ALIGN_BYTES
+      << " base_chunk_elts=" << base_chunk_elts << " elt_bytes=" << elt_bytes;
+  return base_chunk_elts;
+}
+
+namespace {
+template <typename T>
+class CollectiveAdapterImpl : public CollectiveAdapter {
+ public:
+  // Takes ownership of output and prepares to properly alias its chunks.
+  // Ownership is taken because the shape may temporarily change.
+  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator)
+      : output_(std::move(*output)),
+        dt_(output_.dtype()),
+        old_shape_(output_.shape()),
+        num_chunks_(num_chunks),
+        allocator_(allocator),
+        total_elts_(output_.NumElements()),
+        chunk_elts_(AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)),
+        data_start_(reinterpret_cast<T*>(DMAHelper::base(&output_))),
+        data_end_(data_start_ + total_elts_) {
+    CHECK_GT(chunk_elts_, 0);
+    Flatten();
+  }
+
+  ~CollectiveAdapterImpl() override {}
+
+  const Tensor& Value() const override { return output_; }
+
+  // If necessary, flatten output.
+  void Flatten() {
+    if (old_shape_.dims() > 1) {
+      TensorShape new_shape = TensorShape({old_shape_.num_elements()});
+      DMAHelper::UnsafeSetShape(&output_, new_shape);
+    }
+  }
+
+  void ConsumeFinalValue(Tensor* output) override {
+    if (old_shape_ != output_.shape()) {
+      DMAHelper::UnsafeSetShape(&output_, old_shape_);
+    }
+    *output = std::move(output_);
+  }
+
+  // Number of T elements in a particular chunk.
+  inline int64 ChunkElts(int i) const {
+    DCHECK_LT(i, num_chunks_);
+    const T* chunk_start = std::min(data_end_, data_start_ + i * chunk_elts_);
+    const T* chunk_end = std::min(data_end_, chunk_start + chunk_elts_);
+    return chunk_end - chunk_start;
+  }
+
+  int64 ChunkBytes(int i) const override { return sizeof(T) * ChunkElts(i); }
+
+  // Returns a new Tensor that aliases the required chunk.
+  Tensor ChunkAlias(int i) override {
+    int64 start = chunk_elts_ * i;
+    int64 num_elts = ChunkElts(i);
+    // If this chunk is empty the prior chunk might also be short
+    // so always take an empty slice from the front of the tensor
+    // to avoid an illegal offset check failure somewhere.
+    return (num_elts > 0) ? output_.Slice(start, start + num_elts)
+                          : output_.Slice(0, 0);
+  }
+
+  Tensor TempChunk(int i) const override {
+    AllocationAttributes empty;
+    return Tensor(allocator_, dt_, {ChunkElts(i)}, empty);
+  }
+
+  string DebugString() const override {
+    return strings::StrCat(
+        "base addr ", reinterpret_cast<int64>(DMAHelper::base(&output_)),
+        " num_chunks ", num_chunks_, " total_elts ", total_elts_, " chunk_elts",
+        chunk_elts_, " value ",
+        VALUE_IN_DEBUG_STRING ? output_.SummarizeValue(1024) : "<hidden>");
+  }
+
+  string TBounds(const Tensor& t) const override {
+    int64 base_addr = reinterpret_cast<int64>(DMAHelper::base(&t));
+    return strings::StrCat("(", base_addr, ", ", (base_addr + t.TotalBytes()),
+                           ")");
+  }
+
+  Tensor Scalar(int v) const override {
+    Tensor t(dt_, TensorShape({}));
+    t.scalar<T>()() = v;
+    return t;
+  }
+
+  Tensor Scalar(Allocator* a) const override {
+    Tensor t(a, dt_, TensorShape({}));
+    return t;
+  }
+
+  Tensor output_;
+  const DataType dt_;
+  const TensorShape old_shape_;
+  const int64 num_chunks_;
+  Allocator* allocator_;
+  const int64 total_elts_;
+  const int64 chunk_elts_;
+  const T* data_start_;
+  const T* data_end_;
+};
+
+}  // namespace
+
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator) {
+  switch (output->dtype()) {
+    case DT_FLOAT:
+      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator);
+      break;
+    case DT_DOUBLE:
+      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator);
+      break;
+    case DT_INT32:
+      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator);
+      break;
+    case DT_INT64:
+      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported type " << output->dtype()
+                 << " to MakeCollectiveAdapter";
+      return nullptr;
+  }
+}
+
+BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
+
+void BaseCollectiveExecutor::StartAbort(const Status& s) {
+  LOG(WARNING) << "BaseCollectiveExecutor::StartAbort " << s;
+  remote_access_->StartAbort(s);
+}
+
+void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
+                                          const CollectiveParams& col_params,
+                                          const string& exec_key,
+                                          StatusCallback done) {
+  const Tensor* input = &ctx->input(0);
+  Tensor* output = ctx->mutable_output(0);
+  string error;
+  switch (col_params.instance.type) {
+    case REDUCTION_COLLECTIVE: {
+      // TODO(tucker): support other reduction algorithms,
+      // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc.
+      RingReducer* reducer =
+          CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_,
+                        input, output, &error);
+      if (!reducer) {
+        done(errors::Internal(error));
+        return;
+      }
+      // Run in an I/O thread, so as not to starve the executor threads.
+      // TODO(tucker): Instead of forking every per-device Collective
+      // Op off into its own thread, consider queuing them on a
+      // fixed-size thread-pool dedicated to running CollectiveOps.
+      SchedClosure([reducer, done]() {
+        reducer->Run([reducer, done](const Status& s) {
+          done(s);
+          delete reducer;
+        });
+      });
+    } break;
+    case BROADCAST_COLLECTIVE:
+      done(errors::Internal("Collective Broadcast unimplemented"));
+      break;
+    default:
+      done(errors::Internal("Unimplemented CollectiveType ",
+                            col_params.instance.type));
+  }
+}
+
+RingReducer* BaseCollectiveExecutor::CreateReducer(
+    OpKernelContext* ctx, OpKernelContext::Params* params,
+    const CollectiveParams& col_params, const string& exec_key, int64 step_id,
+    const Tensor* input, Tensor* output, string* error) {
+  switch (col_params.instance.data_type) {
+    case DT_INT32:
+      if (col_params.group.device_type == DEVICE_GPU) {
+        *error =
+            "Collective Reduce does not support datatype DT_INT32 on "
+            "DEVICE_GPU";
+        return nullptr;
+      }
+      TF_FALLTHROUGH_INTENDED;
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT64:
+      return new RingReducer(this, dev_mgr_, ctx, params, col_params, exec_key,
+                             step_id, input, output);
+      break;
+    default:
+      *error = strings::StrCat("Collective Reduce does not support datatype ",
+                               col_params.instance.data_type);
+      return nullptr;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
new file mode 100644
index 0000000000..58eaf31f71
--- /dev/null
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+
+#include <string>
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+class RingReducer;
+
+// Helper interface that aliases regular subfields of a Tensor as separate
+// Tensors for in-place update.
+class CollectiveAdapter {
+ public:
+  virtual ~CollectiveAdapter() {}
+
+  // Move the backing tensor to 'output' with its original storage and
+  // shape. After this call this CollectiveAdapter object should be
+  // deleted immediately without calling any of its other methods.
+  virtual void ConsumeFinalValue(Tensor* output) = 0;
+
+  // const access to entire intermediate value for debugging
+  virtual const Tensor& Value() const = 0;
+
+  // Returns tensor for chunk i which aliases the backing buffer.
+  virtual Tensor ChunkAlias(int i) = 0;
+
+  // Returns tensor allocated on the same device but with its own
+  // separate backing buffer.  Will have same type and size as
+  // chunk i.
+  virtual Tensor TempChunk(int i) const = 0;
+
+  // Bytes in chunk i
+  virtual int64 ChunkBytes(int i) const = 0;
+
+  // Generate a CPU RAM scalar tensor of the same DataType as the
+  // backing tensor with the given integer value.
+  virtual Tensor Scalar(int v) const = 0;
+
+  // Generate a scalar tensor of same DataType and on the same device
+  // as the backing tensor.
+  virtual Tensor Scalar(Allocator* a) const = 0;
+
+  // Debugging string describing buffer location
+  virtual string TBounds(const Tensor& t) const = 0;
+
+  virtual string DebugString() const = 0;
+
+  // Computes the number of elements per alias chunk tensor.
+  //
+  // A CHECK in tensor.cc expects that the memory buffer backing a
+  // Tensor will be aligned according to EIGEN_MAX_ALIGN_BYTES.  To
+  // ensure that all chunk aliasing Tensors maintain this alignment we
+  // need to pick a chunk size that preserves it.  Note than in extreme
+  // cases (impractical, but possible with very small tensors) one or
+  // more tail chunks can end up emptby.
+  static int64 AlignedChunkElts(int64 elt_bytes, int64 total_elts,
+                                int64 num_chunks);
+};
+
+// Create a CollectiveAdaptor wrapping 'output', specialized to its
+// data-type and shape.
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator);
+
+// Default implementation of CollectiveExecutor.  Delegates the actual
+// work of moving data to a class specialized for the operation type,
+// arguments and device+interconnect topology.
+class BaseCollectiveExecutor : public CollectiveExecutor {
+ public:
+  BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
+                         PerStepCollectiveRemoteAccess* remote_access,
+                         int64 step_id, const DeviceMgr* dev_mgr)
+      : CollectiveExecutor(cem),
+        step_id_(step_id),
+        dev_mgr_(dev_mgr),
+        remote_access_(remote_access) {}
+
+  ~BaseCollectiveExecutor() override;
+
+  void StartAbort(const Status& s) override;
+
+  void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
+                    const string& exec_key, StatusCallback done) override;
+
+  PerStepCollectiveRemoteAccess* remote_access() override {
+    return remote_access_.get();
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    remote_access_->RecvFromPeer(peer_device, peer_task, peer_is_local, key,
+                                 to_device, to_device_ctx, to_alloc_attr,
+                                 to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    remote_access_->PostToPeer(peer_device, peer_task, key, from_device,
+                               from_device_ctx, from_alloc_attr, from_tensor,
+                               client_locality, done);
+  }
+
+ protected:
+  const int64 step_id_;
+  const DeviceMgr* dev_mgr_;  // Not owned.
+  std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
+
+ private:
+  RingReducer* CreateReducer(OpKernelContext* ctx,
+                             OpKernelContext::Params* params,
+                             const CollectiveParams& col_params,
+                             const string& exec_key, int64 step_id,
+                             const Tensor* input, Tensor* output,
+                             string* error);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index a5c4946e58..e07829b286 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -21,39 +22,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
-namespace {
-// TODO(tucker): Temporary class just until a real CollectiveExecutor
-// implementation is submitted in a later CL.
-class DummyCollectiveExecutor : public CollectiveExecutor {
- public:
-  explicit DummyCollectiveExecutor(CollectiveExecutorMgr* ce_mgr)
-      : CollectiveExecutor(ce_mgr) {}
-
-  ~DummyCollectiveExecutor() override {}
-
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
-                    DeviceContext* to_device_ctx,
-                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-                    const DeviceLocality& client_locality,
-                    const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
-
-  void PostToPeer(const string& peer_device, const string& peer_task,
-                  const string& key, Device* from_device,
-                  DeviceContext* from_device_ctx,
-                  const AllocatorAttributes& from_alloc_attr,
-                  const Tensor* from_tensor,
-                  const DeviceLocality& client_locality,
-                  const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(DummyCollectiveExecutor);
-};
-}  // namespace
 
 CollectiveExecutorMgr::CollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
@@ -77,7 +45,9 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
     if (it != executor_table_.end()) {
       ce = it->second;
     } else {
-      ce = new DummyCollectiveExecutor(this);
+      CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
+          dev_mgr_, dev_resolver_.get(), step_id);
+      ce = new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
       executor_table_[step_id] = ce;
     }
     ce->Ref();
diff --git a/tensorflow/core/common_runtime/dma_helper.h b/tensorflow/core/common_runtime/dma_helper.h
index 1cc8b9e723..cdfce1f366 100644
--- a/tensorflow/core/common_runtime/dma_helper.h
+++ b/tensorflow/core/common_runtime/dma_helper.h
@@ -28,6 +28,9 @@ class DMAHelper {
   static void* base(Tensor* t) { return t->base<void>(); }
   static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
   static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
+  static void UnsafeSetShape(Tensor* t, const TensorShape& s) {
+    t->set_shape(s);
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
new file mode 100644
index 0000000000..79d03a24ce
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -0,0 +1,542 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+
+namespace tensorflow {
+namespace {
+// Each CollectiveOp implementation is free to define its own
+// BufRendezvous key format.  This function produces the key used by
+// RingReducer.
+string RingReduceBufKey(const string& exec_key, int pass, int section,
+                        int source_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(",
+                           section, "):srcrank(", source_rank, ")");
+  } else {
+    // TODO(tucker): Try out some kind of denser encoding, e.g. 128 bit hash.
+    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
+  }
+}
+
+}  // namespace
+
+void RingReducer::PCQueue::Enqueue(RingField* rf) {
+  mutex_lock l(pcq_mu_);
+  deque_.push_back(rf);
+  if (waiter_count_ > 0) {
+    cv_.notify_one();
+  }
+}
+
+RingReducer::RingField* RingReducer::PCQueue::Dequeue() {
+  mutex_lock l(pcq_mu_);
+  if (deque_.empty()) {
+    ++waiter_count_;
+    while (deque_.empty()) {
+      cv_.wait(l);
+    }
+    --waiter_count_;
+  }
+  RingField* rf = deque_.front();
+  deque_.pop_front();
+  return rf;
+}
+
+RingReducer::RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+                         OpKernelContext* ctx,
+                         OpKernelContext::Params* op_params,
+                         const CollectiveParams& col_params,
+                         const string& exec_key, int64 step_id,
+                         const Tensor* input, Tensor* output)
+    : col_exec_(col_exec),
+      dev_mgr_(dev_mgr),
+      ctx_(ctx),
+      op_params_(op_params),
+      col_params_(col_params),
+      exec_key_(exec_key),
+      input_(input),
+      output_(output),
+      rank_(col_params.subdiv_rank[0]),
+      step_id_(step_id),
+      group_size_(col_params.group.group_size),
+      num_subdivs_(static_cast<int>(
+          col_params.instance.impl_details.subdiv_permutations.size())),
+      done_(nullptr),
+      device_(nullptr),
+      device_name_(
+          col_params_.instance.device_names[col_params_.default_rank]) {
+  CHECK_GT(group_size_, 0);
+  CHECK_GT(num_subdivs_, 0);
+}
+
+string RingReducer::TensorDebugString(Tensor tensor) {
+  const DeviceBase::GpuDeviceInfo* gpu_device_info =
+      ctx_->device()->tensorflow_gpu_device_info();
+  if (gpu_device_info) {
+    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
+    Notification note;
+    gpu_device_info->default_context->CopyDeviceTensorToCPU(
+        &tensor, "" /*tensor_name*/, device_, &cpu_tensor,
+        [&note](const Status& s) {
+          CHECK(s.ok());
+          note.Notify();
+        });
+    note.WaitForNotification();
+    return cpu_tensor.SummarizeValue(64);
+  } else {
+    return tensor.SummarizeValue(64);
+  }
+}
+
+void RingReducer::Run(StatusCallback done) {
+  done_ = std::move(done);
+
+  // Get local execution device.
+  if (VLOG_IS_ON(1)) {
+    string buf;
+    for (int r = 0; r < col_params_.instance.device_names.size(); ++r) {
+      strings::StrAppend(&buf, "dev ", r, " : ",
+                         col_params_.instance.device_names[r], "\n");
+    }
+    for (int sd = 0;
+         sd < col_params_.instance.impl_details.subdiv_permutations.size();
+         ++sd) {
+      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      for (auto x : col_params_.instance.impl_details.subdiv_permutations[sd]) {
+        strings::StrAppend(&buf, x, ", ");
+      }
+    }
+    VLOG(1) << "RingReducer::Run for device " << device_name_
+            << " default_rank " << col_params_.default_rank << "\n"
+            << buf;
+  }
+  CHECK(dev_mgr_);
+  Status status = dev_mgr_->LookupDevice(
+      col_params_.instance.device_names[col_params_.default_rank], &device_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to find device "
+               << col_params_.instance.device_names[col_params_.default_rank];
+    for (auto d : dev_mgr_->ListDevices()) {
+      LOG(ERROR) << "Available device " << d->name();
+    }
+    done_(status);
+    return;
+  }
+  CHECK(device_);
+  device_locality_ = device_->attributes().locality();
+
+  VLOG(1) << this << " default_rank " << col_params_.default_rank << " cp "
+          << &col_params_ << ": " << col_params_.ToString();
+
+  // Start by copying input to output if they're not already the same, i.e. if
+  // we're not computing in-place on the input tensor.
+  if ((input_ != output_) &&
+      (DMAHelper::base(input_) != DMAHelper::base(output_))) {
+    CollectiveRemoteAccessLocal::MemCpyAsync(
+        ctx_->input_device_context(0), ctx_->op_device_context(), device_,
+        device_, ctx_->input_alloc_attr(0), ctx_->output_alloc_attr(0), input_,
+        output_, [this](const Status& s) {
+          if (!s.ok()) {
+            done_(s);
+          } else {
+            ContinueAfterInputCopy();
+          }
+        });
+  } else {
+    ContinueAfterInputCopy();
+  }
+}
+
+void RingReducer::ContinueAfterInputCopy() {
+  AllocatorAttributes attr = ctx_->output_alloc_attr(0);
+  ca_.reset(MakeCollectiveAdapter(output_, group_size_ * num_subdivs_,
+                                  device_->GetAllocator(attr)));
+
+  if (col_params_.final_op) {
+    // Create an on-device scalar value from group_size_ that may be needed
+    // later.
+    // TODO(tucker): Cache and reuse across invocations? Or maybe the scalar
+    // can be provided to the kernel in host memory?
+    Tensor group_size_val = ca_->Scalar(group_size_);
+    if (col_params_.group.device_type != "CPU") {
+      group_size_tensor_ =
+          ca_->Scalar(device_->GetAllocator(ctx_->input_alloc_attr(0)));
+      DeviceContext* op_dev_ctx = ctx_->op_device_context();
+      op_dev_ctx->CopyCPUTensorToDevice(&group_size_val, device_,
+                                        &group_size_tensor_,
+                                        [this](const Status& s) {
+                                          if (!s.ok()) {
+                                            StartAbort(s);
+                                          }
+                                          group_size_tensor_ready_.Notify();
+                                        });
+    } else {
+      group_size_tensor_ = group_size_val;
+      group_size_tensor_ready_.Notify();
+    }
+  }
+  Finish(RunAsyncParts());
+}
+
+void RingReducer::StartAbort(const Status& s) {
+  // In abort mode we stop issuing additional ProvideBuf
+  // and ConsumeBuf calls, but we need to wait for all of the
+  // outstanding callbacks to be invoked before quitting.
+  bool abort_started = false;
+  {
+    mutex_lock l(status_mu_);
+    if (status_.ok()) {
+      LOG(ERROR) << "Aborting RingReduce with " << s;
+      abort_started = true;
+      status_.Update(s);
+    }
+  }
+  // If this is the initial entry to abort mode then invoke StartAbort
+  // on the CollectiveExecutor that invoked us.  That should start
+  // cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions.
+  if (abort_started) {
+    col_exec_->StartAbort(s);
+  }
+}
+
+void RingReducer::Finish(bool ok) {
+  if (ok) {
+    // Recover the output from the adaptor.
+    ca_->ConsumeFinalValue(output_);
+  }
+  Status s;
+  {
+    mutex_lock l(status_mu_);
+    s = status_;
+  }
+  done_(s);
+}
+
+RingReducer::SubContext::SubContext(OpKernelContext* ctx,
+                                    OpKernelContext::Params* params,
+                                    OpKernel* op, Tensor* output, Tensor* input)
+    : sub_params_(*params),
+      sub_inputs_({output, input}),
+      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
+      sub_input_dc_(
+          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
+  sub_params_.op_kernel = op;
+  sub_params_.inputs = &sub_inputs_;
+  sub_params_.input_alloc_attrs = &sub_input_attr_;
+  sub_params_.input_device_contexts = &sub_input_dc_;
+  sub_params_.eigen_gpu_device = nullptr;
+  sub_params_.ensure_eigen_gpu_device();
+  sub_ctx_ = new OpKernelContext(&sub_params_, 1);
+}
+
+Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
+                                 Tensor* input) {
+  // Prepare an OpKernelContext that is identical to that of the original Op
+  // (i.e. the collective), except for the input output sizes and identities and
+  // the Op itself.
+  // TODO(tucker): Is it possible to cache and reuse these objects?  They're
+  // mostly identical inside one device execution.
+  std::unique_ptr<SubContext> sub_ctx(
+      new SubContext(ctx_, op_params_, op, output, input));
+  device->Compute(op, sub_ctx->sub_ctx_);
+  return sub_ctx->sub_ctx_->status();
+}
+
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
+void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                                int field_idx) {
+  // Note on field indexing: There are group_size_ devices in the
+  // instance, implying the same number of chunks per tensor, where a
+  // chunk is the unit of data transferred in a time step.  However, if
+  // a device can simultaenously send data by 2 or more independent
+  // channels we can speed up the transfer by subdividing chunks and
+  // processing multiple subdivisions at once.  So the actual number
+  // of RingFields is group_size_ * num_subdivs_.
+  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
+  rf->chunk_idx = chunk_idx;
+  rf->subdiv_idx = subdiv_idx;
+  rf->sc_idx = field_idx;
+  rf->rank = col_params_.subdiv_rank[subdiv_idx];
+  rf->second_pass = false;
+  rf->action = RF_INIT;
+  // Recv from the device with preceding rank within the subdivision.
+  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  rf->recv_dev_idx = col_params_.instance.impl_details
+                         .subdiv_permutations[subdiv_idx][recv_from_rank];
+  int send_dev_idx = col_params_.instance.impl_details
+                         .subdiv_permutations[subdiv_idx][send_to_rank];
+  rf->recv_is_remote = !col_params_.task.is_local[rf->recv_dev_idx];
+  rf->send_is_remote = !col_params_.task.is_local[send_dev_idx];
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 0 we skip Recv when rank = chunk_idx
+    rf->do_recv = (rf->chunk_idx != rf->rank);
+    // In pass 0 we skip Send when rank = chunk_idx-1
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  if (rf->do_send || rf->do_recv) {
+    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
+    CHECK(rf->chunk.IsAligned()) << rf->DebugString();
+  }
+  if (rf->do_recv) {
+    rf->tmp_chunk = ca_->TempChunk(rf->sc_idx);
+    CHECK(rf->tmp_chunk.IsAligned()) << rf->DebugString();
+  }
+  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
+          << ca_->TBounds(rf->chunk);
+}
+
+// When a RingField transitions from first to second recompute the
+// do_send and do_recv values.
+void RingReducer::AdvanceToSecondPass(RingField* rf) {
+  VLOG(3) << "IncrRingField old value " << rf->DebugString();
+  CHECK(!rf->second_pass);
+  rf->second_pass = true;
+  rf->action = RF_INIT;
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 1 the send/no-send boundary moves down 1 place.
+    rf->do_recv =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  VLOG(3) << "IncrRingField new value " << rf->DebugString();
+}
+
+string RingReducer::RingField::DebugString() const {
+  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
+                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
+                              " action=", action);
+  strings::StrAppend(&rv, " pass=", second_pass);
+  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
+                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
+                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
+  return rv;
+}
+
+void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) {
+  CHECK(rf->do_send);
+  string send_buf_key =
+      RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx, rf->rank);
+  VLOG(3) << "DispatchSend rank=" << col_params_.default_rank << " send key "
+          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
+          << rf->sc_idx;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  int send_to_dev_idx = col_params_.instance.impl_details
+                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
+  col_exec_->PostToPeer(col_params_.instance.device_names[send_to_dev_idx],
+                        col_params_.instance.task_names[send_to_dev_idx],
+                        send_buf_key, device_, ctx_->op_device_context(),
+                        ctx_->output_alloc_attr(0), &rf->chunk,
+                        device_locality_, done);
+}
+
+void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) {
+  CHECK(rf->do_recv);
+  string recv_buf_key =
+      RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx,
+                       (rf->rank + (group_size_ - 1)) % group_size_);
+  VLOG(3) << "DispatchRecv rank=" << col_params_.default_rank << " recv key "
+          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
+          << ((col_params_.merge_op != nullptr) ? "tmp_chunk" : "chunk");
+  Tensor* dst_tensor = (!rf->second_pass && (col_params_.merge_op != nullptr))
+                           ? &rf->tmp_chunk
+                           : &rf->chunk;
+  col_exec_->RecvFromPeer(col_params_.instance.device_names[rf->recv_dev_idx],
+                          col_params_.instance.task_names[rf->recv_dev_idx],
+                          col_params_.task.is_local[rf->recv_dev_idx],
+                          recv_buf_key, device_, ctx_->op_device_context(),
+                          ctx_->output_alloc_attr(0), dst_tensor,
+                          device_locality_, done);
+}
+
+string RingReducer::FieldState() {
+  string s = strings::StrCat("RingReducer ",
+                             strings::Hex(reinterpret_cast<uint64>(this)),
+                             " exec ", exec_key_, " step_id=", step_id_,
+                             " state of all ", rfv_.size(), " fields:");
+  for (int i = 0; i < rfv_.size(); ++i) {
+    s.append("\n");
+    s.append(rfv_[i].DebugString());
+  }
+  return s;
+}
+
+bool RingReducer::RunAsyncParts() {
+  // This function orchestrates RingReduce actions on behalf of a
+  // single device. It is entered by a blockable thread that
+  // loops within it until all actions assigned to that device
+  // complete. Hence function local variables are accessible only by that
+  // one thread and do not require an explicit mutex.
+  rfv_.clear();
+  rfv_.resize(group_size_ * num_subdivs_);
+  PCQueue ready_queue;
+  int field_done_count = 0;
+  int send_pending_count = 0;
+  int recv_pending_count = 0;
+  std::atomic<bool> aborted(false);
+  field_done_count = 0;
+  send_pending_count = 0;
+  recv_pending_count = 0;
+  for (int chunk_idx = 0; chunk_idx < group_size_; ++chunk_idx) {
+    for (int subdiv_idx = 0; subdiv_idx < num_subdivs_; ++subdiv_idx) {
+      int rf_index = (chunk_idx * num_subdivs_) + subdiv_idx;
+      InitRingField(&rfv_[rf_index], chunk_idx, subdiv_idx, rf_index);
+      ready_queue.Enqueue(&rfv_[rf_index]);
+    }
+  }
+
+  // Loop until all RingFields have advanced to completion.
+  while (field_done_count < rfv_.size()) {
+    VLOG(4) << FieldState();
+    // Wait for a RingField to appear in the ready_queue.
+    RingField* rf = ready_queue.Dequeue();
+    // Advance the RingField to its next action and execute, repeating
+    // until either an async action has been started or the RingField
+    // is done.
+    bool dispatched = false;  // true if async action was initiated
+    do {
+      if (aborted) break;
+      switch (rf->action) {
+        case RF_INIT:
+          if (rf->do_recv) {
+            rf->action = RF_RECV;
+            auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchRecv(rf, requeue);
+            dispatched = true;
+            ++recv_pending_count;
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_RECV:
+          CHECK_GT(recv_pending_count, 0);
+          --recv_pending_count;
+          if (!rf->second_pass) {
+            rf->action = RF_REDUCE;
+            Status s = ComputeBinOp(device_, col_params_.merge_op.get(),
+                                    &rf->chunk, &rf->tmp_chunk);
+            if (!s.ok()) {
+              aborted = true;
+              StartAbort(s);
+            }
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_REDUCE:
+          if (!rf->second_pass && col_params_.final_op.get() && rf->is_final) {
+            rf->action = RF_FINALIZE;
+            group_size_tensor_ready_.WaitForNotification();
+            Status s = ComputeBinOp(device_, col_params_.final_op.get(),
+                                    &rf->chunk, &group_size_tensor_);
+            if (!s.ok()) {
+              aborted = true;
+              StartAbort(s);
+            }
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_FINALIZE:
+          rf->action = RF_DONE;
+          break;
+        case RF_SEND_READY:
+          if (rf->do_send) {
+            rf->action = RF_SEND;
+            auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchSend(rf, send_complete);
+            dispatched = true;
+            ++send_pending_count;
+          } else {
+            rf->action = RF_DONE;
+          }
+          break;
+        case RF_SEND:
+          CHECK_GT(send_pending_count, 0);
+          --send_pending_count;
+          rf->action = RF_DONE;
+          break;
+        case RF_DONE:
+          break;
+      }
+      if (rf->action == RF_DONE) {
+        if (rf->second_pass) {
+          ++field_done_count;
+          break;  // from do while(!dispatched)
+        } else {
+          AdvanceToSecondPass(rf);
+        }
+      }
+    } while (!dispatched);
+    if (aborted) break;
+  }  // while (field_done_count < number of fields)
+
+  if (aborted) {
+    // All of the pending data actions should be aborted; field the
+    // callbacks and clear the queue before quitting.
+    while ((send_pending_count > 0) || (recv_pending_count > 0)) {
+      RingField* rf = ready_queue.Dequeue();
+      switch (rf->action) {
+        case RF_RECV:
+          --recv_pending_count;
+          break;
+        case RF_SEND:
+          --send_pending_count;
+          break;
+        default: {}  // Ignore any other actions
+      }
+    }
+  }
+
+  CHECK_EQ(send_pending_count, 0);
+  CHECK_EQ(recv_pending_count, 0);
+
+  VLOG(2) << this << " rank=" << rank_ << " finish;"
+          << " final value " << TensorDebugString(ca_->Value());
+  return !aborted;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
new file mode 100644
index 0000000000..8fde18dc1c
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+
+#include <deque>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Ring-algorithm implementation of collective all-reduce.
+class RingReducer {
+ public:
+  RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+              OpKernelContext* ctx, OpKernelContext::Params* op_params,
+              const CollectiveParams& col_params, const string& exec_key,
+              int64 step_id, const Tensor* input, Tensor* output);
+
+  virtual ~RingReducer() {}
+
+  void Run(StatusCallback done);
+
+ private:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const Status& s);
+  void ContinueAfterInputCopy();
+  void Finish(bool ok);
+  Status ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
+                      Tensor* input);
+  bool RunAsyncParts();
+
+  // Used for executing a sub-operation, e.g. a merge_op instance, with
+  // an OpKernelContext based on the one passed into this Op.
+  class SubContext {
+   public:
+    OpKernelContext::Params sub_params_;
+    gtl::InlinedVector<TensorValue, 4> sub_inputs_;
+    gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
+    gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
+    // Used only for Binary and Unary Ops for which we require
+    // the calculation to be in-place on the first input.
+    int forward_from_ = 0;
+    OpKernelContext* sub_ctx_;
+    SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+               OpKernel* op, Tensor* output, Tensor* input);
+    ~SubContext() { delete sub_ctx_; }
+  };
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    Status status;
+    string DebugString() const;
+  };
+  void AdvanceToSecondPass(RingField* rf);
+  void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                     int field_idx);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(Tensor tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+  };
+
+  CollectiveExecutor* col_exec_;        // Not owned
+  const DeviceMgr* dev_mgr_;            // Not owned
+  OpKernelContext* ctx_;                // Not owned
+  OpKernelContext::Params* op_params_;  // Not owned
+  const CollectiveParams& col_params_;
+  const string exec_key_;
+  const Tensor* input_;  // Not owned
+  Tensor* output_;       // Not owned
+  const int rank_;
+  const int64 step_id_;
+  const int group_size_;
+  const int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  StatusCallback done_;
+  Device* device_;  // The device for which this instance labors
+  const string device_name_;
+  DeviceLocality device_locality_;
+
+  mutex status_mu_;
+  Status status_ GUARDED_BY(status_mu_);
+
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
new file mode 100644
index 0000000000..e4387a074a
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -0,0 +1,606 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+
+#include <algorithm>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      done(errors::Internal("Deliberate failure"));
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                    const DeviceType& device_type,
+                                    DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) {
+    LOG(FATAL) << status;
+  }
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DataType dtype, const DeviceType& device_type,
+                                 DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", dtype)
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device_type, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DataType dtype, const DeviceType& device_type,
+                                 DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", dtype)
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device_type, device);
+}
+
+static int64 kStepId = 123;
+
+class RingReducerTest : public ::testing::Test {
+ protected:
+  RingReducerTest() : device_type_(DEVICE_CPU) {}
+
+  void SetUp() override {
+#if GOOGLE_CUDA
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+#endif
+  }
+
+  ~RingReducerTest() override {
+    stop_ = true;
+    for (auto i : instances_) {
+      delete i;
+    }
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int num_subdivs, int fail_after) {
+    device_type_ = device_type;
+    std::vector<Device*> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name =
+              strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
+          local_devices.push_back(new ThreadPoolDevice(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(gpu_devices_[dev_idx]);
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
+                 << " devices: ";
+      dev_mgr_.reset(new DeviceMgr(local_devices));
+    }
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get());
+    col_params_.name = "test_collective";
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.data_type = dtype;
+    col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs);
+    col_params_.subdiv_rank.resize(num_subdivs);
+    int subdiv_stride = num_devices / num_subdivs;
+    for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/cpu:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name =
+              strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  void Reduce() {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoReduce();
+        ++done;
+      });
+    }
+    while (done < static_cast<int>(instances_.size())) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int num_subdivs, int tensor_len,
+               int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, num_subdivs, fail_after);
+    std::vector<T> expected(tensor_len, 0.0);
+    for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}), [&expected, dtype, di](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              if (dtype == DT_INT32 || dtype == DT_INT64) {
+                value = di * 10 + i;
+              }
+              t->flat<T>()(i) = static_cast<T>(value);
+              expected[i] += value;
+            }
+          });
+    }
+    Reduce();
+    if (fail_after > 0) {
+      // Confirm that every device terminated with the expected error status.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        EXPECT_EQ("Deliberate failure",
+                  instances_[di]->status_.error_message());
+      }
+    } else {
+      // Confirm that every device computed the same correct reduction value.
+      for (int i = 0; i < tensor_len; ++i) {
+        expected[i] /= (num_workers * num_devices);
+      }
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        TF_EXPECT_OK(instances_[di]->status_);
+        Tensor* inst = &instances_[di]->tensor_;
+        CHECK(inst);
+        Tensor actual(dtype, TensorShape({tensor_len}));
+        if (device_type_ == DEVICE_CPU) {
+          CHECK(actual.CopyFrom(*inst, inst->shape()));
+          VLOG(1) << "actual " << actual.SummarizeValue(100);
+        } else if (device_type_ == DEVICE_GPU) {
+          Notification note;
+          Device* dev = instances_[di]->device_;
+          auto* dev_info = dev->tensorflow_gpu_device_info();
+          CHECK(dev_info);
+          dev_info->default_context->CopyDeviceTensorToCPU(
+              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
+                CHECK(s.ok());
+                note.Notify();
+              });
+          note.WaitForNotification();
+        }
+
+        for (int i = 0; i < tensor_len; ++i) {
+          switch (dtype) {
+            case DT_FLOAT:
+              EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_DOUBLE:
+              EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_INT32:
+            case DT_INT64:
+              EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            default:
+              LOG(FATAL) << "unimplemented";
+          }
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
+                                                Tensor* input,
+                                                const DeviceType& device_type,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_reduce_", reduce_counter_++),
+        "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Id")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, RingReducerTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_))
+          << "Couldn't find device " << dev_name
+          << " existing devices: " << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size,
+               static_cast<int>(col_params_.instance.device_names.size()));
+      // Id of this device is at rank position in first subdiv perm.
+      int my_device_id =
+          col_params_.instance.impl_details.subdiv_permutations[0][rank];
+      col_params_.default_rank = my_device_id;
+      // Set rank for all other subdivs by finding that device_id.
+      for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+        for (int r = 0; r < static_cast<int>(col_params_.instance.impl_details
+                                                 .subdiv_permutations[sdi]
+                                                 .size());
+             ++r) {
+          if (my_device_id ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            break;
+          }
+        }
+      }
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        init_f(&tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        init_f(&cpu_tensor);
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        Notification note;
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &tensor_, [&note](const Status& s) {
+              CHECK(s.ok());
+              note.Notify();
+            });
+        note.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoReduce() {
+      col_params_.merge_op =
+          GetAdd(col_params_.instance.data_type, device_type_, device_);
+      col_params_.final_op =
+          GetDiv(col_params_.instance.data_type, device_type_, device_);
+
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op = parent_->GetCollectiveReduce(
+          col_params_, &tensor_, DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the
+      // output allocation that it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a RingReducer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      RingReducer rr(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx,
+                     &op_params, col_params_, exec_key, kStepId, &tensor_,
+                     &tensor_);
+
+      // Start execution in a threadpool then wait for completion.
+      Notification notification;
+      SchedClosure([this, &notification, &rr]() {
+        rr.Run([this, &notification](Status s) {
+          status_ = s;
+          notification.Notify();
+        });
+      });
+      notification.WaitForNotification();
+      CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+
+      dev_ctx->Unref();
+    }
+
+    const Tensor& tensor() { return tensor_; }
+
+    RingReducerTest* parent_;
+    string dev_name_;
+    DeviceType device_type_;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };
+
+  bool stop_ = false;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<tensorflow::Device*> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  mutex mu_;
+  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+};
+
+#define DEF_TEST(B, T, W, D, S, L, A)                                         \
+  TEST_F(RingReducerTest,                                                     \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, S, L, A);                    \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#ifndef GOOGLE_CUDA
+// Success tests
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 3, 1045991, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 4, 1045991, 0)
+DEF_TEST(DOUBLE, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(DOUBLE, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(INT32, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+#endif
+
+#ifdef GOOGLE_CUDA
+// GPU tests.  So long as the device names are all in a single tasks we
+// bypass inter-worker routing code and can fake multiple GPUs with a single
+// GPU, from the perspective of the RingReducer logic.  So these tests
+// are all single-worker.
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 3, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 3, 1045991, 0)
+DEF_TEST(FLOAT, GPU, 1, 4, 4, 1045991, 0)
+DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
+// INT32 values are never on the GPU.
+// DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
new file mode 100644
index 0000000000..d0d4f24b11
--- /dev/null
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+
+// Mock objects that can't actually execute a Collective, but satisfy
+// general infrastructure expectations within tests that don't require
+// full functionality.
+
+class TestCollectiveExecutor : public CollectiveExecutor {
+ public:
+  explicit TestCollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : CollectiveExecutor(cem) {}
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,  //???
+                    const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+};
+
+class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  TestCollectiveExecutorMgr() {}
+
+  ~TestCollectiveExecutorMgr() override {
+    for (auto& iter : table_) {
+      iter.second->Unref();
+    }
+  }
+
+  CollectiveExecutor* FindOrCreate(int64 step_id) override {
+    mutex_lock l(mu_);
+    CollectiveExecutor* ce = nullptr;
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      ce = iter->second;
+    } else {
+      ce = new TestCollectiveExecutor(this);
+      table_[step_id] = ce;
+    }
+    ce->Ref();
+    return ce;
+  }
+
+  void Cleanup(int64 step_id) override {
+    mutex_lock l(mu_);
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      iter->second->Unref();
+      table_.erase(iter);
+    }
+  }
+
+  ParamResolverInterface* GetParamResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  int64 NextStepId(int64 graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64 graph_key, int64 step_id) override {}
+
+  mutex mu_;
+  gtl::FlatMap<int64, CollectiveExecutor*> table_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
-- 
GitLab


From ef2111b8ba3016c958d496dbe541c5f7157b26a9 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 12 Apr 2018 10:04:21 -0700
Subject: [PATCH 0681/1262] Install absl before building

---
 tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 97829892b1..3b437d3c58 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -31,6 +31,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 :: Set ctest binary location.
 IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -40,9 +43,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
-- 
GitLab


From 20f2b863de1c3d0a8c49f642dbb3c009b50886eb Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 12 Apr 2018 10:13:06 -0700
Subject: [PATCH 0682/1262] Add missing semicolon

---
 tensorflow/c/c_api_experimental.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 0c3bb680e7..581f5743eb 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -8309,7 +8309,7 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
   status->status = tensorflow::errors::Unimplemented(
       "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
       "is not implemented for Windows");
-  return nullptr
+  return nullptr;
 #else
   tensorflow::Status s;
 
-- 
GitLab


From ffbf77de81d0b7b4b169c92d0d9fbbdef5b8842a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 10:14:02 -0700
Subject: [PATCH 0683/1262] Introduced tool to run an HLO module in replicated
 fashion, by infeeding random data and outfeeding the data generated at each
 step. The arguments of the computation can be either read from the session
 module, or randomly generated. The tool uses the raw transfer manager API to
 infeed and outfeed the data.

PiperOrigin-RevId: 192628605
---
 tensorflow/compiler/xla/service/BUILD         |   2 +
 tensorflow/compiler/xla/service/hlo_runner.cc | 189 ++++++++++++++----
 tensorflow/compiler/xla/service/hlo_runner.h  |  66 +++++-
 tensorflow/compiler/xla/shape_util.h          |   5 +
 tensorflow/compiler/xla/tests/test_utils.cc   |   4 +-
 tensorflow/compiler/xla/tests/test_utils.h    |   3 +-
 6 files changed, 221 insertions(+), 48 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index db91e80407..65203fa2a0 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2535,6 +2535,7 @@ cc_library(
     srcs = ["hlo_runner.cc"],
     hdrs = ["hlo_runner.h"],
     deps = [
+        ":computation_placer",
         ":executable",
         ":hlo",
         ":transfer_manager",
@@ -2551,6 +2552,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index ec7d8210a7..2e834a79d9 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -16,21 +16,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 
-#include <set>
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -91,15 +86,6 @@ HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-// Define this in .cc file to avoid having to include eigen or forward declare
-// these types in the header.
-struct HloRunner::EigenThreadPoolWrapper {
-  std::unique_ptr<EigenThreadPoolWrapper> pool;
-  std::unique_ptr<Eigen::ThreadPoolDevice> device;
-};
-
-HloRunner::HloRunner() {}
-
 HloRunner::HloRunner(se::Platform* platform) {
   BackendOptions backend_options;
   backend_options.set_platform(platform);
@@ -113,32 +99,14 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
     const tensorflow::gtl::ArraySlice<Literal*> arguments,
     bool run_hlo_passes) {
-  if (run_hlo_passes) {
-    TF_ASSIGN_OR_RETURN(
-        module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor(),
-                    /*device_allocator=*/nullptr));
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor(),
-                                       /*device_allocator=*/nullptr));
-
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      CreateExecutable(std::move(module), run_hlo_passes));
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
 
-  ExecutableRunOptions run_options;
-  run_options.set_device_ordinal(backend().default_device_ordinal());
-  run_options.set_stream(&stream);
-  run_options.set_allocator(backend().memory_allocator());
-  run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
-  run_options.set_intra_op_thread_pool(
-      backend().eigen_intra_op_thread_pool_device());
-
-  ServiceExecutableRunOptions service_run_options(
-      run_options, backend().StreamBorrower(),
-      backend().inter_op_thread_pool());
+  ServiceExecutableRunOptions service_run_options(GetServiceRunOptionsForDevice(
+      backend().default_device_ordinal(), &stream, nullptr));
+  const ExecutableRunOptions& run_options = service_run_options.run_options();
 
   // Copy arguments to device.
   std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
@@ -178,10 +146,153 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
   return result_literal;
 }
 
+StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
+    std::unique_ptr<HloModule> module,
+    const ReplicatedExecuteOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      CreateExecutable(std::move(module), options.run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  std::vector<std::unique_ptr<se::Stream>> streams;
+  std::vector<ServiceExecutableRunOptions> service_run_options;
+  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
+  // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
+  // no arguments.
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs(
+      options.num_replicas * options.arguments.size() + 1);
+  std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
+      argument_buffer_slices;
+  int64 index = 0;
+  for (int64 i = 0; i < options.num_replicas; ++i) {
+    int64 device = device_assignment(i, 0);
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        backend().stream_executor(device));
+    streams.push_back(absl::make_unique<se::Stream>(executor));
+    streams.back()->Init();
+    service_run_options.emplace_back(GetServiceRunOptionsForDevice(
+        device, streams.back().get(), &device_assignment));
+
+    // Copy arguments to device.
+    for (const Literal* argument : options.arguments) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+          backend().transfer_manager()->AllocateScopedShapedBuffer(
+              argument->shape(), backend().memory_allocator(), device));
+      TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+          executor, *argument, *argument_buffer));
+      argument_buffers.push_back(std::move(argument_buffer));
+      argument_buffer_ptrs[index++] = argument_buffers.back().get();
+    }
+    argument_buffer_slices.emplace_back(
+        &argument_buffer_ptrs[index - options.arguments.size()],
+        options.arguments.size());
+  }
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool;
+  int64 num_threads = (options.infeed != nullptr) ? options.num_replicas : 0;
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    num_threads += options.num_replicas;
+  }
+  if (num_threads > 0) {
+    pool = absl::make_unique<tensorflow::thread::ThreadPool>(
+        tensorflow::Env::Default(), "infeed_outfeed",
+        /*num_threads=*/num_threads);
+  }
+  if (options.infeed != nullptr) {
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      int64 device = device_assignment(i, 0);
+      pool->Schedule([this, device, &options]() {
+        se::StreamExecutor* executor =
+            backend().stream_executor(device).ValueOrDie();
+        VLOG(1) << "Starting infeed on device " << device;
+        for (int64 step = 1;
+             options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
+          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralToInfeed(
+              executor, *options.infeed));
+          if (step % 100 == 0) {
+            VLOG(1) << "Infeed step " << step;
+          }
+        }
+      });
+    }
+  }
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      int64 device = device_assignment(i, 0);
+      pool->Schedule([this, device, &options]() {
+        se::StreamExecutor* executor =
+            backend().stream_executor(device).ValueOrDie();
+        VLOG(1) << "Starting outfeed on device " << device;
+        for (int64 step = 1;
+             options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
+          auto literal = absl::make_unique<Literal>();
+          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
+              executor, options.outfeed_shape, literal.get()));
+          if (options.outfeed_values != nullptr) {
+            options.outfeed_values->push_back(std::move(literal));
+          }
+          if (step % 100 == 0) {
+            VLOG(1) << "Outfeed step " << step;
+          }
+        }
+      });
+    }
+  }
+
+  LOG(INFO) << "Replicated execution started";
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<ShapedBuffer>> results,
+                      executable->ExecuteOnStreams(service_run_options,
+                                                   argument_buffer_slices));
+  LOG(INFO) << "Replicated execution terminated";
+
+  std::vector<std::unique_ptr<Literal>> exec_results;
+  for (int64 i = 0; i < options.num_replicas; ++i) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<ScopedShapedBuffer> result,
+                        ScopedShapedBuffer::MakeScoped(
+                            results[i].get(), backend().memory_allocator()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                        backend().transfer_manager()->TransferLiteralFromDevice(
+                            streams[i]->parent(), *result));
+    exec_results.push_back(std::move(literal));
+  }
+  return std::move(exec_results);
+}
+
+StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
+    std::unique_ptr<HloModule> module, bool run_hlo_passes) {
+  if (run_hlo_passes) {
+    TF_ASSIGN_OR_RETURN(
+        module, backend().compiler()->RunHloPasses(
+                    std::move(module), backend().default_stream_executor(),
+                    backend().memory_allocator()));
+  }
+  return backend().compiler()->RunBackend(std::move(module),
+                                          backend().default_stream_executor(),
+                                          backend().memory_allocator());
+}
+
+ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
+    int64 device, se::Stream* stream, DeviceAssignment* device_assignment) {
+  ExecutableRunOptions run_options;
+  run_options.set_device_ordinal(device);
+  run_options.set_stream(stream);
+  run_options.set_allocator(backend().memory_allocator());
+  run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
+  run_options.set_intra_op_thread_pool(
+      backend().eigen_intra_op_thread_pool_device());
+  if (device_assignment != nullptr) {
+    run_options.set_device_assignment(device_assignment);
+  }
+  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower(),
+                                     backend().inter_op_thread_pool());
+}
+
 Backend& HloRunner::backend() {
   if (!backend_) {
     backend_ = Backend::CreateDefaultBackend().ConsumeValueOrDie();
-    VLOG(1) << "executing on platform " << backend().platform()->Name();
+    VLOG(1) << "Executing on platform " << backend().platform()->Name();
   }
   return *backend_;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 06ce22a5b9..f54fb44766 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -16,12 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
 
+#include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -40,9 +44,43 @@ namespace xla {
 // file), or parsed from a hlo textual IR string.
 class HloRunner {
  public:
-  HloRunner();
-
-  HloRunner(::perftools::gputools::Platform* platform);
+  // The options used to configure a ExecuteReplicated() call.
+  struct ReplicatedExecuteOptions {
+    // The number of devices the HLO module should be replicated onto.
+    int64 num_replicas = 1;
+
+    // The arguments to be fed to each replica. Since this is used for a
+    // replicated execution, all the arguments are the same for all replicas.
+    std::vector<const Literal*> arguments;
+
+    // If the HLO module being run has an infeed instruction, this will be the
+    // data which will be fed to it, for as many as infeed_steps steps.
+    const Literal* infeed = nullptr;
+
+    // The number of times the infeed literal should be fed to the HLO module.
+    // For a clean exit, this should match the iterations-per-loop parameter
+    // used when generating the HLO module proto (that is usually the main
+    // while bounary counter). A value higher then iterations-per-loop would
+    // lead to infeed threads feeding to a gone computation, while a lower
+    // value would trigger a stuck ExecuteReplicated() call (the computation
+    // will be trying to infeed data which will never come).
+    int64 infeed_steps = -1;
+
+    // The shape of the outfeed operation. If empty, the HLO module does not
+    // generate any outfeed.
+    Shape outfeed_shape;
+
+    // A pointer to a vector where the outfeed values will be stored. If
+    // nullptr, the values will be read and discarded.
+    std::vector<std::unique_ptr<Literal>>* outfeed_values = nullptr;
+
+    // Whether the HLO passes should be run on the input module. Usually
+    // saved modules are coming from after the HLO pass pipeline, so triggering
+    // another run will likely cause errors.
+    bool run_hlo_passes = false;
+  };
+
+  explicit HloRunner(::perftools::gputools::Platform* platform);
 
   ~HloRunner();
 
@@ -86,6 +124,13 @@ class HloRunner {
     return Execute(std::move(module), argument_pointers, run_hlo_passes);
   }
 
+  // Executes a given HLO module into a set of replicas, and returns a map
+  // with the replica number as key, and the corresponding returned literal as
+  // value.
+  StatusOr<std::vector<std::unique_ptr<Literal>>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options);
+
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
   //
@@ -94,9 +139,18 @@ class HloRunner {
   Backend& backend();
 
  private:
-  struct EigenThreadPoolWrapper;
-
-  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run before.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes);
+
+  // Creates a ServiceExecutableRunOptions object to configure a run on device,
+  // using the provided stream object. If device_assignment is not nullptr, it
+  // will be used to configure the replication parameters. Replicated executions
+  // should pass the device_assignment parameter.
+  ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
+      int64 device, ::perftools::gputools::Stream* stream,
+      DeviceAssignment* device_assignment);
 
   std::unique_ptr<Backend> backend_;
 };
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 1375f981a8..6d228eff46 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -319,6 +319,11 @@ class ShapeUtil {
   // Returns an empty tuple shape. Can be used to indicate side-effects.
   static Shape MakeNil() { return MakeTupleShape({}); }
 
+  // Checks whether the shape is initialized.
+  static bool IsInitialized(const Shape& shape) {
+    return shape.element_type() != PRIMITIVE_TYPE_INVALID;
+  }
+
   // Constructs a new shape with the given element type and sequence of
   // dimensions.
   static Shape MakeShape(PrimitiveType element_type,
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index e30d115fae..cda1989fad 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -340,8 +340,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 }
 
 Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module) {
-  return HloVerifier().Run(module).status();
+                       HloModule* const module, bool allow_mixed_precision) {
+  return HloVerifier(allow_mixed_precision).Run(module).status();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 0fb024ffb0..b5ab779574 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -69,7 +69,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 // Check that a given module satisfies various constraints before trying to
 // execute it.
 Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module);
+                       HloModule* const module,
+                       bool allow_mixed_precision = false);
 
 }  // namespace xla
 
-- 
GitLab


From 844b8cae970d835850a75f8063324224b2de0df0 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 12 Apr 2018 10:35:41 -0700
Subject: [PATCH 0684/1262] [TF] Add TensorListPushBackBatch.

Also modify code to ensure aliased forwarding happens whenever
possible with DT_VARIANT objects in ResourceVariables and in the new op.

PiperOrigin-RevId: 192632202
---
 .../base_api/api_def_TensorListGetItem.pbtxt  |   3 +
 .../api_def_TensorListPushBackBatch.pbtxt     |   3 +
 .../base_api/api_def_TensorListSetItem.pbtxt  |   3 +
 tensorflow/core/kernels/list_kernels.cc       |  16 +++
 tensorflow/core/kernels/list_kernels.cu.cc    |  15 +++
 tensorflow/core/kernels/list_kernels.h        | 121 ++++++++++++++++++
 .../core/kernels/resource_variable_ops.cc     |   7 +-
 tensorflow/core/ops/list_ops.cc               |  44 +++++++
 .../python/kernel_tests/list_ops_test.py      |  42 ++++++
 9 files changed, 252 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
new file mode 100644
index 0000000000..2c47208fa0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGetItem"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
new file mode 100644
index 0000000000..1f33d49260
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBackBatch"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt
new file mode 100644
index 0000000000..002e2a9bd3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListSetItem"
+}
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 9e7786f25e..d1e481d7cc 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -475,6 +475,22 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA
 
+#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListPushBackBatch<CPUDevice, T>)
+
+TF_CALL_ALL_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint8);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint8);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint16);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint16);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint32);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
+
+#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU
+
 #define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
   REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 935f892dd0..0ea9362cbe 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -51,6 +51,21 @@ REGISTER_TENSOR_LIST_STACK_GPU(bool);
 
 #undef REGISTER_TENSOR_LIST_STACK_GPU
 
+#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(T)               \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU),                \
+                          TensorListPushBackBatch<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU
+
 #define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
   REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index f3bbf3b6e3..42871c6113 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -34,6 +34,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
 // Variant compatible type for a list of tensors. This is mutable but instances
 // should never be mutated after stored in a variant tensor.
 struct TensorList {
@@ -146,6 +148,10 @@ class TensorListFromTensor : public OpKernel {
     TensorList output_list;
     const Tensor& t = c->input(0);
     output_list.element_dtype = t.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    t.shape().DebugString()));
     TensorShape output_shape(t.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
@@ -267,6 +273,121 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
   return Status::OK();
 }
 
+template <typename Device, typename T>
+class TensorListPushBackBatch : public OpKernel {
+ public:
+  explicit TensorListPushBackBatch(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListPushBackBatch() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(1);
+    OP_REQUIRES(c, element_dtype_ == input.dtype(),
+                errors::InvalidArgument("Invalid data types; list elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but tried to append ",
+                                        DataTypeString(input.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument(
+                    "Expected tensor to be at least a vector, but saw shape: ",
+                    input.shape().DebugString()));
+
+    const TensorShape& tls_shape = c->input(0).shape();
+
+    // For purposes of input forwarding, we want the least restrictive
+    // AllocatorAttributes possible.  If we need to allocate later,
+    // we'll request the DT_VARIANT be allocated on host.
+    AllocatorAttributes attr;
+
+    std::unique_ptr<Tensor> tls_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tls_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    const Tensor& tls = tls_alias ? *tls_alias : c->input(0);
+
+    OP_REQUIRES(c, tls.dtype() == DT_VARIANT,
+                errors::InvalidArgument(
+                    "Expected input_handles dtype to be Variant, but saw: ",
+                    DataTypeString(tls.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(tls_shape),
+                errors::InvalidArgument(
+                    "Expected input_handles to be a vector, but saw shape: ",
+                    tls_shape.DebugString()));
+    const int64 batch_size = tls.NumElements();
+    OP_REQUIRES(c, input.dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "Expected tensor.shape[0] == input_handles.size, but saw ",
+                    input.dim_size(0), " vs. ", batch_size));
+    auto tls_t = tls.vec<Variant>();
+
+    TensorShape input_element_shape = input.shape();
+    input_element_shape.RemoveDim(0);
+    std::vector<const TensorList*> tl_batch;
+    for (int64 b = 0; b < batch_size; ++b) {
+      const TensorList* l = tls_t(b).get<TensorList>();
+      OP_REQUIRES(c, l != nullptr,
+                  errors::InvalidArgument("Input handle at index ", b,
+                                          " is not a list. Saw: '",
+                                          tls_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l->element_shape.IsCompatibleWith(input_element_shape),
+          errors::InvalidArgument(
+              "Tried to append a tensor with incompatible shape to a "
+              "list at index ",
+              b, ". Op element shape: ", input_element_shape.DebugString(),
+              " list shape: ", l->element_shape.DebugString()));
+      OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                  errors::InvalidArgument(
+                      "Invalid data type at index ", b, "; op elements ",
+                      DataTypeString(element_dtype_), " but list elements ",
+                      DataTypeString(l->element_dtype)));
+      tl_batch.push_back(l);
+    }
+
+    Tensor* result;
+
+    if (tls_alias) {
+      result = tls_alias.get();
+      c->set_output(0, *result);
+    } else {
+      // DT_VARIANT tensors always allocated on host.
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          c, c->allocate_output(0, TensorShape{batch_size}, &result, attr));
+    }
+
+    if (batch_size == 0) {
+      return;
+    }
+
+    auto input_t = input.flat_outer_dims<T, 2>();
+    auto result_t = result->vec<Variant>();
+
+    for (int64 b = 0; b < batch_size; ++b) {
+      if (!tls_alias) {
+        result_t(b) = *tl_batch[b];
+      }
+      TensorList* output = result_t(b).get<TensorList>();
+      DCHECK(output != nullptr);
+      Tensor* frame;
+      PersistentTensor tmp;
+      OP_REQUIRES_OK(c, c->allocate_persistent(
+                            element_dtype_, input_element_shape, &tmp, &frame));
+      if (input_element_shape.num_elements() > 0) {
+        auto frame_t = frame->flat<T>();
+        frame_t.device(c->eigen_device<Device>()) = input_t.template chip<0>(b);
+      }
+      output->tensors.push_back(std::move(*frame));
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 72504200cc..916869fb56 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -306,8 +306,9 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(DT_VARIANT)));
 
+    // For purposes of forwarding DT_VARIANT, we want the least
+    // restrictive attr; we already know the input is on host.
     AllocatorAttributes attr;
-    attr.set_on_host(true);
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
@@ -320,7 +321,7 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
     std::unique_ptr<Tensor> input_alias = context->forward_input(
         1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
         value.shape(),
-        std::is_same<Device, CPUDevice>::value ? HOST_MEMORY : DEVICE_MEMORY,
+        DEVICE_MEMORY /* HOST_MEMORY is only reserved for special cases */,
         attr);
 
     mutex_lock ml(*variable->mu());
@@ -337,6 +338,8 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
         !variable->tensor()->shape().IsSameSize(value.shape())) {
       PersistentTensor unused;
       Tensor* tmp;
+      // Allocation of DT_VARIANT is always on host.
+      attr.set_on_host(true);
       OP_REQUIRES_OK(context,
                      context->allocate_persistent(DT_VARIANT, value.shape(),
                                                   &unused, &tmp, attr));
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index c151055ee6..7af70110b7 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -71,6 +71,50 @@ REGISTER_OP("TensorListPushBack")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListPushBackBatch")
+    .Input("input_handles: variant")
+    .Input("tensor: element_dtype")
+    .Output("output_handles: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle input_handles;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input_handles));
+
+      shape_inference::ShapeHandle tensor;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &tensor));
+
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(tensor, input_handles, &tensor, &input_handles));
+
+      c->set_output(0, input_handles);
+
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to push to list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument(
+              "Trying to push to list with wrong element dtype. List has type ",
+              DataTypeString(list_shape_type.dtype),
+              " but trying to push element with type ", DataTypeString(t));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        s = list_shape_type.shape;
+      }
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListLength")
     .Input("input_handle: variant")
     .Output("length: int32")
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 6173a1def3..2084599760 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -318,6 +318,48 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testPushBackBatch(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l_batch = array_ops.stack([l0, l1])
+    l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
+    l_unstack = array_ops.unstack(l_push)
+    l0_ret = list_ops.tensor_list_stack(l_unstack[0], dtypes.float32)
+    l1_ret = list_ops.tensor_list_stack(l_unstack[1], dtypes.float32)
+    self.assertAllClose([1.0, 2.0, 3.0], self.evaluate(l0_ret))
+    self.assertAllClose([-1.0, 4.0], self.evaluate(l1_ret))
+
+    with ops.control_dependencies([l_push]):
+      l_unstack_orig = array_ops.unstack(l_batch)
+      l0_orig_ret = list_ops.tensor_list_stack(l_unstack_orig[0],
+                                               dtypes.float32)
+      l1_orig_ret = list_ops.tensor_list_stack(l_unstack_orig[1],
+                                               dtypes.float32)
+
+    # Check that without aliasing, push_back_batch still works; and
+    # that it doesn't modify the input.
+    l0_r_v, l1_r_v, l0_orig_v, l1_orig_v = self.evaluate(
+        (l0_ret, l1_ret, l0_orig_ret, l1_orig_ret))
+    self.assertAllClose([1.0, 2.0, 3.0], l0_r_v)
+    self.assertAllClose([-1.0, 4.0], l1_r_v)
+    self.assertAllClose([1.0, 2.0], l0_orig_v)
+    self.assertAllClose([-1.0], l1_orig_v)
+
+    # Pushing back mismatched shapes fails.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, []))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "incompatible shape to a list at index 0"):
+      self.evaluate(
+          list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Invalid data type at index 0"):
+      self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 151c31ce75f4370fd3749f3b07ac8297d3b2e203 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 10:47:26 -0700
Subject: [PATCH 0685/1262] Make default weights initializer in
 `base_layers.Layer` suitable for their dtype.

PiperOrigin-RevId: 192634133
---
 .../keras/_impl/keras/engine/base_layer.py    | 20 ++++++++++++++++---
 tensorflow/python/layers/base_test.py         |  6 ++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index 3b3af7d092..6c68d25127 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -473,16 +473,30 @@ class Layer(checkpointable.CheckpointableBase):
     Raises:
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
+      ValueError: When giving unsupported dtype and no initializer.
     """
     if dtype is None:
       dtype = self.dtype or backend.floatx()
+    else:
+      dtype = dtypes.as_dtype(dtype)
     initializer = initializers.get(initializer)
-    if initializer is None:
-      # Default TensorFlow initializer.
-      initializer = initializers.glorot_uniform()
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
 
+    # Initialize variable when no initializer provided
+    if initializer is None:
+      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+      if dtype.is_floating:
+        initializer = initializers.glorot_uniform()
+      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+      # If dtype is DT_BOOL, provide a default value `FALSE`
+      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+        initializer = initializers.zeros()
+      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
+      else:
+        raise ValueError('An initializer for variable %s of type %s is required'
+                         ' for layer %s' % (name, dtype.base_dtype, self.name))
+
     variable = self._add_variable_with_custom_getter(
         name=name,
         shape=shape,
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index c05c675263..f08b552840 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -52,6 +52,12 @@ class BaseLayerTest(test.TestCase):
     layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testInt64Layer(self):
+    layer = base_layers.Layer(name='my_layer', dtype='int64')
+    layer.add_variable('my_var', [2, 2])
+    self.assertEqual(layer.name, 'my_layer')
+
   @test_util.run_in_graph_and_eager_modes()
   def testAddWeight(self):
     layer = base_layers.Layer(name='my_layer')
-- 
GitLab


From dc2d1c297a1e577151249d953a003357b4962b26 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 11:04:55 -0700
Subject: [PATCH 0686/1262] Fix shape inference for outside_compilation
 clusters that include cycles.

PiperOrigin-RevId: 192637289
---
 tensorflow/compiler/jit/BUILD                 |   8 ++
 .../jit/encapsulate_subgraphs_pass.cc         | 103 +++++++++++++++---
 .../compiler/jit/shape_inference_helpers.cc   |  66 +++++++++++
 .../compiler/jit/shape_inference_helpers.h    |  65 +++++++++++
 4 files changed, 228 insertions(+), 14 deletions(-)
 create mode 100644 tensorflow/compiler/jit/shape_inference_helpers.cc
 create mode 100644 tensorflow/compiler/jit/shape_inference_helpers.h

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 4cefc08645..6edeb7047f 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -183,6 +183,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "shape_inference_helpers",
+    srcs = ["shape_inference_helpers.cc"],
+    hdrs = ["shape_inference_helpers.h"],
+    deps = ["//tensorflow/core:graph"],
+)
+
 # Internal targets below this point.
 
 cc_library(
@@ -293,6 +300,7 @@ cc_library(
     deps = [
         ":common",
         ":graph_to_functiondef",
+        ":shape_inference_helpers",
         ":union_find",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/kernels:parallel_check_op",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index b04b333141..9465385b58 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
@@ -576,7 +578,8 @@ class Encapsulator {
   // satisfied, e.g., because send_node depends on a node that doesn't have a
   // registered shape inference function.
   Status DoStaticShapeInferenceForOutsideCompilationSend(
-      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
+      const ShapeRefiner& shape_refiner,
       const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
       FunctionLibraryDefinition* library,
       std::vector<TensorShapeProto>* static_shape_out,
@@ -599,7 +602,7 @@ class Encapsulator {
   // to nodes in pruned_graph.
   Status MakeGraphForOutsideCompilationSends(
       const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-      ShapeRefiner* shape_refiner,
+      BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
       std::unordered_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
@@ -1712,9 +1715,13 @@ namespace {
 // matter because it will only be used subsequently for shape inference. (It
 // would be possible to add a switch statement over data_type to create a value
 // for the constant, but that would entail maintaining the logic as new types
-// are added, and is not necessary.)
-Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
-                         Graph* graph_out) {
+// are added, and is not necessary.) If the node being replaced was within a
+// control flow frame, adds appropriate Enter nodes so that the use of the Const
+// is well-formed.
+Node* AddDummyShapedNode(const Node* src_node, int src_port,
+                         const std::vector<ControlFlowInfo>& control_flow_info,
+                         const TensorShapeProto& shape, Graph* graph_out) {
+  DataType data_type = src_node->output_type(src_port);
   TensorProto dummy_proto;
   dummy_proto.set_dtype(data_type);
   *dummy_proto.mutable_tensor_shape() = shape;
@@ -1725,7 +1732,23 @@ Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
   NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
                            options.op_registry());
   node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
-  return options.FinalizeBuilder(&node_builder);
+  Node* node = options.FinalizeBuilder(&node_builder);
+  // Add any Enter nodes required to bring the constant to the correct control
+  // flow frame.
+  while (!control_flow_info[src_node->id()].frame_name.empty()) {
+    NodeBuilder enter_builder(options.GetNameForOp("Enter"), "Enter",
+                              options.op_registry());
+    enter_builder.Attr("frame_name",
+                       control_flow_info[src_node->id()].frame_name);
+    enter_builder.Attr("is_constant", true);
+    enter_builder.Input(node, 0);
+    Node* enter_node = options.FinalizeBuilder(&enter_builder);
+    // Adopt the new Enter node as the value in the current frame.
+    node = enter_node;
+    // Recurse to the parent frame to see if more Enter nodes need to be added.
+    src_node = control_flow_info[src_node->id()].parent_frame;
+  }
+  return node;
 }
 
 // Adds a copy of node_in to graph_out and adds the mapping to
@@ -1767,17 +1790,30 @@ Status CopyShapeInferenceNodeToGraph(
       }
     }
   }
+  // Work around the fact that Enter nodes refuse to propagate shape information
+  // unless they are marked loop invariant. Since we are never going to execute
+  // this graph, marking them all loop invariant is fine.
+  if (node_out->type_string() == "Enter") {
+    node_out->ClearAttr("is_constant");
+    node_out->AddAttr("is_constant", true);
+  }
   return Status::OK();
 }
 
 }  // namespace
 
 Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
-    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
+    const ShapeRefiner& shape_refiner,
     const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
     FunctionLibraryDefinition* library,
     std::vector<TensorShapeProto>* static_shape_out,
     std::unique_ptr<Graph>* graph_out) {
+  // Get the control flow structure of the input graph so we can build
+  // well-formed output graphs.
+  std::vector<ControlFlowInfo> control_flow_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(&graph_in, &control_flow_info));
+
   // Maps from nodes in graph_in to nodes in graph_out.
   //
   // When an edge has fully defined shape the source node in graph_in is
@@ -1802,7 +1838,6 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
 
   // We don't use the standard ReverseDFS because we want to cut off traversal
   // whenever we find an output with fully defined shape.
-  // TODO(misard) make this work properly in the presence of control flow.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving node?
@@ -1840,8 +1875,9 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
             TensorShapeProto proto;
             context->ShapeHandleToProto(shape, &proto);
             if (dummy_node_images.find(src_node) == dummy_node_images.end()) {
-              dummy_node_images[src_node] = AddDummyShapedNode(
-                  src_node->output_type(src_port), proto, graph_out->get());
+              dummy_node_images[src_node] =
+                  AddDummyShapedNode(src_node, src_port, control_flow_info,
+                                     proto, graph_out->get());
             }
             // The final input to the send node is the dynamic key, which we
             // don't include in the static shapes.
@@ -1889,6 +1925,38 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     }
   }
 
+  for (const auto edge : back_edge_helper.RemovedEdges()) {
+    if (copied_node_images.find(edge.dst) != copied_node_images.end()) {
+      // The destination of this back edge was added to the inference graph, so
+      // fix it up.
+      Node* dst = copied_node_images[edge.dst];
+      if (dst->type_string() != "Merge") {
+        return errors::InvalidArgument(
+            "outside_compilation cluster contains a back-edge to node ",
+            dst->name(), " of type ", dst->type_string(),
+            ". The analysis pass only supports back-edges to Merge nodes.");
+      }
+      const Edge* existing_input_edge;
+      if (edge.dst_input != 1 || dst->num_inputs() != 2 ||
+          !dst->input_edge(0, &existing_input_edge).ok()) {
+        // TODO(misard) if we see graphs built with a different structure, relax
+        // this constraint. Leaving it here for now to avoid writing unnecessary
+        // complex code since we believe graphs generated by front ends all have
+        // the back edge as the second input to the merge node.
+        return errors::Internal(
+            "Internal assumption failed while rewriting an outside_compilation "
+            "cluster that contains a while loop. Logic assumes back-edge is to "
+            "port 1 of a 2-input "
+            "Merge node.");
+      }
+      // Connect the existing edge to both inputs of the Merge node so that the
+      // graph will be well-formed.
+      (*graph_out)
+          ->AddEdge(existing_input_edge->src(),
+                    existing_input_edge->src_output(), dst, edge.dst_input);
+    }
+  }
+
   return Status::OK();
 }
 
@@ -1956,7 +2024,7 @@ Status Encapsulator::MakePrunedGraphCopyAndInline(
 
 Status Encapsulator::MakeGraphForOutsideCompilationSends(
     const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-    ShapeRefiner* shape_refiner,
+    BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
     std::unordered_map<const Node*, Node*>* node_images,
     FunctionLibraryDefinition* library) {
   // Find all the send_from_host nodes in all subgraphs, to use as roots for the
@@ -1978,10 +2046,15 @@ Status Encapsulator::MakeGraphForOutsideCompilationSends(
   // nodes, inlining any functions as needed.
   TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
       graph, send_from_host_nodes, pruned_graph, node_images, library));
+  FixupSourceAndSinkEdges(pruned_graph->get());
+
+  // Remove back edges from any cycles in the pruned graph to simplify shape
+  // inference traversal. They will be fixed up in the per-subgraph shape
+  // inference graphs stored in the function library.
+  TF_RETURN_IF_ERROR(back_edge_helper->Remove(pruned_graph->get()));
 
   // Perform shape inference on the pruned graph.
   shape_refiner->set_require_shape_inference_fns(false);
-  FixupSourceAndSinkEdges(pruned_graph->get());
   std::vector<Node*> post_order;
   GetReversePostOrder(*(*pruned_graph), &post_order);
   for (auto node : post_order) {
@@ -1999,11 +2072,13 @@ Status Encapsulator::MakeGraphForOutsideCompilationSends(
 
 Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
     Graph* graph_out, FunctionLibraryDefinition* library) {
+  BackEdgeHelper back_edge_helper;
   std::unique_ptr<Graph> pruned_graph;
   ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
   std::unordered_map<const Node*, Node*> node_images;
   TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
-      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+      *graph_out, &pruned_graph, &back_edge_helper, &shape_refiner,
+      &node_images, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("pruned_graph_for_shape_inference",
@@ -2033,7 +2108,7 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
       std::unique_ptr<Graph> graph;
       if (send_node != nullptr) {
         TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
-            *pruned_graph, shape_refiner, recv_at_host_names,
+            *pruned_graph, back_edge_helper, shape_refiner, recv_at_host_names,
             node_images[send_node], library, &static_shape, &graph));
         if (graph == nullptr) {
           VLOG(2) << "Send node  " << send_node->name() << " shapes";
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.cc b/tensorflow/compiler/jit/shape_inference_helpers.cc
new file mode 100644
index 0000000000..d9cfa16526
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_helpers.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains helpers for use in shape inference.
+
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+Status BackEdgeHelper::Remove(Graph* graph) {
+  if (graph_ != nullptr) {
+    return errors::Internal("BackEdgeHelper duplicate call to Remove.");
+  }
+  graph_ = graph;
+  for (Node* n : graph_->nodes()) {
+    if (n->IsMerge()) {
+      for (const Edge* e : n->in_edges()) {
+        if (e->src()->IsNextIteration()) {
+          back_edges_.push_back(
+              BackEdge{e, e->src(), e->src_output(), e->dst(), e->dst_input()});
+        }
+      }
+    }
+  }
+  for (const BackEdge& be : back_edges_) {
+    graph_->RemoveEdge(be.edge);
+  }
+  return Status::OK();
+}
+
+const std::vector<BackEdgeHelper::BackEdge>& BackEdgeHelper::RemovedEdges()
+    const {
+  return back_edges_;
+}
+
+Status BackEdgeHelper::Replace() {
+  if (graph_ == nullptr) {
+    return errors::Internal("BackEdgeHelper Replace called before Remove.");
+  }
+  if (replaced_) {
+    return errors::Internal("BackEdgeHelper Replace called more than once.");
+  }
+  replaced_ = true;
+  for (const BackEdge& be : back_edges_) {
+    graph_->AddEdge(be.src, be.src_output, be.dst, be.dst_input);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.h b/tensorflow/compiler/jit/shape_inference_helpers.h
new file mode 100644
index 0000000000..2f053c9a45
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_helpers.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+#define TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Helper class to temporarily remove, then replace, the back edges in a
+// graph. Simple algorithms for shape inference don't work with cycles, and this
+// class can be used to remove cycles before running inference and replace them
+// after. Correct usage requires exactly one call to Remove(), followed by any
+// number of calls to RemovedEdges() and at most one call to Replace(). The call
+// to Replace() is optional if the graph will be discarded without being
+// executed, e.g., if it is being used purely for a shape inference pass.
+class BackEdgeHelper {
+ public:
+  struct BackEdge {
+    const Edge* edge;
+    Node* src;
+    int src_output;
+    Node* dst;
+    int dst_input;
+  };
+
+  BackEdgeHelper() = default;
+  // Disallows copy and assign.
+  BackEdgeHelper(const BackEdgeHelper& other) = delete;
+  BackEdgeHelper& operator=(const BackEdgeHelper& other) = delete;
+
+  // Temporarily removes all the back edges in graph.
+  Status Remove(Graph* graph);
+
+  // Gets the list of removed edges.
+  const std::vector<BackEdge>& RemovedEdges() const;
+
+  // Replaces the back edges removed by a prior call to Remove.
+  Status Replace();
+
+ private:
+  Graph* graph_ = nullptr;  // not owned
+  std::vector<BackEdge> back_edges_;
+  // Set once Replace has been called.
+  bool replaced_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
-- 
GitLab


From 4a405bc2d7398a0641632439652ec26e310d3359 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 11:23:44 -0700
Subject: [PATCH 0687/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192640621
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 19 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 30d4296326..a45a95ae09 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -68696,6 +68696,25 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListReserve"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0ed039ac2e..afb3dab3fe 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -32047,6 +32047,25 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListReserve"
   input_arg {
-- 
GitLab


From 3ebe39c6152e587137ab580b7e1ec6861f1f22cc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 11:35:39 -0700
Subject: [PATCH 0688/1262] Fix lost dependency

PiperOrigin-RevId: 192643127
---
 .../boosted_trees/estimator_batch/dnn_tree_combined_estimator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 449c130b2d..9994c84ebd 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -32,6 +32,7 @@ from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batc
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-- 
GitLab


From 677156e7e857893fdf4acb8a9931fe2c97ab3246 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 12 Apr 2018 11:40:02 -0700
Subject: [PATCH 0689/1262] Make changes as per reviewer request

---
 tensorflow/contrib/tensorrt/BUILD                     | 11 +++++------
 .../tensorrt/resources/trt_resource_manager.cc        |  8 +++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2a55a49097..2ee0c4589c 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -57,7 +57,6 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        # ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -195,11 +194,11 @@ tf_py_wrap_cc(
 )
 
 tf_cuda_library(
-  name = "trt_resource_manager_impl",
-  srcs = [
-    "resources/trt_resource_manager.cc",
+    name = "trt_resource_manager_impl",
+    srcs = [
+        "resources/trt_resource_manager.cc",
     ],
-  hdrs = [
+    hdrs = [
         "resources/trt_resource_manager.h",
     ],
     deps = [
@@ -230,7 +229,7 @@ tf_cuda_library(
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + if_static([
-      ":trt_resource_manager_impl",
+        ":trt_resource_manager_impl",
     ]),
 )
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index b9a5a00366..9c3698e5d1 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,11 +19,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
-std::shared_ptr<tensorflow::tensorrt::TRTResourceManager>
-tensorflow::tensorrt::TRTResourceManager::instance()
-{
-  static std::shared_ptr<tensorflow::tensorrt::TRTResourceManager> instance_(
-    new tensorflow::tensorrt::TRTResourceManager);
+std::shared_ptr<TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
   return instance_;
 }
 
-- 
GitLab


From 024b037e9ad430c4023e3c9d250f3934f38de5cf Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 12 Apr 2018 11:45:02 -0700
Subject: [PATCH 0690/1262] Fixed error where no background audio samples were
 being used when testing no-speech clips

PiperOrigin-RevId: 192644704
---
 tensorflow/examples/speech_commands/input_data.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index e7db9cddf0..63dd18457f 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -457,7 +457,7 @@ class AudioProcessor(object):
           self.time_shift_offset_placeholder_: time_shift_offset,
       }
       # Choose a section of background noise to mix in.
-      if use_background:
+      if use_background or sample['label'] == SILENCE_LABEL:
         background_index = np.random.randint(len(self.background_data))
         background_samples = self.background_data[background_index]
         background_offset = np.random.randint(
@@ -465,7 +465,9 @@ class AudioProcessor(object):
         background_clipped = background_samples[background_offset:(
             background_offset + desired_samples)]
         background_reshaped = background_clipped.reshape([desired_samples, 1])
-        if np.random.uniform(0, 1) < background_frequency:
+        if sample['label'] == SILENCE_LABEL:
+          background_volume = np.random.uniform(0, 1)
+        elif np.random.uniform(0, 1) < background_frequency:
           background_volume = np.random.uniform(0, background_volume_range)
         else:
           background_volume = 0
-- 
GitLab


From 10e60219b71fc48e07b0afaa6edeec2d9afac24d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 11:46:26 -0700
Subject: [PATCH 0691/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 192644946

---
 tensorflow/go/op/wrappers.go | 184 +++++++++++++++++------------------
 1 file changed, 92 insertions(+), 92 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 2d3e369328..1d5ebf6687 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4932,6 +4932,70 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
@@ -12327,34 +12391,6 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Adds `bias` to `value`.
 //
 // This is a deprecated version of BiasAdd and will be soon removed.
@@ -19183,6 +19219,34 @@ func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Deprecated. Disallowed in GraphDef version >= 2.
 //
 // DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
@@ -22625,70 +22689,6 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Return substrings from `Tensor` of strings.
 //
 // For each string in the input `Tensor`, creates a substring starting at index
-- 
GitLab


From 454a22aa29dc2dba355094aabe733cd8419f2788 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 11:51:34 -0700
Subject: [PATCH 0692/1262] Construct Orthogonal kernels for 2d convolutions.

PiperOrigin-RevId: 192645769
---
 tensorflow/contrib/framework/__init__.py      |   2 +
 .../python/kernel_tests/init_ops_test.py      |  99 +++++++++-
 tensorflow/python/ops/init_ops.py             | 186 +++++++++++++++++-
 3 files changed, 282 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index cbb68bd3eb..a52907f163 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -72,6 +72,7 @@ See the @{$python/contrib.framework} guide.
 @@variable
 @@VariableDeviceChooser
 @@convolutional_delta_orthogonal
+@@convolutional_orthogonal_2d
 @@zero_initializer
 
 @@load_checkpoint
@@ -116,6 +117,7 @@ from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['nest']
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 1e5c118cbc..f7a7119b34 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -551,7 +551,6 @@ class OrthogonalInitializerTest(test.TestCase):
       init2 = init_ops.orthogonal_initializer(gain=3.14, seed=1, dtype=dtype)
       with self.test_session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
-      with self.test_session(graph=ops.Graph(), use_gpu=True):
         t2 = init2(shape).eval()
       return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
 
@@ -610,7 +609,6 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
                                                       seed=1, dtype=dtype)
       with self.test_session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
-      with self.test_session(graph=ops.Graph(), use_gpu=True):
         t2 = init2(shape).eval()
       return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
 
@@ -674,6 +672,103 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
 
+class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_2d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_2d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_2d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Pad input_ for computing (circular) convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+      beg = kernel_size // 2
+      end = kernel_size - 1 - beg
+
+      tmp_up = array_ops.slice(input_, [0, width - beg, 0, 0],
+                               [-1, beg, width, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0, 0], [-1, end, width, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      new_width = width + kernel_size - 1
+      tmp_left = array_ops.slice(tmp, [0, 0, width - beg, 0],
+                                 [-1, new_width, beg, -1])
+      tmp_right = array_ops.slice(tmp, [0, 0, 0, 0], [-1, new_width, end, -1])
+
+      final = array_ops.concat([tmp_left, tmp, tmp_right], 2)
+      return final
+
+    cout = 45
+    shape = [64, 28, 28, 32]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]:
+      convolution = convolutional.conv2d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size, use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_2d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(
+            sess.run(inputs_2norm)/np.sqrt(np.prod(shape)),
+            sess.run(outputs_2norm)/(np.sqrt(np.prod(shape))*np.sqrt(gain)),
+            rtol=tol, atol=tol)
+
+
 class IdentityInitializerTest(test.TestCase):
 
   def testInvalidDataType(self):
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 9dfe5ffbf4..5ded3f7cc2 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -499,10 +499,10 @@ class Orthogonal(Initializer):
 
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix
-    dtype: The type of the output.
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
+    dtype: The data type.
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -552,10 +552,10 @@ class ConvolutionDeltaOrthogonal(Initializer):
     gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
-    dtype: The type of the output.
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
+    dtype: The data type.
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -581,7 +581,6 @@ class ConvolutionDeltaOrthogonal(Initializer):
     q, r = linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
-    # ph = d / math_ops.abs(d)
     q *= math_ops.sign(d)
     q = q[:shape[-2], :]
     q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
@@ -601,6 +600,186 @@ class ConvolutionDeltaOrthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
+class ConvolutionOrthogonal2D(Initializer):
+  """Initializer that generates a 2D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 2. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed}
+      for behavior.
+    dtype: The data type.
+  """
+
+  def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
+    self.gain = gain
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    # Check the shape
+    if len(shape) != 4:
+      raise ValueError("The tensor to initialize must be four-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    if shape[0] != shape[1]:
+      raise ValueError("Kernel sizes must be equal.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def get_config(self):
+    return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
+
+  # Helper functions.
+  def _orthogonal_matrix(self, n):
+    """Construct an n x n orthogonal matrix.
+
+    Args:
+      n: dimension.
+    Returns:
+      a n x n orthogonal matrix.
+    """
+    a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed)
+    if self.seed:
+      self.seed += 1
+    q, r = linalg_ops.qr(a)
+    d = array_ops.diag_part(r)
+    # make q uniform
+    q *= math_ops.sign(d)
+    return q
+
+  def _symmetric_projection(self, n):
+    """Compute a n x n symmetric projection matrix.
+
+    Args:
+      n: dimension.
+    Returns:
+      a n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T.
+    """
+    q = self._orthogonal_matrix(n)
+    # randomly zeroing out some columns
+    mask = math_ops.cast(random_ops.random_normal([n], seed=self.seed) > 0,
+                         self.dtype)
+    if self.seed:
+      self.seed += 1
+    c = math_ops.multiply(q, mask)
+    return math_ops.matmul(c, array_ops.matrix_transpose(c))
+
+  def _dict_to_tensor(self, x, k1, k2):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: a k1 * k2 dictionary.
+      k1: first dimension of x.
+      k2: second dimension of x.
+    Returns:
+      a k1 * k2 tensor.
+    """
+
+    return array_ops.stack([array_ops.stack([x[i, j] for j in range(k2)])
+                            for i in range(k1)])
+
+  def _block_orth(self, p1, p2):
+    """Construct a 2 x 2 kernel. Used to construct orthgonal kernel.
+
+    Args:
+      p1: a symmetric projection matrix
+      p2: a symmetric projection matrix
+    Returns:
+      a 2 x 2 kernel [[p1p2,         p1(1-p2)],
+                      [(1-p1)p2, (1-p1)(1-p2)]].
+    Raises:
+      ValueError: if the dimensions of p1 and p2 are different.
+    """
+    if p1.shape.as_list() != p2.shape.as_list():
+      raise ValueError("The dimension of the matrices must be the same.")
+    n = p1.shape.as_list()[0]
+    kernel2x2 = {}
+    eye = linalg_ops.eye(n, dtype=self.dtype)
+    kernel2x2[0, 0] = math_ops.matmul(p1, p2)
+    kernel2x2[0, 1] = math_ops.matmul(p1, (eye - p2))
+    kernel2x2[1, 0] = math_ops.matmul((eye - p1), p2)
+    kernel2x2[1, 1] = math_ops.matmul((eye - p1), (eye - p2))
+
+    return kernel2x2
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: is a k x k dictionary, each element is a n x n matrix.
+      m2: is a l x l dictionary, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1) * (k + l - 1) dictionary each element is a n x n matrix.
+    Raises:
+      ValueError: if the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0, 0]).shape.as_list()[0]
+    if n != (m2[0, 0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = int(np.sqrt(len(m1)))
+    l = int(np.sqrt(len(m2)))
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      for j in range(size):
+        result[i, j] = array_ops.zeros([n, n], self.dtype)
+        for index1 in range(min(k, i + 1)):
+          for index2 in range(min(k, j + 1)):
+            if (i - index1) < l and (j - index2) < l:
+              result[i, j] += math_ops.matmul(m1[index1, index2],
+                                              m2[i - index1, j - index2])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: kernel size
+      cin: number of input channels
+      cout: number of output channels
+    Returns:
+      an [ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: if cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(array_ops.expand_dims(orth, 0), 0)
+
+    p = self._block_orth(self._symmetric_projection(cout),
+                         self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout),
+                              self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      for j in range(ksize):
+        p[i, j] = math_ops.matmul(orth, p[i, j])
+
+    return self._dict_to_tensor(p, ksize, ksize)
+
+
 @tf_export("keras.initializers.Identity", "initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
@@ -646,6 +825,7 @@ variance_scaling_initializer = VarianceScaling
 orthogonal_initializer = Orthogonal
 identity_initializer = Identity
 convolutional_delta_orthogonal = ConvolutionDeltaOrthogonal
+convolutional_orthogonal_2d = ConvolutionOrthogonal2D
 # pylint: enable=invalid-name
 
 
-- 
GitLab


From 583ee0eabfb1bebd0eb533d2ab7a5c17af7e664e Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Thu, 12 Apr 2018 11:54:21 -0700
Subject: [PATCH 0693/1262] Add testCompileTimeConstantsInDefun in xla

PiperOrigin-RevId: 192646199
---
 tensorflow/compiler/tests/function_test.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index 11d8a99ffe..fbc3c994d1 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -105,6 +105,28 @@ class FunctionTest(XLATestCase):
       result = sess.run(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
+  def testCompileTimeConstantsInDefun(self):
+    """Tests that XLA handles compile-time constants in defuns."""
+    with self.test_session() as sess:
+
+      @function.Defun(dtypes.float32, dtypes.int32, dtypes.int32)
+      def Foo(a, c, d):
+        # c and d must be known at compile time
+        x = array_ops.slice(a, c, d)
+        return x
+
+      a = array_ops.placeholder(dtypes.float32)
+      c = array_ops.placeholder(dtypes.int32, shape=[4])
+      d = array_ops.placeholder(dtypes.int32, shape=[4])
+      with self.test_scope():
+        call_f = Foo(a, c, d)
+      result = sess.run(call_f, feed_dict={
+          a: np.ones([1, 4, 4, 1]),
+          c: [0, 0, 0, 0],
+          d: [1, 2, 2, 1]})
+
+    self.assertAllEqual(np.ones([1, 2, 2, 1]), result)
+
   # TODO(b/36139787): Re-enable this test when noinline works again.
   def DISABLED_testFunctionsNoInline(self):
 
-- 
GitLab


From d1ee67c03a29d93fecd427f1a4693cb3fd6e6e38 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 12 Apr 2018 11:59:08 -0700
Subject: [PATCH 0694/1262] Start moving Checkpointable utilities toward core

Doesn't add to the public API yet, just shifts code around. Changes:
  - A tiny bit of renaming (to avoid having _Checkpoint and Checkpoint in the same file)
  - Removed the garbage collection decorator from a few tests due to the uuid4() garbage issue (apparently core tests get run on Python 2.7.9?)
  - Renamed "Object" to "CheckpointableObject" in the proto, since core protos have Java bindings and apparently Java had something else in mind for the keyword "Object" :)
but otherwise this is a pure move.

After this CL I'll propose adding tf.train.Checkpoint to the API (currently tf.contrib.eager.Checkpoint), move the utilities that are still in contrib/eager to their own contrib directory (there will be a few more misc. utilities for inspecting checkpoints and managing dependencies), get tf.train.Saver to read object-based checkpoints for compatibility, and work on Model.save_weights/load_weights.

PiperOrigin-RevId: 192646890
---
 tensorflow/contrib/cmake/python_modules.txt   |    1 -
 tensorflow/contrib/cmake/python_protos.txt    |    1 -
 .../python/kernel_tests/cudnn_rnn_test.py     |    2 +-
 tensorflow/contrib/eager/proto/BUILD          |   13 -
 tensorflow/contrib/eager/python/BUILD         |   13 -
 .../eager/python/checkpointable_utils.py      |  846 -----------
 .../eager/python/checkpointable_utils_test.py | 1284 +---------------
 .../contrib/eager/python/datasets_test.py     |    2 +-
 .../eager/python/examples/spinn/spinn_test.py |    2 +-
 .../contrib/eager/python/metrics_test.py      |    2 +-
 tensorflow/contrib/eager/python/tfe.py        |    4 +-
 .../optimizer_v2/checkpointable_utils_test.py |    2 +-
 tensorflow/core/BUILD                         |    1 +
 .../checkpointable_object_graph.proto         |   12 +-
 tensorflow/python/BUILD                       |   35 +
 .../python/training/checkpointable_utils.py   |  850 ++++++++++-
 .../training/checkpointable_utils_test.py     | 1308 +++++++++++++++++
 17 files changed, 2207 insertions(+), 2171 deletions(-)
 delete mode 100644 tensorflow/contrib/eager/proto/BUILD
 rename tensorflow/{contrib/eager/proto => core/protobuf}/checkpointable_object_graph.proto (85%)
 create mode 100644 tensorflow/python/training/checkpointable_utils_test.py

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index de84af866b..91839194c7 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -170,7 +170,6 @@ tensorflow/contrib/distributions/python
 tensorflow/contrib/distributions/python/ops
 tensorflow/contrib/distributions/python/ops/bijectors
 tensorflow/contrib/eager
-tensorflow/contrib/eager/proto
 tensorflow/contrib/eager/python
 tensorflow/contrib/estimator
 tensorflow/contrib/estimator/python
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 0c80d529af..d63c41db84 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -5,7 +5,6 @@ tensorflow/python
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/cloud/kernels
 tensorflow/contrib/decision_trees/proto
-tensorflow/contrib/eager/proto
 tensorflow/contrib/gdr
 tensorflow/contrib/lite/toco
 tensorflow/contrib/mpi
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 9cc6ca09ad..6fb56b0858 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -29,7 +29,6 @@ import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -55,6 +54,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
deleted file mode 100644
index b016d2dcb5..0000000000
--- a/tensorflow/contrib/eager/proto/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-
-tf_proto_library(
-    name = "checkpointable_object_graph_proto",
-    srcs = [
-        "checkpointable_object_graph.proto",
-    ],
-    visibility = ["//tensorflow/contrib/eager/python:__subpackages__"],
-)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index d97048405d..04e2d99048 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -230,21 +230,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
     ],
 )
 
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py
index 34cb8d0e08..30c4103c5a 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils.py
@@ -17,857 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import collections
 import functools
-import weakref
 
-from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable as core_checkpointable
-from tensorflow.python.training import checkpointable_utils as core_checkpointable_utils
-from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.util import deprecation
-
-
-_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
-
-# Keyword for identifying that the next bit of a checkpoint variable name is a
-# slot name. Checkpoint names for slot variables look like:
-#
-#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
-#
-# Where <path to variable> is a full path from the checkpoint root to the
-# variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
-# Keyword for separating the path to an object from the name of an
-# attribute in checkpoint names. Used like:
-#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
-_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-# Key where the object graph proto is saved in a TensorBundle
-_OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
-
-
-# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
-# or consolidating the implementation with get_variable.
-def _default_getter(name, shape, dtype, initializer=None,
-                    partition_info=None, **kwargs):
-  """A pared-down version of get_variable which does not reuse variables."""
-  dtype = dtypes.as_dtype(dtype)
-  shape_object = tensor_shape.as_shape(shape)
-  with ops.init_scope():
-    if initializer is None:
-      initializer, initializing_from_value = (
-          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
-              name=name, shape=shape_object, dtype=dtype))
-    else:
-      initializing_from_value = not callable(initializer)
-    # Same logic as get_variable
-    variable_dtype = dtype.base_dtype
-    if initializing_from_value:
-      if shape is not None:
-        raise ValueError("If initializer is a constant, do not specify shape.")
-      initial_value = initializer
-    else:
-      # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      def initial_value():
-        return initializer(
-            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
-    return resource_variable_ops.ResourceVariable(
-        initial_value=initial_value,
-        name=name,
-        dtype=variable_dtype,
-        **kwargs
-    )
-
-
-def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
-                 initializer=None):
-  """Add a variable to a Checkpointable with no scope influence."""
-  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
-      name=name, shape=shape, dtype=dtype,
-      initializer=initializer, getter=_default_getter)
-
-
-def _breadth_first_checkpointable_traversal(root_checkpointable):
-  """Find shortest paths to all variables owned by dependencies of root."""
-  bfs_sorted = []
-  to_visit = collections.deque([root_checkpointable])
-  path_to_root = {root_checkpointable: ()}
-  while to_visit:
-    current_checkpointable = to_visit.popleft()
-    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-    bfs_sorted.append(current_checkpointable)
-    for child_checkpointable in (
-        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
-      if child_checkpointable.ref not in path_to_root:
-        path_to_root[child_checkpointable.ref] = (
-            path_to_root[current_checkpointable] + (child_checkpointable,))
-        to_visit.append(child_checkpointable.ref)
-  return bfs_sorted, path_to_root
-
-
-def _escape_local_name(name):
-  # We need to support slashes in local names for compatibility, since this
-  # naming scheme is being patched in to things like Layer.add_variable where
-  # slashes were previously accepted. We also want to use slashes to indicate
-  # edges traversed to reach the variable, so we escape forward slashes in
-  # names.
-  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
-          .replace(r"/", _ESCAPE_CHAR + "S"))
-
-
-def _object_prefix_from_path(path_to_root):
-  return "/".join(
-      (_escape_local_name(checkpointable.name)
-       for checkpointable in path_to_root))
-
-
-def _slot_variable_naming_for_optimizer(optimizer_path):
-  """Make a function for naming slot variables in an optimizer."""
-  # Name slot variables:
-  #
-  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
-  #
-  # where <variable name> is exactly the checkpoint name used for the original
-  # variable, including the path from the checkpoint root and the local name in
-  # the object which owns it. Note that we only save slot variables if the
-  # variable it's slotting for is also being saved.
-
-  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
-
-  def _name_slot_variable(variable_path, slot_name):
-    """With an optimizer specified, name a slot variable."""
-    return (variable_path
-            + optimizer_identifier
-            + _escape_local_name(slot_name))
-
-  return _name_slot_variable
-
-
-def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
-  """Gather and name slot variables."""
-  non_slot_objects = list(checkpointable_objects)
-  slot_variables = {}
-  for checkpointable in non_slot_objects:
-    if isinstance(checkpointable, optimizer_lib.Optimizer):
-      naming_scheme = _slot_variable_naming_for_optimizer(
-          optimizer_path=object_names[checkpointable])
-      slot_names = checkpointable.get_slot_names()
-      for slot_name in slot_names:
-        for original_variable_node_id, original_variable in enumerate(
-            non_slot_objects):
-          try:
-            slot_variable = checkpointable.get_slot(
-                original_variable, slot_name)
-          except AttributeError:
-            slot_variable = None
-          if slot_variable is None:
-            continue
-          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
-            # TODO(allenl): Gather dependencies of slot variables.
-            raise NotImplementedError(
-                "Currently only variables with no dependencies can be saved as "
-                "slot variables. File a feature request if this limitation "
-                "bothers you.")
-          if slot_variable in node_ids:
-            raise NotImplementedError(
-                "A slot variable was re-used as a dependency of a "
-                "Checkpointable object. This is not currently allowed. File a "
-                "feature request if this limitation bothers you.")
-          checkpoint_name = naming_scheme(
-              variable_path=object_names[original_variable],
-              slot_name=slot_name)
-          object_names[slot_variable] = checkpoint_name
-          slot_variable_node_id = len(checkpointable_objects)
-          node_ids[slot_variable] = slot_variable_node_id
-          checkpointable_objects.append(slot_variable)
-          slot_variable_proto = (
-              checkpointable_object_graph_pb2.CheckpointableObjectGraph
-              .Object.SlotVariableReference(
-                  slot_name=slot_name,
-                  original_variable_node_id=original_variable_node_id,
-                  slot_variable_node_id=slot_variable_node_id))
-          slot_variables.setdefault(checkpointable, []).append(
-              slot_variable_proto)
-  return slot_variables
-
-
-def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = {}
-
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
-    object_name = object_names[checkpointable]
-    for name, saveable_factory in (
-        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
-      attribute = object_proto.attributes.add()
-      attribute.name = name
-      attribute.checkpoint_key = "%s/%s/%s" % (
-          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if callable(saveable_factory):
-        saveable = saveable_factory(name=attribute.checkpoint_key)
-      else:
-        saveable = saveable_factory
-      # Figure out the name-based Saver's name for this variable.
-      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
-          [saveable], convert_variable_to_tensor=False)
-      attribute.full_name, = saver_dict.keys()
-      named_saveables[attribute.checkpoint_key] = saveable
-
-    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      child_proto = object_proto.children.add()
-      child_proto.node_id = node_ids[child.ref]
-      child_proto.local_name = child.name
-
-  return named_saveables, object_graph_proto
-
-
-def _serialize_object_graph(root_checkpointable):
-  """Determine checkpoint keys for variables and build a serialized graph.
-
-  Non-slot variables are keyed based on a shortest path from the root saveable
-  to the object which owns the variable (i.e. the one which called
-  `Checkpointable._add_variable` to create it).
-
-  Slot variables are keyed based on a shortest path to the variable being
-  slotted for, a shortest path to their optimizer, and the slot name.
-
-  Args:
-    root_checkpointable: A `Checkpointable` object whose variables (including
-      the variables of dependencies, recursively) should be saved.
-
-  Returns:
-    A tuple of (named_variables, object_graph_proto):
-      named_variables: A dictionary mapping names to variable objects.
-      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
-        the serialized object graph and variable references.
-
-  Raises:
-    ValueError: If there are invalid characters in an optimizer's slot names.
-  """
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = {
-      obj: _object_prefix_from_path(path)
-      for obj, path in path_to_root.items()}
-  node_ids = {node: node_id for node_id, node
-              in enumerate(checkpointable_objects)}
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return _serialize_checkpointables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names,
-      slot_variables=slot_variables)
-
-
-def gather_initializers(root_checkpointable):
-  """Traverse the object graph and find initialization ops.
-
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable` and which have an `initializer` property. Includes
-  initializers for slot variables only if the variable they are slotting for and
-  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
-  saved with a checkpoint).
-
-  Args:
-    root_checkpointable: A `Checkpointable` object to gather initializers for.
-  Returns:
-    A list of initialization ops.
-  """
-  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
-  # to run.
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = {
-      obj: _object_prefix_from_path(path)
-      for obj, path in path_to_root.items()}
-  node_ids = {node: node_id for node_id, node
-              in enumerate(checkpointable_objects)}
-  _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return [c.initializer for c in checkpointable_objects
-          if hasattr(c, "initializer") and c.initializer is not None]
-
-
-class _NoRestoreSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
-
-  def __init__(self, tensor, name):
-    spec = saver_lib.BaseSaverBuilder.SaveSpec(tensor, "", name)
-    super(_NoRestoreSaveable, self).__init__(tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    return control_flow_ops.no_op()
-
-
-class _LoadStatus(object):
-  """Abstract base for load status callbacks."""
-
-  @abc.abstractmethod
-  def assert_consumed(self):
-    """Raises an exception unless a non-trivial restoration has completed."""
-    pass
-
-  @abc.abstractmethod
-  def run_restore_ops(self, session=None):
-    """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
-    pass
-
-  @abc.abstractmethod
-  def initialize_or_restore(self, session=None):
-    """Runs restore ops from the checkpoint, or initializes variables."""
-    pass
-
-
-class CheckpointLoadStatus(_LoadStatus):
-  """Checks the status of checkpoint loading and manages restore ops.
-
-  Returned from `Saver.restore`. Since `restore` may defer the loading of values
-  in the checkpoint which don't yet have corresponding Python objects,
-  `CheckpointLoadStatus` provides a callback to verify that checkpoint loading
-  is complete (`assert_consumed`).
-
-  When graph building, `restore` does not run restore ops itself since their
-  creation may be deferred. The `run_restore_ops` method must be called once all
-  Python objects with values to restore have been created and added to the
-  dependency graph (this does not necessarily have to be the whole checkpoint;
-  calling `run_restore_ops` while `assert_consumed` fails is supported and will
-  partially restore the checkpoint).
-
-  See `Saver.restore` for usage examples.
-  """
-
-  def __init__(self, checkpoint, feed_dict):
-    self._checkpoint = checkpoint
-    self._feed_dict = feed_dict
-
-  def assert_consumed(self):
-    """Asserts that all objects in the checkpoint have been created/matched.
-
-    Returns:
-      `self` for chaining.
-    Raises:
-      AssertionError: If there are any Python objects in the dependency graph
-        which have not been restored from this checkpoint or a later `restore`,
-        or if there are any checkpointed values which have not been matched to
-        Python objects.
-    """
-    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if checkpointable is None:
-        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
-      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
-        raise AssertionError(
-            "Object not assigned a value from checkpoint: %s" % (node,))
-    if self._checkpoint.slot_restorations:
-      # Sanity check; this collection should be clear if everything has been
-      # restored.
-      raise AssertionError("Unresolved slot restorations: %s" % (
-          self._checkpoint.slot_restorations,))
-    if self._checkpoint.unused_attributes:
-      raise AssertionError(
-          ("Unused attributes in these objects (the attributes exist in the "
-           "checkpoint but not in the objects): %s") % (
-               self._checkpoint.unused_attributes.items(),))
-    return self
-
-  def run_restore_ops(self, session=None):
-    """Run operations to restore objects in the dependency graph."""
-    if context.executing_eagerly():
-      return  # Run eagerly
-    if session is None:
-      session = ops.get_default_session()
-    session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
-
-  def initialize_or_restore(self, session=None):
-    """Alias for `run_restore_ops`.
-
-    This method has a sibling in `InitializationOnlyStatus` which instead
-    initializes variables. That type is returned if no checkpoint is specified
-    in `Saver.restore`.
-
-    Args:
-      session: The session to run restore ops in. If `None`, uses the default
-        session.
-    """
-    self.run_restore_ops(session=session)
-
-
-class InitializationOnlyStatus(_LoadStatus):
-  """Returned from `Saver.restore` when no checkpoint has been specified.
-
-  Objects of this type have the same `assert_consumed` method as
-  `CheckpointLoadStatus`, but it always fails. However,
-  `initialize_or_restore` works on objects of both types, and will
-  initialize variables in `InitializationOnlyStatus` objects or restore them
-  otherwise.
-  """
-
-  def __init__(self, root_checkpointable):
-    self._root_checkpointable = root_checkpointable
-
-  def assert_consumed(self):
-    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
-    raise AssertionError(
-        "No checkpoint specified (save_path=None); nothing is being restored.")
-
-  def run_restore_ops(self, session=None):
-    """For consistency with `CheckpointLoadStatus`.
-
-    Use `initialize_or_restore` for initializing if no checkpoint was passed
-    to `Saver.restore` and restoring otherwise.
-
-    Args:
-      session: Not used.
-    """
-    raise AssertionError(
-        "No checkpoint specified, so no restore ops are available "
-        "(save_path=None to Saver.restore).")
-
-  def initialize_or_restore(self, session=None):
-    """Runs initialization ops for variables.
-
-    Only objects which would be saved by `Saver.save` will be initialized. See
-    `gather_initializers` for details.
-
-    This method does nothing when executing eagerly (initializers get run
-    eagerly).
-
-    Args:
-      session: The session to run initialization ops in. If `None`, uses the
-        default session.
-    """
-    if context.executing_eagerly():
-      return  # run eagerly
-    if session is None:
-      session = ops.get_default_session()
-    session.run(gather_initializers(self._root_checkpointable))
-
-
-_DEPRECATED_RESTORE_INSTRUCTIONS = (
-    "Restoring a name-based tf.train.Saver checkpoint using the object-based "
-    "restore API. This mode uses global names to match variables, and so is "
-    "somewhat fragile. It also adds new restore ops to the graph each time it "
-    "is called. Prefer re-encoding training checkpoints in the object-based "
-    "format: run save() on the object-based saver (the same one this message "
-    "is coming from) and use that checkpoint in the future.")
-
-
-class NameBasedSaverStatus(_LoadStatus):
-  """Status for loading a name-based training checkpoint."""
-
-  def __init__(self, object_saver, save_path):
-    self._object_saver = object_saver
-    self._save_path = save_path
-
-  def assert_consumed(self):
-    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
-    raise AssertionError(
-        "Restoring a name-based checkpoint. No load status is available.")
-
-  @deprecation.deprecated(
-      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
-  def run_restore_ops(self, session=None):
-    """Load the name-based training checkpoint using a new `tf.train.Saver`."""
-    if session is None and not context.executing_eagerly():
-      session = ops.get_default_session()
-    with ops.device("/cpu:0"):
-      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
-          sess=session, save_path=self._save_path)
-
-  def initialize_or_restore(self, session=None):
-    """Alias for `run_restore_ops`."""
-    self.run_restore_ops(session=session)
-
-
-class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
-  """Pretends to be a session, inserts extra feeds on run()."""
-
-  def __init__(self, session, feed_additions):
-    self._wrapped_session = session
-    self._feed_additions = feed_additions
-
-  def run(self, fetches, feed_dict=None, **kwargs):
-    if feed_dict is None:
-      feed_dict = {}
-    else:
-      feed_dict = feed_dict.copy()
-    feed_dict.update(self._feed_additions)
-    return self._wrapped_session.run(
-        fetches=fetches, feed_dict=feed_dict, **kwargs)
-
-
-def _copy_saver_with_new_var_list(old_saver, new_var_list):
-  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
-  new_saver = saver_lib.Saver(var_list=new_var_list)
-  # TODO(allenl): Move to copying functionality to Saver?
-  # pylint: disable=protected-access
-  new_saver._last_checkpoints = old_saver._last_checkpoints
-  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
-  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
-  # pylint: enable=protected-access
-  return new_saver
-
-
-class CheckpointableSaver(object):
-  """Saves and restores a `Checkpointable` object and its dependencies.
-
-  See `Checkpointable` for details of dependency management. `Saver` wraps
-  `tf.train.Saver` for saving, including extra information about the graph of
-  dependencies between Python objects. When restoring, it uses this information
-  about the save-time dependency graph to more robustly match objects with their
-  checkpointed values. When executing eagerly, it supports restoring variables
-  on object creation (see `Saver.restore`).
-
-  Values in a checkpoint are mapped to `Checkpointable` Python objects
-  (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
-  checkpoint was written. To avoid breaking existing checkpoints when modifying
-  a class, dependency names (the names of attributes to which `Checkpointable`
-  objects are assigned) may not change. These names are local to objects, in
-  contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
-  so allow additional program transformations.
-  """
-
-  def __init__(self, root_checkpointable):
-    """Configure saving.
-
-    Args:
-      root_checkpointable: The root of the object graph to save/restore. This
-        object and all of its dependencies are saved in the checkpoint. When
-        restoring, objects are matched and restored starting from this root.
-    """
-    # Allow passing in a weak reference to avoid reference cycles when
-    # `Checkpointable` objects save themselves.
-    self._root_checkpointable_ref = root_checkpointable
-    if not context.executing_eagerly():
-      with ops.device("/cpu:0"):
-        self._file_prefix_placeholder = constant_op.constant("model")
-    else:
-      self._file_prefix_placeholder = None
-
-    # Op caching for save
-    self._object_graph_feed_tensor = None
-    self._last_save_object_graph = None
-    self._last_save_saver = None
-
-    # Op caching for restore
-    self._last_restore_object_graph = None
-    self._last_restore_checkpoint = None
-
-  @property
-  def _root_checkpointable(self):
-    if isinstance(self._root_checkpointable_ref, weakref.ref):
-      derefed = self._root_checkpointable_ref()
-      assert derefed is not None
-      return derefed
-    else:
-      return self._root_checkpointable_ref
-
-  def save(self, file_prefix, checkpoint_number=None, session=None):
-    """Save a training checkpoint.
-
-    The saved checkpoint includes variables created by this object and any
-    Checkpointable objects it depends on at the time `Saver.save()` is called.
-
-    Args:
-      file_prefix: A prefix to use for the checkpoint filenames
-        (/path/to/directory/and_a_prefix). Names are generated based on this
-        prefix and `checkpoint_number`, if provided.
-      checkpoint_number: An integer variable or Tensor, used to number
-        checkpoints. Typically this value is saved along with other variables in
-        training checkpoints, which will happen automatically if it was created
-        by `root_checkpointable` or one of its dependencies (via
-        `Checkpointable._add_variable`).
-      session: The session to evaluate variables in. Ignored when executing
-        eagerly. If not provided when graph building, the default session is
-        used.
-
-    Returns:
-      The full path to the checkpoint.
-    """
-    named_variables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
-    if not context.executing_eagerly():
-      if session is None:
-        session = ops.get_default_session()
-      if self._object_graph_feed_tensor is None:
-        with ops.device("/cpu:0"):
-          self._object_graph_feed_tensor = constant_op.constant(
-              "", dtype=dtypes.string)
-      object_graph_tensor = self._object_graph_feed_tensor
-      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
-    else:
-      session = None
-      with ops.device("/cpu:0"):
-        object_graph_tensor = constant_op.constant(
-            graph_proto.SerializeToString(), dtype=dtypes.string)
-      feed_additions = None
-    assert _OBJECT_GRAPH_PROTO_KEY not in named_variables
-    named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
-        tensor=object_graph_tensor,
-        name=_OBJECT_GRAPH_PROTO_KEY)
-    if (self._last_save_object_graph != graph_proto
-        # When executing eagerly, we need to re-create SaveableObjects each time
-        # save() is called so they pick up new Tensors passed to their
-        # constructors. That means the Saver needs to be copied with a new
-        # var_list.
-        or context.executing_eagerly()):
-      if self._last_save_object_graph is not None:
-        self._last_save_saver = _copy_saver_with_new_var_list(
-            old_saver=self._last_save_saver, new_var_list=named_variables)
-      else:
-        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
-      self._last_save_object_graph = graph_proto
-    with ops.device("/cpu:0"):
-      save_path = self._last_save_saver.save(
-          sess=_SessionWithFeedDictAdditions(
-              session=session, feed_additions=feed_additions),
-          save_path=file_prefix,
-          write_meta_graph=False,
-          global_step=checkpoint_number)
-    return save_path
-
-  def _global_variable_names(self):
-    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
-    named_saveables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
-    saver_names = {}
-    for object_proto in graph_proto.nodes:
-      for attribute_proto in object_proto.attributes:
-        saver_names[attribute_proto.full_name] = named_saveables[
-            attribute_proto.checkpoint_key]
-    return saver_names
-
-  def restore(self, save_path):
-    """Restore a training checkpoint.
-
-    Restores `root_checkpointable` and any objects that it tracks
-    (transitive). Either assigns values immediately if variables to restore have
-    been created already, or defers restoration until the variables are
-    created. Dependencies added to the `root_checkpointable` passed to the
-    constructor after this call will be matched if they have a corresponding
-    object in the checkpoint.
-
-    When building a graph, restorations are added to the graph but not run.
-
-    To disallow deferred loading, assert immediately that all checkpointed
-    variables have been matched to variable objects:
-
-    ```python
-    saver = Saver(root)
-    saver.restore(path).assert_consumed()
-    ```
-
-    An exception will be raised unless every object was matched and its
-    variables already exist.
-
-    When graph building, `assert_consumed()` indicates that all of the restore
-    ops which will be created for this checkpoint have been created. They can be
-    run via the `run_restore_ops()` function of the status object:
-
-    ```python
-    saver.restore(path).assert_consumed().run_restore_ops()
-    ```
-
-    If the checkpoint has not been consumed completely, then the list of restore
-    ops will grow as more objects are added to the dependency graph.
-
-    Name-based `tf.train.Saver` checkpoints can be loaded using this
-    method. There is no deferred loading, and names are used to match
-    variables. No restore ops are created/run until `run_restore_ops()` or
-    `initialize_or_restore()` are called on the returned status object, even
-    when executing eagerly. Re-encode name-based checkpoints using this
-    object-based `Saver.save` as soon as possible.
-
-    Args:
-      save_path: The path to the checkpoint, as returned by `save` or
-        `tf.train.latest_checkpoint`. If None (as when there is no latest
-        checkpoint for `tf.train.latest_checkpoint` to return), returns an
-        object which may run initializers for objects in the dependency
-        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
-        names are used to match variables.
-
-    Returns:
-      A load status object, which can be used to make assertions about the
-      status of checkpoint restoration and run initialization/restore ops
-      (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if
-      `save_path` is `None`).
-
-      If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
-      object is returned which runs restore ops from a name-based saver.
-    """
-    if save_path is None:
-      return InitializationOnlyStatus(self._root_checkpointable)
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
-      file_prefix_tensor = self._file_prefix_placeholder
-      file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
-    else:
-      with ops.device("/cpu:0"):
-        file_prefix_tensor = constant_op.constant(save_path)
-      file_prefix_feed_dict = None
-    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-    try:
-      object_graph_string = reader.get_tensor(_OBJECT_GRAPH_PROTO_KEY)
-    except errors_impl.NotFoundError:
-      # The object graph proto does not exist in this checkpoint. Try again with
-      # name-based saving.
-      return NameBasedSaverStatus(self, save_path)
-
-    object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-    object_graph_proto.ParseFromString(object_graph_string)
-    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
-      checkpoint = self._last_restore_checkpoint
-    else:
-      if in_graph_mode:
-        dtype_map = None
-      else:
-        dtype_map = reader.get_variable_to_dtype_map()
-      checkpoint = core_checkpointable_utils._Checkpoint(  # pylint: disable=protected-access
-          object_graph_proto=object_graph_proto,
-          save_path=file_prefix_tensor,
-          dtype_map=dtype_map)
-      if in_graph_mode:
-        if self._last_restore_object_graph is not None:
-          raise NotImplementedError(
-              "Using a single Saver to restore different object graphs is not "
-              "currently supported when graph building. Use a different Saver "
-              "for each object graph (restore ops will be duplicated), or "
-              "file a feature request if this limitation bothers you.")
-        self._last_restore_checkpoint = checkpoint
-        self._last_restore_object_graph = object_graph_proto
-    core_checkpointable._CheckpointPosition(  # pylint: disable=protected-access
-        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
-    load_status = CheckpointLoadStatus(
-        checkpoint, feed_dict=file_prefix_feed_dict)
-    return load_status
-
-
-class Checkpoint(core_checkpointable.Checkpointable):
-  """A utility class which groups `Checkpointable` objects.
-
-  Accepts arbitrary keyword arguments to its constructor and saves those values
-  with a checkpoint. Maintains a `save_counter` for numbering checkpoints.
-
-  Example usage:
-
-  ```python
-  import tensorflow as tf
-  import tensorflow.contrib.eager as tfe
-  import os
-
-  checkpoint_directory = "/tmp/training_checkpoints"
-  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-  root = tfe.Checkpoint(optimizer=optimizer, model=model)
-  root.restore(tf.train.latest_checkpoint(checkpoint_directory))
-  for _ in range(num_training_steps):
-    optimizer.minimize( ... )
-  root.save(file_prefix=checkpoint_prefix)
-  ```
-
-  For more manual control over saving, use `tfe.CheckpointableSaver` directly.
-
-  Attributes:
-    save_counter: Incremented when `save()` is called. Used to number
-      checkpoints.
-  """
-
-  def __init__(self, **kwargs):
-    """Group objects into a training checkpoint.
-
-    Args:
-      **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Attribute values must derive from
-        `CheckpointableBase`.
-    Raises:
-      ValueError: If objects in `kwargs` are not Checkpointable.
-    """
-    super(Checkpoint, self).__init__()
-    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, core_checkpointable.CheckpointableBase):
-        raise ValueError(
-            ("`Checkpoint` was expecting an object derived from "
-             "`CheckpointableBase`, got %s.") % (v,))
-      setattr(self, k, v)
-    self._save_counter = None  # Created lazily for restore-on-create.
-    self._saver = CheckpointableSaver(weakref.ref(self))
-
-  def _maybe_create_save_counter(self):
-    """Create a save counter if it does not yet exist."""
-    if self._save_counter is None:
-      # Initialized to 0 and incremented before saving.
-      with ops.device("/cpu:0"):
-        self._save_counter = add_variable(
-            self, name="save_counter", initializer=0, dtype=dtypes.int64)
-
-  @property
-  def save_counter(self):
-    """An integer variable which starts at zero and is incremented on save.
-
-    Used to number checkpoints.
-
-    Returns:
-      The save counter variable.
-    """
-    self._maybe_create_save_counter()
-    return self._save_counter
-
-  def save(self, file_prefix, session=None):
-    """Save a checkpoint. Wraps `tfe.CheckpointableSaver.save`."""
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
-      if session is None:
-        session = ops.get_default_session()
-      if self._save_counter is None:
-        # When graph building, if this is a new save counter variable then it
-        # needs to be initialized before assign_add. This is only an issue if
-        # restore() has not been called first.
-        session.run(self.save_counter.initializer)
-    with ops.colocate_with(self.save_counter):
-      assign_op = self.save_counter.assign_add(1)
-    if in_graph_mode:
-      session.run(assign_op)
-    return self._saver.save(
-        file_prefix=file_prefix,
-        checkpoint_number=self.save_counter,
-        session=session)
-
-  def restore(self, save_path):
-    """Restore a checkpoint. Wraps `tfe.CheckpointableSaver.restore`."""
-    status = self._saver.restore(save_path=save_path)
-    # Create the save counter now so it gets initialized with other variables
-    # when graph building. Creating it earlier would lead to double
-    # initialization when executing eagerly.
-    self._maybe_create_save_counter()
-    return status
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index b344d50e7f..da04199aaa 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -16,59 +16,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 
-import six
-
-from tensorflow.contrib.eager.python import checkpointable_utils
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.contrib.eager.python import checkpointable_utils as contrib_checkpointable_utils
 from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import template
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import adam
 from tensorflow.python.training import checkpointable
-from tensorflow.python.training import saver as core_saver
-from tensorflow.python.training import training_util
-
-
-class NonLayerCheckpointable(checkpointable.Checkpointable):
-
-  def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
-        self, name="a_variable", shape=[])
-
-
-# pylint: disable=not-callable
-class MyModel(training.Model):
-  """A concrete Model for testing."""
-
-  def __init__(self):
-    super(MyModel, self).__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
-
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
+from tensorflow.python.training import checkpointable_utils
 
 
 def _split_variable_closure(variable):
@@ -91,7 +47,7 @@ class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
-    split_dependencies = checkpointable_utils.split_dependency(
+    split_dependencies = contrib_checkpointable_utils.split_dependency(
         component_names=("first_half", "second_half"),
         component_dtypes=(self.combined.dtype,) * 2,
         fill_save_buffer_fn=_split_variable_closure(
@@ -152,1239 +108,5 @@ class SplitTests(test.TestCase):
         self.evaluate(restore_checkpoint.dep.combined))
 
 
-class InterfaceTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testAddVariable(self):
-    obj = NonLayerCheckpointable()
-    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
-      checkpointable_utils.add_variable(
-          obj, name="shape_specified_twice", shape=[], initializer=1)
-    constant_initializer = checkpointable_utils.add_variable(
-        obj, name="constant_initializer", initializer=1)
-    with variable_scope.variable_scope("some_variable_scope"):
-      ones_initializer = checkpointable_utils.add_variable(
-          obj,
-          name="ones_initializer",
-          shape=[2],
-          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
-    bare_initializer = checkpointable_utils.add_variable(
-        obj,
-        name="bare_initializer",
-        shape=[2, 2],
-        dtype=dtypes.float64,
-        initializer=init_ops.zeros_initializer)
-
-    # Even in graph mode, there are no naming conflicts between objects, only
-    # naming conflicts within an object.
-    other_duplicate = resource_variable_ops.ResourceVariable(
-        name="duplicate", initial_value=1.)
-    duplicate = checkpointable_utils.add_variable(
-        obj, name="duplicate", shape=[])
-    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
-      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
-
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    self.assertEqual("constant_initializer:0", constant_initializer.name)
-    self.assertEqual(1, self.evaluate(constant_initializer))
-    self.assertEqual("some_variable_scope/ones_initializer:0",
-                     ones_initializer.name)
-    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
-    self.assertAllEqual([[0., 0.],
-                         [0., 0.]], self.evaluate(bare_initializer))
-    self.assertEqual("a_variable:0", obj.a_variable.name)
-    self.assertEqual("duplicate:0", other_duplicate.name)
-    if context.executing_eagerly():
-      # When executing eagerly, there's no uniquification of variable names. The
-      # checkpoint name will be the same.
-      self.assertEqual("duplicate:0", duplicate.name)
-    else:
-      # The .name attribute may be globally influenced, but the checkpoint name
-      # won't be (tested below).
-      self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
-    expected_checkpoint_names = (
-        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
-        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
-        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-    )
-    six.assertCountEqual(
-        self, expected_checkpoint_names, named_variables.keys())
-
-  def testInitNotCalled(self):
-
-    class NoInit(checkpointable.Checkpointable):
-
-      def __init__(self):
-        pass
-
-    # __init__ for Checkpointable will be called implicitly.
-    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
-
-  def testShapeDtype(self):
-    root = checkpointable.Checkpointable()
-    v1 = checkpointable_utils.add_variable(
-        root, name="v1", initializer=3., dtype=dtypes.float64)
-    self.assertEqual(dtypes.float64, v1.dtype)
-    v2 = checkpointable_utils.add_variable(
-        root,
-        name="v2",
-        shape=[3],
-        initializer=init_ops.ones_initializer,
-        dtype=dtypes.float64)
-    self.assertEqual(dtypes.float64, v2.dtype)
-    self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
-
-
-class _MirroringSaveable(core_saver.BaseSaverBuilder.SaveableObject):
-
-  def __init__(self, primary_variable, mirrored_variable, name):
-    self._primary_variable = primary_variable
-    self._mirrored_variable = mirrored_variable
-    tensor = self._primary_variable.read_value()
-    spec = core_saver.BaseSaverBuilder.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name)
-    super(_MirroringSaveable, self).__init__(
-        tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    """Restore the same value into both variables."""
-    tensor, = restored_tensors
-    return control_flow_ops.group(
-        self._primary_variable.assign(tensor),
-        self._mirrored_variable.assign(tensor))
-
-
-class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
-  """A Checkpointable object which returns a more complex SaveableObject."""
-
-  def __init__(self):
-    self.non_dep_variable = variable_scope.get_variable(
-        name="non_dep_variable", initializer=6., use_resource=True)
-    self.mirrored = variable_scope.get_variable(
-        name="mirrored", initializer=15., use_resource=True)
-
-  def _gather_saveables_for_checkpoint(self):
-    def _saveable_factory(name=self.non_dep_variable.name):
-      return _MirroringSaveable(
-          primary_variable=self.non_dep_variable,
-          mirrored_variable=self.mirrored,
-          name=name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  # The Saver sorts by name before parsing, so we need a name property.
-  @property
-  def name(self):
-    return self.non_dep_variable.name
-
-
-class CheckpointingTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    # A nuisance Model using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_model(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          model(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_model(input_value),
-          global_step=optimizer_step)
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    six.assertCountEqual(self, expected_checkpoint_names,
-                         named_variables.keys())
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
-    self.assertEqual(
-        "my_model/dense_1/kernel:0",
-        named_variables["model/_second/kernel" + suffix].name)
-    self.assertEqual(
-        "my_model/dense/kernel:0",
-        named_variables["model/_named_dense/kernel" + suffix].name)
-    self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
-    self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
-        1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_model/dense/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel/Adam:0",
-        optimizer.get_slot(
-            var=named_variables["model/_named_dense/kernel" + suffix],
-            name="m").name)
-    self.assertEqual(
-        "model/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testMoreComplexSaveableReturned(self):
-    v = _OwnsMirroredVariables()
-    checkpoint = checkpointable_utils.Checkpoint(v=v)
-    test_dir = self.get_temp_dir()
-    prefix = os.path.join(test_dir, "ckpt")
-    self.evaluate(v.non_dep_variable.assign(42.))
-    save_path = checkpoint.save(prefix)
-    self.evaluate(v.non_dep_variable.assign(43.))
-    self.evaluate(v.mirrored.assign(44.))
-    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
-    self.assertEqual(42., self.evaluate(v.non_dep_variable))
-    self.assertEqual(42., self.evaluate(v.mirrored))
-    self.evaluate(v.non_dep_variable.assign(44.))
-    save_path = checkpoint.save(prefix)
-    self.evaluate(v.non_dep_variable.assign(45.))
-    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
-    self.assertEqual(44., self.evaluate(v.non_dep_variable))
-    self.assertEqual(44., self.evaluate(v.mirrored))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testMoreComplexSaveableReturnedWithGlobalName(self):
-    # The same object can also be saved using the name-based saver.
-    v = _OwnsMirroredVariables()
-    saver = core_saver.Saver(var_list=[v])
-    test_dir = self.get_temp_dir()
-    prefix = os.path.join(test_dir, "ckpt")
-    self.evaluate(v.non_dep_variable.assign(42.))
-    with self.test_session() as sess:
-      save_path = saver.save(sess, prefix)
-      self.evaluate(v.non_dep_variable.assign(43.))
-      self.evaluate(v.mirrored.assign(44.))
-      saver.restore(sess, save_path)
-      self.assertEqual(42., self.evaluate(v.non_dep_variable))
-      self.assertEqual(42., self.evaluate(v.mirrored))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testSaveRestore(self):
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    input_value = constant_op.constant([[3.]])
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value))
-    else:
-      train_op = optimizer.minimize(model(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if not context.executing_eagerly():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(
-        0.001,
-        # Preserve beta1_power and beta2_power when appying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0, beta2=1.0)
-    on_create_root = checkpointable_utils.Checkpoint(
-        optimizer=on_create_optimizer, model=on_create_model)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    on_create_model(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_model._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_model._named_dense.variables[1], "m")
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
-    dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value)
-    status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
-
-  # TODO(allenl): Debug garbage created by this test in python3.
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          optimizer_step=training_util.get_or_create_global_step())
-      root.restore(core_saver.latest_checkpoint(checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = constant_op.constant([[3.]])
-        optimizer.minimize(
-            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with ops.Graph().as_default():
-          model = MyModel()
-          optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=training_util.get_or_create_global_step())
-          input_value = constant_op.constant([[3.]])
-          train_op = optimizer.minimize(
-              model(input_value),
-              global_step=root.global_step)
-          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
-          with self.test_session(graph=ops.get_default_graph()) as session:
-            status = root.restore(save_path=checkpoint_path)
-            status.initialize_or_restore(session=session)
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              with self.assertRaises(AssertionError):
-                status.assert_consumed()
-            else:
-              status.assert_consumed()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix, session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testAgnosticUsage(self):
-    """Graph/eager agnostic usage."""
-    # Does create garbage when executing eagerly due to ops.Graph() creation.
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with ops.Graph().as_default(), self.test_session(
-          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
-        model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-
-  # pylint: disable=cell-var-from-loop
-  @test_util.run_in_graph_and_eager_modes()
-  def testWithDefun(self):
-    num_training_steps = 2
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with ops.Graph().as_default(), self.test_session(
-          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
-        model = MyModel()
-        # Don't actually train so we can test variable values
-        optimizer = adam.AdamOptimizer(0.)
-        root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        def train_fn():
-          @function.defun
-          def _call_model(x):
-            return model(x)
-          with backprop.GradientTape() as tape:
-            loss = _call_model(constant_op.constant([[3.]]))
-          gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables),
-                                           global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(
-              self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        if training_continuation > 0:
-          status.assert_consumed()
-          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-        else:
-          self.evaluate(model.variables[0].assign([[42.]]))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-  # pylint: enable=cell-var-from-loop
-
-  def _get_checkpoint_name(self, name):
-    root = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testVariableNameEscaping(self):
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    self.assertEqual(r"a.Sb.Sc" + suffix, self._get_checkpoint_name(r"a/b/c"))
-    self.assertEqual(r"b" + suffix, self._get_checkpoint_name(r"b"))
-    self.assertEqual(r"c.S" + suffix, self._get_checkpoint_name(r"c/"))
-    self.assertEqual(r"d.S..S" + suffix, self._get_checkpoint_name(r"d/.S"))
-    self.assertEqual(r"d.S..ATTRIBUTES.Sf" + suffix,
-                     self._get_checkpoint_name(r"d/.ATTRIBUTES/f"))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNumberedPath(self):
-    root = checkpointable.Checkpointable()
-    leaf = checkpointable.Checkpointable()
-    root.leaf = leaf
-    checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    variable_name, = named_variables.keys()
-    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLocalNameValidation(self):
-    root = checkpointable.Checkpointable()
-    leaf = checkpointable.Checkpointable()
-    # Dots are escaped, which avoids conflicts with reserved names.
-    root._track_checkpointable(leaf, name=".ATTRIBUTES")
-    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    name, = named_variables.keys()
-    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
-
-  def testAnonymousVarsInInit(self):
-
-    class Model(training.Model):
-
-      def __init__(self):
-        super(Model, self).__init__()
-        self.w = resource_variable_ops.ResourceVariable(0.0)
-        self.b = resource_variable_ops.ResourceVariable(0.0)
-        self.vars = [self.w, self.b]
-
-      def call(self, x):
-        return x * self.w + self.b
-
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLateDependencyTracking(self):
-
-    class Dependency(checkpointable.Checkpointable):
-
-      def build(self):
-        self.var = checkpointable_utils.add_variable(
-            self, "var", initializer=0.)
-
-    class LateDependencies(checkpointable.Checkpointable):
-
-      def add_dep(self):
-        self.dep = Dependency()
-        self.dep.build()
-
-    original = LateDependencies()
-    original.add_dep()
-    self.evaluate(state_ops.assign(original.dep.var, 123.))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(
-        original).save(checkpoint_prefix)
-    load_into = LateDependencies()
-    status = checkpointable_utils.CheckpointableSaver(
-        load_into).restore(save_path)
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    load_into.add_dep()
-    status.assert_consumed()
-    status.run_restore_ops()
-    self.assertEqual(123., self.evaluate(load_into.dep.var))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDepAfterVar(self):
-
-    class Dependency(checkpointable.Checkpointable):
-
-      def build(self):
-        self.var = checkpointable_utils.add_variable(
-            self, "var", initializer=0.)
-
-    class DepAfterVar(checkpointable.Checkpointable):
-
-      def add_dep(self):
-        dep = Dependency()
-        dep.build()
-        self.dep = dep
-
-    dep_after_var = DepAfterVar()
-    dep_after_var.add_dep()
-    self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
-        checkpoint_prefix)
-
-    loaded_dep_after_var = DepAfterVar()
-    status = checkpointable_utils.CheckpointableSaver(
-        loaded_dep_after_var).restore(save_path)
-    loaded_dep_after_var.add_dep()
-    status.assert_consumed()
-    status.run_restore_ops()
-    self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDeferredSlotRestoration(self):
-    checkpoint_directory = self.get_temp_dir()
-
-    root = checkpointable.Checkpointable()
-    root.var = checkpointable_utils.add_variable(
-        root, name="var", initializer=0.)
-    optimizer = adam.AdamOptimizer(0.1)
-    if context.executing_eagerly():
-      optimizer.minimize(root.var.read_value)
-    else:
-      train_op = optimizer.minimize(root.var)
-      # Note that `optimizer` has not been added as a dependency of
-      # `root`. Create a one-off grouping so that slot variables for `root.var`
-      # get initialized too.
-      self.evaluate(checkpointable_utils.gather_initializers(
-          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
-      self.evaluate(train_op)
-    self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
-    root.optimizer = optimizer
-    self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
-                                   14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = checkpointable.Checkpointable()
-    # Load the slot-containing checkpoint (deferred), then immediately overwrite
-    # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
-    with self.assertRaises(AssertionError):
-      no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
-        new_root, name="var", shape=[])
-    no_slot_status.assert_consumed()
-    no_slot_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = adam.AdamOptimizer(0.1)
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
-      slot_status.assert_consumed()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    if context.executing_eagerly():
-      # Slot variables are only created with restoring initializers when
-      # executing eagerly.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-    else:
-      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
-                    None)
-    if context.executing_eagerly():
-      new_root.optimizer.minimize(new_root.var.read_value)
-    else:
-      train_op = new_root.optimizer.minimize(new_root.var)
-      # The slot variable now exists; restore() didn't create it, but we should
-      # now have a restore op for it.
-      slot_status.run_restore_ops()
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-      self.evaluate(train_op)
-    slot_status.assert_consumed()
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testOverlappingRestores(self):
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep = checkpointable.Checkpointable()
-    save_root.dep.var = checkpointable_utils.add_variable(
-        save_root.dep, name="var", initializer=0.)
-    self.evaluate(state_ops.assign(save_root.dep.var, 12.))
-    saver = checkpointable_utils.CheckpointableSaver(save_root)
-    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
-    self.evaluate(state_ops.assign(save_root.dep.var, 13.))
-    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
-
-    first_root = checkpointable.Checkpointable()
-    second_root = checkpointable.Checkpointable()
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    load_dep = checkpointable.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
-        load_dep, name="var", shape=[])
-    first_root.dep = load_dep
-    first_status.assert_consumed()
-    first_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(load_dep.var))
-    second_root.dep = load_dep
-    second_status.assert_consumed()
-    second_status.run_restore_ops()
-    self.assertEqual(13., self.evaluate(load_dep.var))
-
-    # Try again with the order of the restore() reversed. The last restore
-    # determines the final value.
-    first_root = checkpointable.Checkpointable()
-    second_root = checkpointable.Checkpointable()
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    load_dep = checkpointable.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
-        load_dep, name="var", shape=[])
-    first_root.dep = load_dep
-    first_status.assert_consumed()
-    first_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(load_dep.var))
-    second_root.dep = load_dep
-    second_status.assert_consumed()
-    second_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(load_dep.var))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testAmbiguousLoad(self):
-    # Not OK to split one checkpoint object into two
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep_one = checkpointable.Checkpointable()
-    save_root.dep_two = checkpointable.Checkpointable()
-    dep_three = checkpointable.Checkpointable()
-    save_root.dep_one.dep_three = dep_three
-    save_root.dep_two.dep_three = dep_three
-    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = checkpointable.Checkpointable()
-    checkpointable_utils.CheckpointableSaver(load_root).restore(save_path)
-    load_root.dep_one = checkpointable.Checkpointable()
-    load_root.dep_two = checkpointable.Checkpointable()
-    load_root.dep_one.dep_three = checkpointable.Checkpointable()
-    with self.assertRaisesRegexp(AssertionError,
-                                 "resolved to different objects"):
-      load_root.dep_two.dep_three = checkpointable.Checkpointable()
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testObjectsCombined(self):
-    # Currently fine to load two checkpoint objects into one Python object
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep_one = checkpointable.Checkpointable()
-    save_root.dep_two = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
-    checkpointable_utils.add_variable(
-        save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = checkpointable.Checkpointable()
-    load_root.dep_one = checkpointable.Checkpointable()
-    load_root.dep_two = load_root.dep_one
-    v1 = checkpointable_utils.add_variable(
-        load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
-    v2 = checkpointable_utils.add_variable(
-        load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
-        save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertEqual(32., self.evaluate(v1))
-    self.assertEqual(64., self.evaluate(v2))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDependencyLoop(self):
-    # Note: this test creates garbage during eager execution because it
-    # purposefully creates a reference cycle.
-    first = checkpointable.Checkpointable()
-    second = checkpointable.Checkpointable()
-    first.second = second
-    second.first = first
-    first.v = checkpointable_utils.add_variable(
-        first, "v1", initializer=[3., 1., 4.])
-    second.v = checkpointable_utils.add_variable(
-        second, "v2", initializer=[1., 1., 2., 3.])
-    self.evaluate(checkpointable_utils.gather_initializers(first))
-    checkpoint_directory = self.get_temp_dir()
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-
-    # Test deferred loading
-    first_load = checkpointable.Checkpointable()
-    status = checkpointable_utils.CheckpointableSaver(
-        first_load).restore(save_path)
-    second_load = checkpointable.Checkpointable()
-    first_load.second = second_load
-    second_load.first = first_load
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    first_load.v = checkpointable_utils.add_variable(
-        first_load, "v1", shape=[3])
-    second_load.v = checkpointable_utils.add_variable(
-        second_load, "v2", shape=[4])
-    status.assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
-    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
-
-    # Test loading when variables have already been created
-    self.evaluate(first_load.v.assign([2., 7., 1.]))
-    self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
-    self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
-    self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
-    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
-        save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
-    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testRestoreOnAssign(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_graph = ops.Graph()
-    with save_graph.as_default(), self.test_session(save_graph):
-      first = checkpointable.Checkpointable()
-      first.var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      first.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      self.evaluate(first.var1.assign(4.))
-      self.evaluate(first.var2.assign(8.))
-      save_path = checkpointable_utils.CheckpointableSaver(first).save(
-          checkpoint_prefix)
-    restore_graph = ops.Graph()
-    with restore_graph.as_default(), self.test_session(restore_graph):
-      second = checkpointable.Checkpointable()
-      second.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      status = checkpointable_utils.CheckpointableSaver(
-          second).restore(save_path)
-      recreated_var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      status.run_restore_ops()
-      self.assertEqual(8., self.evaluate(second.var2))
-      self.evaluate(recreated_var1.assign(-2.))
-      self.assertEqual(-2., self.evaluate(recreated_var1))
-      second.var1 = recreated_var1
-      status.run_restore_ops()
-      self.assertEqual(4., self.evaluate(recreated_var1))
-
-  def testManySavesGraph(self):
-    """Saves after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = checkpointable.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
-        before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testCheckpointCleanup(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = checkpointable.Checkpointable()
-    obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    saver = checkpointable_utils.Checkpoint(obj=obj)
-    for _ in range(10):
-      saver.save(checkpoint_prefix)
-    expected_filenames = ["checkpoint"]
-    for checkpoint_number in range(6, 11):
-      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
-      expected_filenames.append(
-          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
-    six.assertCountEqual(
-        self,
-        expected_filenames,
-        os.listdir(checkpoint_directory))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testCheckpointCleanupChangingVarList(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = checkpointable.Checkpointable()
-    obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
-    looped_variables = []
-    for iteration in range(10):
-      new_variable = resource_variable_ops.ResourceVariable(iteration)
-      self.evaluate(new_variable.initializer)
-      setattr(checkpoint, "var_%d" % iteration, new_variable)
-      checkpoint.save(checkpoint_prefix)
-      looped_variables.append(new_variable)
-    expected_filenames = ["checkpoint"]
-    # We've copied the saver each time, but checkpoint management should still
-    # be consistent.
-    for checkpoint_number in range(6, 11):
-      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
-      expected_filenames.append(
-          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
-    six.assertCountEqual(
-        self,
-        expected_filenames,
-        os.listdir(checkpoint_directory))
-    for v in looped_variables:
-      self.evaluate(v.assign(314))
-    checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
-    self.assertEqual(314, self.evaluate(checkpoint.var_9))
-    self.assertEqual(314, self.evaluate(checkpoint.var_8))
-    self.assertEqual(314, self.evaluate(checkpoint.var_6))
-    self.assertEqual(5, self.evaluate(checkpoint.var_5))
-    self.assertEqual(1, self.evaluate(checkpoint.var_1))
-    self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    if context.executing_eagerly():
-      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
-      self.assertEqual(9, self.evaluate(checkpoint.var_9))
-      self.assertEqual(8, self.evaluate(checkpoint.var_8))
-      self.assertEqual(1, self.evaluate(checkpoint.var_1))
-      self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    else:
-      # Restoring into modified graphs is an error while graph building.
-      with self.assertRaises(NotImplementedError):
-        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
-
-  def testManyRestoresGraph(self):
-    """Restores after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = checkpointable.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
-        before_ops = graph.get_operations()
-        saver.restore(save_path)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  def testMultipleGraphsNonSlotVariables(self):
-    with context.graph_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      optimizer = adam.AdamOptimizer(0.001)
-      # Construct a model in one graph
-      first_graph = ops.Graph()
-      first_session = session_lib.Session(graph=first_graph)
-      with first_graph.as_default(), first_session.as_default():
-        first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=first_variable)
-        train_op = optimizer.minimize(first_variable.read_value)
-        self.evaluate(checkpointable_utils.gather_initializers(
-            first_root_checkpointable))
-        self.evaluate(train_op)
-        self.evaluate(first_variable.assign([1.]))
-        self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m").assign([2.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(3.))
-
-      # Save and load in a second graph
-      second_graph = ops.Graph()
-      with second_graph.as_default(), session_lib.Session(graph=second_graph):
-        second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=second_variable)
-        train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
-        self.evaluate(train_op)
-        self.evaluate(second_variable.assign([4.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([5.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
-        self.evaluate(second_variable.assign([7.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([8.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-        status = second_root_checkpointable.restore(save_path)
-        status.assert_consumed().run_restore_ops()
-        self.assertAllEqual([4.], self.evaluate(second_variable))
-        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-
-      # Check that the first graph is unmolested
-      with first_graph.as_default(), first_session.as_default():
-        self.assertAllEqual([1.], self.evaluate(first_variable))
-        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_sequential(self):
-    model = sequential.Sequential()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
-    model.add(core.Dense(4))
-    second_dense = core.Dense(5)
-    model.add(second_dense)
-    model(constant_op.constant([[1.]]))
-    checkpoint.restore(None).initialize_or_restore()
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([1., 2., 3., 4., 5.])))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([5., 6., 7., 8., 9.])))
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
-
-    deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
-        model=deferred_sequential)
-    status = deferred_sequential_checkpoint.restore(save_path)
-    deferred_sequential.add(core.Dense(4))
-    deferred_sequential(constant_op.constant([[1.]]))
-    deferred_second_dense = core.Dense(5)
-    deferred_sequential.add(deferred_second_dense)
-    deferred_sequential(constant_op.constant([[1.]]))
-    status.run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.],
-                        self.evaluate(deferred_second_dense.bias))
-
-
-class TemplateTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_checkpointable_save_restore(self):
-
-    def _templated():
-      v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer())
-      v2 = variable_scope.get_variable(
-          "v2", shape=[1], initializer=init_ops.zeros_initializer())
-      return v, v + 1., v2
-
-    save_template = template.make_template("s1", _templated)
-    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
-    v1_save, _, v2_save = save_template()
-    self.evaluate(v1_save.assign([12.]))
-    self.evaluate(v2_save.assign([14.]))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = save_root.save(checkpoint_prefix)
-
-    load_template = template.make_template("s2", _templated)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
-    status = load_root.restore(save_path)
-    var, var_plus_one, var2 = load_template()
-    self.assertEqual(2, len(load_template._checkpoint_dependencies))
-    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
-    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
-    status.assert_consumed().run_restore_ops()
-    self.assertAllEqual([12.], self.evaluate(var))
-    self.assertAllEqual([13.], self.evaluate(var_plus_one))
-    self.assertAllEqual([14.], self.evaluate(var2))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_checkpointable_save_restore_nested(self):
-
-    def _inner_template():
-      v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer())
-      return v
-
-    def _outer_template():
-      first_inner = template.make_template("i1", _inner_template)
-      second_inner = template.make_template("i2", _inner_template)
-      v1 = first_inner()
-      v2 = second_inner()
-      v3 = second_inner()
-      return (first_inner, second_inner), (v1, v2, v3)
-
-    with variable_scope.variable_scope("ignored"):
-      save_template = template.make_template("s1", _outer_template)
-      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
-      (inner_template_one, inner_template_two), _ = save_template()
-    self.evaluate(inner_template_one.variables[0].assign([20.]))
-    self.evaluate(inner_template_two.variables[0].assign([25.]))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = save_root.save(checkpoint_prefix)
-
-    load_template = template.make_template("s2", _outer_template)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
-    status = load_root.restore(save_path)
-    (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
-    outer_template_dependencies = load_root.my_template._checkpoint_dependencies
-    self.assertEqual(2, len(outer_template_dependencies))
-    self.assertEqual("i1", outer_template_dependencies[0].name)
-    self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
-    self.assertEqual("i2", outer_template_dependencies[1].name)
-    self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
-    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
-    self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
-    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
-    self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
-    status.assert_consumed().run_restore_ops()
-    self.assertAllEqual([20.], self.evaluate(v1))
-    self.assertAllEqual([25.], self.evaluate(v2))
-    self.assertAllEqual([25.], self.evaluate(v3))
-
-
-class CheckpointCompatibilityTests(test.TestCase):
-
-  def _initialized_model(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
-
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
-        .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
-
-  def _check_sentinels(self, root_checkpointable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  def _write_name_based_checkpoint(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        name_saver = core_saver.Saver()
-        return name_saver.save(
-            sess=session, save_path=checkpoint_prefix,
-            global_step=root.optimizer_step)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLoadFromNameBasedSaver(self):
-    """Save a name-based checkpoint, load it using the object-based API."""
-    with test_util.device(use_gpu=True):
-      save_path = self._write_name_based_checkpoint()
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      with self.assertRaises(AssertionError):
-        self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
-      status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-      status.run_restore_ops()
-      self._check_sentinels(root)
-      self._set_sentinels(root)
-      status.initialize_or_restore()
-      self._check_sentinels(root)
-
-  # TODO(allenl): Test for the core name-based saver loading object-based
-  # checkpoints once object-based checkpointing is in core.
-
-  def testSaveGraphLoadEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        object_saver = checkpointable_utils.CheckpointableSaver(root)
-        save_path = object_saver.save(
-            session=session, file_prefix=checkpoint_prefix)
-    with context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      root.restore(save_path).assert_consumed()
-      self._check_sentinels(root)
-
-  def testSaveEagerLoadGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.eager_mode():
-      root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph):
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        root.restore(save_path).assert_consumed().run_restore_ops()
-        self._check_sentinels(root)
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index f76a896d3d..7b123707cc 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -27,7 +27,6 @@ from tensorflow.contrib import lookup
 from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
 from tensorflow.python.eager import test
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.training import checkpointable_utils
 
 
 class IteratorTest(test.TestCase):
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 9adf47d505..f825a2a736 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -33,8 +33,8 @@ import tensorflow as tf
 import tensorflow.contrib.eager as tfe
 from tensorflow.contrib.eager.python.examples.spinn import data
 from third_party.examples.eager.spinn import spinn
-from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
 from tensorflow.contrib.summary import summary_test_util
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.training import checkpoint_utils
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 28f5f286eb..f0fe4ce8c5 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import os
 import tempfile
 
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import metrics
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
@@ -31,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
 
 
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index c6f3f20e78..79dd117854 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -84,8 +84,6 @@ from __future__ import print_function
 # pylint:disable=g-bad-import-order,g-import-not-at-top,unused-import
 #
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.eager.python.checkpointable_utils import CheckpointableSaver
-from tensorflow.contrib.eager.python.checkpointable_utils import Checkpoint
 from tensorflow.contrib.eager.python.datasets import Iterator
 from tensorflow.contrib.eager.python.network import Network
 from tensorflow.contrib.eager.python.network import Sequential
@@ -123,6 +121,8 @@ from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
 from tensorflow.python.training.checkpointable import Checkpointable
+from tensorflow.python.training.checkpointable_utils import CheckpointableSaver
+from tensorflow.python.training.checkpointable_utils import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 54bc23cdef..6ade4ccd52 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -24,7 +24,6 @@ import os
 
 import six
 
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.optimizer_v2 import adam
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
@@ -42,6 +41,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 118955219b..97e0095e05 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -212,6 +212,7 @@ CORE_PROTO_SRCS = [
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
+    "protobuf/checkpointable_object_graph.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/core/protobuf/checkpointable_object_graph.proto
similarity index 85%
rename from tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
rename to tensorflow/core/protobuf/checkpointable_object_graph.proto
index 024765acb2..651f692f6d 100644
--- a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/checkpointable_object_graph.proto
@@ -2,14 +2,14 @@ syntax = "proto3";
 
 option cc_enable_arenas = true;
 
-package tensorflow.contrib.eager;
+package tensorflow;
 
-// Prototype format which saves extra information about the objects which own
-// variables, allowing for more robust checkpoint loading into modified
-// programs. Currently stored in its own entry in a TensorBundle.
+// A TensorBundle addition which saves extra information about the objects which
+// own variables, allowing for more robust checkpoint loading into modified
+// programs.
 
 message CheckpointableObjectGraph {
-  message Object {
+  message CheckpointableObject {
     message ObjectReference {
       // An index into `CheckpointableObjectGraph.nodes`, indicating the object
       // being referenced.
@@ -51,5 +51,5 @@ message CheckpointableObjectGraph {
     repeated SlotVariableReference slot_variables = 3;
   }
 
-  repeated Object nodes = 1;
+  repeated CheckpointableObject nodes = 1;
 }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9707b370c0..559926d415 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2943,6 +2943,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
         ":checkpoint_ops_gen",
         ":client",
         ":control_flow_ops",
@@ -2978,6 +2979,7 @@ py_library(
         ":variables",
         "//third_party/py/numpy",
         "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/ops/losses",
@@ -3010,6 +3012,39 @@ py_test(
     ],
 )
 
+py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["training/checkpointable_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        ":checkpointable",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":framework_test_lib",
+        ":init_ops",
+        ":resource_variable_ops",
+        ":session",
+        ":state_ops",
+        ":template",
+        ":training",
+        ":training_util",
+        ":variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "distribute_test",
     size = "small",
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 32123f87ef..da99d2ec31 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -17,14 +17,48 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+import collections
 import weakref
 
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.training import checkpointable
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable as checkpointable_lib
+from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util import deprecation
 
 
-class _Checkpoint(object):
+_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. Checkpoint names for slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
+# Keyword for separating the path to an object from the name of an
+# attribute in checkpoint names. Used like:
+#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
+_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
+# Key where the object graph proto is saved in a TensorBundle
+_OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
+
+
+class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
   def __init__(self, object_graph_proto, save_path, dtype_map=None):
@@ -72,7 +106,817 @@ class _Checkpoint(object):
         # `node` refers to an `Optimizer`, since only these have slot variables.
         self.slot_restorations.setdefault(
             slot_reference.original_variable_node_id, []).append(
-                checkpointable._SlotVariableRestoration(  # pylint: disable=protected-access
+                checkpointable_lib._SlotVariableRestoration(  # pylint: disable=protected-access
                     optimizer_id=node_index,
                     slot_variable_id=slot_reference.slot_variable_node_id,
                     slot_name=slot_reference.slot_name))
+
+
+# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
+# or consolidating the implementation with get_variable.
+def _default_getter(name, shape, dtype, initializer=None,
+                    partition_info=None, **kwargs):
+  """A pared-down version of get_variable which does not reuse variables."""
+  dtype = dtypes.as_dtype(dtype)
+  shape_object = tensor_shape.as_shape(shape)
+  with ops.init_scope():
+    if initializer is None:
+      initializer, initializing_from_value = (
+          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
+              name=name, shape=shape_object, dtype=dtype))
+    else:
+      initializing_from_value = not callable(initializer)
+    # Same logic as get_variable
+    variable_dtype = dtype.base_dtype
+    if initializing_from_value:
+      if shape is not None:
+        raise ValueError("If initializer is a constant, do not specify shape.")
+      initial_value = initializer
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      def initial_value():
+        return initializer(
+            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value,
+        name=name,
+        dtype=variable_dtype,
+        **kwargs
+    )
+
+
+def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
+                 initializer=None):
+  """Add a variable to a Checkpointable with no scope influence."""
+  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
+      name=name, shape=shape, dtype=dtype,
+      initializer=initializer, getter=_default_getter)
+
+
+def _breadth_first_checkpointable_traversal(root_checkpointable):
+  """Find shortest paths to all variables owned by dependencies of root."""
+  bfs_sorted = []
+  to_visit = collections.deque([root_checkpointable])
+  path_to_root = {root_checkpointable: ()}
+  while to_visit:
+    current_checkpointable = to_visit.popleft()
+    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    bfs_sorted.append(current_checkpointable)
+    for child_checkpointable in (
+        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
+      if child_checkpointable.ref not in path_to_root:
+        path_to_root[child_checkpointable.ref] = (
+            path_to_root[current_checkpointable] + (child_checkpointable,))
+        to_visit.append(child_checkpointable.ref)
+  return bfs_sorted, path_to_root
+
+
+def _escape_local_name(name):
+  # We need to support slashes in local names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # names.
+  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
+          .replace(r"/", _ESCAPE_CHAR + "S"))
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (_escape_local_name(checkpointable.name)
+       for checkpointable in path_to_root))
+
+
+def _slot_variable_naming_for_optimizer(optimizer_path):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+    return (variable_path
+            + optimizer_identifier
+            + _escape_local_name(slot_name))
+
+  return _name_slot_variable
+
+
+def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
+  """Gather and name slot variables."""
+  non_slot_objects = list(checkpointable_objects)
+  slot_variables = {}
+  for checkpointable in non_slot_objects:
+    if isinstance(checkpointable, optimizer_lib.Optimizer):
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer_path=object_names[checkpointable])
+      slot_names = checkpointable.get_slot_names()
+      for slot_name in slot_names:
+        for original_variable_node_id, original_variable in enumerate(
+            non_slot_objects):
+          try:
+            slot_variable = checkpointable.get_slot(
+                original_variable, slot_name)
+          except AttributeError:
+            slot_variable = None
+          if slot_variable is None:
+            continue
+          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
+            # TODO(allenl): Gather dependencies of slot variables.
+            raise NotImplementedError(
+                "Currently only variables with no dependencies can be saved as "
+                "slot variables. File a feature request if this limitation "
+                "bothers you.")
+          if slot_variable in node_ids:
+            raise NotImplementedError(
+                "A slot variable was re-used as a dependency of a "
+                "Checkpointable object. This is not currently allowed. File a "
+                "feature request if this limitation bothers you.")
+          checkpoint_name = naming_scheme(
+              variable_path=object_names[original_variable],
+              slot_name=slot_name)
+          object_names[slot_variable] = checkpoint_name
+          slot_variable_node_id = len(checkpointable_objects)
+          node_ids[slot_variable] = slot_variable_node_id
+          checkpointable_objects.append(slot_variable)
+          slot_variable_proto = (
+              checkpointable_object_graph_pb2.CheckpointableObjectGraph
+              .CheckpointableObject.SlotVariableReference(
+                  slot_name=slot_name,
+                  original_variable_node_id=original_variable_node_id,
+                  slot_variable_node_id=slot_variable_node_id))
+          slot_variables.setdefault(checkpointable, []).append(
+              slot_variable_proto)
+  return slot_variables
+
+
+def _serialize_checkpointables(
+    checkpointable_objects, node_ids, object_names, slot_variables):
+  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  named_saveables = {}
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    assert node_ids[checkpointable] == checkpoint_id
+    object_proto = object_graph_proto.nodes.add()
+    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
+    object_name = object_names[checkpointable]
+    for name, saveable_factory in (
+        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+      attribute = object_proto.attributes.add()
+      attribute.name = name
+      attribute.checkpoint_key = "%s/%s/%s" % (
+          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+      if callable(saveable_factory):
+        saveable = saveable_factory(name=attribute.checkpoint_key)
+      else:
+        saveable = saveable_factory
+      # Figure out the name-based Saver's name for this variable.
+      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          [saveable], convert_variable_to_tensor=False)
+      attribute.full_name, = saver_dict.keys()
+      named_saveables[attribute.checkpoint_key] = saveable
+
+    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
+      child_proto = object_proto.children.add()
+      child_proto.node_id = node_ids[child.ref]
+      child_proto.local_name = child.name
+
+  return named_saveables, object_graph_proto
+
+
+def _serialize_object_graph(root_checkpointable):
+  """Determine checkpoint keys for variables and build a serialized graph.
+
+  Non-slot variables are keyed based on a shortest path from the root saveable
+  to the object which owns the variable (i.e. the one which called
+  `Checkpointable._add_variable` to create it).
+
+  Slot variables are keyed based on a shortest path to the variable being
+  slotted for, a shortest path to their optimizer, and the slot name.
+
+  Args:
+    root_checkpointable: A `Checkpointable` object whose variables (including
+      the variables of dependencies, recursively) should be saved.
+
+  Returns:
+    A tuple of (named_variables, object_graph_proto):
+      named_variables: A dictionary mapping names to variable objects.
+      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
+        the serialized object graph and variable references.
+
+  Raises:
+    ValueError: If there are invalid characters in an optimizer's slot names.
+  """
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = {
+      obj: _object_prefix_from_path(path)
+      for obj, path in path_to_root.items()}
+  node_ids = {node: node_id for node_id, node
+              in enumerate(checkpointable_objects)}
+  slot_variables = _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return _serialize_checkpointables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names,
+      slot_variables=slot_variables)
+
+
+def gather_initializers(root_checkpointable):
+  """Traverse the object graph and find initialization ops.
+
+  Looks for `Checkpointable` objects which are dependencies of
+  `root_checkpointable` and which have an `initializer` property. Includes
+  initializers for slot variables only if the variable they are slotting for and
+  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  saved with a checkpoint).
+
+  Args:
+    root_checkpointable: A `Checkpointable` object to gather initializers for.
+  Returns:
+    A list of initialization ops.
+  """
+  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
+  # to run.
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = {
+      obj: _object_prefix_from_path(path)
+      for obj, path in path_to_root.items()}
+  node_ids = {node: node_id for node_id, node
+              in enumerate(checkpointable_objects)}
+  _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return [c.initializer for c in checkpointable_objects
+          if hasattr(c, "initializer") and c.initializer is not None]
+
+
+class _NoRestoreSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, tensor, name):
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(tensor, "", name)
+    super(_NoRestoreSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    return control_flow_ops.no_op()
+
+
+class _LoadStatus(object):
+  """Abstract base for load status callbacks."""
+
+  @abc.abstractmethod
+  def assert_consumed(self):
+    """Raises an exception unless a non-trivial restoration has completed."""
+    pass
+
+  @abc.abstractmethod
+  def run_restore_ops(self, session=None):
+    """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
+    pass
+
+  @abc.abstractmethod
+  def initialize_or_restore(self, session=None):
+    """Runs restore ops from the checkpoint, or initializes variables."""
+    pass
+
+
+class CheckpointLoadStatus(_LoadStatus):
+  """Checks the status of checkpoint loading and manages restore ops.
+
+  Returned from `Saver.restore`. Since `restore` may defer the loading of values
+  in the checkpoint which don't yet have corresponding Python objects,
+  `CheckpointLoadStatus` provides a callback to verify that checkpoint loading
+  is complete (`assert_consumed`).
+
+  When graph building, `restore` does not run restore ops itself since their
+  creation may be deferred. The `run_restore_ops` method must be called once all
+  Python objects with values to restore have been created and added to the
+  dependency graph (this does not necessarily have to be the whole checkpoint;
+  calling `run_restore_ops` while `assert_consumed` fails is supported and will
+  partially restore the checkpoint).
+
+  See `Saver.restore` for usage examples.
+  """
+
+  def __init__(self, checkpoint, feed_dict):
+    self._checkpoint = checkpoint
+    self._feed_dict = feed_dict
+
+  def assert_consumed(self):
+    """Asserts that all objects in the checkpoint have been created/matched.
+
+    Returns:
+      `self` for chaining.
+    Raises:
+      AssertionError: If there are any Python objects in the dependency graph
+        which have not been restored from this checkpoint or a later `restore`,
+        or if there are any checkpointed values which have not been matched to
+        Python objects.
+    """
+    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
+      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if checkpointable is None:
+        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
+      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
+        raise AssertionError(
+            "Object not assigned a value from checkpoint: %s" % (node,))
+    if self._checkpoint.slot_restorations:
+      # Sanity check; this collection should be clear if everything has been
+      # restored.
+      raise AssertionError("Unresolved slot restorations: %s" % (
+          self._checkpoint.slot_restorations,))
+    if self._checkpoint.unused_attributes:
+      raise AssertionError(
+          ("Unused attributes in these objects (the attributes exist in the "
+           "checkpoint but not in the objects): %s") % (
+               self._checkpoint.unused_attributes.items(),))
+    return self
+
+  def run_restore_ops(self, session=None):
+    """Run operations to restore objects in the dependency graph."""
+    if context.executing_eagerly():
+      return  # Run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
+
+  def initialize_or_restore(self, session=None):
+    """Alias for `run_restore_ops`.
+
+    This method has a sibling in `InitializationOnlyStatus` which instead
+    initializes variables. That type is returned if no checkpoint is specified
+    in `Saver.restore`.
+
+    Args:
+      session: The session to run restore ops in. If `None`, uses the default
+        session.
+    """
+    self.run_restore_ops(session=session)
+
+
+class InitializationOnlyStatus(_LoadStatus):
+  """Returned from `Saver.restore` when no checkpoint has been specified.
+
+  Objects of this type have the same `assert_consumed` method as
+  `CheckpointLoadStatus`, but it always fails. However,
+  `initialize_or_restore` works on objects of both types, and will
+  initialize variables in `InitializationOnlyStatus` objects or restore them
+  otherwise.
+  """
+
+  def __init__(self, root_checkpointable):
+    self._root_checkpointable = root_checkpointable
+
+  def assert_consumed(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "No checkpoint specified (save_path=None); nothing is being restored.")
+
+  def run_restore_ops(self, session=None):
+    """For consistency with `CheckpointLoadStatus`.
+
+    Use `initialize_or_restore` for initializing if no checkpoint was passed
+    to `Saver.restore` and restoring otherwise.
+
+    Args:
+      session: Not used.
+    """
+    raise AssertionError(
+        "No checkpoint specified, so no restore ops are available "
+        "(save_path=None to Saver.restore).")
+
+  def initialize_or_restore(self, session=None):
+    """Runs initialization ops for variables.
+
+    Only objects which would be saved by `Saver.save` will be initialized. See
+    `gather_initializers` for details.
+
+    This method does nothing when executing eagerly (initializers get run
+    eagerly).
+
+    Args:
+      session: The session to run initialization ops in. If `None`, uses the
+        default session.
+    """
+    if context.executing_eagerly():
+      return  # run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    session.run(gather_initializers(self._root_checkpointable))
+
+
+_DEPRECATED_RESTORE_INSTRUCTIONS = (
+    "Restoring a name-based tf.train.Saver checkpoint using the object-based "
+    "restore API. This mode uses global names to match variables, and so is "
+    "somewhat fragile. It also adds new restore ops to the graph each time it "
+    "is called. Prefer re-encoding training checkpoints in the object-based "
+    "format: run save() on the object-based saver (the same one this message "
+    "is coming from) and use that checkpoint in the future.")
+
+
+class NameBasedSaverStatus(_LoadStatus):
+  """Status for loading a name-based training checkpoint."""
+
+  def __init__(self, object_saver, save_path):
+    self._object_saver = object_saver
+    self._save_path = save_path
+
+  def assert_consumed(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "Restoring a name-based checkpoint. No load status is available.")
+
+  @deprecation.deprecated(
+      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
+  def run_restore_ops(self, session=None):
+    """Load the name-based training checkpoint using a new `tf.train.Saver`."""
+    if session is None and not context.executing_eagerly():
+      session = ops.get_default_session()
+    with ops.device("/cpu:0"):
+      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
+          sess=session, save_path=self._save_path)
+
+  def initialize_or_restore(self, session=None):
+    """Alias for `run_restore_ops`."""
+    self.run_restore_ops(session=session)
+
+
+class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
+  """Pretends to be a session, inserts extra feeds on run()."""
+
+  def __init__(self, session, feed_additions):
+    self._wrapped_session = session
+    self._feed_additions = feed_additions
+
+  def run(self, fetches, feed_dict=None, **kwargs):
+    if feed_dict is None:
+      feed_dict = {}
+    else:
+      feed_dict = feed_dict.copy()
+    feed_dict.update(self._feed_additions)
+    return self._wrapped_session.run(
+        fetches=fetches, feed_dict=feed_dict, **kwargs)
+
+
+def _copy_saver_with_new_var_list(old_saver, new_var_list):
+  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
+  new_saver = saver_lib.Saver(var_list=new_var_list)
+  # TODO(allenl): Move to copying functionality to Saver?
+  # pylint: disable=protected-access
+  new_saver._last_checkpoints = old_saver._last_checkpoints
+  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
+  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
+  # pylint: enable=protected-access
+  return new_saver
+
+
+class CheckpointableSaver(object):
+  """Saves and restores a `Checkpointable` object and its dependencies.
+
+  See `Checkpointable` for details of dependency management. `Saver` wraps
+  `tf.train.Saver` for saving, including extra information about the graph of
+  dependencies between Python objects. When restoring, it uses this information
+  about the save-time dependency graph to more robustly match objects with their
+  checkpointed values. When executing eagerly, it supports restoring variables
+  on object creation (see `Saver.restore`).
+
+  Values in a checkpoint are mapped to `Checkpointable` Python objects
+  (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
+  checkpoint was written. To avoid breaking existing checkpoints when modifying
+  a class, dependency names (the names of attributes to which `Checkpointable`
+  objects are assigned) may not change. These names are local to objects, in
+  contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
+  so allow additional program transformations.
+  """
+
+  def __init__(self, root_checkpointable):
+    """Configure saving.
+
+    Args:
+      root_checkpointable: The root of the object graph to save/restore. This
+        object and all of its dependencies are saved in the checkpoint. When
+        restoring, objects are matched and restored starting from this root.
+    """
+    # Allow passing in a weak reference to avoid reference cycles when
+    # `Checkpointable` objects save themselves.
+    self._root_checkpointable_ref = root_checkpointable
+    if not context.executing_eagerly():
+      with ops.device("/cpu:0"):
+        self._file_prefix_placeholder = constant_op.constant("model")
+    else:
+      self._file_prefix_placeholder = None
+
+    # Op caching for save
+    self._object_graph_feed_tensor = None
+    self._last_save_object_graph = None
+    self._last_save_saver = None
+
+    # Op caching for restore
+    self._last_restore_object_graph = None
+    self._last_restore_checkpoint = None
+
+  @property
+  def _root_checkpointable(self):
+    if isinstance(self._root_checkpointable_ref, weakref.ref):
+      derefed = self._root_checkpointable_ref()
+      assert derefed is not None
+      return derefed
+    else:
+      return self._root_checkpointable_ref
+
+  def save(self, file_prefix, checkpoint_number=None, session=None):
+    """Save a training checkpoint.
+
+    The saved checkpoint includes variables created by this object and any
+    Checkpointable objects it depends on at the time `Saver.save()` is called.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `checkpoint_number`, if provided.
+      checkpoint_number: An integer variable or Tensor, used to number
+        checkpoints. Typically this value is saved along with other variables in
+        training checkpoints, which will happen automatically if it was created
+        by `root_checkpointable` or one of its dependencies (via
+        `Checkpointable._add_variable`).
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint.
+    """
+    named_variables, graph_proto = _serialize_object_graph(
+        self._root_checkpointable)
+    if not context.executing_eagerly():
+      if session is None:
+        session = ops.get_default_session()
+      if self._object_graph_feed_tensor is None:
+        with ops.device("/cpu:0"):
+          self._object_graph_feed_tensor = constant_op.constant(
+              "", dtype=dtypes.string)
+      object_graph_tensor = self._object_graph_feed_tensor
+      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
+    else:
+      session = None
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      feed_additions = None
+    assert _OBJECT_GRAPH_PROTO_KEY not in named_variables
+    named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
+        tensor=object_graph_tensor,
+        name=_OBJECT_GRAPH_PROTO_KEY)
+    if (self._last_save_object_graph != graph_proto
+        # When executing eagerly, we need to re-create SaveableObjects each time
+        # save() is called so they pick up new Tensors passed to their
+        # constructors. That means the Saver needs to be copied with a new
+        # var_list.
+        or context.executing_eagerly()):
+      if self._last_save_object_graph is not None:
+        self._last_save_saver = _copy_saver_with_new_var_list(
+            old_saver=self._last_save_saver, new_var_list=named_variables)
+      else:
+        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
+      self._last_save_object_graph = graph_proto
+    with ops.device("/cpu:0"):
+      save_path = self._last_save_saver.save(
+          sess=_SessionWithFeedDictAdditions(
+              session=session, feed_additions=feed_additions),
+          save_path=file_prefix,
+          write_meta_graph=False,
+          global_step=checkpoint_number)
+    return save_path
+
+  def _global_variable_names(self):
+    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
+    named_saveables, graph_proto = _serialize_object_graph(
+        self._root_checkpointable)
+    saver_names = {}
+    for object_proto in graph_proto.nodes:
+      for attribute_proto in object_proto.attributes:
+        saver_names[attribute_proto.full_name] = named_saveables[
+            attribute_proto.checkpoint_key]
+    return saver_names
+
+  def restore(self, save_path):
+    """Restore a training checkpoint.
+
+    Restores `root_checkpointable` and any objects that it tracks
+    (transitive). Either assigns values immediately if variables to restore have
+    been created already, or defers restoration until the variables are
+    created. Dependencies added to the `root_checkpointable` passed to the
+    constructor after this call will be matched if they have a corresponding
+    object in the checkpoint.
+
+    When building a graph, restorations are added to the graph but not run.
+
+    To disallow deferred loading, assert immediately that all checkpointed
+    variables have been matched to variable objects:
+
+    ```python
+    saver = Saver(root)
+    saver.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised unless every object was matched and its
+    variables already exist.
+
+    When graph building, `assert_consumed()` indicates that all of the restore
+    ops which will be created for this checkpoint have been created. They can be
+    run via the `run_restore_ops()` function of the status object:
+
+    ```python
+    saver.restore(path).assert_consumed().run_restore_ops()
+    ```
+
+    If the checkpoint has not been consumed completely, then the list of restore
+    ops will grow as more objects are added to the dependency graph.
+
+    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    method. There is no deferred loading, and names are used to match
+    variables. No restore ops are created/run until `run_restore_ops()` or
+    `initialize_or_restore()` are called on the returned status object, even
+    when executing eagerly. Re-encode name-based checkpoints using this
+    object-based `Saver.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of checkpoint restoration and run initialization/restore ops
+      (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if
+      `save_path` is `None`).
+
+      If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
+      object is returned which runs restore ops from a name-based saver.
+    """
+    if save_path is None:
+      return InitializationOnlyStatus(self._root_checkpointable)
+    in_graph_mode = not context.executing_eagerly()
+    if in_graph_mode:
+      file_prefix_tensor = self._file_prefix_placeholder
+      file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
+    else:
+      with ops.device("/cpu:0"):
+        file_prefix_tensor = constant_op.constant(save_path)
+      file_prefix_feed_dict = None
+    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+    try:
+      object_graph_string = reader.get_tensor(_OBJECT_GRAPH_PROTO_KEY)
+    except errors_impl.NotFoundError:
+      # The object graph proto does not exist in this checkpoint. Try again with
+      # name-based saving.
+      return NameBasedSaverStatus(self, save_path)
+
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
+      checkpoint = self._last_restore_checkpoint
+    else:
+      if in_graph_mode:
+        dtype_map = None
+      else:
+        dtype_map = reader.get_variable_to_dtype_map()
+      checkpoint = _CheckpointRestoreCoordinator(
+          object_graph_proto=object_graph_proto,
+          save_path=file_prefix_tensor,
+          dtype_map=dtype_map)
+      if in_graph_mode:
+        if self._last_restore_object_graph is not None:
+          raise NotImplementedError(
+              "Using a single Saver to restore different object graphs is not "
+              "currently supported when graph building. Use a different Saver "
+              "for each object graph (restore ops will be duplicated), or "
+              "file a feature request if this limitation bothers you.")
+        self._last_restore_checkpoint = checkpoint
+        self._last_restore_object_graph = object_graph_proto
+    checkpointable_lib._CheckpointPosition(  # pylint: disable=protected-access
+        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
+    load_status = CheckpointLoadStatus(
+        checkpoint, feed_dict=file_prefix_feed_dict)
+    return load_status
+
+
+class Checkpoint(checkpointable_lib.Checkpointable):
+  """A utility class which groups `Checkpointable` objects.
+
+  Accepts arbitrary keyword arguments to its constructor and saves those values
+  with a checkpoint. Maintains a `save_counter` for numbering checkpoints.
+
+  Example usage:
+
+  ```python
+  import tensorflow as tf
+  import tensorflow.contrib.eager as tfe
+  import os
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  root = tfe.Checkpoint(optimizer=optimizer, model=model)
+  root.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  for _ in range(num_training_steps):
+    optimizer.minimize( ... )
+  root.save(file_prefix=checkpoint_prefix)
+  ```
+
+  For more manual control over saving, use `tfe.CheckpointableSaver` directly.
+
+  Attributes:
+    save_counter: Incremented when `save()` is called. Used to number
+      checkpoints.
+  """
+
+  def __init__(self, **kwargs):
+    """Group objects into a training checkpoint.
+
+    Args:
+      **kwargs: Keyword arguments are set as attributes of this object, and are
+        saved with the checkpoint. Attribute values must derive from
+        `CheckpointableBase`.
+    Raises:
+      ValueError: If objects in `kwargs` are not Checkpointable.
+    """
+    super(Checkpoint, self).__init__()
+    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
+      if not isinstance(v, checkpointable_lib.CheckpointableBase):
+        raise ValueError(
+            ("`Checkpoint` was expecting an object derived from "
+             "`CheckpointableBase`, got %s.") % (v,))
+      setattr(self, k, v)
+    self._save_counter = None  # Created lazily for restore-on-create.
+    self._saver = CheckpointableSaver(weakref.ref(self))
+
+  def _maybe_create_save_counter(self):
+    """Create a save counter if it does not yet exist."""
+    if self._save_counter is None:
+      # Initialized to 0 and incremented before saving.
+      with ops.device("/cpu:0"):
+        self._save_counter = add_variable(
+            self, name="save_counter", initializer=0, dtype=dtypes.int64)
+
+  @property
+  def save_counter(self):
+    """An integer variable which starts at zero and is incremented on save.
+
+    Used to number checkpoints.
+
+    Returns:
+      The save counter variable.
+    """
+    self._maybe_create_save_counter()
+    return self._save_counter
+
+  def save(self, file_prefix, session=None):
+    """Save a checkpoint. Wraps `tfe.CheckpointableSaver.save`."""
+    in_graph_mode = not context.executing_eagerly()
+    if in_graph_mode:
+      if session is None:
+        session = ops.get_default_session()
+      if self._save_counter is None:
+        # When graph building, if this is a new save counter variable then it
+        # needs to be initialized before assign_add. This is only an issue if
+        # restore() has not been called first.
+        session.run(self.save_counter.initializer)
+    with ops.colocate_with(self.save_counter):
+      assign_op = self.save_counter.assign_add(1)
+    if in_graph_mode:
+      session.run(assign_op)
+    return self._saver.save(
+        file_prefix=file_prefix,
+        checkpoint_number=self.save_counter,
+        session=session)
+
+  def restore(self, save_path):
+    """Restore a checkpoint. Wraps `tfe.CheckpointableSaver.restore`."""
+    status = self._saver.restore(save_path=save_path)
+    # Create the save counter now so it gets initialized with other variables
+    # when graph building. Creating it earlier would lead to double
+    # initialization when executing eagerly.
+    self._maybe_create_save_counter()
+    return status
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable_utils_test.py
new file mode 100644
index 0000000000..ddf9820616
--- /dev/null
+++ b/tensorflow/python/training/checkpointable_utils_test.py
@@ -0,0 +1,1308 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import sequential
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class InterfaceTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testAddVariable(self):
+    obj = NonLayerCheckpointable()
+    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
+      checkpointable_utils.add_variable(
+          obj, name="shape_specified_twice", shape=[], initializer=1)
+    constant_initializer = checkpointable_utils.add_variable(
+        obj, name="constant_initializer", initializer=1)
+    with variable_scope.variable_scope("some_variable_scope"):
+      ones_initializer = checkpointable_utils.add_variable(
+          obj,
+          name="ones_initializer",
+          shape=[2],
+          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
+    bare_initializer = checkpointable_utils.add_variable(
+        obj,
+        name="bare_initializer",
+        shape=[2, 2],
+        dtype=dtypes.float64,
+        initializer=init_ops.zeros_initializer)
+
+    # Even in graph mode, there are no naming conflicts between objects, only
+    # naming conflicts within an object.
+    other_duplicate = resource_variable_ops.ResourceVariable(
+        name="duplicate", initial_value=1.)
+    duplicate = checkpointable_utils.add_variable(
+        obj, name="duplicate", shape=[])
+    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
+      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
+
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    self.assertEqual("constant_initializer:0", constant_initializer.name)
+    self.assertEqual(1, self.evaluate(constant_initializer))
+    self.assertEqual("some_variable_scope/ones_initializer:0",
+                     ones_initializer.name)
+    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
+    self.assertAllEqual([[0., 0.],
+                         [0., 0.]], self.evaluate(bare_initializer))
+    self.assertEqual("a_variable:0", obj.a_variable.name)
+    self.assertEqual("duplicate:0", other_duplicate.name)
+    if context.executing_eagerly():
+      # When executing eagerly, there's no uniquification of variable names. The
+      # checkpoint name will be the same.
+      self.assertEqual("duplicate:0", duplicate.name)
+    else:
+      # The .name attribute may be globally influenced, but the checkpoint name
+      # won't be (tested below).
+      self.assertEqual("duplicate_1:0", duplicate.name)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
+    expected_checkpoint_names = (
+        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
+        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
+        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+    )
+    six.assertCountEqual(
+        self, expected_checkpoint_names, named_variables.keys())
+
+  def testInitNotCalled(self):
+
+    class NoInit(checkpointable.Checkpointable):
+
+      def __init__(self):
+        pass
+
+    # __init__ for Checkpointable will be called implicitly.
+    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
+
+  def testShapeDtype(self):
+    root = checkpointable.Checkpointable()
+    v1 = checkpointable_utils.add_variable(
+        root, name="v1", initializer=3., dtype=dtypes.float64)
+    self.assertEqual(dtypes.float64, v1.dtype)
+    v2 = checkpointable_utils.add_variable(
+        root,
+        name="v2",
+        shape=[3],
+        initializer=init_ops.ones_initializer,
+        dtype=dtypes.float64)
+    self.assertEqual(dtypes.float64, v2.dtype)
+    self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
+
+
+class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    tensor = self._primary_variable.read_value()
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name)
+    super(_MirroringSaveable, self).__init__(
+        tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph = (
+        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step:0",
+        named_variables["optimizer_step" + suffix].name)
+    self.assertEqual(
+        "my_model/dense_1/kernel:0",
+        named_variables["model/_second/kernel" + suffix].name)
+    self.assertEqual(
+        "my_model/dense/kernel:0",
+        named_variables["model/_named_dense/kernel" + suffix].name)
+    self.assertEqual(
+        "beta1_power:0",
+        named_variables["optimizer/beta1_power" + suffix].name)
+    self.assertEqual(
+        "beta2_power:0",
+        named_variables["optimizer/beta2_power" + suffix].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=named_variables["model/_named_dense/kernel" + suffix],
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturned(self):
+    v = _OwnsMirroredVariables()
+    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(43.))
+    self.evaluate(v.mirrored.assign(44.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(42., self.evaluate(v.non_dep_variable))
+    self.assertEqual(42., self.evaluate(v.mirrored))
+    self.evaluate(v.non_dep_variable.assign(44.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(45.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(44., self.evaluate(v.non_dep_variable))
+    self.assertEqual(44., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturnedWithGlobalName(self):
+    # The same object can also be saved using the name-based saver.
+    v = _OwnsMirroredVariables()
+    saver = saver_lib.Saver(var_list=[v])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    with self.test_session() as sess:
+      save_path = saver.save(sess, prefix)
+      self.evaluate(v.non_dep_variable.assign(43.))
+      self.evaluate(v.mirrored.assign(44.))
+      saver.restore(sess, save_path)
+      self.assertEqual(42., self.evaluate(v.non_dep_variable))
+      self.assertEqual(42., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(saver_lib.latest_checkpoint(checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableNameEscaping(self):
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    self.assertEqual(r"a.Sb.Sc" + suffix, self._get_checkpoint_name(r"a/b/c"))
+    self.assertEqual(r"b" + suffix, self._get_checkpoint_name(r"b"))
+    self.assertEqual(r"c.S" + suffix, self._get_checkpoint_name(r"c/"))
+    self.assertEqual(r"d.S..S" + suffix, self._get_checkpoint_name(r"d/.S"))
+    self.assertEqual(r"d.S..ATTRIBUTES.Sf" + suffix,
+                     self._get_checkpoint_name(r"d/.ATTRIBUTES/f"))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNumberedPath(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    root.leaf = leaf
+    checkpointable_utils.add_variable(leaf, name="v", shape=[])
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    variable_name, = named_variables.keys()
+    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLocalNameValidation(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    # Dots are escaped, which avoids conflicts with reserved names.
+    root._track_checkpointable(leaf, name=".ATTRIBUTES")
+    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    name, = named_variables.keys()
+    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLateDependencyTracking(self):
+
+    class Dependency(checkpointable.Checkpointable):
+
+      def build(self):
+        self.var = checkpointable_utils.add_variable(
+            self, "var", initializer=0.)
+
+    class LateDependencies(checkpointable.Checkpointable):
+
+      def add_dep(self):
+        self.dep = Dependency()
+        self.dep.build()
+
+    original = LateDependencies()
+    original.add_dep()
+    self.evaluate(state_ops.assign(original.dep.var, 123.))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpointable_utils.CheckpointableSaver(
+        original).save(checkpoint_prefix)
+    load_into = LateDependencies()
+    status = checkpointable_utils.CheckpointableSaver(
+        load_into).restore(save_path)
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    load_into.add_dep()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(123., self.evaluate(load_into.dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDepAfterVar(self):
+
+    class Dependency(checkpointable.Checkpointable):
+
+      def build(self):
+        self.var = checkpointable_utils.add_variable(
+            self, "var", initializer=0.)
+
+    class DepAfterVar(checkpointable.Checkpointable):
+
+      def add_dep(self):
+        dep = Dependency()
+        dep.build()
+        self.dep = dep
+
+    dep_after_var = DepAfterVar()
+    dep_after_var.add_dep()
+    self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
+        checkpoint_prefix)
+
+    loaded_dep_after_var = DepAfterVar()
+    status = checkpointable_utils.CheckpointableSaver(
+        loaded_dep_after_var).restore(save_path)
+    loaded_dep_after_var.add_dep()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = checkpointable.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = checkpointable.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOverlappingRestores(self):
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep = checkpointable.Checkpointable()
+    save_root.dep.var = checkpointable_utils.add_variable(
+        save_root.dep, name="var", initializer=0.)
+    self.evaluate(state_ops.assign(save_root.dep.var, 12.))
+    saver = checkpointable_utils.CheckpointableSaver(save_root)
+    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
+    self.evaluate(state_ops.assign(save_root.dep.var, 13.))
+    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
+
+    first_root = checkpointable.Checkpointable()
+    second_root = checkpointable.Checkpointable()
+    first_status = checkpointable_utils.CheckpointableSaver(
+        first_root).restore(first_path)
+    second_status = checkpointable_utils.CheckpointableSaver(
+        second_root).restore(second_path)
+    load_dep = checkpointable.Checkpointable()
+    load_dep.var = checkpointable_utils.add_variable(
+        load_dep, name="var", shape=[])
+    first_root.dep = load_dep
+    first_status.assert_consumed()
+    first_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+    second_root.dep = load_dep
+    second_status.assert_consumed()
+    second_status.run_restore_ops()
+    self.assertEqual(13., self.evaluate(load_dep.var))
+
+    # Try again with the order of the restore() reversed. The last restore
+    # determines the final value.
+    first_root = checkpointable.Checkpointable()
+    second_root = checkpointable.Checkpointable()
+    second_status = checkpointable_utils.CheckpointableSaver(
+        second_root).restore(second_path)
+    first_status = checkpointable_utils.CheckpointableSaver(
+        first_root).restore(first_path)
+    load_dep = checkpointable.Checkpointable()
+    load_dep.var = checkpointable_utils.add_variable(
+        load_dep, name="var", shape=[])
+    first_root.dep = load_dep
+    first_status.assert_consumed()
+    first_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+    second_root.dep = load_dep
+    second_status.assert_consumed()
+    second_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAmbiguousLoad(self):
+    # Not OK to split one checkpoint object into two
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep_one = checkpointable.Checkpointable()
+    save_root.dep_two = checkpointable.Checkpointable()
+    dep_three = checkpointable.Checkpointable()
+    save_root.dep_one.dep_three = dep_three
+    save_root.dep_two.dep_three = dep_three
+    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(save_root))
+    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+    load_root = checkpointable.Checkpointable()
+    checkpointable_utils.CheckpointableSaver(load_root).restore(save_path)
+    load_root.dep_one = checkpointable.Checkpointable()
+    load_root.dep_two = checkpointable.Checkpointable()
+    load_root.dep_one.dep_three = checkpointable.Checkpointable()
+    with self.assertRaisesRegexp(AssertionError,
+                                 "resolved to different objects"):
+      load_root.dep_two.dep_three = checkpointable.Checkpointable()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testObjectsCombined(self):
+    # Currently fine to load two checkpoint objects into one Python object
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep_one = checkpointable.Checkpointable()
+    save_root.dep_two = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
+    checkpointable_utils.add_variable(
+        save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
+    self.evaluate(checkpointable_utils.gather_initializers(save_root))
+    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+    load_root = checkpointable.Checkpointable()
+    load_root.dep_one = checkpointable.Checkpointable()
+    load_root.dep_two = load_root.dep_one
+    v1 = checkpointable_utils.add_variable(
+        load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
+    v2 = checkpointable_utils.add_variable(
+        load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
+    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+        save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(32., self.evaluate(v1))
+    self.assertEqual(64., self.evaluate(v2))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDependencyLoop(self):
+    # Note: this test creates garbage during eager execution because it
+    # purposefully creates a reference cycle.
+    first = checkpointable.Checkpointable()
+    second = checkpointable.Checkpointable()
+    first.second = second
+    second.first = first
+    first.v = checkpointable_utils.add_variable(
+        first, "v1", initializer=[3., 1., 4.])
+    second.v = checkpointable_utils.add_variable(
+        second, "v2", initializer=[1., 1., 2., 3.])
+    self.evaluate(checkpointable_utils.gather_initializers(first))
+    checkpoint_directory = self.get_temp_dir()
+    save_path = checkpointable_utils.CheckpointableSaver(first).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+
+    # Test deferred loading
+    first_load = checkpointable.Checkpointable()
+    status = checkpointable_utils.CheckpointableSaver(
+        first_load).restore(save_path)
+    second_load = checkpointable.Checkpointable()
+    first_load.second = second_load
+    second_load.first = first_load
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    first_load.v = checkpointable_utils.add_variable(
+        first_load, "v1", shape=[3])
+    second_load.v = checkpointable_utils.add_variable(
+        second_load, "v2", shape=[4])
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
+    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
+
+    # Test loading when variables have already been created
+    self.evaluate(first_load.v.assign([2., 7., 1.]))
+    self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
+    self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
+    self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
+    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
+        save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
+    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRestoreOnAssign(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(save_graph):
+      first = checkpointable.Checkpointable()
+      first.var1 = variable_scope.get_variable(
+          name="outside_var", initializer=0.)
+      first.var2 = variable_scope.get_variable(
+          name="blah", initializer=0.)
+      self.evaluate(first.var1.assign(4.))
+      self.evaluate(first.var2.assign(8.))
+      save_path = checkpointable_utils.CheckpointableSaver(first).save(
+          checkpoint_prefix)
+    restore_graph = ops.Graph()
+    with restore_graph.as_default(), self.test_session(restore_graph):
+      second = checkpointable.Checkpointable()
+      second.var2 = variable_scope.get_variable(
+          name="blah", initializer=0.)
+      status = checkpointable_utils.CheckpointableSaver(
+          second).restore(save_path)
+      recreated_var1 = variable_scope.get_variable(
+          name="outside_var", initializer=0.)
+      status.run_restore_ops()
+      self.assertEqual(8., self.evaluate(second.var2))
+      self.evaluate(recreated_var1.assign(-2.))
+      self.assertEqual(-2., self.evaluate(recreated_var1))
+      second.var1 = recreated_var1
+      status.run_restore_ops()
+      self.assertEqual(4., self.evaluate(recreated_var1))
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCheckpointCleanup(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    saver = checkpointable_utils.Checkpoint(obj=obj)
+    for _ in range(10):
+      saver.save(checkpoint_prefix)
+    expected_filenames = ["checkpoint"]
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCheckpointCleanupChangingVarList(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    looped_variables = []
+    for iteration in range(10):
+      new_variable = resource_variable_ops.ResourceVariable(iteration)
+      self.evaluate(new_variable.initializer)
+      setattr(checkpoint, "var_%d" % iteration, new_variable)
+      checkpoint.save(checkpoint_prefix)
+      looped_variables.append(new_variable)
+    expected_filenames = ["checkpoint"]
+    # We've copied the saver each time, but checkpoint management should still
+    # be consistent.
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+    for v in looped_variables:
+      self.evaluate(v.assign(314))
+    checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
+    self.assertEqual(314, self.evaluate(checkpoint.var_9))
+    self.assertEqual(314, self.evaluate(checkpoint.var_8))
+    self.assertEqual(314, self.evaluate(checkpoint.var_6))
+    self.assertEqual(5, self.evaluate(checkpoint.var_5))
+    self.assertEqual(1, self.evaluate(checkpoint.var_1))
+    self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    if context.executing_eagerly():
+      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+      self.assertEqual(9, self.evaluate(checkpoint.var_9))
+      self.assertEqual(8, self.evaluate(checkpoint.var_8))
+      self.assertEqual(1, self.evaluate(checkpoint.var_1))
+      self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    else:
+      # Restoring into modified graphs is an error while graph building.
+      with self.assertRaises(NotImplementedError):
+        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sequential(self):
+    model = sequential.Sequential()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    model.add(core.Dense(4))
+    second_dense = core.Dense(5)
+    model.add(second_dense)
+    model(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([1., 2., 3., 4., 5.])))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([5., 6., 7., 8., 9.])))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+
+    deferred_sequential = sequential.Sequential()
+    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+        model=deferred_sequential)
+    status = deferred_sequential_checkpoint.restore(save_path)
+    deferred_sequential.add(core.Dense(4))
+    deferred_sequential(constant_op.constant([[1.]]))
+    deferred_second_dense = core.Dense(5)
+    deferred_sequential.add(deferred_second_dense)
+    deferred_sequential(constant_op.constant([[1.]]))
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.],
+                        self.evaluate(deferred_second_dense.bias))
+
+
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer())
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer())
+      return v, v + 1., v2
+
+    save_template = template.make_template("s1", _templated)
+    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+    v1_save, _, v2_save = save_template()
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2 = load_template()
+    self.assertEqual(2, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore_nested(self):
+
+    def _inner_template():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer())
+      return v
+
+    def _outer_template():
+      first_inner = template.make_template("i1", _inner_template)
+      second_inner = template.make_template("i2", _inner_template)
+      v1 = first_inner()
+      v2 = second_inner()
+      v3 = second_inner()
+      return (first_inner, second_inner), (v1, v2, v3)
+
+    with variable_scope.variable_scope("ignored"):
+      save_template = template.make_template("s1", _outer_template)
+      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+      (inner_template_one, inner_template_two), _ = save_template()
+    self.evaluate(inner_template_one.variables[0].assign([20.]))
+    self.evaluate(inner_template_two.variables[0].assign([25.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _outer_template)
+    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    status = load_root.restore(save_path)
+    (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
+    outer_template_dependencies = load_root.my_template._checkpoint_dependencies
+    self.assertEqual(2, len(outer_template_dependencies))
+    self.assertEqual("i1", outer_template_dependencies[0].name)
+    self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
+    self.assertEqual("i2", outer_template_dependencies[1].name)
+    self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
+    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
+    self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
+    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
+    self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([20.], self.evaluate(v1))
+    self.assertAllEqual([25.], self.evaluate(v2))
+    self.assertAllEqual([25.], self.evaluate(v3))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  # TODO(allenl): Test for the core name-based saver loading object-based
+  # checkpoints once object-based checkpointing is in core.
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        object_saver = checkpointable_utils.CheckpointableSaver(root)
+        save_path = object_saver.save(
+            session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From 6d2316d4a75be1c603e4edd08a33e1098a28b070 Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Thu, 12 Apr 2018 12:04:48 -0700
Subject: [PATCH 0695/1262] Add FunctionTest.testLayerInDefun

PiperOrigin-RevId: 192647818
---
 tensorflow/python/eager/BUILD            |  2 ++
 tensorflow/python/eager/function_test.py | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 8c0d3feece..b3268c9047 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -142,6 +142,8 @@ cuda_py_test(
         ":tape",
         ":test",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 9af197981b..65dde75e60 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,9 +29,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
@@ -104,6 +106,7 @@ class FunctionTest(test.TestCase):
     matmul = function.defun(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
+
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
@@ -312,6 +315,7 @@ class FunctionTest(test.TestCase):
         x = variable_scope.get_variable(
             'v', initializer=constant_op.constant(1.0))
         return x * constant_op.constant(2.0)
+
       with self.assertRaisesRegexp(ValueError,
                                    'No trainable variables were accessed'):
         backprop.implicit_val_and_grad(f)()
@@ -581,6 +585,7 @@ class FunctionTest(test.TestCase):
       with ops.name_scope('foo'):
         v = resource_variable_ops.ResourceVariable(0.0, name='bar')
       self.assertEqual(v.name, 'foo/bar:0')
+
     create_variable()
 
   def testVariableNamesRespectNameScopesWithDefunInGraph(self):
@@ -590,9 +595,25 @@ class FunctionTest(test.TestCase):
         with ops.name_scope('foo'):
           v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
         self.assertEqual(v.name, 'foo/bar:0')
+
       with ops.get_default_graph().as_default():
         create_variable()
 
+  def testLayerInDefun(self):
+    conv = convolutional.Conv2D(
+        filters=1,
+        kernel_size=2,
+        kernel_initializer=init_ops.ones_initializer(),
+        bias_initializer=init_ops.zeros_initializer())
+
+    @function.defun
+    def model(x):
+      return conv(x)
+
+    x = array_ops.ones([1, 2, 2, 1])
+    y = model(x)
+    self.assertAllEqual([[[[4.0]]]], y.numpy())
+
 
 class AutomaticControlDependenciesTest(test.TestCase):
 
-- 
GitLab


From 6308e58e32e0d238e7df35b4c8a5935c3327d79a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 12:09:43 -0700
Subject: [PATCH 0696/1262] Add softsign bijector.

PiperOrigin-RevId: 192648596
---
 tensorflow/contrib/distributions/BUILD        |  19 +++
 .../kernel_tests/bijectors/softsign_test.py   | 111 ++++++++++++++++++
 .../python/ops/bijectors/__init__.py          |   2 +
 .../python/ops/bijectors/softsign.py          |  86 ++++++++++++++
 4 files changed, 218 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/softsign.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index fec6eafd4a..20e432b88d 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -1174,6 +1174,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "softsign_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/softsign_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "square_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
new file mode 100644
index 0000000000..2ac06fce55
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
@@ -0,0 +1,111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.softsign import Softsign
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class SoftsignBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = g(X) = X / (1 + |X|) transformation."""
+
+  def _softsign(self, x):
+    return x / (1. + np.abs(x))
+
+  def _softsign_ildj_before_reduction(self, y):
+    """Inverse log det jacobian, before being reduced."""
+    return -2. * np.log1p(-np.abs(y))
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorBounds(self):
+    bijector = Softsign(validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("greater than -1"):
+        bijector.inverse(-3.).eval()
+      with self.assertRaisesOpError("greater than -1"):
+        bijector.inverse_log_det_jacobian(-3., event_ndims=0).eval()
+
+      with self.assertRaisesOpError("less than 1"):
+        bijector.inverse(3.).eval()
+      with self.assertRaisesOpError("less than 1"):
+        bijector.inverse_log_det_jacobian(3., event_ndims=0).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorForwardInverse(self):
+    bijector = Softsign(validate_args=True)
+    self.assertEqual("softsign", bijector.name)
+    x = 2. * self._rng.randn(2, 10)
+    y = self._softsign(x)
+
+    self.assertAllClose(y, self.evaluate(bijector.forward(x)))
+    self.assertAllClose(x, self.evaluate(bijector.inverse(y)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorLogDetJacobianEventDimsZero(self):
+    bijector = Softsign(validate_args=True)
+    y = self._rng.rand(2, 10)
+    # No reduction needed if event_dims = 0.
+    ildj = self._softsign_ildj_before_reduction(y)
+
+    self.assertAllClose(ildj, self.evaluate(
+        bijector.inverse_log_det_jacobian(y, event_ndims=0)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorForwardInverseEventDimsOne(self):
+    bijector = Softsign(validate_args=True)
+    self.assertEqual("softsign", bijector.name)
+    x = 2. * self._rng.randn(2, 10)
+    y = self._softsign(x)
+    self.assertAllClose(y, self.evaluate(bijector.forward(x)))
+    self.assertAllClose(x, self.evaluate(bijector.inverse(y)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorLogDetJacobianEventDimsOne(self):
+    bijector = Softsign(validate_args=True)
+    y = self._rng.rand(2, 10)
+    ildj_before = self._softsign_ildj_before_reduction(y)
+    ildj = np.sum(ildj_before, axis=1)
+    self.assertAllClose(
+        ildj, self.evaluate(
+            bijector.inverse_log_det_jacobian(y, event_ndims=1)))
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Softsign(validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=-20., upper_x=20.)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      bijector = Softsign(validate_args=True)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.linspace(-0.99, 0.99, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=0, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index bc6b02542e..babce80396 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -38,6 +38,7 @@
 @@SinhArcsinh
 @@SoftmaxCentered
 @@Softplus
+@@Softsign
 @@Square
 @@Weibull
 
@@ -74,6 +75,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
+from tensorflow.contrib.distributions.python.ops.bijectors.softsign import *
 from tensorflow.contrib.distributions.python.ops.bijectors.square import *
 from tensorflow.python.ops.distributions.bijector import *
 from tensorflow.python.ops.distributions.identity_bijector import Identity
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py b/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py
new file mode 100644
index 0000000000..b4a658c171
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Softsign bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Softsign",
+]
+
+
+class Softsign(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = X / (1 + |X|)`.
+
+  The softsign `Bijector` has the following two useful properties:
+
+  * The domain is all real numbers
+  * `softsign(x) approx sgn(x)`, for large `|x|`.
+
+  #### Examples
+
+  ```python
+  # Create the Y = softsign(X) transform.
+  softsign = Softsign()
+  x = [[[1., 2],
+        [3, 4]],
+       [[5, 6],
+        [7, 8]]]
+  x / (1 + abs(x)) == softsign.forward(x)
+  x / (1 - abs(x)) == softsign.inverse(x)
+  ```
+  """
+
+  def __init__(self, validate_args=False, name="softsign"):
+    super(Softsign, self).__init__(
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    return x / (1. + math_ops.abs(x))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return y / (1. - math_ops.abs(y))
+
+  def _forward_log_det_jacobian(self, x):
+    return -2. * math_ops.log1p(math_ops.abs(x))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return -2. * math_ops.log1p(-math_ops.abs(y))
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = [
+        check_ops.assert_greater(
+            y, math_ops.cast(-1., dtype=y.dtype.base_dtype),
+            message="Inverse transformation input must be greater than -1."),
+        check_ops.assert_less(
+            y, math_ops.cast(1., dtype=y.dtype.base_dtype),
+            message="Inverse transformation input must be less than 1.")
+    ]
+
+    return control_flow_ops.with_dependencies(is_valid, y)
-- 
GitLab


From ecacd206c44811baa75bef07b2ce99cd1021163c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 12:12:16 -0700
Subject: [PATCH 0697/1262] [XLA] Redesign: add XlaComputation::IsNull.

PiperOrigin-RevId: 192649052
---
 tensorflow/compiler/xla/client/xla_client/xla_computation.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index 2a3c695266..7182908666 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -44,6 +44,9 @@ class XlaComputation {
 
   const HloModuleProto& proto() const { return proto_; }
 
+  // Returns true if this object is a null Computation.
+  bool IsNull() const { return unique_id_ == -1; }
+
  private:
   XlaComputation(const int64 unique_id) : unique_id_(unique_id) {}
   HloModuleProto* mutable_proto() { return &proto_; }
-- 
GitLab


From 1a014c6d62bad0e58e3c8a1e31beb396daa19c13 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 12:29:48 -0700
Subject: [PATCH 0698/1262] Restore dependency on estimator utils from model.

PiperOrigin-RevId: 192651583
---
 tensorflow/contrib/boosted_trees/estimator_batch/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 0f65881aee..8cff1a3bb1 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -28,12 +28,13 @@ py_library(
     srcs = ["model.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
     ],
 )
 
-- 
GitLab


From 7bf6efa2d8e1172df47c1c4a8a09a007a1a09e8f Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Fri, 13 Apr 2018 03:36:52 +0800
Subject: [PATCH 0699/1262] Replace all COMPILER_MSVC to _MSC_VER and _WIN32
 accordingly (#18448)

* Replace all COMPILER_MSVC to _MSC_VER and _WIN32 accordingly

* One more ARRAYSIZE to TF_ARRAYSIZE

* Delete non-existing include
---
 tensorflow/c/c_api.h                               |  4 ++--
 tensorflow/c/c_api_experimental.h                  |  4 ++--
 tensorflow/c/eager/c_api.h                         |  4 ++--
 tensorflow/compiler/aot/runtime.cc                 |  4 ++--
 tensorflow/contrib/cmake/CMakeLists.txt            |  2 +-
 tensorflow/core/framework/numeric_types.h          |  4 ++--
 tensorflow/core/lib/gtl/manual_constructor.h       |  2 +-
 tensorflow/core/lib/strings/stringprintf.cc        | 10 ++--------
 tensorflow/core/lib/strings/stringprintf_test.cc   |  4 ++--
 tensorflow/core/util/memmapped_file_system.cc      |  2 +-
 tensorflow/core/util/memmapped_file_system.h       |  4 ++--
 tensorflow/stream_executor/cuda/cuda_driver.cc     | 14 +++-----------
 .../stream_executor/cuda/cuda_gpu_executor.cc      |  2 +-
 tensorflow/stream_executor/platform/port.h         |  6 ------
 14 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index fe85f8ee0e..c859434745 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -72,7 +72,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -80,7 +80,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 666342974e..88cb173cd2 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -35,7 +35,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -43,7 +43,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 3926c22ce1..c06ce84a8c 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -30,7 +30,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -38,7 +38,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 5772776666..5e74079fc1 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,7 +31,7 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
-#elif defined(COMPILER_MSVC)
+#elif defined(_WIN32)
   return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
@@ -48,7 +48,7 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 }
 
 inline void aligned_free(void* aligned_memory) {
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
   _aligned_free(aligned_memory);
 #else
   free(aligned_memory);
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 23b31ae1dc..a7944ea74a 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -124,7 +124,7 @@ endif()
 
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11)
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index dab53cba3e..b1d0127809 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -111,7 +111,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
 }  // namespace numext
 }  // namespace Eigen
 
-#if defined(COMPILER_MSVC) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 namespace std {
 template <>
 struct hash<Eigen::half> {
@@ -120,6 +120,6 @@ struct hash<Eigen::half> {
   }
 };
 }  // namespace std
-#endif  // COMPILER_MSVC
+#endif  // _MSC_VER
 
 #endif  // TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h
index 0a76e0962e..0176cdc94d 100644
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@@ -53,7 +53,7 @@ template <int size>
 struct AlignType<0, size> {
   typedef char result[size];
 };
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 #define TF_LIB_GTL_ALIGN_ATTRIBUTE(X) __declspec(align(X))
 #define TF_LIB_GTL_ALIGN_OF(T) __alignof(T)
 #elif defined(COMPILER_GCC3) || __GNUC__ >= 3 || defined(__APPLE__) || \
diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/lib/strings/stringprintf.cc
index 03eba4c851..bbffa062a9 100644
--- a/tensorflow/core/lib/strings/stringprintf.cc
+++ b/tensorflow/core/lib/strings/stringprintf.cc
@@ -22,12 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-#ifdef COMPILER_MSVC
-enum { IS_COMPILER_MSVC = 1 };
-#else
-enum { IS_COMPILER_MSVC = 0 };
-#endif
-
 void Appendv(string* dst, const char* format, va_list ap) {
   // First try with a small fixed size buffer
   static const int kSpaceLength = 1024;
@@ -48,13 +42,13 @@ void Appendv(string* dst, const char* format, va_list ap) {
       return;
     }
 
-    if (IS_COMPILER_MSVC) {
+#ifdef _MSC_VER
       // Error or MSVC running out of space.  MSVC 8.0 and higher
       // can be asked about space needed with the special idiom below:
       va_copy(backup_ap, ap);
       result = vsnprintf(nullptr, 0, format, backup_ap);
       va_end(backup_ap);
-    }
+#endif
 
     if (result < 0) {
       // Just an error.
diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/lib/strings/stringprintf_test.cc
index d61a1a945a..02cf4cbcad 100644
--- a/tensorflow/core/lib/strings/stringprintf_test.cc
+++ b/tensorflow/core/lib/strings/stringprintf_test.cc
@@ -30,9 +30,9 @@ TEST(PrintfTest, Empty) {
 
 TEST(PrintfTest, Misc) {
 // MSVC does not support $ format specifier.
-#if !defined(COMPILER_MSVC)
+#if !defined(_MSC_VER)
   EXPECT_EQ("123hello w", Printf("%3$d%2$s %1$c", 'w', "hello", 123));
-#endif  // !COMPILER_MSVC
+#endif  // !_MSC_VER
 }
 
 TEST(AppendfTest, Empty) {
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 1fa6b8bec0..d3439cbc93 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -185,7 +185,7 @@ const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
   return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
 }
 
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 constexpr char* MemmappedFileSystem::kMemmappedPackagePrefix;
 constexpr char* MemmappedFileSystem::kMemmappedPackageDefaultGraphDef;
 #else
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 76cc4911f5..958e23d28e 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -53,7 +53,7 @@ class MemmappedFileSystem : public FileSystem {
  public:
   // Memmapped regions use this prefix to distinguish from
   // the filesystem.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackagePrefix =
 #else
   static constexpr char kMemmappedPackagePrefix[] =
@@ -61,7 +61,7 @@ class MemmappedFileSystem : public FileSystem {
       "memmapped_package://";
 
 // The default graphdef in the package.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackageDefaultGraphDef =
 #else
   static constexpr char kMemmappedPackageDefaultGraphDef[] =
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 58e1e58c59..b06be69b64 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -37,14 +37,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-#if defined(PLATFORM_WINDOWS)
-// TODO: in windows ARRAYSIZE is defined in winnt.h but including it
-//  here creates a conflict with cuda.h - for now define it here.
-#define ARRAYSIZE(a) \
-  ((sizeof(a) / sizeof(*(a))) / \
-  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
@@ -720,15 +712,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
         port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
         port::bit_cast<void *>(info_log_buffer.data()),
         port::bit_cast<void *>(uintptr_t(log_verbose))};
-    CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
+    CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
     {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
-      res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
-                               option_values);
+      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options),
+                               options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 5ecaf46b8c..58ca0d3a97 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -1127,7 +1127,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_name(device_name);
   }
 
-  for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
+  for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
     const auto &params = kAllUnqueryableDeviceParams[i];
     if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
       builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 6603df4878..db62100435 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -39,12 +39,6 @@ using tensorflow::uint64;
 using std::string;
 #endif
 
-#if !defined(COMPILER_MSVC)
-#define ARRAYSIZE(a)              \
-    ((sizeof(a) / sizeof(*(a))) / \
-    static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 using tensorflow::LinkerInitialized;
 using tensorflow::LINKER_INITIALIZED;
 
-- 
GitLab


From f95906527e92a151a424b60a109d2361e20d610b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 12:39:48 -0700
Subject: [PATCH 0700/1262] Fix comment of bucket_by_sequence_length about
 return type of element_length_func. Current code requires tf.int32 in order
 to compare with buckets_min which is int32.

PiperOrigin-RevId: 192652917
---
 tensorflow/contrib/data/python/ops/grouping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 36591c055a..0531f9cbb9 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -108,7 +108,7 @@ def bucket_by_sequence_length(element_length_func,
   fraction of padding in a batch which increases training step efficiency.
 
   Args:
-    element_length_func: function from element in `Dataset` to `tf.int64`,
+    element_length_func: function from element in `Dataset` to `tf.int32`,
       determines the length of the element, which will determine the bucket it
       goes into.
     bucket_boundaries: `list<int>`, upper length boundaries of the buckets.
-- 
GitLab


From 3add17c999e7a50442fb5c97d2bb2d88597d5039 Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Fri, 13 Apr 2018 03:57:26 +0800
Subject: [PATCH 0701/1262] [MSVC] Remove -D__VERSION__ flag and implement
 tf_compiler_version properly (#18445)

---
 tensorflow/contrib/cmake/tf_core_framework.cmake |  6 ------
 tensorflow/tensorflow.bzl                        |  1 -
 tensorflow/tools/git/gen_git_source.py           | 10 +++++++++-
 tensorflow/tools/git/gen_git_source.sh           | 10 +++++++++-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index bcfb4f0819..f7cb186c7c 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -341,9 +341,3 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
-
-if(WIN32)
-  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
-  # Instead of defining this global, limit it to tf_core_framework where its used.
-  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
-endif()
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 528f811b40..bfb28d22a9 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -163,7 +163,6 @@ def if_override_eigen_strong_inline(a):
 
 def get_win_copts(is_external=False):
     WINDOWS_COPTS = [
-        "/D__VERSION__=\\\"MSVC\\\"",
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
         "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 6a1f126131..372329b70c 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -178,7 +178,15 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index db20bb00e8..cd128af6b3 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -28,7 +28,15 @@ fi
 cat <<EOF > ${OUTPUT_FILENAME}
 #include <string>
 const char* tf_git_version() {return "${GIT_VERSION}";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
-- 
GitLab


From 393a65caac76f5b4a3fa4c3edc98000a4a62b2e4 Mon Sep 17 00:00:00 2001
From: Rholais Lii <rholais@gmail.com>
Date: Fri, 13 Apr 2018 03:57:39 +0800
Subject: [PATCH 0702/1262] Reorder section `Using SavedModel with Estimators`
 (#18412)

Outputs should be specified before performing an export.
---
 .../docs_src/programmers_guide/saved_model.md | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 55ee42dd64..c6ef87c54a 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -485,31 +485,7 @@ portion of the signature.  That is, when writing a
 to expect and how to map them to your model's expected inputs.
 By contrast, the *output* portion of the signature is determined by the model.
 
-
-### Perform the export
-
-To export your trained Estimator, call
-@{tf.estimator.Estimator.export_savedmodel} with the export base path and
-the `serving_input_receiver_fn`.
-
-```py
-estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                            strip_default_attrs=True)
-```
-
-This method builds a new graph by first calling the
-`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
-this `Estimator`'s `model_fn()` to generate the model graph based on those
-features. It starts a fresh `Session`, and, by default, restores the most recent
-checkpoint into it.  (A different checkpoint may be passed, if needed.)
-Finally it creates a time-stamped export directory below the given
-`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
-SavedModel into it containing a single `MetaGraphDef` saved from this
-Session.
-
-> Note: It is your responsibility to garbage-collect old exports.
-> Otherwise, successive exports will accumulate under `export_dir_base`.
-
+<a name="specify_outputs"></a>
 ### Specify the outputs of a custom model
 
 When writing a custom `model_fn`, you must populate the `export_outputs` element
@@ -541,6 +517,30 @@ using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tens
 indicating which `SignatureDef` will be served when an inference request
 does not specify one.
 
+<a name="perform_export"></a>
+### Perform the export
+
+To export your trained Estimator, call
+@{tf.estimator.Estimator.export_savedmodel} with the export base path and
+the `serving_input_receiver_fn`.
+
+```py
+estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                            strip_default_attrs=True)
+```
+
+This method builds a new graph by first calling the
+`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
+this `Estimator`'s `model_fn()` to generate the model graph based on those
+features. It starts a fresh `Session`, and, by default, restores the most recent
+checkpoint into it.  (A different checkpoint may be passed, if needed.)
+Finally it creates a time-stamped export directory below the given
+`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
+SavedModel into it containing a single `MetaGraphDef` saved from this
+Session.
+
+> Note: It is your responsibility to garbage-collect old exports.
+> Otherwise, successive exports will accumulate under `export_dir_base`.
 
 ### Serve the exported model locally
 
-- 
GitLab


From 9e3077475cf86d8ed615a478984818d84b37d29c Mon Sep 17 00:00:00 2001
From: brett koonce <koonce@hello.com>
Date: Thu, 12 Apr 2018 12:57:48 -0700
Subject: [PATCH 0703/1262] contrib: minor spelling tweaks (#18330)

* contrib: minor spelling tweaks

* Fix lint error
---
 .../estimator/python/estimator/replicate_model_fn.py   |  4 ++--
 .../python/ops/fused_conv2d_bias_activation_op.py      |  2 +-
 .../python/ops/fused_conv2d_bias_activation_op_test.py | 10 +++++-----
 .../kernel_tests/sparse_feature_cross_op_test.py       |  2 +-
 .../contrib/layers/python/layers/feature_column.py     |  2 +-
 .../contrib/layers/python/layers/feature_column_ops.py |  4 ++--
 tensorflow/contrib/layers/python/layers/layers.py      |  4 ++--
 .../meta_graph_transform/meta_graph_transform.py       |  2 +-
 tensorflow/contrib/optimizer_v2/optimizer_v2.py        |  2 +-
 9 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index fa2697800e..a8774d6dab 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -456,7 +456,7 @@ def _get_local_devices(device_type):
 
 
 def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labes into batches."""
+  """Split input features and labels into batches."""
 
   def ensure_divisible_by_shards(sequence):
     batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
@@ -602,7 +602,7 @@ def _local_device_setter(worker_device, ps_devices, ps_strategy):
 
 
 def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with approproriately scaled loss."""
+  """Produce an EstimatorSpec with appropriately scaled loss."""
   if tower_spec.loss is None:
     return tower_spec
 
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index a97adf622e..983b6dc8e5 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -65,7 +65,7 @@ def fused_conv2d_bias_activation(conv_input,
     side_input_scale: A scalar `float32` that will be multiplied by side_input.
         This is optional and defaults to 0.
     side_input: A `Tensor` of the format specified by `data_format`.
-        This is useful for imlementing ResNet blocks.
+        This is useful for implementing ResNet blocks.
     activation_mode: (optional) currently must be the default "Relu".
         Note that in qint8 mode, it also clips to 127, so acts like ReluX.
     data_format: Specifies the data format.
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index bb155aa249..3d0ed89932 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -566,7 +566,7 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
-def CalculateCovolvedOutputDim(input_dim, filter_dim, stride, padding_type):
+def CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
   """Calculates the size of an output dimension of a strided convolution.
 
   Given the sizes of the corresponding dimension of the input and filter shapes,
@@ -827,10 +827,10 @@ class FusedConvInt8Tests(test.TestCase):
             maxval=1.0,
             dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
 
-    output_height = CalculateCovolvedOutputDim(input_height, filter_height,
-                                               vertical_stride, padding_type)
-    output_width = CalculateCovolvedOutputDim(input_width, filter_width,
-                                              horizontal_stride, padding_type)
+    output_height = CalculateConvolvedOutputDim(input_height, filter_height,
+                                                vertical_stride, padding_type)
+    output_width = CalculateConvolvedOutputDim(input_width, filter_width,
+                                               horizontal_stride, padding_type)
     print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
diff --git a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
index f701647c2b..28ddaa69a1 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
+++ b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
@@ -200,7 +200,7 @@ class SparseCrossOpTest(test.TestCase):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_large_batch(self):
-    """Tests with large batch size to force multithreding.
+    """Tests with large batch size to force multithreading.
     """
     batch_size = 5000
     col1 = []
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 9ccb589d69..3ae07cedab 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -48,7 +48,7 @@ you should choose depends on (1) the feature type and (2) the model type.
    recommended.
 
      embedded_dept_column = embedding_column(
-       sparse_column_with_keys("department", ["math", "philosphy", ...]),
+       sparse_column_with_keys("department", ["math", "philosophy", ...]),
        dimension=10)
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 78affea44c..06060b99e7 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -815,7 +815,7 @@ class _Transformer(object):
   """
 
   def __init__(self, columns_to_tensors):
-    """Initializes transfomer.
+    """Initializes transformer.
 
     Args:
       columns_to_tensors: A mapping from feature columns to tensors. 'string'
@@ -908,7 +908,7 @@ def _gather_feature_columns(feature_columns):
 
 
 def _check_forbidden_sequence_columns(feature_columns):
-  """Recursively cecks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
+  """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
   all_feature_columns = _gather_feature_columns(feature_columns)
   for feature_column in all_feature_columns:
     if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 949e73deff..151fc7a0d7 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1542,7 +1542,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1686,7 +1686,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index ff88b4fa84..4fe4e8d044 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -348,7 +348,7 @@ def _freeze_graph_with_def_protos(input_graph_def, output_node_names,
                                   input_saver_def, input_checkpoint):
   """Converts all variables in a graph and checkpoint into constants.
 
-  During this process, we need to retain certain initialzer nodes (e.g. table
+  During this process, we need to retain certain initializer nodes (e.g. table
   initializer nodes). Instead of determining which dependencies
   of the shared initializer node (e.g. group_deps) to keep, we
   reconstruct the connections between the individual initializer nodes and
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 25d19578ea..ce15db6f1e 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -579,7 +579,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   ### State
 
-  Internal methods apre passed a `state` argument with the correct
+  Internal methods are passed a `state` argument with the correct
   values to use for the slot and non-slot variables, and the hyper
   parameters.
   """
-- 
GitLab


From 5592a96a5195dc4e5f49a1e3ca4243faa094ff85 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 12 Apr 2018 12:58:04 -0700
Subject: [PATCH 0704/1262] Fix WARNING in BatchNormalization (#18315)

The keep_dims for reduce_mean has been deprecated and replaced
with keepdims. This casues the following WARNING in BatchNormalization:
```
normalization.py:584: calling reduce_mean (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
```

This fix fixes the warning in BatchNormalization.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/keras/_impl/keras/layers/normalization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index b73025a5a8..69332c21e1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -592,9 +592,9 @@ class BatchNormalization(Layer):
         # used during evaluation, it is more efficient to just update in one
         # step and should not make a significant difference in the result.
         new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keep_dims=True)
+                                        axis=1, keepdims=True)
         new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keep_dims=True)
+                                            axis=1, keepdims=True)
 
       def _do_update(var, value):
         if in_eager_mode and not self.trainable:
-- 
GitLab


From 9efffac056fd2e01755a0bc1059f20ff6448f35d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 13 Apr 2018 03:58:22 +0800
Subject: [PATCH 0705/1262] remove the misleading n_class information (#18305)

* DOC: modify the misleading n_class info

* DOC: add suggested fix
---
 tensorflow/python/estimator/canned/head.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 189b81aeea..5e61c30ea2 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -263,9 +263,12 @@ def _check_dense_labels_match_logits_and_reshape(
         if (dim1 is not None) and (dim1 != expected_labels_dimension):
           raise ValueError(
               'Mismatched label shape. '
-              'Classifier configured with n_classes=%s.  Received %s. '
-              'Suggested Fix: check your n_classes argument to the estimator '
-              'and/or the shape of your label.' %
+              'Expected labels dimension=%s.  Received %s. '
+              'Suggested Fix:'
+              'If your classifier expects one-hot encoding label,'
+              'check your n_classes argument to the estimator'
+              'and/or the shape of your label.'
+              'Otherwise, check the shape of your label.' %
               (expected_labels_dimension, dim1))
       expected_labels_shape = array_ops.concat(
           [logits_shape[:-1], [expected_labels_dimension]], axis=0)
-- 
GitLab


From 12da1017c6182afefd53d707dadd0ea76ce658a1 Mon Sep 17 00:00:00 2001
From: brett koonce <koonce@hello.com>
Date: Thu, 12 Apr 2018 12:58:36 -0700
Subject: [PATCH 0706/1262] contrib/autograph: minor spelling tweaks (#18284)

---
 tensorflow/contrib/autograph/converters/call_trees.py     | 2 +-
 .../contrib/autograph/converters/call_trees_test.py       | 2 +-
 .../contrib/autograph/converters/decorators_test.py       | 2 +-
 tensorflow/contrib/autograph/impl/api.py                  | 4 ++--
 tensorflow/contrib/autograph/impl/conversion.py           | 2 +-
 .../contrib/autograph/pyct/static_analysis/activity.py    | 6 +++---
 .../autograph/pyct/static_analysis/activity_test.py       | 2 +-
 .../contrib/autograph/pyct/static_analysis/annos.py       | 8 ++++----
 tensorflow/contrib/autograph/utils/builtins.py            | 2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 61f6bfd7e7..e22895ed6a 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -147,7 +147,7 @@ class CallTreeTransformer(transformer.Base):
       # Inspect the target function decorators. If any include a @convert
       # or @graph_ready annotation, then they must be called as they are.
       # TODO(mdan): This may be quite heavy.
-      # To parse and re-analize each function for every call site could be quite
+      # To parse and re-analyze each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
         target_node, _ = parser.parse_entity(target_entity)
diff --git a/tensorflow/contrib/autograph/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
index c666dcb73b..303dd54a4e 100644
--- a/tensorflow/contrib/autograph/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -34,7 +34,7 @@ class CallTreesTest(converter_test_base.TestCase):
   def test_basic(self):
 
     def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled verison.')
+      raise ValueError('This should not be called in the compiled version.')
 
     def renamed_test_fn_1(a):
       return a + 1
diff --git a/tensorflow/contrib/autograph/converters/decorators_test.py b/tensorflow/contrib/autograph/converters/decorators_test.py
index e67ab1cd6a..9c01f68912 100644
--- a/tensorflow/contrib/autograph/converters/decorators_test.py
+++ b/tensorflow/contrib/autograph/converters/decorators_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 # The Python parser only briefly captures decorators into the AST.
 # The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is notmally what you would expect, since
+# trace of the decorator (which is normally what you would expect, since
 # they are meant to be transparent).
 # However, decorators are still visible when you analyze the function
 # from inside a decorator, before it was applied - as is the case
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index dce994e50d..b1731480be 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -49,7 +49,7 @@ def convert(recursive=False, verbose=False, arg_types=None):
   function is called. This means the parameter values are known at compilation.
 
   Args:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_types: See to_graph.
@@ -215,7 +215,7 @@ def to_graph(e,
 
   Args:
     e: A Python entity.
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_values: A dict containing value hints for symbols like function
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 3bacc94300..240e070368 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -56,7 +56,7 @@ class ConversionMap(object):
   This object is mutable, and is updated as functions are converted.
 
   Attributes:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     nocompile_decorators: tuple of decorator functions that toggle compilation
         off.
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index 6dd53091fa..b6817e9d75 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -162,11 +162,11 @@ class Scope(object):
       self.parent.mark_returned(name)
 
 
-class ActivityAnalizer(transformer.Base):
+class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
   def __init__(self, context, parent_scope):
-    super(ActivityAnalizer, self).__init__(context)
+    super(ActivityAnalyzer, self).__init__(context)
     self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
@@ -323,4 +323,4 @@ class ActivityAnalizer(transformer.Base):
 
 
 def resolve(node, context, parent_scope=None):
-  return ActivityAnalizer(context, parent_scope).visit(node)
+  return ActivityAnalyzer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index 1e6c686b01..65e1a8f0ea 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -108,7 +108,7 @@ class ScopeTest(test.TestCase):
     self.assertFalse(QN('a') in child.referenced)
 
 
-class ActivityAnalizerTest(test.TestCase):
+class ActivityAnalyzerTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
     node, source = parser.parse_entity(test_fn)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index d6d9f7e1a6..b929b35b79 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Annotations used by the static analizer."""
+"""Annotations used by the static analyzer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,15 +28,15 @@ class NoValue(Enum):
 
 
 class NodeAnno(NoValue):
-  """Additionnal annotations used by the static analyzer.
+  """Additional annotations used by the static analyzer.
 
   These are in addition to the basic annotations declared in anno.py.
   """
 
   # Symbols
   # These flags are boolean.
-  IS_LOCAL = 'Symbol is local to the function scope being analized.'
-  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  IS_LOCAL = 'Symbol is local to the function scope being analyzed.'
+  IS_PARAM = 'Symbol is a parameter to the function being analyzed.'
   IS_MODIFIED_SINCE_ENTRY = (
       'Symbol has been explicitly replaced in the current function scope.')
 
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 7fbb7c09d8..0a0e72d70e 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -84,7 +84,7 @@ def is_tf_print_compatible(value):
 
 
 def dynamic_print(*values):
-  """Implementartion of print using dynamic dispatch.
+  """Implementation of print using dynamic dispatch.
 
   The function attempts to use tf.Print if all the values are compatible.
   Otherwise, it will fall back to py_func.
-- 
GitLab


From 462b5d28061d7983aa852f09c9ee94e5957f58dd Mon Sep 17 00:00:00 2001
From: Wai Hon Law <whhone@gmail.com>
Date: Thu, 12 Apr 2018 12:58:44 -0700
Subject: [PATCH 0707/1262] Change --output_png to --output_image (#18273)

The argument is incorrect.

When running the given command, we get
```
E tensorflow/examples/wav_to_spectrogram/main.cc:54] Unknown argument
--output_png=/tmp/spectrogram.png
```

TESTED:Rerun the updated command and verify that the flag is correct.
```
bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram --
--input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav
--output_image=/tmp/spectrogram.png
```
---
 tensorflow/docs_src/tutorials/audio_recognition.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 7d79f433c4..372ab47df7 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -280,7 +280,7 @@ tool:
 ```
 bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram -- \
 --input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav \
---output_png=/tmp/spectrogram.png
+--output_image=/tmp/spectrogram.png
 ```
 
 If you open up `/tmp/spectrogram.png` you should see something like this:
-- 
GitLab


From 5c237ddfcc0e54427e4fc31cccff809d65e66321 Mon Sep 17 00:00:00 2001
From: Shaoning Zeng <shaoning.zeng@gmail.com>
Date: Fri, 13 Apr 2018 03:58:59 +0800
Subject: [PATCH 0708/1262] give some writing updates to
 tensorflow/contrib/slim/README.md (#18259)

* add missed right bracket in ### Scopes

* change one , to . in ### Scopes

* refine one sentence
---
 tensorflow/contrib/slim/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 40f484fd78..746b955642 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -290,9 +290,9 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 
 In addition to the types of scope mechanisms in TensorFlow
 ([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope)),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope).
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -761,8 +761,8 @@ parts:
 3. Finalization: (optionally) perform any final operation to compute metric
 values. For example, computing means, mins, maxes, etc.
 
-For example, to compute `mean_absolute_error`, two variables, a `count` and
-`total` variable are *initialized* to zero. During *aggregation*, we observed
+For example, to compute `mean_absolute_error`, two variables (`count` and
+`total`) are *initialized* to zero. During *aggregation*, we observed
 some set of predictions and labels, compute their absolute differences and add
 the total to `total`. Each time we observe another value,
 `count` is incremented. Finally, during *finalization*, `total` is divided
-- 
GitLab


From 4c7fe9e83f206fc177dd6deaa6a1338b6192f263 Mon Sep 17 00:00:00 2001
From: Quanlong <cybertk@users.noreply.github.com>
Date: Fri, 13 Apr 2018 03:59:26 +0800
Subject: [PATCH 0709/1262] Latest nngraph cannot build with Hexagon SDK 3.0
 (#17963)

* fix: latest nngraph cannot build with Hexagon SDK 3.0

* Fix typo
---
 tensorflow/contrib/hvx/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 163993a3f6..68e34f3b09 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -42,11 +42,12 @@ If you've finished walking through the quick start guide, you may want to try bu
 
 ### Build libhexagon\_nn\_skel.so
 
-Download Hexagon NN library from codeaurora.org and build it.
+Download Hexagon NN library from codeaurora.org and build it. For Hexagon SDK 3.0, we need use the compatible version([721b2d58f](https://source.codeaurora.org/quic/hexagon_nn/nnlib/commit/?id=721b2d58f0f4e2d5b182f41e6b7c4db5356bf0fb)) of nnlib.
 
 ```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
+git reset 721b2d58f --hard
 ```
 
 Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
-- 
GitLab


From ace33c76636ed908958888243131524091085f96 Mon Sep 17 00:00:00 2001
From: Yihong Wang <yh.wang@ibm.com>
Date: Thu, 12 Apr 2018 12:59:48 -0700
Subject: [PATCH 0710/1262] Link to gcc_s and gcc if compiler is GCC version 5
 (#17849)

When using cmake and GCC 5.4 to build tensorflow in Ubuntu 16.04,
the following error message would show up when loading
_pywrap_tensorflow_internal.so:
```
_pywrap_tensorflow_internal.so: undefined symbol: __cpu_model
```

The root cause is the same to this issue:
https://github.com/tensorflow/tensorflow/issues/9593

Signed-off-by: Yihong Wang <yh.wang@ibm.com>
---
 tensorflow/contrib/cmake/tf_python.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ded15b4b66..1c3206f1a2 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -586,6 +586,12 @@ add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_deffile}
 )
 
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(pywrap_tensorflow_internal PRIVATE gcc_s gcc)
+endif()
+
 if(WIN32)
     add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
 endif(WIN32)
-- 
GitLab


From d68ef84dc9bc99bb4d06a48ad847f13f0c8d0396 Mon Sep 17 00:00:00 2001
From: David Norman <DavidNorman@users.noreply.github.com>
Date: Thu, 12 Apr 2018 21:00:12 +0100
Subject: [PATCH 0711/1262] Allow for devices which have F16, no F64, no
 Complex (#17473)

---
 tensorflow/compiler/xla/tests/dot_operation_test.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 7b994a4c17..c4031dfee5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -50,6 +50,13 @@ using TypesF16F32 = ::testing::Types<Eigen::half, float>;
 using TypesF16F32F64 = ::testing::Types<Eigen::half, float, double>;
 using TypesF16F32F64CF64 =
     ::testing::Types<Eigen::half, float, double, complex64>;
+#elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX)
+using TypesF16F32 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64CF64 =
+    ::testing::Types<Eigen::half, float>;
 #else
 #error "Situation not handled yet"
 #endif
-- 
GitLab


From 0253b68db7ccb0537b46052cbcac7715861ac22b Mon Sep 17 00:00:00 2001
From: Seyed Majid Azimi <m_azimi@web.de>
Date: Thu, 12 Apr 2018 22:00:33 +0200
Subject: [PATCH 0712/1262] Update nn.py (#17247)

adding missing quantized_relu which was missing before.
---
 tensorflow/python/ops/nn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 244702d13b..1d0d9a52a1 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -98,6 +98,7 @@ See the @{$python/nn} guide.
 @@fixed_unigram_candidate_sampler
 @@compute_accidental_hits
 @@quantized_conv2d
+@@quantized_relu
 @@quantized_relu_x
 @@quantized_max_pool
 @@quantized_avg_pool
-- 
GitLab


From e40fec4a9563cfe021243f63beda51afcc6d13ef Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Thu, 12 Apr 2018 12:58:05 -0700
Subject: [PATCH 0713/1262] Upgrade libjpeg-turbo

NOTE: libjpeg-turbo 1.5.90 also exists, which adds AVX2 SIMD support. However
it also comes with a CMake build rewrite and 35 micro-architecture specialized
nasm files for x86_64 alone. We do not have the cycles to update jpeg.BUILD to
incorporate those changes at this time. If anyone wants to try, please note
we'd need predicates such as the following:

config_setting(
    name = "haswell_opt",
    values = {
        "cpu": "haswell",  # First Intel chip with AVX2
        "compilation_mode": "opt",
    },
    visibility = ["//visibility:public"],
)

config_setting(
    name = "excavator_opt",
    values = {
        "cpu": "excavator",  # First AMD chip with AVX2
        "compilation_mode": "opt",
    },
    visibility = ["//visibility:public"],
)

PiperOrigin-RevId: 192655533
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 52168a89c5..72f446d359 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -210,11 +210,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "jpeg",
       urls = [
-          "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
-          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.3.tar.gz",
+          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.3.tar.gz",
       ],
-      sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
-      strip_prefix = "libjpeg-turbo-1.5.1",
+      sha256 = "1a17020f859cb12711175a67eab5c71fc1904e04b587046218e36106e07eabde",
+      strip_prefix = "libjpeg-turbo-1.5.3",
       build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
   )
 
-- 
GitLab


From 9a9a90e9f170045e752805b390064c25fcc69573 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 12 Apr 2018 13:01:05 -0700
Subject: [PATCH 0714/1262] Add tensor support for num_spectrogram_bins in
 linear_to_mel_weight_matrix (#17404)

* Add tensor support for num_spectrogram_bins in linear_to_mel_weight_matrix

This fix tries to address the issue raised in 16553 where it was not possible
to provide num_spectrogram_bins as a tensor or placeholder for
linear_to_mel_weight_matrix.

The reason comes from the implementation of `_validate_arguments`
which requires num_spectrogram_bins to be a python.

However, the validation here is not necessary as `num_spectrogram_bins`
will be passed to `math_ops.linspace`, which performs the validation
anyway. The validation in `math_ops.linspace` is done in shape function
and in kernel's `Compute()`. For that it makes sense to remove the
validation of `num_spectrogram_bins` in `_validate_arguments` so that
the issue raised in 16553 could be addressed.
This fix adds a test case to cover the changes. Also, the
error case of `num_spectrogram_bins < 0` has already been covered in the
existing test case:
https://github.com/tensorflow/tensorflow/blob/013a6c7b3112573ba4d932c8a22bfaf45f648c77/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py#L149-L165

This fix fixes 16553.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for tensor support of num_spectrogram_bins in mel_ops.linear_to_mel_weight_matrix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add comment for removing validation of num_spectrogram_bins

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update docstring

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update test case for num_spectrogram_bins

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove unused constant_op import to pass sanity check

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../signal/python/kernel_tests/mel_ops_test.py   | 13 +++++++++++++
 tensorflow/contrib/signal/python/ops/mel_ops.py  | 16 ++++++++--------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 35c4b5bec1..345eb6cfaa 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.signal.python.kernel_tests import test_util
 from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -173,6 +174,18 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  def test_num_spectrogram_bins_dynamic(self):
+    with self.test_session(use_gpu=True):
+      num_spectrogram_bins = array_ops.placeholder(shape=(),
+                                                   dtype=dtypes.int32)
+      mel_matrix_np = spectrogram_to_mel_matrix(
+          20, 129, 8000.0, 125.0, 3800.0)
+      mel_matrix = mel_ops.linear_to_mel_weight_matrix(
+          20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
+      self.assertAllClose(
+          mel_matrix_np,
+          mel_matrix.eval(feed_dict={num_spectrogram_bins: 129}), atol=3e-6)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index d1a36548d9..1e84006116 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -64,14 +64,11 @@ def _hertz_to_mel(frequencies_hertz, name=None):
         1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
 
 
-def _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+def _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype):
   """Checks the inputs to linear_to_mel_weight_matrix."""
   if num_mel_bins <= 0:
     raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
-  if num_spectrogram_bins <= 0:
-    raise ValueError('num_spectrogram_bins must be positive. Got: %s' %
-                     num_spectrogram_bins)
   if sample_rate <= 0.0:
     raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
   if lower_edge_hertz < 0.0:
@@ -122,9 +119,9 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
-    num_spectrogram_bins: Python int. How many bins there are in the source
-      spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the
-      spectrogram only contains the nonredundant FFT bins.
+    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
+      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
+      i.e. the spectrogram only contains the nonredundant FFT bins.
     sample_rate: Python float. Samples per second of the input signal used to
       create the spectrogram. We need this to figure out the actual frequencies
       for each spectrogram bin, which dictates how they are mapped into the mel
@@ -148,7 +145,10 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
   [mel]: https://en.wikipedia.org/wiki/Mel_scale
   """
   with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
-    _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
+    # and the validation is already done in linspace (both in shape function
+    # and in kernel), there is no need to validate num_spectrogram_bins here.
+    _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
     # To preserve accuracy, we compute the matrix at float64 precision and then
-- 
GitLab


From 18f8568ca2e2efedd482e1120d4a5b73aab7841c Mon Sep 17 00:00:00 2001
From: Russell Klopfer <russell@klopfer.us>
Date: Thu, 12 Apr 2018 16:01:25 -0400
Subject: [PATCH 0715/1262] crf_decode fails when sequence_length is 0 (#17755)

* updating documentation

* crf_decode fails when sequence_length is 0

* fixing line length

* more pylint fixes
---
 .../contrib/crf/python/kernel_tests/crf_test.py   | 15 +++++++++++++++
 tensorflow/contrib/crf/python/ops/crf.py          |  8 +++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 721dc4d080..a5e065b93a 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -281,6 +281,21 @@ class CrfTest(test.TestCase):
         self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
                          expected_max_sequence[:sequence_lengths])
 
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      values = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tags, scores = sess.run(values)
+      self.assertEqual(len(tags.shape), 2)
+      self.assertEqual(len(scores.shape), 1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 1233c8f251..e37c029ceb 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -479,15 +479,17 @@ def crf_decode(potentials, transition_params, sequence_length):
     initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # sequence length is not allowed to be less than zero
+    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
         inputs=inputs,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
     backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
-        backpointers, sequence_length - 1, seq_dim=1)
+        backpointers, sequence_length_less_one, seq_dim=1)
 
     # Computes backward decoding. Extract tag indices from backpointers.
     crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
@@ -497,7 +499,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
         crf_bwd_cell,
         inputs=backpointers,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
-- 
GitLab


From 64eb9b445a79d571c26c3e63920402d3b0940c12 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Thu, 12 Apr 2018 13:06:28 -0700
Subject: [PATCH 0716/1262] Separate out distribute dependency out of training,
 as it needs to be used in summary utils (which training depends on, thus
 causing circular dependency).

PiperOrigin-RevId: 192656997
---
 tensorflow/contrib/distribute/python/BUILD | 12 ++++++--
 tensorflow/contrib/optimizer_v2/BUILD      |  1 +
 tensorflow/python/BUILD                    | 33 +++++++++++++++++++++-
 tensorflow/python/estimator/BUILD          |  5 ++++
 tensorflow/python/keras/BUILD              |  1 +
 5 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 78b2b0054a..51b4fbacd1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -27,6 +27,8 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
@@ -51,6 +53,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:model_fn",
     ],
@@ -66,6 +69,8 @@ py_library(
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:training",
@@ -84,9 +89,9 @@ py_library(
         ":values",
         "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -104,6 +109,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:training",
@@ -156,8 +162,8 @@ py_test(
     deps = [
         ":mirrored_strategy",
         ":strategy_test_lib",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -186,10 +192,10 @@ cuda_py_test(
         ":mirrored_strategy",
         ":values",
         ":strategy_test_lib",
+        "//tensorflow/python:distribute",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:layers",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 26ea9135f5..86e5f4a437 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -48,6 +48,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 559926d415..72284fd50b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2955,6 +2955,7 @@ py_library(
         ":framework_ops",
         ":gradients",
         ":init_ops",
+        ":distribute",
         ":io_ops",
         ":io_ops_gen",
         ":layers_base",
@@ -3012,6 +3013,35 @@ py_test(
     ],
 )
 
+py_library(
+    name = "device_util",
+    srcs = ["training/device_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device",
+        ":framework_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "distribute",
+    srcs = ["training/distribute.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":device_util",
+        ":framework_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
 py_test(
     name = "checkpointable_utils_test",
     srcs = ["training/checkpointable_utils_test.py"],
@@ -3052,7 +3082,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":client_testlib",
-        ":training",
+        ":distribute",
         ":variable_scope",
     ],
 )
@@ -4316,6 +4346,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":distribute",
         ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 5d8b19223f..a34405c702 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -251,6 +251,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:boosted_trees_ops",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
@@ -327,6 +328,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -383,6 +385,7 @@ py_library(
         ":model_fn",
         ":optimizers",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
@@ -466,6 +469,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:metrics",
         "//tensorflow/python:platform",
@@ -743,6 +747,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index da5bc3e6f1..024a8cd3d1 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -205,6 +205,7 @@ py_library(
     deps = [
         ":engine",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:logging_ops",
-- 
GitLab


From 322580a5b704f0db72cd2bfa4e5a08f6b8c3b664 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 12 Apr 2018 13:24:32 -0700
Subject: [PATCH 0717/1262] Fix build breakage on metagraph exporting when
 caching_device is set

PiperOrigin-RevId: 192659701
---
 tensorflow/python/ops/resource_variable_ops.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index c51d1e467d..49dd7f9948 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -522,12 +522,13 @@ class ResourceVariable(variables.Variable):
     else:
       self._initial_value = None
     if variable_def.snapshot_name:
-      self._cached_value = g.as_graph_element(
+      snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
-      self._graph_element = g.as_graph_element(
-          ops.prepend_name_scope(variable_def.snapshot_name,
-                                 import_scope=import_scope))
+      self._cached_value = snapshot
+      while snapshot.op.type != "ReadVariableOp":
+        snapshot = snapshot.op.inputs[0]
+      self._graph_element = snapshot
     else:
       self._cached_value = None
       # Legacy case for protos without the snapshot name; assume it's the
-- 
GitLab


From 111ee9ba4c7bcc736db9b79f967f380052a091e0 Mon Sep 17 00:00:00 2001
From: James Wexler <jwexler@google.com>
Date: Thu, 12 Apr 2018 13:24:51 -0700
Subject: [PATCH 0718/1262] Make new build target public.

PiperOrigin-RevId: 192659759
---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 97e0095e05..c461f9ed2f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -255,6 +255,7 @@ closure_js_proto_library(
         "example/example.proto",
         "example/feature.proto",
     ],
+    visibility = ["//visibility:public"],
 )
 
 exports_files([
-- 
GitLab


From 7c0172e0853f3262e1d85aa6bc37cf70d718cca0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 14:31:08 -0700
Subject: [PATCH 0719/1262] ResolveConstantReshape transformation and fix for
 ResolveConstantTranspose.

PiperOrigin-RevId: 192670991
---
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 .../graph_transformations.h                   |   1 +
 .../remove_trivial_reshape.cc                 |   5 +-
 .../resolve_constant_reshape.cc               | 124 ++++++++++++++++++
 .../resolve_constant_transpose.cc             |   6 +
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   1 +
 tensorflow/contrib/lite/toco/tooling_util.cc  |  48 +++----
 tensorflow/contrib/lite/toco/tooling_util.h   |  17 +++
 8 files changed, 171 insertions(+), 32 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index a05d71985f..4c8652d62e 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -266,6 +266,7 @@ cc_library(
         "graph_transformations/resolve_constant_gather.cc",
         "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
+        "graph_transformations/resolve_constant_reshape.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 80463ce8f8..384bd85b81 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -165,6 +165,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantReshape)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
 DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
index 61477d59ae..e28d8cf01e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -41,8 +41,8 @@ bool IsReshapeTrivial(const Model& model, const Operator& op,
         ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
       transformation->AddMessageF(
           "%s is trivial because its input and output shapes are equal up to "
-          "extending "
-          "by 1's, and we are told to aggressively discard such Reshape ops.",
+          "extending by 1's, and we are told to aggressively discard such "
+          "Reshape ops.",
           LogName(op));
       return true;
     }
@@ -80,6 +80,7 @@ bool RemoveTrivialReshape::Run(Model* model, std::size_t op_index) {
   }
 
   if (!IsReshapeTrivial(*model, *reshape_op, this)) {
+    AddMessageF("%s is not trivial", LogName(*reshape_op));
     return false;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
new file mode 100644
index 0000000000..7e7ad383e7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Resolves a constant reshape operation by copying the buffer.
+bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+  const auto* op = static_cast<const TensorFlowReshapeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  const Array& input_array = model->GetArray(op->inputs[0]);
+  if (!ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
+    AddMessageF("Constant reshape is non-trivial (%s -> %s)",
+                ShapeToString(input_array.shape()),
+                ShapeToString(output_array.shape()));
+    return false;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (input_array.data_type) {
+    case ArrayDataType::kBool:
+      CopyArrayBuffer<ArrayDataType::kBool>(input_array, &output_array);
+      break;
+    case ArrayDataType::kFloat:
+      CopyArrayBuffer<ArrayDataType::kFloat>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt8:
+      CopyArrayBuffer<ArrayDataType::kInt8>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      CopyArrayBuffer<ArrayDataType::kUint8>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt16:
+      CopyArrayBuffer<ArrayDataType::kInt16>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint16:
+      CopyArrayBuffer<ArrayDataType::kUint16>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      CopyArrayBuffer<ArrayDataType::kInt32>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint32:
+      CopyArrayBuffer<ArrayDataType::kUint32>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      CopyArrayBuffer<ArrayDataType::kInt64>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint64:
+      CopyArrayBuffer<ArrayDataType::kUint64>(input_array, &output_array);
+      break;
+    case ArrayDataType::kString:
+      CopyArrayBuffer<ArrayDataType::kString>(input_array, &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type: "
+                 << ArrayDataTypeName(input_array.data_type);
+      return false;
+  }
+
+  AddMessageF("Resolving constant reshape of %s", LogName(*op));
+
+  if (input_array.minmax) {
+    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  }
+  if (input_array.quantization_params) {
+    output_array.GetOrCreateQuantizationParams() =
+        input_array.GetQuantizationParams();
+  }
+
+  // Erase input arrays if no longer used.
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 4f984bfde5..1fd20314b1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -131,6 +131,10 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
   if (input_array.minmax) {
     output_array.GetOrCreateMinMax() = input_array.GetMinMax();
   }
+  if (input_array.quantization_params) {
+    output_array.GetOrCreateQuantizationParams() =
+        input_array.GetQuantizationParams();
+  }
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
@@ -164,6 +168,8 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
       break;
   }
 
+  AddMessageF("Resolving constant transpose of %s", LogName(*op));
+
   // Erase input arrays if no longer used.
   for (const auto& input : op->inputs) {
     if (IsDiscardableArray(*model, input) &&
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 1ab0a6f058..5ba093a830 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -83,6 +83,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveConstantGather);
   transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
+  transformations->Add(new ResolveConstantReshape);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
   transformations->Add(new ResolveConstantTranspose);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index bd2d5f7df0..224df9973e 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1084,23 +1084,30 @@ void InsertCopyOperator(Model* model, const string& source_array_name,
   model->operators.emplace_back(copy_op);
 }
 
-namespace {
-template <ArrayDataType A>
-void CopyArrayBuffer(const Array& source_array, Array* target_array) {
-  if (source_array.buffer) {
-    const auto& source_buffer = source_array.GetBuffer<A>();
-    auto& target_buffer = target_array->GetMutableBuffer<A>();
-    target_buffer.data = source_buffer.data;
-  }
-}
-}  // namespace
-
 void CloneArray(Model* model, const string& source_array_name,
                 const string& target_array_name) {
   CHECK(!model->HasArray(target_array_name));
   const Array& source_array = model->GetArray(source_array_name);
   Array& target_array = model->GetOrCreateArray(target_array_name);
 
+  if (source_array.minmax) {
+    const auto& smm = source_array.GetMinMax();
+    auto& tmm = target_array.GetOrCreateMinMax();
+    tmm.min = smm.min;
+    tmm.max = smm.max;
+  }
+
+  if (source_array.quantization_params) {
+    const auto& sqp = source_array.GetQuantizationParams();
+    auto& tqp = target_array.GetOrCreateQuantizationParams();
+    tqp.zero_point = sqp.zero_point;
+    tqp.scale = sqp.scale;
+  }
+
+  target_array.data_type = source_array.data_type;
+  target_array.final_data_type = source_array.final_data_type;
+  target_array.copy_shape(source_array.shape());
+
   switch (source_array.data_type) {
     case ArrayDataType::kBool:
       CopyArrayBuffer<ArrayDataType::kBool>(source_array, &target_array);
@@ -1140,25 +1147,6 @@ void CloneArray(Model* model, const string& source_array_name,
                  << ArrayDataTypeName(source_array.data_type);
       return;
   }
-
-  if (source_array.minmax) {
-    const auto& smm = source_array.GetMinMax();
-    auto& tmm = target_array.GetOrCreateMinMax();
-    tmm.min = smm.min;
-    tmm.max = smm.max;
-  }
-
-  if (source_array.quantization_params) {
-    const auto& sqp = source_array.GetQuantizationParams();
-    auto& tqp = target_array.GetOrCreateQuantizationParams();
-    tqp.zero_point = sqp.zero_point;
-    tqp.scale = sqp.scale;
-  }
-
-  target_array.data_type = source_array.data_type;
-  target_array.final_data_type = source_array.final_data_type;
-
-  target_array.copy_shape(source_array.shape());
 }
 
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index dfd81173c3..ed0ecd4d0f 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -147,6 +147,23 @@ void FixNoOrphanedArray(Model* model);
 // Fixes input/output arrays that may have issues during export or inference.
 void FixEdgeArrays(Model* model);
 
+// Copies the contents of an array into another.
+// Expects that the shape and data type match.
+template <ArrayDataType A>
+void CopyArrayBuffer(const Array& source_array, Array* target_array) {
+  int source_buffer_size = RequiredBufferSizeForShape(source_array.shape());
+  int target_buffer_size = RequiredBufferSizeForShape(target_array->shape());
+  CHECK_EQ(source_buffer_size, target_buffer_size)
+      << "Buffer sizes must match in element count";
+  CHECK(source_array.data_type == target_array->data_type)
+      << "Data types must match";
+  if (source_array.buffer) {
+    const auto& source_buffer = source_array.GetBuffer<A>();
+    auto& target_buffer = target_array->GetMutableBuffer<A>();
+    target_buffer.data = source_buffer.data;
+  }
+}
+
 // Inserts a no-op reshape operator between the source array and the target
 // array. This effectively just copies the data.
 void InsertCopyOperator(Model* model, const string& source_array_name,
-- 
GitLab


From 0161bb77accc64d3742098feb7f438752a83ff32 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 14:33:16 -0700
Subject: [PATCH 0720/1262] K-FAC: Deprecate tf.contrib.kfac.

As LayerCollections are required to instantiate KfacOptimizer and FisherEstimator, a deprecation warning is printed upon instantiating LayerCollection.

PiperOrigin-RevId: 192671370
---
 tensorflow/contrib/kfac/python/ops/layer_collection.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 411da033c3..366e2a82d5 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -28,6 +28,7 @@ from collections import defaultdict
 from collections import OrderedDict
 from contextlib import contextmanager
 from functools import partial
+import warnings
 
 import math
 import six
@@ -171,6 +172,9 @@ class LayerCollection(object):
   def __init__(self,
                graph=None,
                name="LayerCollection"):
+    warnings.warn(
+        "tf.contrib.kfac is deprecated and will be removed by 2018-11-01. "
+        "Use https://pypi.python.org/pypi/kfac instead.")
     self.fisher_blocks = LayerParametersDict()
     self.fisher_factors = OrderedDict()
     self._linked_parameters = dict(
-- 
GitLab


From 69edcec4746cc4260fd40079f1d72c2b23cdc297 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 12 Apr 2018 15:04:35 -0700
Subject: [PATCH 0721/1262] Merge libraries back

---
 tensorflow/contrib/tensorrt/BUILD | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2ee0c4589c..8dc6e8fae6 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -193,28 +193,11 @@ tf_py_wrap_cc(
     ],
 )
 
-tf_cuda_library(
-    name = "trt_resource_manager_impl",
-    srcs = [
-        "resources/trt_resource_manager.cc",
-    ],
-    hdrs = [
-        "resources/trt_resource_manager.h",
-    ],
-    deps = [
-        ":trt_logging",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
 tf_cuda_library(
     name = "trt_resources",
     srcs = [
         "resources/trt_int8_calibrator.cc",
+        "resources/trt_resource_manager.cc",
     ],
     hdrs = [
         "resources/trt_int8_calibrator.h",
@@ -228,8 +211,6 @@ tf_cuda_library(
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ]) + if_static([
-        ":trt_resource_manager_impl",
     ]),
 )
 
@@ -248,7 +229,6 @@ tf_cuda_library(
         ":segment",
         ":trt_logging",
         ":trt_resources",
-        ":trt_resource_manager_impl",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
-- 
GitLab


From 504a2ee3d82ac04a813b0bf18b0f972ce6bab2db Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 12 Apr 2018 15:17:03 -0700
Subject: [PATCH 0722/1262] Remove if_static import

---
 tensorflow/contrib/tensorrt/BUILD | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 8dc6e8fae6..c792587733 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -27,10 +27,6 @@ load(
     "if_tensorrt",
 )
 
-load(
-  "//tensorflow/core:platform/default/build_config_root.bzl",
-  "if_static",
-)
 
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
-- 
GitLab


From fffbe5a26da2d6fab5a3eb648cefef49db4d38de Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 12 Apr 2018 15:20:18 -0700
Subject: [PATCH 0723/1262] Check if the session has been deleted before
 releasing a callable.

In some versions of Python, the Session._session field may be cleared
(in `Session.__del__()`) before a callable that has a reference to
that Session is deleted. Add a defensive check in the
`Session._Callable.__del__()` method.

PiperOrigin-RevId: 192679796
---
 tensorflow/python/client/session.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 4c84d78f2e..5507d011bb 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1454,7 +1454,10 @@ class BaseSession(SessionInterface):
               self._session._session, self._handle, args, status, None)
 
     def __del__(self):
-      if self._handle is not None:
+      # NOTE(mrry): It is possible that `self._session.__del__()` could be
+      # called before this destructor, in which case `self._session._session`
+      # will be `None`.
+      if self._handle is not None and self._session._session is not None:
         with errors.raise_exception_on_not_ok_status() as status:
           if self._session._created_with_new_api:
             tf_session.TF_SessionReleaseCallable(
-- 
GitLab


From d49cbc232ed5cd8c14159b7f3760348d10aa6206 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 12 Apr 2018 15:20:34 -0700
Subject: [PATCH 0724/1262] [tf.data] Clean up
 //tensorflow/contrib/data/python/ops/BUILD.

Create spearate targets for each submodule, so that each test can depend on
the appropriate subset of Python files.

PiperOrigin-RevId: 192679856
---
 tensorflow/contrib/data/BUILD                 |   6 -
 tensorflow/contrib/data/__init__.py           |   2 -
 .../contrib/data/python/kernel_tests/BUILD    |  58 +++--
 tensorflow/contrib/data/python/ops/BUILD      | 214 +++++++++++++++---
 tensorflow/contrib/distribute/python/BUILD    |   2 +-
 tensorflow/contrib/eager/python/BUILD         |   4 +-
 tensorflow/contrib/tpu/BUILD                  |   3 +-
 7 files changed, 218 insertions(+), 71 deletions(-)

diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 7bb0dc1c0f..8bdbba83ef 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -22,13 +22,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
-        "//tensorflow/contrib/data/python/ops:prefetching_ops",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/contrib/data/python/ops:shuffle_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 17048314a4..f58e5ec1f0 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -78,8 +78,6 @@ from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
 from tensorflow.contrib.data.python.ops.sliding import sliding_window_batch
-from tensorflow.python.data.ops.iterator_ops import Iterator
-from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_single_example
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 5d6dbdcbdf..a8481dc90a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -14,8 +14,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -37,8 +36,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -59,10 +57,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
@@ -79,8 +77,7 @@ py_test(
     ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -127,13 +124,13 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -145,7 +142,7 @@ tf_py_test(
     additional_deps = [
         ":dataset_serialization_test",
         "//third_party/py/numpy",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -175,8 +172,7 @@ py_test(
     ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -187,6 +183,7 @@ py_test(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -197,7 +194,8 @@ tf_py_test(
     srcs = ["get_single_element_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:get_single_element",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -215,8 +213,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -261,8 +258,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:counter",
+        "//tensorflow/contrib/data/python/ops:enumerate_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -274,6 +271,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -309,12 +307,12 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["noasan"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -327,7 +325,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:scan_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -346,11 +344,11 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -378,7 +376,6 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -415,10 +412,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:stats_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -429,10 +426,11 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:threadpool",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -444,13 +442,13 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/contrib/stateless",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -463,11 +461,11 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -497,8 +495,8 @@ tf_py_test(
     size = "small",
     srcs = ["slide_dataset_op_test.py"],
     additional_deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:sliding",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 236792bb98..7c28d1f005 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -12,18 +12,26 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 py_library(
-    name = "dataset_ops",
-    srcs = [
-        "counter.py",
-        "get_single_element.py",
+    name = "counter",
+    srcs = ["counter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":scan_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+)
+
+py_library(
+    name = "get_single_element",
+    srcs = ["get_single_element.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":transformation_ops",
         "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
@@ -66,7 +74,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":dataset_ops",
+        ":batching",
+        ":interleave_ops",
         ":shuffle_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
@@ -94,50 +103,169 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":random_ops",
-        ":transformation_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 py_library(
-    name = "transformation_ops",
-    srcs = [
-        "batching.py",
-        "enumerate_ops.py",
-        "error_ops.py",
-        "grouping.py",
-        "interleave_ops.py",
-        "resampling.py",
-        "scan_ops.py",
-        "sliding.py",
-        "stats_ops.py",
-        "threadpool.py",
-        "unique.py",
+    name = "batching",
+    srcs = ["batching.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "enumerate_ops",
+    srcs = ["enumerate_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+)
+
+py_library(
+    name = "error_ops",
+    srcs = ["error_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":contrib_op_loader",
         ":gen_dataset_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "grouping",
+    srcs = ["grouping.py"],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "interleave_ops",
+    srcs = ["interleave_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+py_library(
+    name = "resampling",
+    srcs = ["resampling.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batching",
+        ":scan_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "scan_ops",
+    srcs = ["scan_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "sliding",
+    srcs = ["sliding.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "threadpool",
+    srcs = ["threadpool.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
-        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "unique",
+    srcs = [
+        "unique.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
@@ -183,3 +311,29 @@ py_library(
         "//tensorflow/python/data/util:sparse",
     ],
 )
+
+py_library(
+    name = "dataset_ops",
+    deps = [
+        ":batching",
+        ":counter",
+        ":enumerate_ops",
+        ":error_ops",
+        ":get_single_element",
+        ":grouping",
+        ":interleave_ops",
+        ":prefetching_ops",
+        ":readers",
+        ":resampling",
+        ":scan_ops",
+        ":shuffle_ops",
+        ":sliding",
+        ":stats_ops",
+        ":threadpool",
+        ":unique",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 51b4fbacd1..5aad21cccd 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -22,7 +22,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":prefetching_ops_v2",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpointable",
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 04e2d99048..e2744a430d 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -71,7 +71,9 @@ cuda_py_test(
     additional_deps = [
         ":datasets",
         ":checkpointable_utils",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/contrib/data/python/ops:threadpool",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 3e489d38b6..9646d15486 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -198,7 +198,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
-- 
GitLab


From f768aa0bb3d16edfdd1ac11733fac09c97c48f74 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 12 Apr 2018 15:33:09 -0700
Subject: [PATCH 0725/1262] Fix buildifier issues

---
 tensorflow/contrib/tensorrt/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index c792587733..fd3582e175 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -27,7 +27,6 @@ load(
     "if_tensorrt",
 )
 
-
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
-- 
GitLab


From 0195d6b4fbbe948914d0045d19eec8fcef1211f5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 12 Apr 2018 15:41:41 -0700
Subject: [PATCH 0726/1262] Added a utility to compute a topo ordering of a
 graph

PiperOrigin-RevId: 192683166
---
 .../core/grappler/utils/topological_sort.cc   | 35 ++++++++++++++-----
 .../core/grappler/utils/topological_sort.h    |  4 +++
 .../grappler/utils/topological_sort_test.cc   | 24 +++++++++++--
 3 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 8d8ff4da3a..a8e464d09d 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -26,24 +26,24 @@ namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-Status TopologicalSort(GraphDef* graph) {
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<int>* ready_nodes) {
   SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*graph));
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
 
-  std::vector<int> ready_nodes;
-  ready_nodes.reserve(graph_view.num_nodes());
+  ready_nodes->reserve(graph_view.num_nodes());
 
   int front = 0;
   int back = 0;
   std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
   for (int i = 0; i < graph_view.num_nodes(); i++) {
     if (graph_view.inputs(i).empty()) {
-      ready_nodes.push_back(i);
+      ready_nodes->push_back(i);
       back++;
     }
-    if (IsMerge(graph->node(i))) {
+    if (IsMerge(graph.node(i))) {
       for (int input : graph_view.inputs(i)) {
-        if (IsNextIteration(graph->node(input))) {
+        if (IsNextIteration(graph.node(input))) {
           num_ready_inputs[i]++;
         }
       }
@@ -51,11 +51,11 @@ Status TopologicalSort(GraphDef* graph) {
   }
 
   while (front != back) {
-    int ready_node = ready_nodes[front];
+    int ready_node = (*ready_nodes)[front];
     for (int fanout : graph_view.outputs(ready_node)) {
       ++num_ready_inputs[fanout];
       if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
-        ready_nodes.push_back(fanout);
+        ready_nodes->push_back(fanout);
         ++back;
       }
     }
@@ -66,7 +66,24 @@ Status TopologicalSort(GraphDef* graph) {
     return errors::InvalidArgument(
         "The graph couldn't be sorted in topological order.");
   }
+  return Status::OK();
+}
 
+Status ComputeTopologicalOrder(
+    const GraphDef& graph,
+    std::unordered_map<const NodeDef*, int>* topo_order) {
+  std::vector<int> ready_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(graph, &ready_nodes));
+  topo_order->reserve(graph.node_size());
+  for (int i = 0; i < ready_nodes.size(); ++i) {
+    (*topo_order)[&graph.node(ready_nodes[i])] = i;
+  }
+  return Status::OK();
+}
+
+Status TopologicalSort(GraphDef* graph) {
+  std::vector<int> ready_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index 7700fe41e4..668c88dc75 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -22,6 +22,10 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Compute a topological ordering for the graph nodes.
+Status ComputeTopologicalOrder(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order);
+
 // Sort a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
 
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index c96f15b0e8..f5c95009d2 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -52,8 +52,19 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+
+  const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    std::cout << "Node " << node_name << " at order " << topo_order
+              << std::endl;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
@@ -68,8 +79,17 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
   *graph.add_node() = CreateNode("1", {});
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+
+  const std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
-- 
GitLab


From cc108a73af35b407bf9bf51e679e5884b309964b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 16:26:13 -0700
Subject: [PATCH 0727/1262] Add support for RNN state array of type
 tf.identity.

PiperOrigin-RevId: 192689747
---
 .../lite/toco/graph_transformations/remove_unused_op.cc        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index aa2c293382..8e6aaf544a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -47,7 +47,8 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     bool found_output_as_rnn_state_array = false;
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.state_array()) {
-        CHECK(op->type == OperatorType::kFill);
+        CHECK(op->type == OperatorType::kFill ||
+              op->type == OperatorType::kTensorFlowIdentity);
         found_output_as_rnn_state_array = true;
         break;
       }
-- 
GitLab


From dde6aaf321d7f73fb31578fb044b783fb449d017 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 12 Apr 2018 16:35:47 -0700
Subject: [PATCH 0728/1262] Exposing tensorflow.contrib.proto in the pip
 package.

PiperOrigin-RevId: 192691078
---
 tensorflow/contrib/BUILD                          |  1 +
 tensorflow/contrib/__init__.py                    |  1 +
 tensorflow/contrib/cmake/tf_python.cmake          |  6 ++++--
 .../python/kernel_tests/decode_proto_fail_test.py |  4 ++--
 .../python/kernel_tests/decode_proto_op_test.py   |  4 ++--
 .../python/kernel_tests/encode_proto_op_test.py   | 15 ++++++++-------
 6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 9bef0d8b61..ae68f4aec4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -77,6 +77,7 @@ py_library(
         "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
+        "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index aaddb06fa0..e27ece8fa5 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -64,6 +64,7 @@ from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
+from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
 from tensorflow.contrib import recurrent
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ded15b4b66..21f59d2563 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,8 +330,10 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
-GENERATE_PYTHON_OP_LIB("decode_proto_ops")
-GENERATE_PYTHON_OP_LIB("encode_proto_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("encode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
index f019833905..f8969b0bd5 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import proto
+from tensorflow.contrib.proto import decode_proto
 from tensorflow.contrib.proto.python.kernel_tests import test_case
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -46,7 +46,7 @@ class DecodeProtoFailTest(test_case.ProtoOpTestCase):
     field_types = [dtypes.int32]
 
     with self.test_session() as sess:
-      ctensor, vtensor = proto.decode_proto(
+      ctensor, vtensor = decode_proto(
           batch,
           message_type=msg_type,
           field_names=field_names,
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
index 30ceac5f5f..cd5121cdba 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -27,7 +27,7 @@ import numpy as np
 
 from google.protobuf import text_format
 
-from tensorflow.contrib import proto
+from tensorflow.contrib.proto import decode_proto
 from tensorflow.contrib.proto.python.kernel_tests import test_case
 from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
 from tensorflow.python.framework import dtypes
@@ -175,7 +175,7 @@ class DecodeProtoOpTest(test_case.ProtoOpTestCase):
     output_types = [f.dtype for f in fields]
 
     with self.test_session() as sess:
-      sizes, vtensor = proto.decode_proto(
+      sizes, vtensor = decode_proto(
           batch,
           message_type=message_type,
           field_names=field_names,
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
index 2a24c3b8ce..a289ff290a 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -30,7 +30,8 @@ import numpy as np
 
 from google.protobuf import text_format
 
-from tensorflow.contrib import proto
+from tensorflow.contrib.proto import decode_proto
+from tensorflow.contrib.proto import encode_proto
 from tensorflow.contrib.proto.python.kernel_tests import test_case
 from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
 from tensorflow.python.framework import dtypes
@@ -50,7 +51,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
     # Invalid field name
     with self.test_session():
       with self.assertRaisesOpError('Unknown field: non_existent_field'):
-        proto.encode_proto(
+        encode_proto(
             sizes=[[1]],
             values=[np.array([[0.0]], dtype=np.int32)],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -60,7 +61,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
     with self.test_session():
       with self.assertRaisesOpError(
           'Incompatible type for field double_value.'):
-        proto.encode_proto(
+        encode_proto(
             sizes=[[1]],
             values=[np.array([[0.0]], dtype=np.int32)],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -72,7 +73,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
           r'sizes should be batch_size \+ \[len\(field_names\)\]'):
         sizes = array_ops.placeholder(dtypes.int32)
         values = array_ops.placeholder(dtypes.float64)
-        proto.encode_proto(
+        encode_proto(
             sizes=sizes,
             values=[values],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -88,7 +89,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
         sizes = array_ops.placeholder(dtypes.int32)
         values1 = array_ops.placeholder(dtypes.float64)
         values2 = array_ops.placeholder(dtypes.int32)
-        (proto.encode_proto(
+        (encode_proto(
             sizes=[[1, 1]],
             values=[values1, values2],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -103,13 +104,13 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
     out_types = [f.dtype for f in fields]
 
     with self.test_session() as sess:
-      sizes, field_tensors = proto.decode_proto(
+      sizes, field_tensors = decode_proto(
           in_bufs,
           message_type=message_type,
           field_names=field_names,
           output_types=out_types)
 
-      out_tensors = proto.encode_proto(
+      out_tensors = encode_proto(
           sizes,
           field_tensors,
           message_type=message_type,
-- 
GitLab


From 9908cb16746a2c1a5b4c28950debc0b5964447ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 16:51:43 -0700
Subject: [PATCH 0729/1262] Change assertions to use the tensor 'x' rather than
 'x.op.name'. This enables eager execution in validate_args=True contexts.

PiperOrigin-RevId: 192693458
---
 .../python/ops/bijectors/reshape.py             | 14 +++++++-------
 tensorflow/python/ops/distributions/util.py     | 17 ++++++++---------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 82210cd6c9..f21b982ba6 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -138,7 +138,7 @@ class Reshape(bijector_lib.Bijector):
     """Check that a shape Tensor is int-type and otherwise sane."""
     if not shape.dtype.is_integer:
       raise TypeError("{} dtype ({}) should be `int`-like.".format(
-          shape.op.name, shape.dtype.name))
+          shape, shape.dtype.name))
 
     assertions = []
 
@@ -146,10 +146,10 @@ class Reshape(bijector_lib.Bijector):
     ndims_ = tensor_util.constant_value(ndims)
     if ndims_ is not None and ndims_ > 1:
       raise ValueError("`{}` rank ({}) should be <= 1.".format(
-          shape.op.name, ndims_))
+          shape, ndims_))
     elif validate_args:
       assertions.append(check_ops.assert_less_equal(
-          ndims, 1, message="`{}` rank should be <= 1.".format(shape.op.name)))
+          ndims, 1, message="`{}` rank should be <= 1.".format(shape)))
 
     shape_ = tensor_util.constant_value_as_shape(shape)
     if shape_.is_fully_defined():
@@ -157,12 +157,12 @@ class Reshape(bijector_lib.Bijector):
       if sum(es == -1) > 1:
         raise ValueError(
             "`{}` must have at most one `-1` (given {})"
-            .format(shape.op.name, es))
+            .format(shape, es))
       if np.any(es < -1):
         raise ValueError(
             "`{}` elements must be either positive integers or `-1`"
             "(given {})."
-            .format(shape.op.name, es))
+            .format(shape, es))
     elif validate_args:
       assertions.extend([
           check_ops.assert_less_equal(
@@ -170,11 +170,11 @@ class Reshape(bijector_lib.Bijector):
                   math_ops.cast(math_ops.equal(shape, -1), dtypes.int32)),
               1,
               message="`{}` elements must have at most one `-1`."
-              .format(shape.op.name)),
+              .format(shape)),
           check_ops.assert_greater_equal(
               shape, -1,
               message="`{}` elements must be either positive integers or `-1`."
-              .format(shape.op.name)),
+              .format(shape)),
       ])
     return assertions
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 0fe6aa30f9..2e067eab45 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -58,8 +58,7 @@ def assert_close(
   if data is None:
     data = [
         message,
-        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-        y.name, y
+        "Condition x ~= y did not hold element-wise: x = ", x, "y = ", y
     ]
 
   if x.dtype.is_integer:
@@ -95,7 +94,7 @@ def assert_integer_form(
     x = ops.convert_to_tensor(x, name="x")
     if x.dtype.is_integer:
       return control_flow_ops.no_op()
-    message = message or "{} has non-integer components".format(x.op.name)
+    message = message or "{} has non-integer components".format(x)
     if int_dtype is None:
       try:
         int_dtype = {
@@ -123,13 +122,13 @@ def embed_check_nonnegative_integer_form(
     x = ops.convert_to_tensor(x, name="x")
     assertions = [
         check_ops.assert_non_negative(
-            x, message="'{}' must be non-negative.".format(x.op.name)),
+            x, message="'{}' must be non-negative.".format(x)),
     ]
     if not x.dtype.is_integer:
       assertions += [
           assert_integer_form(
               x, message="'{}' cannot contain fractional components.".format(
-                  x.op.name)),
+                  x)),
       ]
     return control_flow_ops.with_dependencies(assertions, x)
 
@@ -434,7 +433,7 @@ def embed_check_integer_casting_closed(
         and not _is_integer_like_by_dtype(target_dtype)):
       raise TypeError("At least one of {}.dtype ({}) and target_dtype ({}) "
                       "must be integer-type.".format(
-                          x.op.name, x.dtype.name, target_dtype.name))
+                          x, x.dtype.name, target_dtype.name))
 
     assertions = []
     if assert_nonnegative:
@@ -683,7 +682,7 @@ def pick_vector(cond,
     cond = ops.convert_to_tensor(cond, name="cond")
     if cond.dtype != dtypes.bool:
       raise TypeError("%s.dtype=%s which is not %s" %
-                      (cond.name, cond.dtype, dtypes.bool))
+                      (cond, cond.dtype, dtypes.bool))
     cond_value_static = tensor_util.constant_value(cond)
     if cond_value_static is not None:
       return true_vector if cond_value_static else false_vector
@@ -692,8 +691,8 @@ def pick_vector(cond,
     if true_vector.dtype != false_vector.dtype:
       raise TypeError(
           "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector.name, true_vector.dtype,
-             false_vector.name, false_vector.dtype))
+          % (true_vector, true_vector.dtype,
+             false_vector, false_vector.dtype))
     n = array_ops.shape(true_vector)[0]
     return array_ops.slice(
         array_ops.concat([true_vector, false_vector], 0),
-- 
GitLab


From 5d442bea19fd8f7f945248fb55f1ca2a6f6205c5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 16:55:30 -0700
Subject: [PATCH 0730/1262] Propagate sharding of the source instruction to the
 copies added by layout assignment.

PiperOrigin-RevId: 192693972
---
 .../compiler/xla/service/hlo_instruction.cc   |  15 +-
 .../compiler/xla/service/hlo_instruction.h    |   7 +
 .../compiler/xla/service/hlo_sharding.cc      |  10 ++
 .../compiler/xla/service/hlo_sharding.h       |   4 +
 .../compiler/xla/service/layout_assignment.cc | 163 ++++++++++--------
 .../compiler/xla/service/layout_assignment.h  |  23 +++
 6 files changed, 148 insertions(+), 74 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 5d2d7a9727..56cb241087 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -838,6 +838,16 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   return instruction;
 }
 
+void HloInstruction::SetupDerivedInstruction(
+    HloInstruction* derived_instruction) const {
+  if (sharding_ != nullptr) {
+    derived_instruction->set_sharding(*sharding_);
+  } else {
+    derived_instruction->clear_sharding();
+  }
+  derived_instruction->set_metadata(metadata_);
+}
+
 HloInstruction* HloInstruction::AddFusionOperand(HloInstruction* new_operand) {
   CHECK_EQ(opcode(), HloOpcode::kFusion);
   CHECK_EQ(operand_count(),
@@ -1480,10 +1490,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
-  clone->set_metadata(metadata_);
-  if (has_sharding()) {
-    clone->set_sharding(sharding());
-  }
+  SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
   return clone;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 9a9de07883..49aa075029 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -949,6 +949,13 @@ class HloInstruction {
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
 
+  // When creating a new instruction which either replaces, or shifts up (kCopy
+  // insertion case), another instruction, we need to make sure the certain
+  // properties of the new instruction are copied into the derived one. As of
+  // today, the metadata and sharding will be propagated to the derived
+  // instruction.
+  void SetupDerivedInstruction(HloInstruction* derived_instruction) const;
+
   // Adds a new operand the fusion instruction.
   HloInstruction* AddFusionOperand(HloInstruction* new_operand);
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index e8e45f1ee9..1b42349b0b 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -376,6 +376,16 @@ HloSharding HloSharding::TransformShardedTileShape(
   return HloSharding::Tile(new_tile_shape, tile_assignment());
 }
 
+HloSharding HloSharding::GetSubSharding(const Shape& shape,
+                                        const ShapeIndex& index) const {
+  CHECK(IsTuple());
+
+  ShapeTree<HloSharding> sub_shape_tree(ShapeUtil::GetSubshape(shape, index),
+                                        Replicate());
+  sub_shape_tree.CopySubtreeFrom(GetAsShapeTree(shape), index, {});
+  return Tuple(sub_shape_tree);
+}
+
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
   out << sharding.ToString();
   return out;
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 06204acbca..2b8e757f42 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -175,6 +175,10 @@ class HloSharding {
     }
   }
 
+  // Retrieves the sub sharding at a given index, out of a tuple sharding.
+  // REQUIRES: IsTuple()
+  HloSharding GetSubSharding(const Shape& shape, const ShapeIndex& index) const;
+
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            ShapeUtil::Compatible(tile_shape_, other.tile_shape_) &&
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 39f9120e55..2494569db5 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -57,76 +57,6 @@ namespace xla {
 // anonymous namespace, instead of three or four spread all over this file.
 namespace {
 
-// Creates and returns a copy of the given instruction with a different
-// layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
-// instruction producing the copy is returned.
-StatusOr<HloInstruction*> CreateCopyWithNewLayout(
-    const Shape& shape_with_layout, HloInstruction* instruction) {
-  TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
-  DCHECK(ShapeUtil::Compatible(shape_with_layout, instruction->shape()))
-      << ShapeUtil::HumanString(shape_with_layout) << " "
-      << ShapeUtil::HumanString(instruction->shape())
-      << " instruction: " << instruction->ToString();
-
-  if (ShapeUtil::IsTuple(instruction->shape())) {
-    // Deep-copy tuples.
-    std::vector<HloInstruction*> element_copies;
-    for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
-         ++i) {
-      HloInstruction* gte = instruction->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
-              i));
-
-      // Recurse to copy each elements.
-      TF_ASSIGN_OR_RETURN(
-          HloInstruction * element_copy,
-          CreateCopyWithNewLayout(
-              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
-      element_copies.push_back(element_copy);
-    }
-    // Gather element copies into a tuple with a new Tuple instruction.
-    HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
-        HloInstruction::CreateTuple(element_copies));
-    LayoutUtil::ClearLayout(tuple_copy->mutable_shape());
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        shape_with_layout, tuple_copy->mutable_shape()));
-    return tuple_copy;
-  } else if (ShapeUtil::IsArray(instruction->shape())) {
-    HloInstruction* copy =
-        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
-            instruction->shape(), HloOpcode::kCopy, instruction));
-    LayoutUtil::ClearLayout(copy->mutable_shape());
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        shape_with_layout, copy->mutable_shape()));
-
-    return copy;
-  } else {
-    return FailedPrecondition(
-        "Can only copy array and tuple shaped instructions");
-  }
-}
-
-// Creates a copy of the given operand if the operand's layout does not match
-// the given layout. This copy replaces the use in the given instruction. Tuple
-// operands will be deep-copied.
-Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                  HloInstruction* instruction,
-                                  int64 operand_no) {
-  HloInstruction* operand = instruction->mutable_operand(operand_no);
-  TF_RET_CHECK(operand_layout.LayoutIsSet());
-  TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
-
-  if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
-    // Operand layout already matches our constraint. Nothing to do.
-    return Status::OK();
-  }
-
-  TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
-                      CreateCopyWithNewLayout(operand_layout.shape(), operand));
-
-  return instruction->ReplaceOperandWith(operand_no, operand_copy);
-}
 
 }  // namespace
 
@@ -793,6 +723,99 @@ Status CheckConstantLayout(HloInstruction* constant) {
 
 }  // namespace
 
+StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
+    const Shape& shape_with_layout, HloInstruction* instruction) {
+  TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
+  DCHECK(ShapeUtil::Compatible(shape_with_layout, instruction->shape()))
+      << ShapeUtil::HumanString(shape_with_layout) << " "
+      << ShapeUtil::HumanString(instruction->shape())
+      << " instruction: " << instruction->ToString();
+
+  if (ShapeUtil::IsTuple(instruction->shape())) {
+    // Deep-copy tuples.
+    std::vector<HloInstruction*> element_copies;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
+         ++i) {
+      HloInstruction* gte = instruction->parent()->AddInstruction(
+          HloInstruction::CreateGetTupleElement(
+              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
+              i));
+      SetupCopiedInstruction(*instruction, gte, {i});
+      // Recurse to copy each elements.
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * element_copy,
+          CreateCopyWithNewLayout(
+              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
+      element_copies.push_back(element_copy);
+    }
+    // Gather element copies into a tuple with a new Tuple instruction.
+    HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
+        HloInstruction::CreateTuple(element_copies));
+    SetupCopiedInstruction(*instruction, tuple_copy, {});
+    LayoutUtil::ClearLayout(tuple_copy->mutable_shape());
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        shape_with_layout, tuple_copy->mutable_shape()));
+    return tuple_copy;
+  } else if (ShapeUtil::IsArray(instruction->shape())) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            instruction->shape(), HloOpcode::kCopy, instruction));
+    SetupCopiedInstruction(*instruction, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        shape_with_layout, copy->mutable_shape()));
+
+    return copy;
+  } else {
+    return FailedPrecondition(
+        "Can only copy array and tuple shaped instructions");
+  }
+}
+
+// Creates a copy of the given operand if the operand's layout does not match
+// the given layout. This copy replaces the use in the given instruction. Tuple
+// operands will be deep-copied.
+Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
+    const ShapeLayout& operand_layout, HloInstruction* instruction,
+    int64 operand_no) {
+  HloInstruction* operand = instruction->mutable_operand(operand_no);
+  TF_RET_CHECK(operand_layout.LayoutIsSet());
+  TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
+
+  if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    // Operand layout already matches our constraint. Nothing to do.
+    return Status::OK();
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
+                      CreateCopyWithNewLayout(operand_layout.shape(), operand));
+
+  return instruction->ReplaceOperandWith(operand_no, operand_copy);
+}
+
+void LayoutAssignment::SetupCopiedInstruction(const HloInstruction& instruction,
+                                              HloInstruction* copy,
+                                              const ShapeIndex& index) {
+  if (instruction.has_sharding()) {
+    // If the index is empty, we want to copy the whole sharding, in case the
+    // sharding is a tuple sharding.
+    HloSharding sharding =
+        !index.empty() && instruction.sharding().IsTuple()
+            ? instruction.sharding().GetSubSharding(instruction.shape(), index)
+            : instruction.sharding();
+    // We propagate the sharding to the copied instruction only if it is a
+    // special sharding, like tiled ones, or special devices like the
+    // HostCompute module.
+    // Otherwise it is preferable to leave the new instruction without device,
+    // and let the automatic device placer to choose the best location.
+    if (!sharding.HasUniqueDevice() ||
+        HloSharding::IsReservedDevice(sharding.UniqueDevice().ValueOrDie())) {
+      copy->set_sharding(sharding);
+    }
+  }
+  copy->set_metadata(instruction.metadata());
+}
+
 Status LayoutAssignment::CheckLayouts(HloModule* module) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 680f88048a..ae4986d6ad 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -405,6 +405,29 @@ class LayoutAssignment : public HloPassInterface {
   ComputationLayout* entry_computation_layout_;
 
  protected:
+  // Sets up the copy instruction according to the characteristic (sharding,
+  // metadata, ...) of the reference instruction. The index argument is used
+  // when the instruction is a tuple, and in such case the index represents
+  // the location from where the copy instruction was created from.
+  // If the index is empty, the whole sharding will be propagated, even in case
+  // the intruction has a tuple sharding.
+  static void SetupCopiedInstruction(const HloInstruction& instruction,
+                                     HloInstruction* copy,
+                                     const ShapeIndex& index);
+
+  // Creates and returns a copy of the given instruction with a different
+  // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
+  // instruction producing the copy is returned.
+  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+      const Shape& shape_with_layout, HloInstruction* instruction);
+
+  // Creates a copy of the given operand if the operand's layout does not match
+  // the given layout. This copy replaces the use in the given instruction.
+  // Tuple operands will be deep-copied.
+  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                           HloInstruction* instruction,
+                                           int64 operand_no);
+
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
-- 
GitLab


From 5f7929b8c340b579f859396677c17f010f94d984 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 12 Apr 2018 16:56:45 -0700
Subject: [PATCH 0731/1262] [XLA:GPU] Pass all four args to custom-call convs
 when they're created.

A custom-call-conv should have four arguments: lhs, rhs, algorithm, and
use-tensor-cores.  CudnnAlgorithmPicker did the right thing, and that
path is exercised 99% of the time.  But CudnnAlgorithmPicker can fail,
and if it does, we're stuck with whatever we had before.  What we had
before only had three of the four args, which is bad.

In addition to fixing it, added an e2e test that catches the bug.

PiperOrigin-RevId: 192694119
---
 .../xla/service/gpu/ir_emission_utils.cc      | 13 +++++++----
 .../compiler/xla/tests/convolution_test.cc    | 23 +++++++++++++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 32413f975a..532d436ee8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -160,14 +160,19 @@ static HloInstruction* CreateCudnnConv(
   Shape call_shape =
       ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
 
-  // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn
-  // algorithm to use.  It's up to a later pass to choose the algorithm, so to
-  // indicate that we haven't yet made a choice, we speicfy -1 for that arg.
+  // Our CustomCall takes four arguments: The conv lhs and rhs, the cudnn
+  // algorithm to use, and a boolean indicating whether to use tensor cores.
+  //
+  // It's up to a later pass to choose the algorithm and decide whether to use
+  // tensor cores, so to indicate that we haven't yet made a choice, we speicfy
+  // -1 and false for those args.
   HloInstruction* negative_one = computation->AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
+  HloInstruction* false_constant = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloInstruction* custom_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {lhs, rhs, negative_one}, call_target));
+          call_shape, {lhs, rhs, negative_one, false_constant}, call_target));
   custom_call->set_window(window);
   custom_call->set_convolution_dimension_numbers(dnums);
   return custom_call;
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 5eb3136abe..947959beb1 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -745,5 +745,28 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
                     error_spec_);
 }
 
+// Check that GPU convs still work if the CudnnAlgorithmPicker pass is disabled.
+// (We run this test on all platforms, because, what the heck.)
+XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) {
+  execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+      "cudnn-convolution-algorithm-picker");
+
+  XlaBuilder builder(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
+
+  Array4D<float> input_data(1, 1, 1, 2);
+  input_data.FillIota(0);
+  Array4D<float> filter_data(1, 1, 1, 2);
+  filter_data.FillIota(10);
+
+  ComputeAndCompare(&builder,
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))});
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 59c828c5f0d040f6461534d7760e2ff6e89b3f1a Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@google.com>
Date: Thu, 12 Apr 2018 16:57:40 -0700
Subject: [PATCH 0732/1262] Document support for boolean values in
 tf.contrib.training.HParams.

PiperOrigin-RevId: 192694244
---
 tensorflow/contrib/training/python/training/hparam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 185f70a86d..6c59b68053 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -315,7 +315,7 @@ class HParams(object):
 
   Hyperparameters have type, which is inferred from the type of their value
   passed at construction type.   The currently supported types are: integer,
-  float, string, and list of integer, float, or string.
+  float, boolean, string, and list of integer, float, boolean, or string.
 
   You can override hyperparameter values by calling the
   [`parse()`](#HParams.parse) method, passing a string of comma separated
-- 
GitLab


From 4d568133aade026bfc3bcee3a444682a349058b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 16:59:57 -0700
Subject: [PATCH 0733/1262] Misc. small optimizations in Grappler and shape
 inference code.

Impact on time per optimizer on inception graph:

model_pruner:          590 ms -> 550 ms   (-7%)
function_optimizer:    130 ms -> 130 ms   (-0%)
constant_folding:     7600 ms -> 7550 ms  (-0.7%)
arithmetic_optimizer: 1860 ms -> 1550 ms  (-20%)
loop_optimizer:        320 ms -> 320 ms   (-0%)
dependency_optimizer: 1300 ms -> 720 ms   (-45%)
layout:               1400 ms -> 1400 ms  (-0%)
memory_optimizer:     4200 ms -> 3540 ms  (-16%)
PiperOrigin-RevId: 192694528
---
 tensorflow/core/framework/shape_inference.cc  | 18 +++---
 tensorflow/core/graph/graph_constructor.cc    | 48 +++++++++------
 .../core/grappler/costs/graph_memory.cc       | 34 +++++++----
 tensorflow/core/grappler/grappler_item.cc     |  4 +-
 tensorflow/core/grappler/grappler_item.h      |  6 +-
 .../optimizers/arithmetic_optimizer.cc        | 15 +++--
 .../grappler/optimizers/constant_folding.cc   |  6 +-
 .../optimizers/dependency_optimizer.cc        | 45 ++++++--------
 .../grappler/optimizers/memory_optimizer.cc   |  2 +-
 .../grappler/optimizers/meta_optimizer.cc     | 61 ++++++++-----------
 tensorflow/core/grappler/utils.cc             | 41 ++++---------
 tensorflow/core/grappler/utils.h              | 34 ++++++++++-
 12 files changed, 169 insertions(+), 145 deletions(-)

diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index cc1ec47a83..229b4a45fa 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -40,6 +40,7 @@ InferenceContext::InferenceContext(
     : graph_def_version_(graph_def_version),
       node_def_(CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  input_tensors_as_shape_handles.reserve(input_tensors_as_shapes.size());
   for (const TensorShapeProto& p : input_tensors_as_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
@@ -50,6 +51,7 @@ InferenceContext::InferenceContext(
   }
   PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
   if (!construction_status_.ok()) return;
+  inputs_.reserve(input_shapes.size());
   for (const TensorShapeProto& p : input_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
@@ -93,6 +95,7 @@ InferenceContext::InferenceContext(
     : graph_def_version_(graph_def_version),
       node_def_(CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  input_tensors_as_shape_handles.reserve(input_tensors_as_shapes.size());
   for (const PartialTensorShape& p : input_tensors_as_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
@@ -103,6 +106,7 @@ InferenceContext::InferenceContext(
   }
   PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
   if (!construction_status_.ok()) return;
+  inputs_.reserve(input_shapes.size());
   for (const PartialTensorShape& p : input_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
@@ -229,9 +233,7 @@ void InferenceContext::PreInputInit(
   for (const auto& e : output_name_map_) {
     num_outputs = std::max(num_outputs, e.second.second);
   }
-  for (int i = 0; i < num_outputs; ++i) {
-    outputs_.push_back(nullptr);
-  }
+  outputs_.assign(num_outputs, nullptr);
   output_handle_shapes_and_types_.resize(num_outputs);
 }
 
@@ -469,13 +471,15 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   TF_RETURN_IF_ERROR(WithRankAtLeast(s, rank, &s));
 
   // Merge the prefix dims and create the new output shapes.
+  const int32 rank_s = Rank(s);
   std::vector<DimensionHandle> dims;
+  dims.reserve(std::max(rank, rank_s));
   dims.resize(rank);
   for (int i = 0; i < rank; ++i) {
     TF_RETURN_IF_ERROR(Merge(Dim(s, i), Dim(prefix, i), &dims[i]));
   }
   *prefix_out = MakeShape(dims);
-  for (int i = rank; i < Rank(s); ++i) dims.push_back(Dim(s, i));
+  for (int i = rank; i < rank_s; ++i) dims.push_back(Dim(s, i));
   *s_out = MakeShape(dims);
   return Status::OK();
 }
@@ -1105,6 +1109,7 @@ Status InferenceContext::Max(DimensionHandle first, DimensionOrConstant second,
 
 Status InferenceContext::AttachContext(const Status& status) {
   std::vector<string> input_shapes;
+  input_shapes.reserve(inputs_.size());
   for (const ShapeHandle& input_shape : inputs_) {
     input_shapes.emplace_back(DebugString(input_shape));
   }
@@ -1112,6 +1117,7 @@ Status InferenceContext::AttachContext(const Status& status) {
   // Add information about the input tensors and partial tensor shapes used.
   std::vector<string> input_from_tensors_str;
   std::vector<string> input_from_tensors_as_shape_str;
+  input_from_tensors_as_shape_str.reserve(inputs_.size());
   for (int i = 0; i < inputs_.size(); ++i) {
     if (requested_input_tensor_as_partial_shape_[i] &&
         i < input_tensors_as_shapes_.size() &&
@@ -1233,9 +1239,7 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
   if (!refined) {
     return false;
   }
-  for (int i = 0; i < new_values.size(); ++i) {
-    (*to_update)[i] = new_values[i];
-  }
+  to_update->swap(new_values);
   return true;
 }
 
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 250992fb7a..c678283fce 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -666,20 +666,17 @@ Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
 void RemoveInputs(const std::vector<int>& inputs_to_remove, NodeDef* node_def,
                   std::vector<bool>* input_already_exists) {
   // Remove 'inputs_to_remove' from 'node_def'
-  // TODO(skyewm): is there a better way to do this?
-  std::vector<string> inputs;
-  inputs.reserve(node_def->input_size());
-  for (int i = 0; i < node_def->input_size(); ++i) {
-    inputs.push_back(node_def->input(i));
-  }
-  node_def->clear_input();
-  for (int i = 0, j = 0; i < inputs.size(); ++i) {
+  NodeDef copy;
+  copy.mutable_input()->Reserve(node_def->input_size() -
+                                inputs_to_remove.size());
+  for (int i = 0, j = 0; i < node_def->input_size(); ++i) {
     if (j < inputs_to_remove.size() && i == inputs_to_remove[j]) {
       ++j;
     } else {
-      node_def->add_input(inputs[i]);
+      copy.add_input()->swap(*node_def->mutable_input(i));
     }
   }
+  node_def->mutable_input()->Swap(copy.mutable_input());
   // Remove 'inputs_to_remove' from 'input_already_exists'
   for (int idx : inputs_to_remove) {
     input_already_exists->erase(input_already_exists->begin() + idx);
@@ -745,9 +742,21 @@ void GraphConstructor::AddControlDependencies(
   // dependencies
   for (const string& control_dep : opts_.control_dependencies) {
     string input = TensorId(control_dep, Graph::kControlSlot).ToString();
-    const protobuf::RepeatedPtrField<string>& inputs = node_def->input();
-    if (std::find(inputs.begin(), inputs.end(), input) != inputs.end()) {
-      // Control dependency already exists
+    bool found = false;
+    for (int i = node_def->input_size() - 1; i >= 0; --i) {
+      const string& node_input = node_def->input(i);
+      if (node_input[0] != '^') {
+        // Control inputs are at the end. Break when we reach the non-control
+        // inputs.
+        break;
+      }
+      if (node_input == input) {
+        // Control dependency already exists
+        found = true;
+        break;
+      }
+    }
+    if (found) {
       continue;
     }
     node_def->add_input(input);
@@ -761,10 +770,10 @@ void GraphConstructor::AddPrefixToNodeDef(
   node_def->set_name(strings::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
-    StringPiece input(node_def->input(i));
     // Skip remapped inputs (which already exist in g_ and are not being
     // imported).
     if (input_already_exists[i]) continue;
+    StringPiece input(node_def->input(i));
     if (str_util::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
@@ -933,10 +942,10 @@ Status GraphConstructor::Convert() {
         }
       }
 
-      // TODO(ashankar): The line below means an additional copy of the NodeDef,
-      // which can be expensive if the NodeDef contains large tensors in it.
-      // Might make sense to change the API for ImportGraphDef to take a mutable
-      // GraphDef* and avoid the copying.
+      // TODO(ashankar): The line below means an additional copy of the
+      // NodeDef, which can be expensive if the NodeDef contains large tensors
+      // in it. Might make sense to change the API for ImportGraphDef to take
+      // a mutable GraphDef* and avoid the copying.
       imported_node_def = original_node_def;
       if (!opts_.input_map.empty()) {
         // Note that input_already_exists can shrink here
@@ -980,7 +989,7 @@ Status GraphConstructor::Convert() {
             src_node->num_outputs(), " outputs");
       }
 
-      inputs.push_back(InputInfo(id.first.ToString(), src_node, src_index));
+      inputs.emplace_back(id.first.ToString(), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(*node_def)) {
@@ -1010,8 +1019,7 @@ Status GraphConstructor::Convert() {
       if (inputs[i].node == nullptr) {
         // Record this back edge, which will be added after all nodes
         // are created.
-        back_edges_.push_back(
-            EdgeInfo(inputs[i].name, inputs[i].index, node, i));
+        back_edges_.emplace_back(inputs[i].name, inputs[i].index, node, i);
       } else if (inputs[i].index == Graph::kControlSlot) {
         g_->AddControlEdge(inputs[i].node, node);
       } else {
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 3604de392f..a5736d40b1 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -14,7 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_memory.h"
-#include <list>
+
+#include <deque>
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -120,7 +121,7 @@ int64 GraphMemory::InferMemUsageForNeighbors(
 static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
     const string& node_name, int output_id,
     std::unordered_map<string, GraphMemory::LiveTensor*>* live_tensors,
-    std::list<GraphMemory::LiveTensor>* device_tensors) {
+    std::deque<GraphMemory::LiveTensor>* device_tensors) {
   string name = strings::StrCat(node_name, ":", output_id);
   GraphMemory::LiveTensor* live;
   auto it = live_tensors->find(name);
@@ -141,6 +142,10 @@ static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
 
 namespace {
 struct Event {
+  Event(int64 _timestamp, bool _allocated,
+        const GraphMemory::LiveTensor* _tensor)
+      : timestamp(_timestamp), allocated(_allocated), tensor(_tensor) {}
+
   int64 timestamp;
   bool allocated;
   const GraphMemory::LiveTensor* tensor;
@@ -160,13 +165,15 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
   }
 
   std::unordered_map<string, LiveTensor*> live_tensors;
-  std::unordered_map<string, std::list<LiveTensor>> live_tensors_per_device;
-
-  NodeMap node_map(&item_.graph);
+  std::unordered_map<string, std::deque<LiveTensor>> live_tensors_per_device;
+  std::unordered_map<string, const NodeDef*> node_map;
+  for (const NodeDef& node : item_.graph.node()) {
+    node_map[node.name()] = &node;
+  }
   for (const auto& dev_stats : timeline.dev_stats()) {
     const string& device_name = dev_stats.device();
     const bool is_gpu = (device_name.find("GPU:") || device_name.find("gpu:"));
-    std::list<LiveTensor>& device_tensors =
+    std::deque<LiveTensor>& device_tensors =
         live_tensors_per_device[dev_stats.device()];
     for (const auto& node_stats : dev_stats.node_stats()) {
       for (int i = 0; i < node_stats.output_size(); ++i) {
@@ -191,12 +198,13 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
                                     node_stats.op_end_rel_micros()));
       }
 
-      const NodeDef* node = node_map.GetNode(node_stats.node_name());
-      if (!node) {
+      auto it = node_map.find(node_stats.node_name());
+      if (it == node_map.end()) {
         // Skip nodes inserted by TF since they don't exist in the original
         // graph (e.g _Send/_Recv nodes).
         continue;
       }
+      const NodeDef* node = it->second;
       std::unordered_set<int> swapped_inputs;
       if (is_gpu) {
         auto it = node->attr().find("_swap_to_host");
@@ -237,14 +245,16 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
     std::vector<Event> events;
     events.reserve(2 * live_per_device.second.size());
     for (const auto& live : live_per_device.second) {
-      events.push_back(Event{live.allocation_time.count(), true, &live});
-      events.push_back(Event{live.deallocation_time.count(), false, &live});
+      events.emplace_back(static_cast<int64>(live.allocation_time.count()),
+                          true, &live);
+      events.emplace_back(static_cast<int64>(live.deallocation_time.count()),
+                          false, &live);
     }
     std::stable_sort(events.begin(), events.end());
     size_t peak = 0;
-    std::set<const LiveTensor*> live_at_peak;
+    std::unordered_set<const LiveTensor*> live_at_peak;
     size_t current = 0;
-    std::set<const LiveTensor*> currently_live;
+    std::unordered_set<const LiveTensor*> currently_live;
     for (int i = 0; i < events.size(); ++i) {
       const auto& event = events[i];
 
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index ad86356504..bbc0fedd22 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef) {
+GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
   id = other.id;
   feed = other.feed;
   fetch = other.fetch;
@@ -38,7 +38,7 @@ GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef) {
   restore_op = other.restore_op;
   save_restore_loc_tensor = other.save_restore_loc_tensor;
   queue_runners = other.queue_runners;
-  graph.Swap(&graphDef);
+  graph.Swap(graph_def);
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 45eed47b50..cd165ac3d4 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -33,10 +33,12 @@ namespace grappler {
 // A TensorFlow model to optimize.
 // Models are represented by the combination of a graph, one of more fetch
 // nodes, and potentially a set of nodes to feed.
-// TODO(volunteer_needed): turn this struct into a class.
 struct GrapplerItem {
   GrapplerItem() = default;
-  GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef);
+  GrapplerItem(const GrapplerItem& other, GraphDef&& graph_def)
+      : GrapplerItem(other, &graph_def) {}
+  // Swaps *graph_def with an empty GraphDef.
+  GrapplerItem(const GrapplerItem& other, GraphDef* graph_def);
   virtual ~GrapplerItem() = default;
 
   string id;  // A unique id for this item
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 463c332858..60b1af48ec 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -253,9 +253,8 @@ NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
   auto is_value_preserving_non_branching = [&](const NodeDef& node) {
-    return IsValuePreserving(node) &&
-           NumNonControlOutputs(node, node_map) == 1 &&
-           nodes_to_preserve.count(node.name()) == 0;
+    return nodes_to_preserve.find(node.name()) == nodes_to_preserve.end() &&
+           IsValuePreserving(node) && NumNonControlOutputs(node, node_map) == 1;
   };
   return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
                         is_value_preserving_non_branching);
@@ -2023,12 +2022,11 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  GrapplerItem optimized_item(item);
-  optimized_graph_ = &optimized_item.graph;
-
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
+  *optimized_graph = item.graph;
+  optimized_graph_ = optimized_graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
   DedupComputations();
@@ -2037,8 +2035,9 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   // optimize larger subgraphs starting from the roots with more inputs.
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
 
-  // Shapes are only needed in aggressive mode.
-  graph_properties_.reset(new GraphProperties(item));
+  GrapplerItem optimized_item(item, optimized_graph);
+  optimized_graph_ = &optimized_item.graph;
+  graph_properties_.reset(new GraphProperties(optimized_item));
   const Status status = graph_properties_->InferStatically(false);
   const bool can_use_shapes = status.ok();
   if (!can_use_shapes) {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index b2a1ce6ab6..e29aaa25fe 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1004,7 +1004,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
 
   for (const auto& input : node.input()) {
     int port = 0;
-    ParseNodeName(input, &port);
+    ParseNodeNameAsStringPiece(input, &port);
     if (port < 0) {
       // Control dependency
       break;
@@ -2084,9 +2084,9 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
           left_child_is_constant ? left_child : right_child;
       // Make sure that it is safe to change the value of the child node->
       if (op_child_node->input_size() < 2 ||
-          NumNonControlOutputs(*op_child_node, *node_map_) > 1 ||
           nodes_to_preserve_.find(op_child_node->name()) !=
-              nodes_to_preserve_.end()) {
+              nodes_to_preserve_.end() ||
+          NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
         continue;
       }
 
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index ed9bce439c..7b7fd81155 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -109,23 +109,12 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
 }
 
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
-  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
-    return false;
-  }
-  if (!fetch_nodes_known_ || NumNonControlOutputs(node, *node_map_) > 0) {
-    // The output values of this node may be needed.
-    return false;
-  }
-  if (IsMerge(node) || IsSwitch(node)) {
-    return false;
-  }
-  if (ModifiesFrameInfo(node)) {
-    return false;
-  }
-  if (!IsFreeOfSideEffect(node)) {
+  if (!fetch_nodes_known_ ||
+      nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  if (node.op() == "ControlTrigger") {
+  if (IsMerge(node) || IsSwitch(node) || ModifiesFrameInfo(node) ||
+      !IsFreeOfSideEffect(node)) {
     return false;
   }
   if (node.op().rfind("Submodel", 0) == 0) {
@@ -136,16 +125,21 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (!status.ok() || op_def->output_arg_size() == 0) {
     return false;
   }
-
+  const std::unordered_set<string> do_not_rewrite_ops{
+      "Assert",      "CheckNumerics",         "_Retval",
+      "_Arg",        "_ParallelConcatUpdate", "_TPUExecute",
+      "_TPUCompile", "ControlTrigger"};
+  if (do_not_rewrite_ops.find(node.op()) != do_not_rewrite_ops.end()) {
+    return false;
+  }
   if (!SafeToRemoveIdentity(node)) {
     return false;
   }
-
-  const std::unordered_set<string> do_not_rewrite_ops{
-      "Assert",     "CheckNumerics",         "_Retval",
-      "_Arg",       "_ParallelConcatUpdate", "_TPUExecute",
-      "_TPUCompile"};
-  return do_not_rewrite_ops.find(node.op()) == do_not_rewrite_ops.end();
+  if (NumNonControlOutputs(node, *node_map_) > 0) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  return true;
 }
 
 void DependencyOptimizer::OptimizeNode(int node_idx,
@@ -164,7 +158,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
         int pos;
-        string input_name = ParseNodeName(fanout->input(i), &pos);
+        StringPiece input_name =
+            ParseNodeNameAsStringPiece(fanout->input(i), &pos);
         if (input_name == node_name) {
           if (pos < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
@@ -358,8 +353,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           for (int j = 0; j < consumer->input_size(); ++j) {
             const string& old_input = consumer->input(j);
             int old_input_pos;
-            string old_input_node_name =
-                ParseNodeName(old_input, &old_input_pos);
+            StringPiece old_input_node_name =
+                ParseNodeNameAsStringPiece(old_input, &old_input_pos);
             if (old_input_node_name == node_name) {
               if (old_input_pos >= 0) {
                 // Regular input
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 27e9d2c78d..c1fee0e993 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -1227,7 +1227,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                              recomputation_targets_name_scope_, optimized_graph,
                              item);
 
-  GrapplerItem optimized_item(item, std::move(*optimized_graph));
+  GrapplerItem optimized_item(item, optimized_graph);
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 5723e397ab..558b8a77e8 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -178,45 +178,41 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
           ? 1
           : cfg_.meta_optimizer_iterations();
+  GrapplerItem optimized_item = item;
+  optimized_graph->Swap(&optimized_item.graph);
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
     VLOG(1) << "Starting optimization iteration " << iteration + 1;
     for (const auto& optimizer : optimizers) {
+      // Invariant: optimized_graph contains the most recently optimized
+      // version of the graph.
       if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
         continue;
       }
-      if (!already_optimized) {
-        Status status = optimizer->Optimize(cluster, item, optimized_graph);
-        string result;
-        if (!status.ok()) {
-          VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                  << ". Return status: " << status.ToString();
-          result = status.ToString();
-        } else {
-          already_optimized = true;
-          result = strings::StrCat(
-              "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
-        }
-        result_.push_back(std::make_pair(optimizer->name(), result));
-        VLOG(1) << "Optimizer " << optimizer->name()
-                << " return status: " << result;
+      uint64 start_us = Env::Default()->NowMicros();
+      // This swaps the current optimized_graph into optimized item and
+      // resets optimized_graph to an empty graph.
+      optimized_graph->Swap(&optimized_item.graph);
+      *optimized_graph = GraphDef();
+      Status status =
+          optimizer->Optimize(cluster, optimized_item, optimized_graph);
+
+      uint64 end_us = Env::Default()->NowMicros();
+      float duration_ms = (end_us - start_us) / 1000.0f;
+      string result;
+      if (!status.ok()) {
+        VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
+                << status.ToString();
+        optimized_graph->Swap(&optimized_item.graph);
+        result = status.ToString();
       } else {
-        GrapplerItem optimized_item(item, std::move(*optimized_graph));
-        Status status =
-            optimizer->Optimize(cluster, optimized_item, optimized_graph);
-        string result;
-        if (!status.ok()) {
-          VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
-                  << status.ToString();
-          optimized_graph->Swap(&optimized_item.graph);
-          result = status.ToString();
-        } else {
-          result = strings::StrCat(
-              optimizer->name(), ": ",
-              PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
-        }
-        result_.push_back(std::make_pair(optimizer->name(), result));
-        VLOG(1) << result;
+        already_optimized = true;
+        result = strings::StrCat(
+            optimizer->name(), ": ",
+            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
+            ", time = ", duration_ms, "ms.");
       }
+      result_.emplace_back(optimizer->name(), result);
+      VLOG(1) << result;
     }
   }
 
@@ -230,10 +226,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
               item.graph.library().gradient_size());
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
-  } else {
-    *optimized_graph = item.graph;
   }
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 534fe670e0..7398d2c896 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -142,38 +142,12 @@ bool IsSameInput(const string& name1, const string& name2) {
     return true;
   }
   int position1;
-  string node1 = ParseNodeName(name1, &position1);
+  StringPiece node1 = ParseNodeNameAsStringPiece(name1, &position1);
   int position2;
-  string node2 = ParseNodeName(name2, &position2);
+  StringPiece node2 = ParseNodeNameAsStringPiece(name2, &position2);
   return (position1 == position2) && (node1 == node2);
 }
 
-string ParseNodeName(const string& name, int* position) {
-  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
-  // to get a node name.
-  strings::Scanner scan(name);
-  scan.ZeroOrOneLiteral("^")
-      .RestartCapture()
-      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
-      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
-  StringPiece capture;
-  StringPiece remaining;
-  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
-    *position = 0;
-    return "";
-  } else {
-    if (name[0] == '^') {
-      *position = -1;
-    } else if (remaining.empty()) {
-      *position = 0;
-    } else {
-      // Skip the first ':' character.
-      CHECK(strings::safe_strto32(remaining.substr(1), position));
-    }
-    return capture.ToString();
-  }
-}
-
 bool IsControlInput(const string& name) {
   return !name.empty() && name[0] == '^';
 }
@@ -185,7 +159,7 @@ string NodeName(const string& name) {
 
 int NodePosition(const string& name) {
   int position;
-  ParseNodeName(name, &position);
+  ParseNodeNameAsStringPiece(name, &position);
   return position;
 }
 
@@ -275,13 +249,20 @@ int NumNonControlInputs(const NodeDef& node) {
 
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
+  int pos;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
     for (const string& node_as_input : output->input()) {
       if (IsControlInput(node_as_input)) {
         break;
       }
-      if (NodeName(node_as_input) == node.name()) {
+      if (node_as_input == node.name()) {
         ++num_outputs;
+      } else {
+        const StringPiece name =
+            ParseNodeNameAsStringPiece(node_as_input, &pos);
+        if (name == node.name()) {
+          ++num_outputs;
+        }
       }
     }
   }
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 11555d712a..b15667dca2 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -26,8 +26,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -107,8 +109,38 @@ string NodeName(const string& name);
 // Get the trailing position number ":{digits}" (if any) of a node name.
 int NodePosition(const string& name);
 
+inline StringPiece ParseNodeNameAsStringPiece(const string& name,
+                                              int* position) {
+  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
+  // to get a node name.
+  strings::Scanner scan(name);
+  scan.ZeroOrOneLiteral("^")
+      .RestartCapture()
+      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
+      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  StringPiece capture;
+  StringPiece remaining;
+  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
+    *position = 0;
+    static const string empty;
+    return StringPiece(empty);
+  } else {
+    if (name[0] == '^') {
+      *position = -1;
+    } else if (remaining.empty()) {
+      *position = 0;
+    } else {
+      // Skip the first ':' character.
+      CHECK(strings::safe_strto32(remaining.substr(1), position));
+    }
+    return capture;
+  }
+}
+
 // Returns the node name and position in a single call.
-string ParseNodeName(const string& name, int* position);
+inline string ParseNodeName(const string& name, int* position) {
+  return ParseNodeNameAsStringPiece(name, position).ToString();
+}
 
 // Add a prefix to a node name with a custom delimiter.
 string AddPrefixToNodeName(const string& name, const string& prefix,
-- 
GitLab


From 3755128f3a83fea84c5a90d71d5b684157a99ac7 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Thu, 12 Apr 2018 17:01:55 -0700
Subject: [PATCH 0734/1262] Fix a typo in cross_tower_ops.

PiperOrigin-RevId: 192694794
---
 tensorflow/contrib/distribute/python/cross_tower_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index bbe5e877d5..cff717db80 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -488,7 +488,8 @@ class AllReduceCrossTowerOps(CrossTowerOps):
           "agg_small_grads_max_group = %d", len(per_device_values),
           self.all_reduce_alg, self.agg_small_grads_max_bytes,
           self.agg_small_grads_max_group)
-      tensor_packer = AggregateSmallTensorPacker(100, 10)
+      tensor_packer = AggregateSmallTensorPacker(
+          self.agg_small_grads_max_bytes, self.agg_small_grads_max_group)
       device_grad_packs = tensor_packer.pack(grouped)
     else:
       logging.info(
-- 
GitLab


From fffd3ca4fcf1f54f97a7be6f225fe183ad82b0ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 17:07:35 -0700
Subject: [PATCH 0735/1262] Move dummy AssertOp and CheckNumericsOp to
 //third_party/tensorflow/compiler/tf2xla/kernels. Enable type DT_STRING for
 AssertOp and ConstOp, in order to make dummy Assert compile with a const
 string (assert message) as its input.

PiperOrigin-RevId: 192695938
---
 tensorflow/compiler/aot/BUILD                 |  1 +
 tensorflow/compiler/aot/tests/BUILD           | 15 ++++++
 .../compiler/aot/tests/make_test_graphs.py    | 10 ++++
 .../tests/test_graph_tfassert_eq.config.pbtxt | 16 ++++++
 .../compiler/aot/tests/tfcompile_test.cc      | 18 +++++++
 .../compiler/jit/mark_for_compilation_pass.cc |  9 ++++
 .../jit/mark_for_compilation_pass_test.cc     | 24 +++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      | 17 +++++++
 .../compiler/tf2xla/kernels/assert_op.cc      | 49 ++++++++++++++++++
 .../tf2xla/kernels/check_numerics_op.cc       | 50 +++++++++++++++++++
 tensorflow/compiler/tf2xla/tf2xla_util.cc     |  9 ++++
 tensorflow/compiler/tf2xla/tf2xla_util.h      |  5 ++
 tensorflow/compiler/tf2xla/xla_cpu_backend.cc |  7 +++
 tensorflow/compiler/tf2xla/xla_gpu_backend.cc |  7 +++
 14 files changed, 237 insertions(+)
 create mode 100644 tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
 create mode 100644 tensorflow/compiler/tf2xla/kernels/assert_op.cc
 create mode 100644 tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc

diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index fa03b1f3c2..19e6bf68e7 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -60,6 +60,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index b053dad1b5..bb73cb19c5 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -14,6 +14,7 @@ test_suite(
         ":test_graph_tfadd_test",
         ":test_graph_tfadd_with_ckpt_saver_test",
         ":test_graph_tfadd_with_ckpt_test",
+        ":test_graph_tfassert_eq_test",
         ":test_graph_tffunction_test",
         ":test_graph_tfgather_test",
         ":test_graph_tfmatmul_test",
@@ -33,6 +34,7 @@ py_binary(
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
@@ -52,6 +54,7 @@ genrule(
         "test_graph_tfadd_with_ckpt_saver.ckpt",
         "test_graph_tfadd_with_ckpt_saver.pb",
         "test_graph_tfadd_with_ckpt_saver.saver",
+        "test_graph_tfassert_eq.pb",
         "test_graph_tffunction.pb",
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
@@ -104,6 +107,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfassert_eq",
+    testonly = 1,
+    config = "test_graph_tfassert_eq.config.pbtxt",
+    cpp_class = "AssertComp",
+    graph = "test_graph_tfassert_eq.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_library(
     name = "test_graph_tffunction",
     testonly = 1,
@@ -170,6 +184,7 @@ tf_cc_test(
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
+        ":test_graph_tfassert_eq",
         ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 89c7cd4507..67767f55da 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import app
@@ -125,6 +126,14 @@ def tfsplits(_):
   array_ops.identity(y, name='result')
 
 
+def tfassert_eq(_):
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  control_flow_ops.Assert(
+      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
+  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -144,6 +153,7 @@ def main(_):
   write_graph(tfmatmulandadd, FLAGS.out_dir)
   write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
+  write_graph(tfassert_eq, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
new file mode 100644
index 0000000000..8732d1709e
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "x_y_diff" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 413efd9cea..67dbd643bf 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfassert_eq.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
@@ -413,6 +414,23 @@ TEST(TFCompileTest, Splits) {
   EXPECT_NEAR(expected[3], fn.result0(1, 1), 1e4);
 }
 
+TEST(TFCompileTest, AssertEqAndReturnDiff) {
+  // Assert is converted into a no-op in XLA, so there is no failure even if the
+  // two args are different.
+  AssertComp assert;
+  EXPECT_EQ(assert.arg0_data(), assert.args()[0]);
+  EXPECT_EQ(assert.arg1_data(), assert.args()[1]);
+
+  assert.arg0() = 2;
+  assert.arg1() = 1;
+  const int32 expected_result = assert.arg0() - assert.arg1();
+  EXPECT_TRUE(assert.Run());
+  EXPECT_EQ(assert.error_msg(), "");
+  EXPECT_EQ(assert.result0(), expected_result);
+  EXPECT_EQ(assert.result0_data()[0], expected_result);
+  EXPECT_EQ(assert.result0_data(), assert.results()[0]);
+}
+
 TEST(TFCompileTest, LookupNameIndex) {
   // add doesn't have any names defined in its config.
   AddComp add;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index f32c0f4ba8..0c9fbf3d54 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -50,6 +50,15 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
   if (node.type_string() == "SymbolicGradient") return false;
+  if (node.type_string() == "Const") {
+    // Skip Const op with type DT_STRING, since XLA doesn't support it, but the
+    // registered Const KernelDef says that it does, to support no-op Assert for
+    // tfcompile.
+    const AttrValue* attr = node.attrs().Find("dtype");
+    if (attr != nullptr && attr->type() == DT_STRING) {
+      return false;
+    }
+  }
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 80edaf28b8..703d8825d7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -609,5 +609,29 @@ TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
   EXPECT_TRUE(clusters.empty());
 }
 
+TEST(XlaCompilationTest, ConstOp) {
+  // valid data type
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope root = Scope::NewRootScope().ExitOnError();
+    auto c = ops::Const(root.WithOpName("const"), 0.5f);
+    c.node()->AddAttr(kXlaCompileAttr, true);
+    TF_ASSERT_OK(root.ToGraph(graph.get()));
+    TF_ASSERT_OK(MarkForCompilation(&graph));
+    EXPECT_EQ(1, GetClusters(*graph).size());
+  }
+
+  // invalid data type
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope root = Scope::NewRootScope().ExitOnError();
+    auto c = ops::Const(root.WithOpName("const"), string("string"));
+    c.node()->AddAttr(kXlaCompileAttr, true);
+    TF_ASSERT_OK(root.ToGraph(graph.get()));
+    TF_ASSERT_OK(MarkForCompilation(&graph));
+    EXPECT_TRUE(GetClusters(*graph).empty());
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index f1bc7d6af4..3ba37b0383 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -171,6 +171,23 @@ tf_kernel_library(
     ],
 )
 
+# Kernels that have a dummy (no-op) implementation.
+tf_kernel_library(
+    name = "xla_dummy_ops",
+    srcs = [
+        "assert_op.cc",
+        "check_numerics_op.cc",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:logging_ops_op_lib",
+    ],
+    alwayslink = 1,
+)
+
 # Kernels that only work on CPU, because they use XLA custom calls.
 # Only link this when using the CPU backend for XLA.
 tf_kernel_library(
diff --git a/tensorflow/compiler/tf2xla/kernels/assert_op.cc b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
new file mode 100644
index 0000000000..af4ab5e8ef
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+// This TensorFlow op supports the Assert primitve.
+class AssertOp : public XlaOpKernel {
+ public:
+  explicit AssertOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  ~AssertOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    static mutex mu(tensorflow::LINKER_INITIALIZED);
+    static int log_counter = 0;
+
+    mutex_lock l(mu);
+    if (log_counter < 20) {
+      ++log_counter;
+      LOG(WARNING) << "Ignoring Assert operator " << name();
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(AssertOp);
+};
+
+REGISTER_XLA_OP(Name("Assert"), AssertOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc b/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc
new file mode 100644
index 0000000000..6061e822d8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+
+class CheckNumericsOp : public XlaOpKernel {
+ public:
+  explicit CheckNumericsOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // TODO(b/32223192): add a real implementation of CheckNumerics
+    {
+      static mutex mu(tensorflow::LINKER_INITIALIZED);
+      static int log_counter = 0;
+      mutex_lock l(mu);
+      if (log_counter < 20) {
+        ++log_counter;
+        LOG(WARNING) << "Ignoring CheckNumerics operator " << name();
+      }
+    }
+    ctx->SetOutput(0, ctx->Input(0));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CheckNumericsOp);
+};
+
+REGISTER_XLA_OP(Name("CheckNumerics"), CheckNumericsOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 2fc77cc4bc..7ec85aa3cd 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -288,4 +288,13 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
   return Status::OK();
 }
 
+void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
+                                   KernelDef* kdef) {
+  for (KernelDef::AttrConstraint& constraint : *kdef->mutable_constraint()) {
+    if (constraint.name() == name) {
+      constraint.mutable_allowed_values()->mutable_list()->add_type(dtype);
+    }
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index e5fba8ede7..745beb39c1 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -51,6 +52,10 @@ string TensorIdToString(const tf2xla::TensorId& id);
 // edges are considered.
 Status SetNodeShardingFromNeighbors(Node* n, bool out_edges);
 
+// Add an allowed data type to the AttrConstraint with the given name.
+void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
+                                   KernelDef* kdef);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
index 8286480e0e..ead229aacc 100644
--- a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
@@ -30,6 +31,12 @@ bool CpuOpFilter(KernelDef* kdef) {
         DT_FLOAT);
     return true;
   }
+  if (kdef->op() == "Const") {
+    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (kdef->op() == "Assert") {
+    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index 8ca757e723..62168b6483 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
@@ -25,6 +26,12 @@ bool GpuOpFilter(KernelDef* kdef) {
       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
     return false;
   }
+  if (kdef->op() == "Const") {
+    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (kdef->op() == "Assert") {
+    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+  }
   return true;
 }
 
-- 
GitLab


From d42e4bde7ace9bb757b0fdf0e2a48c97cabe938b Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 12 Apr 2018 17:32:36 -0700
Subject: [PATCH 0736/1262] Porting tests for `rpc_op` to OS.

PiperOrigin-RevId: 192698931
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   3 +-
 tensorflow/contrib/rpc/BUILD                  |  16 +
 .../contrib/rpc/python/kernel_tests/BUILD     |  76 ++++
 .../rpc/python/kernel_tests/rpc_op_test.py    |  71 ++++
 .../python/kernel_tests/rpc_op_test_base.py   | 337 ++++++++++++++++++
 .../kernel_tests/rpc_op_test_servicer.py      | 101 ++++++
 .../python/kernel_tests/test_example.proto    | 171 +++++++++
 .../core/platform/default/build_config.bzl    |  86 ++++-
 tensorflow/tools/pip_package/BUILD            |   1 +
 tensorflow/workspace.bzl                      |   4 +
 12 files changed, 864 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index ae68f4aec4..7e47516550 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -87,6 +87,7 @@ py_library(
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/rpc",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/signal:signal_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e27ece8fa5..36cc5144d0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -71,6 +71,7 @@ from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
+from tensorflow.contrib import rpc
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
 from tensorflow.contrib import signal
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 21f59d2563..f6aaf41f73 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -347,7 +347,8 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
-GENERATE_PYTHON_OP_LIB("rpc_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index 597f18c771..dbd311a276 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "rpc",
     srcs = [
@@ -11,3 +13,17 @@ py_library(
     ],
     deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
 )
+
+py_library(
+    name = "rpc_pip",
+    data = if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/rpc/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":rpc",
+        "//tensorflow/contrib/rpc/python/kernel_tests:py_test_deps",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_base",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_servicer",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..08ec1e61a4
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -0,0 +1,76 @@
+# TODO(b/76425722): Port everything in here to OS (currently excluded).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Placeholder for loading internal BUILD rule.
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [":test_example_proto_py"],
+)
+
+py_library(
+    name = "rpc_op_test_base",
+    srcs = ["rpc_op_test_base.py"],
+    deps = [
+        ":test_example_proto_py",
+        "//tensorflow/contrib/proto",
+        "//tensorflow/contrib/rpc",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "rpc_op_test_servicer",
+    srcs = ["rpc_op_test_servicer.py"],
+    deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
+
+tf_py_test(
+    name = "rpc_op_test",
+    size = "small",
+    srcs = ["rpc_op_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        ":rpc_op_test_servicer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
new file mode 100644
index 0000000000..e2e0dbc7a2
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for RpcOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+import grpc
+from grpc.framework.foundation import logging_pool
+import portpicker
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_servicer
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+from tensorflow.python.platform import test
+
+
+class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
+  _protocol = 'grpc'
+
+  invalid_method_string = 'Method not found'
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(RpcOpTest, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
+
+  def get_method_name(self, suffix):
+    return '/tensorflow.contrib.rpc.TestCaseService/%s' % suffix
+
+  def setUp(self):
+    super(RpcOpTest, self).setUp()
+
+    service_port = portpicker.pick_unused_port()
+
+    server = grpc.server(logging_pool.pool(max_workers=25))
+    servicer = rpc_op_test_servicer.RpcOpTestServicer()
+    test_example_pb2_grpc.add_TestCaseServiceServicer_to_server(
+        servicer, server)
+    self._address = 'localhost:%d' % service_port
+    server.add_insecure_port(self._address)
+    server.start()
+    self._server = server
+
+  def tearDown(self):
+    # TODO(ebrevdo): Figure out why this sometimes times out.
+    #    self._service.ExitLoop()
+    #    self._service_thread.join()
+    # self._server.stop()
+    super(RpcOpTest, self).tearDown()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
new file mode 100644
index 0000000000..aa03a103ed
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -0,0 +1,337 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Base class for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.contrib.proto import decode_proto
+from tensorflow.contrib.proto import encode_proto
+from tensorflow.contrib.rpc import rpc
+from tensorflow.contrib.rpc import try_rpc
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+
+__all__ = ['I_WARNED_YOU', 'RpcOpTestBase']
+
+I_WARNED_YOU = 'I warned you!'
+
+
+class RpcOpTestBase(object):
+  # pylint: disable=missing-docstring,invalid-name
+  """Base class for RpcOp tests."""
+
+  def get_method_name(self, suffix):
+    raise NotImplementedError
+
+  def rpc(self, *args, **kwargs):
+    return rpc(*args, protocol=self._protocol, **kwargs)
+
+  def try_rpc(self, *args, **kwargs):
+    return try_rpc(*args, protocol=self._protocol, **kwargs)
+
+  def testScalarHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, ())
+      response_values = sess.run(response_tensors)
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+
+  def testScalarHostPortTryRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      self.assertEqual(response_tensors.shape, ())
+      response_values, status_code_values, status_message_values = (
+          sess.run((response_tensors, status_code, status_message)))
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+    # For the base Rpc op, don't expect to get error status back.
+    self.assertEqual(errors.OK, status_code_values)
+    self.assertEqual(b'', status_message_values)
+
+  def testEmptyHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = []
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertAllEqual(response_tensors.shape, [0])
+      response_values = sess.run(response_tensors)
+    self.assertAllEqual(response_values.shape, [0])
+
+  def testInvalidAddresses(self):
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method='/InvalidService.IncrementTestShapes',
+                address=self._address,
+                request=''))
+
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('InvalidMethodName'),
+                address=self._address,
+                request=''))
+
+      # This also covers the case of address=''
+      # and address='localhost:293874293874'
+      with self.assertRaises(errors.UnavailableError):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('IncrementTestShapes'),
+                address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@',
+                request=''))
+
+      # Test invalid method with the TryRpc op
+      _, status_code_value, status_message_value = sess.run(
+          self.try_rpc(
+              method=self.get_method_name('InvalidMethodName'),
+              address=self._address,
+              request=''))
+      self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+      self.assertTrue(
+          self.invalid_method_string in status_message_value.decode('ascii'))
+
+  def testAlwaysFailingMethod(self):
+    with self.test_session() as sess:
+      response_tensors = self.rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+  def testSometimesFailingMethodWithManyRequests(self):
+    with self.test_session() as sess:
+      # Fail hard by default.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+      # Don't fail hard, use TryRpc - return the failing status instead.
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values, status_message_values = sess.run((status_code,
+                                                            status_message))
+      self.assertTrue([
+          x in (errors.OK, errors.INVALID_ARGUMENT) for x in status_code_values
+      ])
+      expected_message_values = np.where(
+          status_code_values == errors.INVALID_ARGUMENT,
+          I_WARNED_YOU.encode('ascii'), b'')
+      self.assertAllEqual(expected_message_values, status_message_values)
+
+  def testVecHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, (20,))
+      response_values = sess.run(response_tensors)
+    self.assertEqual(response_values.shape, (20,))
+    for i in range(20):
+      response_message = test_example_pb2.TestCase()
+      self.assertTrue(response_message.ParseFromString(response_values[i]))
+      self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortManyParallelRpcs(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      many_response_tensors = [
+          self.rpc(
+              method=self.get_method_name('IncrementTestShapes'),
+              address=self._address,
+              request=request_tensors) for _ in range(10)
+      ]
+      # Launch parallel 10 calls to the RpcOp, each containing
+      # 20 rpc requests.
+      many_response_values = sess.run(many_response_tensors)
+    self.assertEqual(10, len(many_response_values))
+    for response_values in many_response_values:
+      self.assertEqual(response_values.shape, (20,))
+      for i in range(20):
+        response_message = test_example_pb2.TestCase()
+        self.assertTrue(response_message.ParseFromString(response_values[i]))
+        self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortRpcUsingEncodeAndDecodeProto(self):
+    with self.test_session() as sess:
+      request_tensors = encode_proto(
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          sizes=[[3]] * 20,
+          values=[
+              [[i, i + 1, i + 2] for i in range(20)],
+          ])
+      response_tensor_strings = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      _, (response_shape,) = decode_proto(
+          bytes=response_tensor_strings,
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          output_types=[dtypes.int32])
+      response_shape_values = sess.run(response_shape)
+    self.assertAllEqual([[i + 1, i + 2, i + 3]
+                         for i in range(20)], response_shape_values)
+
+  def testVecHostPortRpcCancelsUponSessionTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          request=request_tensors)
+      for timeout_ms in [1, 500, 1000]:
+        options = config_pb2.RunOptions(timeout_in_ms=timeout_ms)
+        with self.assertRaises((errors.UnavailableError,
+                                errors.DeadlineExceededError)):
+          sess.run(response_tensors, options=options)
+
+  def testVecHostPortRpcCancelsUponConfiguredTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          timeout_in_ms=1000,
+          request=request_tensors)
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(response_tensors)
+
+  def testTryRpcPropagatesDeadlineErrorWithSometimesTimingOutRequests(self):
+    with self.test_session() as sess:
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesSleepForever'),
+          timeout_in_ms=1000,
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values = sess.run(status_code)
+      self.assertTrue([
+          x in (errors.OK, errors.DEADLINE_EXCEEDED) for x in status_code_values
+      ])
+
+  def testTryRpcWithMultipleAddressesSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleMethodsSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      methods = flatten(
+          [[self.get_method_name('IncrementTestShapes'), 'InvalidMethodName']
+           for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=methods, address=self._address, request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNIMPLEMENTED] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleAddressesAndRequests(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      requests = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=requests)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(20):
+        if i % 2 == 1:
+          self.assertFalse(response_tensors_values[i])
+        else:
+          response_message = test_example_pb2.TestCase()
+          self.assertTrue(
+              response_message.ParseFromString(response_tensors_values[i]))
+          self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
new file mode 100644
index 0000000000..7cbd636cb1
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Test servicer for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import time
+
+import grpc
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+
+
+class RpcOpTestServicer(test_example_pb2_grpc.TestCaseServiceServicer):
+  """Test servicer for RpcOp tests."""
+
+  def IncrementTestShapes(self, request, context):
+    """Increment the entries in the shape attribute of request.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    for i in range(len(request.shape)):
+      request.shape[i] += 1
+    return request
+
+  def AlwaysFailWithInvalidArgument(self, request, context):
+    """Always fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    del request
+    context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+    context.set_details(rpc_op_test_base.I_WARNED_YOU)
+
+  def SometimesFailWithInvalidArgument(self, request, context):
+    """Sometimes fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+      context.set_details(rpc_op_test_base.I_WARNED_YOU)
+    return request
+
+  def SleepForever(self, request, context):
+    """Sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    # TODO(ebrevdo): Make this async wait like the stubby version.
+    time.sleep(5)
+
+  def SometimesSleepForever(self, request, context):
+    """Sometimes sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      time.sleep(5)
+    return request
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..96f4550f62
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
@@ -0,0 +1,171 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.rpc;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+service TestCaseService {
+  // Copy input, and increment each entry in 'shape' by 1.
+  rpc IncrementTestShapes(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever.
+  rpc SleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever 50% of the time, return immediately the other 50%.
+  rpc SometimesSleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Always fails with InvalidArgument.
+  rpc AlwaysFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+
+  // Fails with InvalidArgument 50% of the time.
+  rpc SometimesFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 4cfa25bf66..44356e3438 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,6 @@
 # Platform-specific build configurations.
 
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
-load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
@@ -110,6 +109,12 @@ def _proto_cc_srcs(srcs, use_grpc_plugin=False):
     ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
   return ret
 
+def _proto_py_outs(srcs, use_grpc_plugin=False):
+  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+  if use_grpc_plugin:
+    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+  return ret
+
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
@@ -217,6 +222,80 @@ def cc_proto_library(
       hdrs=gen_hdrs,
       **kargs)
 
+# Re-defined protocol buffer rule to bring in the change introduced in commit
+# https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
+# which was not part of a stable protobuf release in 04/2018.
+# TODO(jsimsa): Remove this once the protobuf dependency version is updated
+# to include the above commit.
+def py_proto_library(
+        name,
+        srcs=[],
+        deps=[],
+        py_libs=[],
+        py_extra_srcs=[],
+        include=None,
+        default_runtime="@protobuf_archive//:protobuf_python",
+        protoc="@protobuf_archive//:protoc",
+        use_grpc_plugin=False,
+        **kargs):
+  """Bazel rule to create a Python protobuf library from proto source files
+
+  NOTE: the rule is only an internal workaround to generate protos. The
+  interface may change and the rule may be removed when bazel has introduced
+  the native rule.
+
+  Args:
+    name: the name of the py_proto_library.
+    srcs: the .proto files of the py_proto_library.
+    deps: a list of dependency labels; must be py_proto_library.
+    py_libs: a list of other py_library targets depended by the generated
+        py_library.
+    py_extra_srcs: extra source files that will be added to the output
+        py_library. This attribute is used for internal bootstrapping.
+    include: a string indicating the include path of the .proto files.
+    default_runtime: the implicitly default runtime which will be depended on by
+        the generated py_library target.
+    protoc: the label of the protocol compiler to generate the sources.
+    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+        when processing the proto files.
+    **kargs: other keyword arguments that are passed to cc_library.
+  """
+  outs = _proto_py_outs(srcs, use_grpc_plugin)
+
+  includes = []
+  if include != None:
+    includes = [include]
+
+  grpc_python_plugin = None
+  if use_grpc_plugin:
+    grpc_python_plugin = "//external:grpc_python_plugin"
+    # Note: Generated grpc code depends on Python grpc module. This dependency
+    # is not explicitly listed in py_libs. Instead, host system is assumed to
+    # have grpc installed.
+
+  proto_gen(
+      name=name + "_genproto",
+      srcs=srcs,
+      deps=[s + "_genproto" for s in deps],
+      includes=includes,
+      protoc=protoc,
+      gen_py=1,
+      outs=outs,
+      visibility=["//visibility:public"],
+      plugin=grpc_python_plugin,
+      plugin_language="grpc"
+  )
+
+  if default_runtime and not default_runtime in py_libs + deps:
+    py_libs = py_libs + [default_runtime]
+
+  native.py_library(
+      name=name,
+      srcs=outs+py_extra_srcs,
+      deps=py_libs+deps,
+      imports=includes,
+      **kargs)
+
 def tf_proto_library_cc(name, srcs = [], has_services = None,
                         protodeps = [],
                         visibility = [], testonly = 0,
@@ -261,8 +340,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   )
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0,
-                        srcs_version="PY2AND3"):
+                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
   py_proto_library(
       name = name + "_py",
       srcs = srcs,
@@ -272,6 +350,7 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
+      use_grpc_plugin = use_grpc_plugin,
   )
 
 def tf_jspb_proto_library(**kwargs):
@@ -310,6 +389,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
       srcs_version = "PY2AND3",
       testonly = testonly,
       visibility = visibility,
+      use_grpc_plugin = has_services,
   )
 
 def tf_additional_lib_hdrs(exclude = []):
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a0bae23a7c..2ef105755f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -76,6 +76,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/predictor:predictor_pip",
     "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
     "//tensorflow/contrib/signal:test_util",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 72f446d359..dee2fcd0e1 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -763,6 +763,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc_cpp_plugin",
       actual = "@grpc//:grpc_cpp_plugin",
   )
+  native.bind(
+      name = "grpc_python_plugin",
+      actual = "@grpc//:grpc_python_plugin",
+  )
 
   # gRPC has three empty C++ functions which it wants the user to define
   # at build time. https://github.com/grpc/grpc/issues/13590
-- 
GitLab


From 457e8b3a78d4b31de4113168422786412f8771fc Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Thu, 12 Apr 2018 17:35:56 -0700
Subject: [PATCH 0737/1262] Print error msg in CUDATimer.Init() when
 CreateEvent() is not ok().

PiperOrigin-RevId: 192699277
---
 tensorflow/stream_executor/cuda/cuda_timer.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
index 4bd5503348..7d78601fb9 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -27,16 +27,18 @@ namespace cuda {
 bool CUDATimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
   CudaContext* context = parent_->cuda_context();
-  if (!CUDADriver::CreateEvent(context, &start_event_,
-                               CUDADriver::EventFlags::kDefault)
-           .ok()) {
+  port::Status status = CUDADriver::CreateEvent(
+      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
     return false;
   }
 
-  if (!CUDADriver::CreateEvent(context, &stop_event_,
-                               CUDADriver::EventFlags::kDefault)
-           .ok()) {
-    port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+  status = CUDADriver::CreateEvent(context, &stop_event_,
+                                   CUDADriver::EventFlags::kDefault);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    status = CUDADriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
       LOG(ERROR) << status;
     }
-- 
GitLab


From 5a53c9b54d8781032ebf2cf26f93da3b2a33d1e4 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 12 Apr 2018 18:02:58 -0700
Subject: [PATCH 0738/1262] Reintroducing support for constants as outputs of
 tf.data.map(). This fixes a regression introduced by cl/176147440.

PiperOrigin-RevId: 192702279
---
 .../data/kernel_tests/map_dataset_op_test.py  | 14 +++++++
 tensorflow/python/data/ops/dataset_ops.py     | 42 +++++++++----------
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 0791c614fa..1ad0b9de5e 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -624,6 +624,20 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testConstantOutput(self):
+    iterator = (
+        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, b"hello", 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c28de3d054..406f172e59 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1155,10 +1155,12 @@ class _GeneratorDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._state_classes = sparse.get_classes(ret)
@@ -1167,11 +1169,9 @@ class _GeneratorDataset(Dataset):
       self._state_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._init_func = tf_init_func
@@ -1214,10 +1214,12 @@ class _GeneratorDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._output_classes = sparse.get_classes(ret)
@@ -1226,11 +1228,9 @@ class _GeneratorDataset(Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._next_func = tf_next_func
@@ -1816,10 +1816,12 @@ class MapDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._output_classes = sparse.get_classes(ret)
@@ -1828,11 +1830,9 @@ class MapDataset(Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._map_func = tf_map_func
-- 
GitLab


From e489b600f388ae345387881a85368af3cd373ba2 Mon Sep 17 00:00:00 2001
From: Sami Kama <samikama@users.noreply.github.com>
Date: Thu, 12 Apr 2018 18:07:50 -0700
Subject: [PATCH 0739/1262] Replace tuple<int,int,int> for version info with a
 class in DnnSupportr::GetVersion() (#18434)

* Replace tuple<int,int,int> for version info with a class

* Removed clang-format modifications on non-edited code

* Update dnn.h

Update the comment as per request of reviewer
---
 .../gpu/cudnn_convolution_algorithm_picker.cc |  4 ++--
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |  7 ++++---
 tensorflow/stream_executor/cuda/cuda_dnn.h    |  2 +-
 tensorflow/stream_executor/dnn.h              | 20 +++++++++++++++++--
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index d6b457a91b..1eccfe8571 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -99,9 +99,9 @@ bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
                                        const ConvolutionDimensionNumbers& dnums,
                                        se::StreamExecutor* stream_exec) {
   // Skip this check for cudnn7 and newer.
-  se::port::StatusOr<std::tuple<int, int, int>> version =
+  auto version =
       stream_exec->AsDnn()->GetVersion();
-  if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+  if (version.ok() && version.ValueOrDie().major_version() >= 7) {
     return true;
   }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 1dc7f991b3..a11b644ab1 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -477,11 +477,12 @@ port::Status CudnnSupport::Init() {
                                    ToString(status))};
 }
 
-port::StatusOr<std::tuple<int, int, int>> CudnnSupport::GetVersion() {
+port::StatusOr<perftools::gputools::dnn::VersionInfo>
+CudnnSupport::GetVersion() {
   CudnnVersion version;
   TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
-  return std::make_tuple(version.major_version, version.minor_version,
-                         version.patch_level);
+  return perftools::gputools::dnn::VersionInfo(
+      version.major_version, version.minor_version, version.patch_level);
 }
 
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 0e5368aca8..09d248f137 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -46,7 +46,7 @@ class CudnnSupport : public dnn::DnnSupport {
   ~CudnnSupport() override;
 
   port::Status Init() override;
-  port::StatusOr<std::tuple<int, int, int>> GetVersion() override;
+  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 3c47d2c2e8..47dcd80218 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -876,6 +876,22 @@ enum class ElementwiseOperation { kAdd, kMultiply };
 
 string ElementwiseOperationString(ElementwiseOperation op);
 
+// A simple class representing the version of the backing library, to 
+// workaround the "too perfect forwarding" issue in gcc6+ compilers. 
+// See PR#16309 and issue #18402 for links discussing the issue.
+class VersionInfo {
+ public:
+  VersionInfo(int major = 0, int minor = 0, int patch = 0)
+      : major_(major), minor_(minor), patch_(patch) {}
+  int major_version() { return major_; }
+  int minor_version() { return minor_; }
+  int patch() { return patch_; }
+ private:
+  int major_;
+  int minor_;
+  int patch_;
+};
+
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
@@ -886,8 +902,8 @@ class DnnSupport {
 
   virtual port::Status Init() = 0;
 
-  // Gets the version of the backing library, as a {major, minor, patch} tuple.
-  virtual port::StatusOr<std::tuple<int, int, int>> GetVersion() {
+  // Gets the version of the backing library, as a VersionInfo object.
+  virtual port::StatusOr<VersionInfo> GetVersion() {
     return port::UnimplementedError(
         "DnnSupport::GetVersion not implemented on this platform.");
   }
-- 
GitLab


From 7d89bfcd72bef4c5c9328a88ee520d81642b5284 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 18:19:05 -0700
Subject: [PATCH 0740/1262] Adding autograph built-in function checker.

PiperOrigin-RevId: 192703924
---
 .../contrib/autograph/converters/call_trees.py      |  3 +--
 tensorflow/contrib/autograph/impl/api.py            |  2 +-
 tensorflow/contrib/autograph/pyct/inspect_utils.py  | 13 +++++++++++++
 .../contrib/autograph/pyct/inspect_utils_test.py    |  7 +++++++
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 61f6bfd7e7..9424966696 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -23,7 +23,6 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import namedtuple
-import types
 
 import gast
 
@@ -114,7 +113,7 @@ class CallTreeTransformer(transformer.Base):
   def _function_is_compilable(self, target_entity):
     """Determines whether an entity can be compiled at all."""
     # TODO(mdan): This is just a placeholder. Implement.
-    return not isinstance(target_entity, types.BuiltinFunctionType)
+    return not inspect_utils.isbuiltin(target_entity)
 
   def _should_compile(self, node, fqn):
     """Determines whether an entity should be compiled in the context."""
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index dce994e50d..a553813e19 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -137,7 +137,7 @@ def converted_call(f, recursive, verbose, arg_types, *args, **kwargs):
 
   unknown_arg_value = object()  # Sentinel for arguments of unknown value
 
-  if tf_inspect.isbuiltin(f):
+  if inspect_utils.isbuiltin(f):
     return builtins.dynamic_builtin(f, *args, **kwargs)
 
   if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index 386a6d21ec..63361cc4f2 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -22,12 +22,25 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import types
 
 import six
 
 from tensorflow.python.util import tf_inspect
 
 
+def isbuiltin(f):
+  # Note these return false for isinstance(f, types.BuiltinFunctionType) so we
+  # need to specifically check for them.
+  if f in (range, int, float):
+    return True
+  if isinstance(f, types.BuiltinFunctionType):
+    return True
+  if tf_inspect.isbuiltin(f):
+    return True
+  return False
+
+
 def getnamespace(f):
   """Returns the complete namespace of a function.
 
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index 58f827b79a..cf841dae81 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -258,6 +258,13 @@ class InspectUtilsTest(test.TestCase):
     self.assertTrue(
         inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
 
+  def test_isbuiltin(self):
+    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(float))
+    self.assertTrue(inspect_utils.isbuiltin(int))
+    self.assertTrue(inspect_utils.isbuiltin(len))
+    self.assertFalse(inspect_utils.isbuiltin(function_decorator))
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 93afca507ec09ff3b5cdf05cbd5eb265e83fc8cb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 18:29:05 -0700
Subject: [PATCH 0741/1262] Convert GrapplerFunctionItem to
 (Specialized)FunctionDef.

PiperOrigin-RevId: 192704808
---
 tensorflow/core/grappler/utils/BUILD          |   3 +
 tensorflow/core/grappler/utils/functions.cc   | 328 +++++++++++++++---
 tensorflow/core/grappler/utils/functions.h    |  92 +++--
 .../core/grappler/utils/functions_test.cc     | 179 ++++++++--
 4 files changed, 504 insertions(+), 98 deletions(-)

diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 05d9cbaa2b..b473f32c45 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -165,6 +165,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
@@ -177,6 +178,8 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index dd0d918e72..e8d423a759 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -23,27 +23,82 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+Status OutputNameRange(const FunctionLibraryDefinition& flib,
+                       const NodeDef& node,
+                       tensorflow::NameRangeMap* outputs_range_map) {
+  const OpRegistrationData* registration;
+  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
+  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(node, registration->op_def,
+                                                   nullptr, outputs_range_map));
+  return Status::OK();
+}
+
+Status RegisterFunctionBodyOutputs(const FunctionLibraryDefinition& flib,
+                                   const NodeDef& node,
+                                   GrapplerFunctionConnectivity* connectivity) {
+  tensorflow::NameRangeMap outputs_range_map;
+  TF_RETURN_IF_ERROR(OutputNameRange(flib, node, &outputs_range_map));
+  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
+  return Status::OK();
+}
+
+// Replace the placeholder attribute values with the values specified in
+// instantiation attributes.
+Status ResolveFunctionBodyNodeAttrPlaceholders(
+    const AttrValueMap& func_instantiation_attr, NodeDef* node) {
+  for (auto& attr : *node->mutable_attr()) {
+    const string& placeholder = attr.second.placeholder();
+    if (placeholder.empty()) continue;
+
+    auto it = func_instantiation_attr.find(placeholder);
+    if (it != func_instantiation_attr.end()) {
+      attr.second = it->second;
+    } else {
+      return errors::InvalidArgument("Can't resolve placeholder: ",
+                                     placeholder);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
     const InputArgExpansion& input_arg_expansion) {
-  input_arg_expansions_.insert(
-      {input_arg_expansion.input_name, input_arg_expansion});
+  const auto& input_name = input_arg_expansion.input_name;
+  const auto& placeholders = input_arg_expansion.placeholders;
+  input_arg_expansions_.emplace(input_name, input_arg_expansion);
+  for (int i = 0; i < placeholders.size(); ++i) {
+    const string& placeholder = input_arg_expansion.placeholders[i];
+    input_arg_placeholders_.emplace(
+        placeholder, InputArgPlaceholder{input_name, /*position=*/i});
+  }
 }
 
 void GrapplerFunctionConnectivity::RegisterFunctionBodyOutputs(
     const string& node_name, const tensorflow::NameRangeMap& outputs) {
-  function_body_outputs_.insert({node_name, outputs});
+  function_body_outputs_[node_name] = outputs;
 }
 
 Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
     const string& func_def_input, std::vector<string>* graph_def_inputs) const {
   using ::tensorflow::strings::Scanner;
 
+  if (IsControlInput(func_def_input)) {
+    graph_def_inputs->push_back(func_def_input);
+    return Status::OK();
+  }
+
   // Parse input format: "node_name[:node_output][:position]"
   string node_name;
   string node_output;
@@ -150,11 +205,8 @@ Status GrapplerFunctionConnectivity::ExpandNodeInputs(
   std::vector<string> expanded_inputs;
 
   for (const string& function_def_input : function_body_node->input()) {
-    if (!IsControlInput(function_def_input))
-      TF_RETURN_IF_ERROR(
-          ExpandFunctionDefInput(function_def_input, &expanded_inputs));
-    else
-      expanded_inputs.push_back(function_def_input);
+    TF_RETURN_IF_ERROR(
+        ExpandFunctionDefInput(function_def_input, &expanded_inputs));
   }
 
   function_body_node->clear_input();
@@ -163,10 +215,66 @@ Status GrapplerFunctionConnectivity::ExpandNodeInputs(
   return Status::OK();
 }
 
-Status GrapplerFunctionItemBuilder::GetTypeAttr(const string& type_attr_name,
-                                                DataType* data_type) const {
-  auto it = func_attr_->find(type_attr_name);
-  if (it == func_attr_->end()) {
+Status GrapplerFunctionConnectivity::AsFunctionDefInput(
+    const string& graph_def_input, string* func_def_input) const {
+  using gtl::FindOrNull;
+
+  if (IsControlInput(graph_def_input)) {
+    *func_def_input = graph_def_input;
+    return Status::OK();
+  }
+
+  int position;
+  string node_name = ParseNodeName(graph_def_input, &position);
+  CHECK_GE(position, 0);
+
+  // Check if it's an input arg placeholder
+  if (position == 0) {
+    const InputArgPlaceholder* placeholder =
+        FindOrNull(input_arg_placeholders_, node_name);
+    if (placeholder != nullptr) {
+      *func_def_input =
+          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      return Status::OK();
+    }
+  }
+
+  // It must be output from one of the function body nodes
+  const tensorflow::NameRangeMap* outputs_range_map =
+      FindOrNull(function_body_outputs_, node_name);
+  if (outputs_range_map != nullptr) {
+    for (const auto& el : *outputs_range_map) {
+      const auto& output_name = el.first;
+      const auto& output_range = el.second;
+      if (position >= output_range.first && position < output_range.second) {
+        int pos = position - output_range.first;
+        *func_def_input =
+            strings::StrCat(node_name, ":", output_name, ":", pos);
+        return Status::OK();
+      }
+    }
+  }
+
+  return errors::InvalidArgument("Unknown graph def input: ", graph_def_input);
+}
+
+Status GrapplerFunctionConnectivity::AsFunctionDefNode(
+    NodeDef* function_body_node) const {
+  string func_def_input;
+
+  for (int i = 0; i < function_body_node->input_size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        AsFunctionDefInput(function_body_node->input(i), &func_def_input));
+    function_body_node->set_input(i, func_def_input);
+  }
+
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemInstantiation::GetTypeAttr(
+    const string& type_attr_name, DataType* data_type) const {
+  auto it = func_instantiation_attr_->find(type_attr_name);
+  if (it == func_instantiation_attr_->end()) {
     return errors::InvalidArgument("Type attribute ", type_attr_name,
                                    " is not defined");
   } else if (it->second.type() == DT_INVALID) {
@@ -178,31 +286,48 @@ Status GrapplerFunctionItemBuilder::GetTypeAttr(const string& type_attr_name,
   return Status::OK();
 }
 
-Status GrapplerFunctionItemBuilder::GetArgType(const OpDef::ArgDef& arg,
-                                               DataType* data_type) const {
+Status GrapplerFunctionItemInstantiation::GetArgType(
+    const OpDef::ArgDef& arg, DataType* data_type) const {
   if (arg.type() != DT_INVALID) {
     *data_type = arg.type();
   } else {
+    if (!arg.type_list_attr().empty() || !arg.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Arguments with sequence of tensors are not supported. Unsupported "
+          "argument name: ",
+          arg.name());
+    }
     TF_RETURN_IF_ERROR(GetTypeAttr(arg.type_attr(), data_type));
   }
   return Status::OK();
 }
 
 GrapplerFunctionItem::GrapplerFunctionItem(
-    const string& function_name,
+    const string& func_name, const AttrValueMap& func_attr,
     const std::vector<InputArgExpansion>& input_arg_expansions,
     const std::vector<OutputArgExpansion>& output_arg_expansions,
     GraphDef&& function_body)
-    : function_name_(function_name),
+    : func_attr_(func_attr),
       input_arg_expansions_(input_arg_expansions),
       output_arg_expansions_(output_arg_expansions) {
+  id = func_name;
+  // Fill the feed nodes with input placeholders
+  for (const InputArgExpansion& input_arg : input_arg_expansions_) {
+    for (const string& placeholder : input_arg.placeholders) {
+      feed.emplace_back(placeholder, Tensor());
+      input_arg_placeholders_.insert(placeholder);
+    }
+  }
+  // Fill the fetch nodes with outputs
+  for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
+    for (const string& output_tensor : output_arg.output_tensors) {
+      fetch.push_back(output_tensor);
+    }
+  }
+  // Swap the graph body
   graph.Swap(&function_body);
 }
 
-const string& GrapplerFunctionItem::function_name() const {
-  return function_name_;
-}
-
 const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
   return input_arg_expansions_;
 }
@@ -215,6 +340,11 @@ const std::size_t GrapplerFunctionItem::input_size() const {
   return input_arg_expansions_.size();
 }
 
+bool GrapplerFunctionItem::IsInputPlaceholder(const string& node_name) const {
+  return input_arg_placeholders_.find(node_name) !=
+         input_arg_placeholders_.end();
+}
+
 const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
   return output_arg_expansions_;
 }
@@ -227,10 +357,19 @@ const std::size_t GrapplerFunctionItem::output_size() const {
   return output_arg_expansions_.size();
 }
 
+const AttrValueMap& GrapplerFunctionItem::func_attr() const {
+  return func_attr_;
+}
+
 const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
 
 GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
 
+GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
+  graph.Swap(&other);
+  return *this;
+}
+
 std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
   std::vector<string> output_tensors;
   for (const OutputArgExpansion& output : item.outputs()) {
@@ -241,18 +380,27 @@ std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
   return output_tensors;
 }
 
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionLibraryDefinition& func_library, GrapplerFunctionItem* item) {
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const AttrValueMap& func_instantiation_attr,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
   const OpDef& signature = func.signature();
 
   if (signature.name().empty()) {
     return errors::InvalidArgument("Function name must be specified");
   }
 
-  // Helper methods to lookup function attributes
-  GrapplerFunctionItemBuilder builder(&func_attr);
+  // Function types will be resolved from function instantiation attributes. All
+  // other attributes will be lost during conversion to FunctionDef.
+  for (const OpDef::AttrDef& attr : signature.attr()) {
+    if (attr.type() != "type") {
+      return errors::InvalidArgument(
+          "Function signature must have only type attributes");
+    }
+  }
+
+  // Helper methods to lookup function instantiation attributes
+  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
 
   // Mapping from FunctionDef input format (name[:output][:position]) to
   // GraphDef input format (name[:position])
@@ -260,7 +408,10 @@ Status MakeGrapplerFunctionItem(
 
   std::vector<InputArgExpansion> inputs;
   std::vector<OutputArgExpansion> outputs;
+
+  // Function body shares the library with the graph that instantiated it.
   GraphDef function_body;
+  *function_body.mutable_library() = flib.ToProto();
 
   // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
 
@@ -284,7 +435,7 @@ Status MakeGrapplerFunctionItem(
     }
 
     DataType input_data_type;
-    TF_RETURN_IF_ERROR(builder.GetArgType(input, &input_data_type));
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(input, &input_data_type));
 
     NodeDef* placeholder = function_body.add_node();
     placeholder->set_name(input.name());
@@ -292,6 +443,7 @@ Status MakeGrapplerFunctionItem(
     (*placeholder->mutable_attr())["T"].set_type(input_data_type);
 
     InputArgExpansion input_expansion{/*input_name=*/input.name(),
+                                      /*data_type=*/input_data_type,
                                       /*placeholders=*/{input.name()}};
     connectivity.RegisterInputArgExpansion(input_expansion);
     inputs.push_back(input_expansion);
@@ -302,24 +454,12 @@ Status MakeGrapplerFunctionItem(
     NodeDef* new_node = function_body.add_node();
     *new_node = func_def_node;
 
-    // Replace the placeholder attribute values with the specified value
-    for (auto& attr : *new_node->mutable_attr()) {
-      const string& ph_name = attr.second.placeholder();
-      auto it = func_attr.find(ph_name);
-      if (it != func_attr.end()) {
-        attr.second = it->second;
-      }
-    }
-
-    // Functions use a custom format to encode connectivity. Map these custom
-    // strings to regular ones.
-    tensorflow::NameRangeMap outputs_range_map;
-    const OpRegistrationData* registration;
-    TF_RETURN_IF_ERROR(func_library.LookUp(func_def_node.op(), &registration));
-    TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(
-        func_def_node, registration->op_def, nullptr, &outputs_range_map));
-    connectivity.RegisterFunctionBodyOutputs(func_def_node.name(),
-                                             outputs_range_map);
+    // Resolve all placeholder values using function instantiation attributes.
+    TF_RETURN_IF_ERROR(ResolveFunctionBodyNodeAttrPlaceholders(
+        func_instantiation_attr, new_node));
+    // Register node output range in a function connectivity.
+    TF_RETURN_IF_ERROR(
+        RegisterFunctionBodyOutputs(flib, func_def_node, &connectivity));
   }
 
   // Rewrite inputs to use GraphDef format
@@ -331,20 +471,96 @@ Status MakeGrapplerFunctionItem(
   for (const OpDef::ArgDef& out : signature.output_arg()) {
     std::vector<string> output_tensors;
     auto ret = func.ret().find(out.name());
-    if (ret != func.ret().end()) {
-      // Expand outputs using provided output mapping
-      TF_RETURN_IF_ERROR(
-          connectivity.ExpandFunctionDefInput(ret->second, &output_tensors));
-    } else {
-      // Otherwise output must be one of the function inputs
-      TF_RETURN_IF_ERROR(
-          connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
+    TF_RETURN_IF_ERROR(
+        ret != func.ret().end()
+            // Expand outputs using provided output mapping
+            ? connectivity.ExpandFunctionDefInput(ret->second, &output_tensors)
+            // Otherwise output must be one of the function inputs
+            : connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
+
+    DataType output_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+
+    OutputArgExpansion output{/*output_name=*/out.name(),
+                              /*data_type=*/output_data_type,
+                              /*output_tensors=*/output_tensors};
+    outputs.push_back(output);
+  }
+
+  *item = GrapplerFunctionItem(
+      /*func_name=*/signature.name(),
+      /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
+      inputs, outputs, std::move(function_body));
+  return Status::OK();
+}
+
+// Register GrapplerFunctionItem input arg expansion and function body outputs
+// in the GrapplerFunctionConnectivity
+Status RegisterGrapplerFunctionConnectivity(
+    const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
+    GrapplerFunctionConnectivity* connectivity) {
+  for (const InputArgExpansion& input : item.inputs()) {
+    connectivity->RegisterInputArgExpansion(input);
+  }
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    TF_RETURN_IF_ERROR(
+        RegisterFunctionBodyOutputs(flib, func_body_node, connectivity));
+  }
+  return Status::OK();
+}
+
+Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
+                                  const FunctionLibraryDefinition& flib,
+                                  FunctionDef* func) {
+  func->mutable_signature()->set_name(item.id);
+
+  // Build a GrapplerFunctionConnectivity from inputs and new function body.
+  GrapplerFunctionConnectivity connectivity;
+  TF_RETURN_IF_ERROR(
+      RegisterGrapplerFunctionConnectivity(item, flib, &connectivity));
+
+  // Add function input arguments.
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    OpDef::ArgDef arg_def;
+    arg_def.set_name(input_arg.input_name);
+    arg_def.set_type(input_arg.data_type);
+    *func->mutable_signature()->add_input_arg() = arg_def;
+  }
+
+  // Add function output arguments.
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    OpDef::ArgDef arg_def;
+    arg_def.set_name(output_arg.output_name);
+    arg_def.set_type(output_arg.data_type);
+    *func->mutable_signature()->add_output_arg() = arg_def;
+
+    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
+        << "Outputs of tensor sequences are not supported";
+
+    string ret;
+    for (const string& output_tensor : output_arg.output_tensors) {
+      TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
+      (*func->mutable_ret())[output_arg.output_name] = ret;
     }
-    outputs.push_back({out.name(), output_tensors});
   }
 
-  *item = GrapplerFunctionItem(signature.name(), inputs, outputs,
-                               std::move(function_body));
+  // Copy function definition specific attributes.
+  for (const auto& attr : item.func_attr()) {
+    const auto& attr_name = attr.first;
+    const auto& attr_value = attr.second;
+    (*func->mutable_attr())[attr_name] = attr_value;
+  }
+
+  // Copy function body nodes to the FunctionDef and update input format
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    // Do not copy input placeholders
+    if (item.IsInputPlaceholder(func_body_node.name())) continue;
+
+    NodeDef* func_def_node = func->add_node_def();
+    *func_def_node = func_body_node;
+    TF_RETURN_IF_ERROR(connectivity.AsFunctionDefNode(func_def_node));
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 60ea8857c0..2ac3917a66 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -28,14 +28,19 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+using AttrValueMap = std::unordered_map<string, AttrValue>;
+
 // Depending on the function instantiation attributes, input argument to the
 // function might be a single tensor, list of tensors of the same type, or a
 // list of tensors of different types.
 //
 // InputArgExpansion keeps track of the placeholders that were added to the
-// function body in place of function inputs.
+// function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
+  // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
+  // different data types
   string input_name;                 // name of the function input argument
+  DataType data_type;                // input data type
   std::vector<string> placeholders;  // names of placeholder nodes in the
                                      // function body
 };
@@ -44,11 +49,14 @@ struct InputArgExpansion {
 // to one or more outputs of one of the function body nodes.
 //
 // OutputArgExpansion keeps mapping from a function output arg to the output
-// tensors of a function body nodes, that compute function outputs.
+// tensors of a function body nodes and a resolved output data type
 struct OutputArgExpansion {
+  // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
+  // different data types
   string output_name;                  // name of the function output argument
-  std::vector<string> output_tensors;  // names of output tensors from the
-                                       // function body graph nodes
+  DataType data_type;                  // output data type
+  std::vector<string> output_tensors;  // names of output tensor from the
+                                       // function body nodes
 };
 
 // FunctionDef uses different connectivity encoding for the function body nodes,
@@ -67,26 +75,46 @@ class GrapplerFunctionConnectivity {
   Status ExpandFunctionDefInput(const string& func_def_input,
                                 std::vector<string>* graph_def_inputs) const;
 
-  // Update Node inputs from FunctionDef to GraphDef format
+  // Update Node inputs from FunctionDef to GraphDef format.
   Status ExpandNodeInputs(NodeDef* function_body_node) const;
 
-  // TODO(ezhulenev): fold GraphDef inputs back to FunctionDef format
-  // Status FoldGraphDefInputs(const std::vector<sting> graph_def_inputs,
-  //                          std::vector<string>* function_def_inputs) const;
+  // When expanding inputs in function def format, single input might be
+  // expanded into multiple tensors. When converting back to the function def
+  // format from graph def format, it's always a 1-to-1 relationship.
+  // FunctionDef built from GrapplerFunctionItem is always specialized to it's
+  // instantiation attributes and length of input args (and node def outputs) is
+  // known.
+
+  // Map from GraphDef input format to FunctionDef input format using registered
+  // input arg expansion and function body outputs.
+  Status AsFunctionDefInput(const string& graph_def_input,
+                            string* func_def_input) const;
+
+  // Update Node inputs from GraphDef to FunctionDef format.
+  Status AsFunctionDefNode(NodeDef* function_body_node) const;
 
  private:
+  // Mapping from input name to input arg expansion.
   std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  // Mapping from function body node name to output names range map.
   std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+
+  struct InputArgPlaceholder {
+    string input_name;
+    int position;
+  };
+
+  // Mapping from input arg placeholder to the function input tensor.
+  std::unordered_map<string, InputArgPlaceholder> input_arg_placeholders_;
 };
 
-// Helper methods to build GrapplerFunctionItem from a function def and function
-// attributes.
-class GrapplerFunctionItemBuilder {
+// Get Function type attributes using attributes of a node that instantiated
+// a function.
+class GrapplerFunctionItemInstantiation {
  public:
-  using FunctionAttr = std::unordered_map<string, AttrValue>;
-
-  explicit GrapplerFunctionItemBuilder(const FunctionAttr* func_attr)
-      : func_attr_(func_attr) {}
+  explicit GrapplerFunctionItemInstantiation(
+      const AttrValueMap* func_instantiation_attr)
+      : func_instantiation_attr_(func_instantiation_attr) {}
 
   // Get DataType from attributes by name. Return error if attribute is missing,
   // or it doesn't define a valid data type.
@@ -97,20 +125,20 @@ class GrapplerFunctionItemBuilder {
   Status GetArgType(const OpDef::ArgDef& arg, DataType* data_type) const;
 
  private:
-  const FunctionAttr* func_attr_;  // do not own
+  const AttrValueMap* func_instantiation_attr_;  // do not own
 };
 
 // A special case of GrapplerItem, constructed from a TensorFlow Function.
 class GrapplerFunctionItem : public GrapplerItem {
  public:
-  GrapplerFunctionItem() {}
+  GrapplerFunctionItem() = default;
   GrapplerFunctionItem(
-      const string& function_name,
+      const string& func_name, const AttrValueMap& func_attr,
       const std::vector<InputArgExpansion>& input_arg_expansions,
       const std::vector<OutputArgExpansion>& output_arg_expansions,
       GraphDef&& function_body);
 
-  const string& function_name() const;
+  bool IsInputPlaceholder(const string& node_name) const;
 
   const std::vector<InputArgExpansion>& inputs() const;
   const InputArgExpansion& input(int i) const;
@@ -120,13 +148,20 @@ class GrapplerFunctionItem : public GrapplerItem {
   const OutputArgExpansion& output(int i) const;
   const std::size_t output_size() const;
 
+  const AttrValueMap& func_attr() const;
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
 
+  GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
+
  private:
-  string function_name_;
+  AttrValueMap func_attr_;  // Attributes specific to function definition that
+                            // produced this item (FuncDef.attr field).
+
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
+
+  std::set<string> input_arg_placeholders_;
 };
 
 // Return all output tensors referenced by item output args.
@@ -136,8 +171,21 @@ std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
 // Return error if the given function def cannot be converted.
 Status MakeGrapplerFunctionItem(
     const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionLibraryDefinition& func_library, GrapplerFunctionItem* item);
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
+// Register GrapplerFunctionItem input arg expansion and function body outputs
+// in the GrapplerFunctionConnectivity.  Use function library definition to
+// lookup function body nodes output names and ranges.
+Status RegisterGrapplerFunctionConnectivity(
+    const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
+    GrapplerFunctionConnectivity* connectivity);
+
+// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function
+// library definition to lookup function body nodes output names and ranges.
+Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
+                                  const FunctionLibraryDefinition& flib,
+                                  FunctionDef* func);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 1eb3298e89..a9a708bf67 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
@@ -32,8 +33,9 @@ class FunctionsTest : public ::testing::Test {};
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", {"inputA"}});
-  connectivity.RegisterInputArgExpansion({"inputB", {"inputB_0", "inputB_1"}});
+  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
 
   connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
   connectivity.RegisterFunctionBodyOutputs("Func",
@@ -93,11 +95,50 @@ TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
   EXPECT_EQ("Func:3", inputs[0]);
 }
 
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+
+  connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
+  connectivity.RegisterFunctionBodyOutputs("Func",
+                                           {{"o1", {0, 2}}, {"o2", {2, 4}}});
+
+  string input;
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputA", &input));
+  EXPECT_EQ("inputA:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputB_0", &input));
+  EXPECT_EQ("inputB:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputB_1", &input));
+  EXPECT_EQ("inputB:1", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Add", &input));
+  EXPECT_EQ("Add:z:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func", &input));
+  EXPECT_EQ("Func:o1:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:1", &input));
+  EXPECT_EQ("Func:o1:1", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:2", &input));
+  EXPECT_EQ("Func:o2:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:3", &input));
+  EXPECT_EQ("Func:o2:1", input);
+}
+
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandNodeInputs) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", {"inputA"}});
-  connectivity.RegisterInputArgExpansion({"inputB", {"inputB_0", "inputB_1"}});
+  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
 
   NodeDef node;
   node.add_input("inputA:0");
@@ -131,12 +172,12 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
-  EXPECT_EQ("XTimesTwo", item.function_name());
+  EXPECT_EQ("XTimesTwo", item.id);
   EXPECT_EQ(4, item.function_body().node_size());
 
   EXPECT_EQ(1, item.input_size());
@@ -206,12 +247,12 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
-  EXPECT_EQ("SubGrad", item.function_name());
+  EXPECT_EQ("SubGrad", item.id);
   EXPECT_EQ(12, item.function_body().node_size());
 
   ASSERT_EQ(3, item.input_size());
@@ -251,8 +292,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
-  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
-  TF_ASSERT_OK(library.AddFunctionDef(FunctionDefHelper::Define(
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+  TF_ASSERT_OK(flib.AddFunctionDef(FunctionDefHelper::Define(
       // Name
       "Swap",
       // Args
@@ -290,7 +331,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
   func_attr["T"].set_type(DT_FLOAT);
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -348,10 +389,10 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       {{"out", "Exp:y:0"}});
 
   std::unordered_map<string, AttrValue> func_attr;
-  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   EXPECT_EQ(1, item.output_size());
   EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
@@ -391,12 +432,12 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
       {{"out0", "in0"}});
 
   std::unordered_map<string, AttrValue> func_attr;
-  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
-  EXPECT_EQ("ForwardInputs", item.function_name());
+  EXPECT_EQ("ForwardInputs", item.id);
   EXPECT_EQ(5, item.function_body().node_size());
 
   EXPECT_EQ(3, item.output_size());
@@ -437,10 +478,10 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionLibraryDefinition library(OpRegistry::Global(), FunctionDefLibrary());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, library, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   EXPECT_EQ(0, item.input_size());
   EXPECT_EQ(1, item.output_size());
@@ -456,6 +497,104 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
+TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+
+  // Input and output types are resolved based on instantiation attributes.
+  EXPECT_EQ("x", specialized.signature().input_arg(0).name());
+  EXPECT_EQ(DT_FLOAT, specialized.signature().input_arg(0).type());
+  EXPECT_EQ("y", specialized.signature().output_arg(0).name());
+  EXPECT_EQ(DT_FLOAT, specialized.signature().output_arg(0).type());
+
+  // Function body specialized for instantiation types
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "scale" && count++) {
+      EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
+    } else if (node.name() == "y" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("x:0", node.input(0));
+      EXPECT_EQ("scale:y:0", node.input(1));
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+    }
+  }
+  EXPECT_EQ(2, count);
+}
+
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
+  using test::function::NDef;
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GraphDef id_func_body = test::function::GDef(
+      {/* pass input to output through identity */
+       NDef("output", "Identity", {"x"}, {{"T", "float"}})});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+
+  FunctionDefLibrary lib_def;
+  *lib_def.add_function() = func;
+  *lib_def.add_function() = mul_func;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  // Replace function body with identity function
+  item.SwapFunctionBody(std::move(id_func_body));
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+
+  // Check that graph body was updated.
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "output" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ("x:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And return tensor mapping was updated with a new output name (z->output).
+  EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From c4526e50b2ac2d6819c8eb67db5423af103a1bb7 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 12 Apr 2018 18:36:13 -0700
Subject: [PATCH 0742/1262] Avoid calling K.learning_phase() when not necessary
 in Dropout layer since it instantiates a placeholder_with_default, which is
 not supported by TPU compilation.

PiperOrigin-RevId: 192705478
---
 tensorflow/python/keras/_impl/keras/layers/core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index f64174a23f..9c4cb0f4fd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -130,6 +130,7 @@ class Dropout(Layer):
     return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
 
   def call(self, inputs, training=None):
+    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -141,7 +142,7 @@ class Dropout(Layer):
                                  dropped_inputs,
                                  lambda: array_ops.identity(inputs))
     # EagerTensor object has no attribute _uses_learning_phase
-    if not context.executing_eagerly() and training is K.learning_phase():
+    if not context.executing_eagerly() and original_training_value is None:
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
-- 
GitLab


From 5a6d5a1b3982e59548340422f831ada6f5d5e0be Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 12 Apr 2018 19:01:10 -0700
Subject: [PATCH 0743/1262] Enable efficient feeding of symbolic tensors to
 placeholders in the Keras backend.

PiperOrigin-RevId: 192707345
---
 .../python/keras/_impl/keras/backend.py       | 110 ++++++++++++++----
 .../python/keras/_impl/keras/backend_test.py  |  43 ++++++-
 .../keras/_impl/keras/integration_test.py     |   2 +-
 3 files changed, 124 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 096db8db32..6647cc5b79 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -2760,8 +2760,7 @@ class Function(object):
       outputs: Output tensors to fetch.
       updates: Additional update ops to be run at function call.
       name: A name to help users identify what this function does.
-      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`,
-        `options`, `run_metadata`
+      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`.
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None,
@@ -2795,19 +2794,74 @@ class Function(object):
     self.fetches = session_kwargs.pop('fetches', [])
     if not isinstance(self.fetches, list):
       self.fetches = [self.fetches]
+    # The main use case of `fetches` being passed to a model is the ability
+    # to run custom updates (since the outputs of fetches are never returned).
+    # This requires us to wrap fetches in `identity` ops.
+    self.fetches = [array_ops.identity(x) for x in self.fetches]
     self.session_kwargs = session_kwargs
 
+    if session_kwargs:
+      raise ValueError('Some keys in session_kwargs are not supported at this '
+                       'time: %s', session_kwargs.keys())
+
+    self._callable_fn = None
+    self._feed_arrays = None
+    self._feed_symbols = None
+    self._symbol_vals = None
+    self._session = None
+
+  def _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session):
+    """Generates a callable that runs the graph.
+
+    Arguments:
+      feed_arrays: List of input tensors to be fed Numpy arrays at runtime.
+      feed_symbols: List of input tensors to be fed symbolic tensors at runtime.
+      symbol_vals: List of symbolic tensors to be fed to `feed_symbols`.
+      session: Session to use to generate the callable.
+
+    Returns:
+      Function that runs the graph according to the above options.
+    """
+    # Prepare callable options.
+    callable_opts = config_pb2.CallableOptions()
+    # Handle external-data feed.
+    for x in feed_arrays:
+      callable_opts.feed.append(x.name)
+    if self.feed_dict:
+      for key in sorted(self.feed_dict.keys()):
+        callable_opts.feed.append(key.name)
+    # Handle symbolic feed.
+    for x, y in zip(feed_symbols, symbol_vals):
+      connection = callable_opts.tensor_connection.add()
+      from_tensor = ops._as_graph_element(y)
+      if from_tensor is None:
+        from_tensor = y
+      connection.from_tensor = from_tensor.name  # Data tensor
+      connection.to_tensor = x.name  # Placeholder
+    # Handle fetches.
+    for x in self.outputs + self.fetches:
+      callable_opts.fetch.append(x.name)
+    # Handle updates.
+    callable_opts.target.append(self.updates_op.name)
+    # Create callable.
+    callable_fn = session._make_callable_from_options(callable_opts)
+    # Cache parameters corresponding to the generated callable, so that
+    # we can detect future mismatches and refresh the callable.
+    self._callable_fn = callable_fn
+    self._feed_arrays = feed_arrays
+    self._feed_symbols = feed_symbols
+    self._symbol_vals = symbol_vals
+    self._session = session
+
   def __call__(self, inputs):
     if not isinstance(inputs, (list, tuple)):
       raise TypeError('`inputs` should be a list or tuple.')
 
-    if self.feed_dict:
-      feed_dict = self.feed_dict.copy()
-    else:
-      feed_dict = {}
-
     session = get_session()
-    data_tensors_to_feed = []
+    feed_arrays = []
+    array_vals = []
+    feed_symbols = []
+    symbol_vals = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
         continue
@@ -2816,23 +2870,31 @@ class Function(object):
         indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
-      elif tensor_util.is_tensor(value):
-        data_tensors_to_feed.append((tensor, value))
+      if tensor_util.is_tensor(value):
+        # Case: feeding symbolic tensor.
+        feed_symbols.append(tensor)
+        symbol_vals.append(value)
       else:
-        feed_dict[tensor] = value
-
-    if data_tensors_to_feed:
-      # This is a *temporary* workaround (i.e. hack) to feed a symbolic tensor
-      # to `feed_dict`. It is very inefficient. It will be removed as soon
-      # as it becomes possible to pass symbolic tensors to `feed_dict`.
-      data_tensor_values = session.run([x[1] for x in data_tensors_to_feed])
-      for i, v in enumerate(data_tensor_values):
-        feed_dict[data_tensors_to_feed[i][0]] = v
-
-    fetches = self.outputs + [self.updates_op] + self.fetches
-    updated = session.run(
-        fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
-    return updated[:len(self.outputs)]
+        # Case: feeding Numpy array.
+        feed_arrays.append(tensor)
+        # We need to do array conversion and type casting at this level, since
+        # `callable_fn` only supports exact matches.
+        array_vals.append(np.asarray(value, dtype=tensor.dtype.base_dtype.name))
+    if self.feed_dict:
+      for key in sorted(self.feed_dict.keys()):
+        array_vals.append(
+            np.asarray(self.feed_dict[key], dtype=key.dtype.base_dtype.name))
+
+    # Refresh callable if anything has changed.
+    if (self._callable_fn is None or
+        feed_arrays != self._feed_arrays or
+        symbol_vals != self._symbol_vals or
+        feed_symbols != self._feed_symbols or
+        session != self._session):
+      self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
+
+    fetched = self._callable_fn(*array_vals)
+    return fetched[:len(self.outputs)]
 
 
 @tf_export('keras.backend.function')
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index fb4b2a0e1d..0193fc6976 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -189,6 +189,34 @@ class BackendUtilsTest(test.TestCase):
     for y in ys:
       self.assertEqual(y.op.name[:12], 'StopGradient')
 
+  def test_function_tf_feed_symbols(self):
+    with self.test_session():
+      # Test feeding a resource variable to `function`.
+      x1 = keras.backend.placeholder(shape=())
+      x2 = keras.backend.placeholder(shape=())
+      lr = keras.backend.learning_phase()  # Include a placeholder_with_default.
+
+      y1 = keras.backend.variable(10.)
+      y2 = 3
+
+      f = keras.backend.function(
+          inputs=[x1, x2, lr],
+          outputs=[x1 + 1,
+                   keras.backend.in_train_phase(x2 + 2, x2 - 1)])
+      outs = f([y1, y2, None])  # Use default learning_phase value.
+      self.assertEqual(outs, [11., 2.])
+      outs = f([y1, y2, 1])  # Set learning phase value.
+      self.assertEqual(outs, [11., 5.])
+
+      # Test triggering a callable refresh by changing the input.
+      y3 = keras.backend.constant(20.)  # Test with tensor
+      outs = f([y3, y2, None])
+      self.assertEqual(outs, [21., 2.])
+
+      y4 = 4  # Test with non-symbol
+      outs = f([y4, y2, None])
+      self.assertEqual(outs, [5., 2.])
+
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.Session().run() via its
     # `fetches` arguments. In contrast to `updates` argument of
@@ -206,8 +234,9 @@ class BackendUtilsTest(test.TestCase):
                                  updates=[(x, x_placeholder + 1.)],
                                  fetches=[keras.backend.update(y, 5.)])
       output = f([10., 20.])
-      assert output == [30.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [11., 5.]
+      self.assertEqual(output, [30.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [11., 5.])
 
   def test_function_tf_feed_dict(self):
     # Additional substitutions can be passed to `tf.Session().run()` via its
@@ -229,14 +258,16 @@ class BackendUtilsTest(test.TestCase):
                                  feed_dict=feed_dict,
                                  fetches=fetches)
       output = f([10.])
-      assert output == [11.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [20., 30.]
+      self.assertEqual(output, [11.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [20., 30.])
 
       # updated value in feed_dict will be modified within the K.function()
       feed_dict[y_placeholder] = 4.
       output = f([20.])
-      assert output == [21.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [30., 40.]
+      self.assertEqual(output, [21.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [30., 40.])
 
 
 class BackendVariableTest(test.TestCase):
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index c44808421f..43aff67ef9 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -95,7 +95,7 @@ class KerasIntegrationTest(test.TestCase):
       model.compile(loss='categorical_crossentropy',
                     optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
                           validation_data=(x_train, y_train),
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
-- 
GitLab


From 4f615adc1d7875f9fbe592619dc6b0f31cc7fd9e Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 12 Apr 2018 19:13:18 -0700
Subject: [PATCH 0744/1262] Automated g4 rollback of changelist 192691078

PiperOrigin-RevId: 192708480
---
 tensorflow/contrib/BUILD                          |  1 -
 tensorflow/contrib/__init__.py                    |  1 -
 tensorflow/contrib/cmake/tf_python.cmake          |  6 ++----
 .../python/kernel_tests/decode_proto_fail_test.py |  4 ++--
 .../python/kernel_tests/decode_proto_op_test.py   |  4 ++--
 .../python/kernel_tests/encode_proto_op_test.py   | 15 +++++++--------
 6 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 7e47516550..192d053683 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -77,7 +77,6 @@ py_library(
         "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
-        "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 36cc5144d0..e02dd5e759 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -64,7 +64,6 @@ from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
-from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
 from tensorflow.contrib import recurrent
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index f6aaf41f73..9d9db82513 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,10 +330,8 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
-GENERATE_PYTHON_OP_LIB("decode_proto_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
-GENERATE_PYTHON_OP_LIB("encode_proto_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("decode_proto_ops")
+GENERATE_PYTHON_OP_LIB("encode_proto_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
index f8969b0bd5..f019833905 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.proto import decode_proto
+from tensorflow.contrib import proto
 from tensorflow.contrib.proto.python.kernel_tests import test_case
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -46,7 +46,7 @@ class DecodeProtoFailTest(test_case.ProtoOpTestCase):
     field_types = [dtypes.int32]
 
     with self.test_session() as sess:
-      ctensor, vtensor = decode_proto(
+      ctensor, vtensor = proto.decode_proto(
           batch,
           message_type=msg_type,
           field_names=field_names,
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
index cd5121cdba..30ceac5f5f 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -27,7 +27,7 @@ import numpy as np
 
 from google.protobuf import text_format
 
-from tensorflow.contrib.proto import decode_proto
+from tensorflow.contrib import proto
 from tensorflow.contrib.proto.python.kernel_tests import test_case
 from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
 from tensorflow.python.framework import dtypes
@@ -175,7 +175,7 @@ class DecodeProtoOpTest(test_case.ProtoOpTestCase):
     output_types = [f.dtype for f in fields]
 
     with self.test_session() as sess:
-      sizes, vtensor = decode_proto(
+      sizes, vtensor = proto.decode_proto(
           batch,
           message_type=message_type,
           field_names=field_names,
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
index a289ff290a..2a24c3b8ce 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -30,8 +30,7 @@ import numpy as np
 
 from google.protobuf import text_format
 
-from tensorflow.contrib.proto import decode_proto
-from tensorflow.contrib.proto import encode_proto
+from tensorflow.contrib import proto
 from tensorflow.contrib.proto.python.kernel_tests import test_case
 from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
 from tensorflow.python.framework import dtypes
@@ -51,7 +50,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
     # Invalid field name
     with self.test_session():
       with self.assertRaisesOpError('Unknown field: non_existent_field'):
-        encode_proto(
+        proto.encode_proto(
             sizes=[[1]],
             values=[np.array([[0.0]], dtype=np.int32)],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -61,7 +60,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
     with self.test_session():
       with self.assertRaisesOpError(
           'Incompatible type for field double_value.'):
-        encode_proto(
+        proto.encode_proto(
             sizes=[[1]],
             values=[np.array([[0.0]], dtype=np.int32)],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -73,7 +72,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
           r'sizes should be batch_size \+ \[len\(field_names\)\]'):
         sizes = array_ops.placeholder(dtypes.int32)
         values = array_ops.placeholder(dtypes.float64)
-        encode_proto(
+        proto.encode_proto(
             sizes=sizes,
             values=[values],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -89,7 +88,7 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
         sizes = array_ops.placeholder(dtypes.int32)
         values1 = array_ops.placeholder(dtypes.float64)
         values2 = array_ops.placeholder(dtypes.int32)
-        (encode_proto(
+        (proto.encode_proto(
             sizes=[[1, 1]],
             values=[values1, values2],
             message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
@@ -104,13 +103,13 @@ class EncodeProtoOpTest(test_case.ProtoOpTestCase):
     out_types = [f.dtype for f in fields]
 
     with self.test_session() as sess:
-      sizes, field_tensors = decode_proto(
+      sizes, field_tensors = proto.decode_proto(
           in_bufs,
           message_type=message_type,
           field_names=field_names,
           output_types=out_types)
 
-      out_tensors = encode_proto(
+      out_tensors = proto.encode_proto(
           sizes,
           field_tensors,
           message_type=message_type,
-- 
GitLab


From 3c9870524b86fe7e3cff5a49daa692cd52e7f0c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 12 Apr 2018 19:52:18 -0700
Subject: [PATCH 0745/1262] Add boolean type to tflite in favor of comparison
 implementations.

PiperOrigin-RevId: 192711203
---
 tensorflow/contrib/lite/context.h                 |  2 ++
 tensorflow/contrib/lite/interpreter.cc            |  8 ++++++--
 tensorflow/contrib/lite/interpreter.h             |  4 ++++
 tensorflow/contrib/lite/kernels/internal/tensor.h |  5 +++++
 tensorflow/contrib/lite/model.cc                  |  3 +++
 tensorflow/contrib/lite/optional_debug_tools.cc   |  2 ++
 .../interpreter_wrapper/interpreter_wrapper.cc    |  4 ++++
 tensorflow/contrib/lite/schema/schema.fbs         |  1 +
 tensorflow/contrib/lite/schema/schema_generated.h |  9 ++++++---
 tensorflow/contrib/lite/testing/split.h           | 10 ++++++++++
 tensorflow/contrib/lite/testing/split_test.cc     |  5 +++++
 tensorflow/contrib/lite/testing/tflite_driver.cc  | 15 +++++++++++++++
 12 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 45184b05ec..0b38f43cd3 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -137,6 +137,7 @@ typedef enum {
   kTfLiteUInt8 = 3,
   kTfLiteInt64 = 4,
   kTfLiteString = 5,
+  kTfLiteBool = 6,
 } TfLiteType;
 
 // Parameters for asymmetric quantization. Quantized values can be converted
@@ -155,6 +156,7 @@ typedef union {
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
+  bool* b;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 4575fe884d..f258654608 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -337,9 +337,13 @@ TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
     case kTfLiteInt64:
       *bytes = sizeof(int64_t) * count;
       break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
     default:
-      ReportError(&context_,
-                  "Only float32, int32, int64, uint8 supported currently.");
+      ReportError(
+          &context_,
+          "Only float32, int32, int64, uint8, bool supported currently.");
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index a6d582a813..df67cce9de 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -48,6 +48,10 @@ template <>
 constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
+template <>
+constexpr TfLiteType typeToTfLiteType<bool>() {
+  return kTfLiteBool;
+}
 
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 4bce2ffaaf..62cea143e6 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -44,6 +44,11 @@ inline int64_t* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i64 : nullptr;
 }
 
+template <>
+inline bool* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.b : nullptr;
+}
+
 inline int RemapDim(int max_dimensions, int d) {
   return max_dimensions - d - 1;
 }
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 87af953061..0b65884025 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -57,6 +57,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_STRING:
       *type = kTfLiteString;
       break;
+    case TensorType_BOOL:
+      *type = kTfLiteBool;
+      break;
     default:
       error_reporter->Report("Unimplemented data type %s (%d) in tensor\n",
                              EnumNameTensorType(tensor_type), tensor_type);
diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
index 1f762e6688..e1366639c7 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -48,6 +48,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt64";
     case kTfLiteString:
       return "kTfLiteString";
+    case kTfLiteBool:
+      return "kTfLiteBool";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 4b34969356..04fc098129 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -72,6 +72,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_INT64;
     case kTfLiteString:
       return NPY_OBJECT;
+    case kTfLiteBool:
+      return NPY_BOOL;
     case kTfLiteNoType:
       return -1;
   }
@@ -90,6 +92,8 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
       return kTfLiteUInt8;
     case NPY_INT64:
       return kTfLiteInt64;
+    case NPY_BOOL:
+      return kTfLiteBool;
     case NPY_OBJECT:
     case NPY_STRING:
     case NPY_UNICODE:
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 357493755d..fa825500fd 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -33,6 +33,7 @@ enum TensorType : byte {
   UINT8 = 3,
   INT64 = 4,
   STRING = 5,
+  BOOL = 6,
 }
 
 // Parameters for converting a quantized tensor back to float. Given a
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index c638daf66e..909c4ccb3b 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -173,18 +173,20 @@ enum TensorType {
   TensorType_UINT8 = 3,
   TensorType_INT64 = 4,
   TensorType_STRING = 5,
+  TensorType_BOOL = 6,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_STRING
+  TensorType_MAX = TensorType_BOOL
 };
 
-inline TensorType (&EnumValuesTensorType())[6] {
+inline TensorType (&EnumValuesTensorType())[7] {
   static TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
     TensorType_INT32,
     TensorType_UINT8,
     TensorType_INT64,
-    TensorType_STRING
+    TensorType_STRING,
+    TensorType_BOOL
   };
   return values;
 }
@@ -197,6 +199,7 @@ inline const char **EnumNamesTensorType() {
     "UINT8",
     "INT64",
     "STRING",
+    "BOOL",
     nullptr
   };
   return names;
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/contrib/lite/testing/split.h
index 428cfda4f2..896f2949ef 100644
--- a/tensorflow/contrib/lite/testing/split.h
+++ b/tensorflow/contrib/lite/testing/split.h
@@ -80,6 +80,16 @@ inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
   return fields;
 }
 
+template <>
+inline std::vector<bool> Split(const string& s, const string& delimiter) {
+  std::vector<bool> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(
+        static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
+  }
+  return fields;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/split_test.cc b/tensorflow/contrib/lite/testing/split_test.cc
index 3d1e25d9c7..76b918cbcd 100644
--- a/tensorflow/contrib/lite/testing/split_test.cc
+++ b/tensorflow/contrib/lite/testing/split_test.cc
@@ -52,6 +52,11 @@ TEST(SplitTest, SplitUint8) {
   EXPECT_THAT(Split<uint8_t>("1,-1,258", ","), ElementsAre(1, 255, 2));
 }
 
+TEST(SplitTest, SplitBool) {
+  EXPECT_THAT(Split<bool>("1, 0, 0, 1", ","),
+              ElementsAre(true, false, false, true));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 3764bab035..58fe5bd6e4 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -42,6 +42,10 @@ template <>
 uint8_t Value(const TfLitePtrUnion& data, int index) {
   return data.uint8[index];
 }
+template <>
+bool Value(const TfLitePtrUnion& data, int index) {
+  return data.b[index];
+}
 
 template <typename T>
 void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
@@ -79,6 +83,8 @@ class TfLiteDriver::Expectation {
         return TypedCheck<int64_t>(verbose, tensor);
       case kTfLiteUInt8:
         return TypedCheck<uint8_t>(verbose, tensor);
+      case kTfLiteBool:
+        return TypedCheck<bool>(verbose, tensor);
       default:
         fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
         return false;
@@ -203,6 +209,12 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       SetTensorData(values, &tensor->data);
       break;
     }
+    case kTfLiteBool: {
+      const auto& values = testing::Split<bool>(csv_values, ",");
+      if (!CheckSizes<bool>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
       Invalidate("Unsupported tensor data type");
@@ -231,6 +243,9 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
     case kTfLiteUInt8:
       expected_output_[id]->SetData<uint8_t>(csv_values);
       break;
+    case kTfLiteBool:
+      expected_output_[id]->SetData<bool>(csv_values);
+      break;
     default:
       fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
       Invalidate("Unsupported tensor data type");
-- 
GitLab


From 3438c3f4f18e2057aee38d38537d96cc485b8fab Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 12 Apr 2018 19:56:38 -0700
Subject: [PATCH 0746/1262] Automated g4 rollback of changelist 192504411

PiperOrigin-RevId: 192711501
---
 tensorflow/contrib/proto/BUILD                |  16 -
 .../contrib/proto/python/kernel_tests/BUILD   |  81 -----
 .../proto/python/kernel_tests/build_defs.bzl  |  78 -----
 .../kernel_tests/decode_proto_fail_test.py    |  68 ----
 .../kernel_tests/decode_proto_op_test.py      | 300 ------------------
 .../kernel_tests/encode_proto_op_test.py      | 179 -----------
 .../python/kernel_tests/minmax.TestCase.pbtxt | 161 ----------
 .../python/kernel_tests/nested.TestCase.pbtxt |  16 -
 .../kernel_tests/optional.TestCase.pbtxt      |  20 --
 .../promote_unsigned.TestCase.pbtxt           |  21 --
 .../python/kernel_tests/ragged.TestCase.pbtxt |  32 --
 .../kernel_tests/shaped_batch.TestCase.pbtxt  |  62 ----
 .../python/kernel_tests/simple.TestCase.pbtxt |  21 --
 .../proto/python/kernel_tests/test_case.py    |  35 --
 .../python/kernel_tests/test_example.proto    | 149 ---------
 tensorflow/tools/pip_package/BUILD            |   1 -
 16 files changed, 1240 deletions(-)
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/BUILD
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_case.py
 delete mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index 3e9b1a0b8d..046652cbc5 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -4,8 +4,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-
 py_library(
     name = "proto",
     srcs = [
@@ -16,17 +14,3 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
-
-py_library(
-    name = "proto_pip",
-    data = [
-        "//tensorflow/contrib/proto/python/kernel_tests:test_messages",
-    ] + if_static(
-        [],
-        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
-    ),
-    deps = [
-        ":proto",
-        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
-    ],
-)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
deleted file mode 100644
index 4125ea8a2a..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/BUILD
+++ /dev/null
@@ -1,81 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-# Much of the work in this BUILD file actually happens in the corresponding
-# build_defs.bzl, which creates an individual testcase for each example .pbtxt
-# file in this directory.
-#
-load(":build_defs.bzl", "decode_proto_test_suite")
-load(":build_defs.bzl", "encode_proto_test_suite")
-
-# This expands to a tf_py_test for each test file.
-# It defines the test_suite :decode_proto_op_tests.
-decode_proto_test_suite(
-    name = "decode_proto_tests",
-    examples = glob(["*.pbtxt"]),
-)
-
-# This expands to a tf_py_test for each test file.
-# It defines the test_suite :encode_proto_op_tests.
-encode_proto_test_suite(
-    name = "encode_proto_tests",
-    examples = glob(["*.pbtxt"]),
-)
-
-# Below here are tests that are not tied to an example text proto.
-filegroup(
-    name = "test_messages",
-    srcs = glob(["*.pbtxt"]),
-)
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-
-tf_py_test(
-    name = "decode_proto_fail_test",
-    size = "small",
-    srcs = ["decode_proto_fail_test.py"],
-    additional_deps = [
-        ":py_test_deps",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/proto:proto",
-    ],
-    data = if_static(
-        [],
-        otherwise = [":libtestexample.so"],
-    ),
-)
-
-py_library(
-    name = "test_case",
-    srcs = ["test_case.py"],
-    deps = ["//tensorflow/python:client_testlib"],
-)
-
-py_library(
-    name = "py_test_deps",
-    deps = [
-        ":test_case",
-        ":test_example_proto_py",
-    ],
-)
-
-tf_proto_library(
-    name = "test_example_proto",
-    srcs = ["test_example.proto"],
-    cc_api_version = 2,
-    protodeps = ["//tensorflow/core:protos_all"],
-)
-
-tf_cc_shared_object(
-    name = "libtestexample.so",
-    linkstatic = 1,
-    deps = [
-        ":test_example_proto_cc",
-    ],
-)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
deleted file mode 100644
index 6fe48ae807..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
+++ /dev/null
@@ -1,78 +0,0 @@
-"""BUILD rules for generating file-driven proto test cases.
-
-The decode_proto_test_suite() and encode_proto_test_suite() rules take a list
-of text protos and generates a tf_py_test() for each one.
-"""
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "register_extension_info")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-
-def _test_name(test, path):
-  return "%s_%s_test" % (test, path.split("/")[-1].split(".")[0])
-
-def decode_proto_test_suite(name, examples):
-  """Build the decode_proto py_test for each test filename."""
-  for test_filename in examples:
-    tf_py_test(
-        name = _test_name("decode_proto", test_filename),
-        srcs = ["decode_proto_op_test.py"],
-        size = "small",
-        data = [test_filename] + if_static(
-            [],
-            otherwise = [":libtestexample.so"],
-        ),
-        main = "decode_proto_op_test.py",
-        args = [
-            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
-        ],
-        additional_deps = [
-            ":py_test_deps",
-            "//third_party/py/numpy",
-            "//tensorflow/contrib/proto:proto",
-        ],
-    )
-  native.test_suite(
-      name = name,
-      tests = [":" + _test_name("decode_proto", test_filename)
-               for test_filename in examples],
-  )
-
-def encode_proto_test_suite(name, examples):
-  """Build the encode_proto py_test for each test filename."""
-  for test_filename in examples:
-    tf_py_test(
-        name = _test_name("encode_proto", test_filename),
-        srcs = ["encode_proto_op_test.py"],
-        size = "small",
-        data = [test_filename] + if_static(
-            [],
-            otherwise = [":libtestexample.so"],
-        ),
-        main = "encode_proto_op_test.py",
-        args = [
-            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
-        ],
-        additional_deps = [
-            ":py_test_deps",
-            "//third_party/py/numpy",
-            "//tensorflow/contrib/proto:proto",
-        ],
-    )
-  native.test_suite(
-      name = name,
-      tests = [":" + _test_name("encode_proto", test_filename)
-               for test_filename in examples],
-  )
-
-register_extension_info(
-    extension_name = "decode_proto_test_suite",
-    label_regex_map = {
-        "deps": "deps:decode_example_.*",
-    })
-
-register_extension_info(
-    extension_name = "encode_proto_test_suite",
-    label_regex_map = {
-        "deps": "deps:encode_example_.*",
-    })
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
deleted file mode 100644
index f019833905..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# =============================================================================
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# Python3 preparedness imports.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import proto
-from tensorflow.contrib.proto.python.kernel_tests import test_case
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-
-
-class DecodeProtoFailTest(test_case.ProtoOpTestCase):
-  """Test failure cases for DecodeToProto."""
-
-  def _TestCorruptProtobuf(self, sanitize):
-    """Test failure cases for DecodeToProto."""
-
-    # The goal here is to check the error reporting.
-    # Testing against a variety of corrupt protobufs is
-    # done by fuzzing.
-    corrupt_proto = 'This is not a binary protobuf'
-
-    # Numpy silently truncates the strings if you don't specify dtype=object.
-    batch = np.array(corrupt_proto, dtype=object)
-    msg_type = 'tensorflow.contrib.proto.TestCase'
-    field_names = ['sizes']
-    field_types = [dtypes.int32]
-
-    with self.test_session() as sess:
-      ctensor, vtensor = proto.decode_proto(
-          batch,
-          message_type=msg_type,
-          field_names=field_names,
-          output_types=field_types,
-          sanitize=sanitize)
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   'Unable to parse binary protobuf'
-                                   '|Failed to consume entire buffer'):
-        _ = sess.run([ctensor] + vtensor)
-
-  def testCorrupt(self):
-    self._TestCorruptProtobuf(sanitize=False)
-
-  def testSanitizerCorrupt(self):
-    self._TestCorruptProtobuf(sanitize=True)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
deleted file mode 100644
index 30ceac5f5f..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# =============================================================================
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Table-driven test for decode_proto op.
-
-This test is run once with each of the *.TestCase.pbtxt files
-in the test directory.
-"""
-# Python3 preparedness imports.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from google.protobuf import text_format
-
-from tensorflow.contrib import proto
-from tensorflow.contrib.proto.python.kernel_tests import test_case
-from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.platform import flags
-from tensorflow.python.platform import test
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('message_text_file', None,
-                    'A file containing a text serialized TestCase protobuf.')
-
-
-class DecodeProtoOpTest(test_case.ProtoOpTestCase):
-
-  def _compareValues(self, fd, vs, evs):
-    """Compare lists/arrays of field values."""
-
-    if len(vs) != len(evs):
-      self.fail('Field %s decoded %d outputs, expected %d' %
-                (fd.name, len(vs), len(evs)))
-    for i, ev in enumerate(evs):
-      # Special case fuzzy match for float32. TensorFlow seems to mess with
-      # MAX_FLT slightly and the test doesn't work otherwise.
-      # TODO(nix): ask on TF list about why MAX_FLT doesn't pass through.
-      if fd.cpp_type == fd.CPPTYPE_FLOAT:
-        # Numpy isclose() is better than assertIsClose() which uses an absolute
-        # value comparison.
-        self.assertTrue(
-            np.isclose(vs[i], ev), 'expected %r, actual %r' % (ev, vs[i]))
-      elif fd.cpp_type == fd.CPPTYPE_STRING:
-        # In Python3 string tensor values will be represented as bytes, so we
-        # reencode the proto values to match that.
-        self.assertEqual(vs[i], ev.encode('ascii'))
-      else:
-        # Doubles and other types pass through unscathed.
-        self.assertEqual(vs[i], ev)
-
-  def _compareRepeatedPrimitiveValue(self, batch_shape, sizes, fields,
-                                     field_dict):
-    """Compare protos of type RepeatedPrimitiveValue.
-
-    Args:
-      batch_shape: the shape of the input tensor of serialized messages.
-      sizes: int matrix of repeat counts returned by decode_proto
-      fields: list of test_example_pb2.FieldSpec (types and expected values)
-      field_dict: map from field names to decoded numpy tensors of values
-    """
-
-    # Check that expected values match.
-    for field in fields:
-      values = field_dict[field.name]
-      self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
-
-      fd = field.expected.DESCRIPTOR.fields_by_name[field.name]
-
-      # Values has the same shape as the input plus an extra
-      # dimension for repeats.
-      self.assertEqual(list(values.shape)[:-1], batch_shape)
-
-      # Nested messages are represented as TF strings, requiring
-      # some special handling.
-      if field.name == 'message_value':
-        vs = []
-        for buf in values.flat:
-          msg = test_example_pb2.PrimitiveValue()
-          msg.ParseFromString(buf)
-          vs.append(msg)
-        evs = getattr(field.expected, field.name)
-        if len(vs) != len(evs):
-          self.fail('Field %s decoded %d outputs, expected %d' %
-                    (fd.name, len(vs), len(evs)))
-        for v, ev in zip(vs, evs):
-          self.assertEqual(v, ev)
-        continue
-
-      # This can be a little confusing. For testing we are using
-      # RepeatedPrimitiveValue in two ways: it's the proto that we
-      # decode for testing, and it's used in the expected value as a
-      # union type. The two cases are slightly different: this is the
-      # second case.
-      # We may be fetching the uint64_value from the test proto, but
-      # in the expected proto we store it in the int64_value field
-      # because TensorFlow doesn't support unsigned int64.
-      tf_type_to_primitive_value_field = {
-          dtypes.float32:
-              'float_value',
-          dtypes.float64:
-              'double_value',
-          dtypes.int32:
-              'int32_value',
-          dtypes.uint8:
-              'uint8_value',
-          dtypes.int8:
-              'int8_value',
-          dtypes.string:
-              'string_value',
-          dtypes.int64:
-              'int64_value',
-          dtypes.bool:
-              'bool_value',
-          # Unhandled TensorFlow types:
-          # DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
-          # DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
-      }
-      tf_field_name = tf_type_to_primitive_value_field.get(field.dtype)
-      if tf_field_name is None:
-        self.fail('Unhandled tensorflow type %d' % field.dtype)
-
-      self._compareValues(fd, values.flat,
-                          getattr(field.expected, tf_field_name))
-
-  def _runDecodeProtoTests(self, fields, case_sizes, batch_shape, batch,
-                           message_type, message_format, sanitize,
-                           force_disordered=False):
-    """Run decode tests on a batch of messages.
-
-    Args:
-      fields: list of test_example_pb2.FieldSpec (types and expected values)
-      case_sizes: expected sizes array
-      batch_shape: the shape of the input tensor of serialized messages
-      batch: list of serialized messages
-      message_type: descriptor name for messages
-      message_format: format of messages, 'text' or 'binary'
-      sanitize: whether to sanitize binary protobuf inputs
-      force_disordered: whether to force fields encoded out of order.
-    """
-
-    if force_disordered:
-      # Exercise code path that handles out-of-order fields by prepending extra
-      # fields with tag numbers higher than any real field. Note that this won't
-      # work with sanitization because that forces reserialization using a
-      # trusted decoder and encoder.
-      assert not sanitize
-      extra_fields = test_example_pb2.ExtraFields()
-      extra_fields.string_value = 'IGNORE ME'
-      extra_fields.bool_value = False
-      extra_msg = extra_fields.SerializeToString()
-      batch = [extra_msg + msg for msg in batch]
-
-    # Numpy silently truncates the strings if you don't specify dtype=object.
-    batch = np.array(batch, dtype=object)
-    batch = np.reshape(batch, batch_shape)
-
-    field_names = [f.name for f in fields]
-    output_types = [f.dtype for f in fields]
-
-    with self.test_session() as sess:
-      sizes, vtensor = proto.decode_proto(
-          batch,
-          message_type=message_type,
-          field_names=field_names,
-          output_types=output_types,
-          message_format=message_format,
-          sanitize=sanitize)
-
-      vlist = sess.run([sizes] + vtensor)
-      sizes = vlist[0]
-      # Values is a list of tensors, one for each field.
-      value_tensors = vlist[1:]
-
-      # Check that the repeat sizes are correct.
-      self.assertTrue(
-          np.all(np.array(sizes.shape) == batch_shape + [len(field_names)]))
-
-      # Check that the decoded sizes match the expected sizes.
-      self.assertEqual(len(sizes.flat), len(case_sizes))
-      self.assertTrue(
-          np.all(sizes.flat == np.array(
-              case_sizes, dtype=np.int32)))
-
-      field_dict = dict(zip(field_names, value_tensors))
-
-      self._compareRepeatedPrimitiveValue(batch_shape, sizes, fields,
-                                          field_dict)
-
-  def testBinary(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    batch = [primitive.SerializeToString() for primitive in case.primitive]
-    self._runDecodeProtoTests(
-        case.field,
-        case.sizes,
-        list(case.shape),
-        batch,
-        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
-        'binary',
-        sanitize=False)
-
-  def testBinaryDisordered(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    batch = [primitive.SerializeToString() for primitive in case.primitive]
-    self._runDecodeProtoTests(
-        case.field,
-        case.sizes,
-        list(case.shape),
-        batch,
-        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
-        'binary',
-        sanitize=False,
-        force_disordered=True)
-
-  def testPacked(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    # Now try with the packed serialization.
-    # We test the packed representations by loading the same test cases
-    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
-    # To do this we rely on the text format being the same for packed and
-    # unpacked fields, and reparse the test message using the packed version
-    # of the proto.
-    packed_batch = [
-        # Note: float_format='.17g' is necessary to ensure preservation of
-        # doubles and floats in text format.
-        text_format.Parse(
-            text_format.MessageToString(
-                primitive, float_format='.17g'),
-            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
-        for primitive in case.primitive
-    ]
-
-    self._runDecodeProtoTests(
-        case.field,
-        case.sizes,
-        list(case.shape),
-        packed_batch,
-        'tensorflow.contrib.proto.PackedPrimitiveValue',
-        'binary',
-        sanitize=False)
-
-  def testText(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    # Note: float_format='.17g' is necessary to ensure preservation of
-    # doubles and floats in text format.
-    text_batch = [
-        text_format.MessageToString(
-            primitive, float_format='.17g') for primitive in case.primitive
-    ]
-
-    self._runDecodeProtoTests(
-        case.field,
-        case.sizes,
-        list(case.shape),
-        text_batch,
-        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
-        'text',
-        sanitize=False)
-
-  def testSanitizerGood(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    batch = [primitive.SerializeToString() for primitive in case.primitive]
-    self._runDecodeProtoTests(
-        case.field,
-        case.sizes,
-        list(case.shape),
-        batch,
-        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
-        'binary',
-        sanitize=True)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
deleted file mode 100644
index 2a24c3b8ce..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# =============================================================================
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Table-driven test for encode_proto op.
-
-This test is run once with each of the *.TestCase.pbtxt files
-in the test directory.
-
-It tests that encode_proto is a lossless inverse of decode_proto
-(for the specified fields).
-"""
-# Python3 readiness boilerplate
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from google.protobuf import text_format
-
-from tensorflow.contrib import proto
-from tensorflow.contrib.proto.python.kernel_tests import test_case
-from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import flags
-from tensorflow.python.platform import test
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('message_text_file', None,
-                    'A file containing a text serialized TestCase protobuf.')
-
-
-class EncodeProtoOpTest(test_case.ProtoOpTestCase):
-
-  def testBadInputs(self):
-    # Invalid field name
-    with self.test_session():
-      with self.assertRaisesOpError('Unknown field: non_existent_field'):
-        proto.encode_proto(
-            sizes=[[1]],
-            values=[np.array([[0.0]], dtype=np.int32)],
-            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
-            field_names=['non_existent_field']).eval()
-
-    # Incorrect types.
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'Incompatible type for field double_value.'):
-        proto.encode_proto(
-            sizes=[[1]],
-            values=[np.array([[0.0]], dtype=np.int32)],
-            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
-            field_names=['double_value']).eval()
-
-    # Incorrect shapes of sizes.
-    with self.test_session():
-      with self.assertRaisesOpError(
-          r'sizes should be batch_size \+ \[len\(field_names\)\]'):
-        sizes = array_ops.placeholder(dtypes.int32)
-        values = array_ops.placeholder(dtypes.float64)
-        proto.encode_proto(
-            sizes=sizes,
-            values=[values],
-            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
-            field_names=['double_value']).eval(feed_dict={
-                sizes: [[[0, 0]]],
-                values: [[0.0]]
-            })
-
-    # Inconsistent shapes of values.
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'Values must match up to the last dimension'):
-        sizes = array_ops.placeholder(dtypes.int32)
-        values1 = array_ops.placeholder(dtypes.float64)
-        values2 = array_ops.placeholder(dtypes.int32)
-        (proto.encode_proto(
-            sizes=[[1, 1]],
-            values=[values1, values2],
-            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
-            field_names=['double_value', 'int32_value']).eval(feed_dict={
-                values1: [[0.0]],
-                values2: [[0], [0]]
-            }))
-
-  def _testRoundtrip(self, in_bufs, message_type, fields):
-
-    field_names = [f.name for f in fields]
-    out_types = [f.dtype for f in fields]
-
-    with self.test_session() as sess:
-      sizes, field_tensors = proto.decode_proto(
-          in_bufs,
-          message_type=message_type,
-          field_names=field_names,
-          output_types=out_types)
-
-      out_tensors = proto.encode_proto(
-          sizes,
-          field_tensors,
-          message_type=message_type,
-          field_names=field_names)
-
-      out_bufs, = sess.run([out_tensors])
-
-      # Check that the re-encoded tensor has the same shape.
-      self.assertEqual(in_bufs.shape, out_bufs.shape)
-
-      # Compare the input and output.
-      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
-        in_obj = test_example_pb2.RepeatedPrimitiveValue()
-        in_obj.ParseFromString(in_buf)
-
-        out_obj = test_example_pb2.RepeatedPrimitiveValue()
-        out_obj.ParseFromString(out_buf)
-
-        # Check that the deserialized objects are identical.
-        self.assertEqual(in_obj, out_obj)
-
-        # Check that the input and output serialized messages are identical.
-        # If we fail here, there is a difference in the serialized
-        # representation but the new serialization still parses. This could
-        # be harmless (a change in map ordering?) or it could be bad (e.g.
-        # loss of packing in the encoding).
-        self.assertEqual(in_buf, out_buf)
-
-  def testRoundtrip(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    in_bufs = [primitive.SerializeToString() for primitive in case.primitive]
-
-    # np.array silently truncates strings if you don't specify dtype=object.
-    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
-    return self._testRoundtrip(
-        in_bufs, 'tensorflow.contrib.proto.RepeatedPrimitiveValue', case.field)
-
-  def testRoundtripPacked(self):
-    with open(FLAGS.message_text_file, 'r') as fp:
-      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
-
-    # Now try with the packed serialization.
-    # We test the packed representations by loading the same test cases
-    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
-    # To do this we rely on the text format being the same for packed and
-    # unpacked fields, and reparse the test message using the packed version
-    # of the proto.
-    in_bufs = [
-        # Note: float_format='.17g' is necessary to ensure preservation of
-        # doubles and floats in text format.
-        text_format.Parse(
-            text_format.MessageToString(
-                primitive, float_format='.17g'),
-            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
-        for primitive in case.primitive
-    ]
-
-    # np.array silently truncates strings if you don't specify dtype=object.
-    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
-    return self._testRoundtrip(
-        in_bufs, 'tensorflow.contrib.proto.PackedPrimitiveValue', case.field)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
deleted file mode 100644
index b170f89c0f..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
+++ /dev/null
@@ -1,161 +0,0 @@
-primitive {
-  double_value: -1.7976931348623158e+308
-  double_value: 2.2250738585072014e-308
-  double_value: 1.7976931348623158e+308
-  float_value: -3.402823466e+38
-  float_value: 1.175494351e-38
-  float_value: 3.402823466e+38
-  int64_value: -9223372036854775808
-  int64_value: 9223372036854775807
-  uint64_value: 0
-  uint64_value: 18446744073709551615
-  int32_value: -2147483648
-  int32_value: 2147483647
-  fixed64_value: 0
-  fixed64_value: 18446744073709551615
-  fixed32_value: 0
-  fixed32_value: 4294967295
-  bool_value: false
-  bool_value: true
-  string_value: ""
-  string_value: "I refer to the infinite."
-  uint32_value: 0
-  uint32_value: 4294967295
-  sfixed32_value: -2147483648
-  sfixed32_value: 2147483647
-  sfixed64_value: -9223372036854775808
-  sfixed64_value: 9223372036854775807
-  sint32_value: -2147483648
-  sint32_value: 2147483647
-  sint64_value: -9223372036854775808
-  sint64_value: 9223372036854775807
-}
-shape: 1
-sizes: 3
-sizes: 3
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-sizes: 2
-field {
-  name: "double_value"
-  dtype: DT_DOUBLE
-  expected {
-    double_value: -1.7976931348623158e+308
-    double_value: 2.2250738585072014e-308
-    double_value: 1.7976931348623158e+308
-  }
-}
-field {
-  name: "float_value"
-  dtype: DT_FLOAT
-  expected {
-    float_value: -3.402823466e+38
-    float_value: 1.175494351e-38
-    float_value: 3.402823466e+38
-  }
-}
-field {
-  name: "int64_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: -9223372036854775808
-    int64_value: 9223372036854775807
-  }
-}
-field {
-  name: "uint64_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: 0
-    int64_value: -1
-  }
-}
-field {
-  name: "int32_value"
-  dtype: DT_INT32
-  expected {
-    int32_value: -2147483648
-    int32_value: 2147483647
-  }
-}
-field {
-  name: "fixed64_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: 0
-    int64_value: -1  # unsigned is 18446744073709551615
-  }
-}
-field {
-  name: "fixed32_value"
-  dtype: DT_INT32
-  expected {
-    int32_value: 0
-    int32_value: -1  # unsigned is 4294967295
-  }
-}
-field {
-  name: "bool_value"
-  dtype: DT_BOOL
-  expected {
-    bool_value: false
-    bool_value: true
-  }
-}
-field {
-  name: "string_value"
-  dtype: DT_STRING
-  expected {
-    string_value: ""
-    string_value: "I refer to the infinite."
-  }
-}
-field {
-  name: "uint32_value"
-  dtype: DT_INT32
-  expected {
-    int32_value: 0
-    int32_value: -1  # unsigned is 4294967295
-  }
-}
-field {
-  name: "sfixed32_value"
-  dtype: DT_INT32
-  expected {
-    int32_value: -2147483648
-    int32_value: 2147483647
-  }
-}
-field {
-  name: "sfixed64_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: -9223372036854775808
-    int64_value: 9223372036854775807
-  }
-}
-field {
-  name: "sint32_value"
-  dtype: DT_INT32
-  expected {
-    int32_value: -2147483648
-    int32_value: 2147483647
-  }
-}
-field {
-  name: "sint64_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: -9223372036854775808
-    int64_value: 9223372036854775807
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
deleted file mode 100644
index c664e52851..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-primitive {
-  message_value {
-    double_value: 23.5
-  }
-}
-shape: 1
-sizes: 1
-field {
-  name: "message_value"
-  dtype: DT_STRING
-  expected {
-    message_value {
-      double_value: 23.5
-    }
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
deleted file mode 100644
index 125651d7ea..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
+++ /dev/null
@@ -1,20 +0,0 @@
-primitive {
-  bool_value: true
-}
-shape: 1
-sizes: 1
-sizes: 0
-field {
-  name: "bool_value"
-  dtype: DT_BOOL
-  expected {
-    bool_value: true
-  }
-}
-field {
-  name: "double_value"
-  dtype: DT_DOUBLE
-  expected {
-    double_value: 0.0
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
deleted file mode 100644
index db7555bf2d..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-primitive {
-  fixed32_value: 4294967295
-  uint32_value: 4294967295
-}
-shape: 1
-sizes: 1
-sizes: 1
-field {
-  name: "fixed32_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: 4294967295
-  }
-}
-field {
-  name: "uint32_value"
-  dtype: DT_INT64
-  expected {
-    int64_value: 4294967295
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
deleted file mode 100644
index 61c7ac53f7..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
+++ /dev/null
@@ -1,32 +0,0 @@
-primitive {
-  double_value: 23.5
-  double_value: 123.0
-  bool_value: true
-}
-primitive {
-  double_value: 3.1
-  bool_value: false
-}
-shape: 2
-sizes: 2
-sizes: 1
-sizes: 1
-sizes: 1
-field {
-  name: "double_value"
-  dtype: DT_DOUBLE
-  expected {
-    double_value: 23.5
-    double_value: 123.0
-    double_value: 3.1
-    double_value: 0.0
-  }
-}
-field {
-  name: "bool_value"
-  dtype: DT_BOOL
-  expected {
-    bool_value: true
-    bool_value: false
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
deleted file mode 100644
index f4828076d5..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
+++ /dev/null
@@ -1,62 +0,0 @@
-primitive {
-  double_value: 23.5
-  bool_value: true
-}
-primitive {
-  double_value: 44.0
-  bool_value: false
-}
-primitive {
-  double_value: 3.14159
-  bool_value: true
-}
-primitive {
-  double_value: 1.414
-  bool_value: true
-}
-primitive {
-  double_value: -32.2
-  bool_value: false
-}
-primitive {
-  double_value: 0.0001
-  bool_value: true
-}
-shape: 3
-shape: 2
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-sizes: 1
-field {
-  name: "double_value"
-  dtype: DT_DOUBLE
-  expected {
-    double_value: 23.5
-    double_value: 44.0
-    double_value: 3.14159
-    double_value: 1.414
-    double_value: -32.2
-    double_value: 0.0001
-  }
-}
-field {
-  name: "bool_value"
-  dtype: DT_BOOL
-  expected {
-    bool_value: true
-    bool_value: false
-    bool_value: true
-    bool_value: true
-    bool_value: false
-    bool_value: true
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
deleted file mode 100644
index dc20ac147b..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-primitive {
-  double_value: 23.5
-  bool_value: true
-}
-shape: 1
-sizes: 1
-sizes: 1
-field {
-  name: "double_value"
-  dtype: DT_DOUBLE
-  expected {
-    double_value: 23.5
-  }
-}
-field {
-  name: "bool_value"
-  dtype: DT_BOOL
-  expected {
-    bool_value: true
-  }
-}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_case.py b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
deleted file mode 100644
index b95202c5df..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/test_case.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# =============================================================================
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Test case base for testing proto operations."""
-
-# Python3 preparedness imports.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ctypes as ct
-import os
-
-from tensorflow.python.platform import test
-
-
-class ProtoOpTestCase(test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    super(ProtoOpTestCase, self).__init__(methodName)
-    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
-    if os.path.isfile(lib):
-      ct.cdll.LoadLibrary(lib)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
deleted file mode 100644
index dc495034ff..0000000000
--- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
+++ /dev/null
@@ -1,149 +0,0 @@
-// Test description and protos to work with it.
-//
-// Many of the protos in this file are for unit tests that haven't been written yet.
-
-syntax = "proto2";
-
-import "tensorflow/core/framework/types.proto";
-
-package tensorflow.contrib.proto;
-
-// A TestCase holds a proto and a bunch of assertions
-// about how it should decode.
-message TestCase {
-  // A batch of primitives to be serialized and decoded.
-  repeated RepeatedPrimitiveValue primitive = 1;
-  // The shape of the batch.
-  repeated int32 shape = 2;
-  // Expected sizes for each field.
-  repeated int32 sizes = 3;
-  // Expected values for each field.
-  repeated FieldSpec field = 4;
-};
-
-// FieldSpec describes the expected output for a single field.
-message FieldSpec {
-  optional string name = 1;
-  optional tensorflow.DataType dtype = 2;
-  optional RepeatedPrimitiveValue expected = 3;
-};
-
-message TestValue {
-  optional PrimitiveValue primitive_value = 1;
-  optional EnumValue enum_value = 2;
-  optional MessageValue message_value = 3;
-  optional RepeatedMessageValue repeated_message_value = 4;
-  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
-}
-
-message PrimitiveValue {
-  optional double double_value = 1;
-  optional float float_value = 2;
-  optional int64 int64_value = 3;
-  optional uint64 uint64_value = 4;
-  optional int32 int32_value = 5;
-  optional fixed64 fixed64_value = 6;
-  optional fixed32 fixed32_value = 7;
-  optional bool bool_value = 8;
-  optional string string_value = 9;
-  optional bytes bytes_value = 12;
-  optional uint32 uint32_value = 13;
-  optional sfixed32 sfixed32_value = 15;
-  optional sfixed64 sfixed64_value = 16;
-  optional sint32 sint32_value = 17;
-  optional sint64 sint64_value = 18;
-}
-
-// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
-message RepeatedPrimitiveValue {
-  repeated double double_value = 1;
-  repeated float float_value = 2;
-  repeated int64 int64_value = 3;
-  repeated uint64 uint64_value = 4;
-  repeated int32 int32_value = 5;
-  repeated fixed64 fixed64_value = 6;
-  repeated fixed32 fixed32_value = 7;
-  repeated bool bool_value = 8;
-  repeated string string_value = 9;
-  repeated bytes bytes_value = 12;
-  repeated uint32 uint32_value = 13;
-  repeated sfixed32 sfixed32_value = 15;
-  repeated sfixed64 sfixed64_value = 16;
-  repeated sint32 sint32_value = 17;
-  repeated sint64 sint64_value = 18;
-  repeated PrimitiveValue message_value = 19;
-}
-
-// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
-// in the text format, but the binary serializion is different.
-// We test the packed representations by loading the same test cases
-// using this definition instead of RepeatedPrimitiveValue.
-// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
-// in every way except the packed=true declaration.
-message PackedPrimitiveValue {
-  repeated double double_value = 1 [packed = true];
-  repeated float float_value = 2 [packed = true];
-  repeated int64 int64_value = 3 [packed = true];
-  repeated uint64 uint64_value = 4 [packed = true];
-  repeated int32 int32_value = 5 [packed = true];
-  repeated fixed64 fixed64_value = 6 [packed = true];
-  repeated fixed32 fixed32_value = 7 [packed = true];
-  repeated bool bool_value = 8 [packed = true];
-  repeated string string_value = 9;
-  repeated bytes bytes_value = 12;
-  repeated uint32 uint32_value = 13 [packed = true];
-  repeated sfixed32 sfixed32_value = 15 [packed = true];
-  repeated sfixed64 sfixed64_value = 16 [packed = true];
-  repeated sint32 sint32_value = 17 [packed = true];
-  repeated sint64 sint64_value = 18 [packed = true];
-  repeated PrimitiveValue message_value = 19;
-}
-
-message EnumValue {
-  enum Color {
-    RED = 0;
-    ORANGE = 1;
-    YELLOW = 2;
-    GREEN = 3;
-    BLUE = 4;
-    INDIGO = 5;
-    VIOLET = 6;
-  };
-  optional Color enum_value = 14;
-  repeated Color repeated_enum_value = 15;
-}
-
-
-message InnerMessageValue {
-  optional float float_value = 2;
-  repeated bytes bytes_values = 8;
-}
-
-message MiddleMessageValue {
-  repeated int32 int32_values = 5;
-  optional InnerMessageValue message_value = 11;
-  optional uint32 uint32_value = 13;
-}
-
-message MessageValue {
-  optional double double_value = 1;
-  optional MiddleMessageValue message_value = 11;
-}
-
-message RepeatedMessageValue {
-  message NestedMessageValue {
-    optional float float_value = 2;
-    repeated bytes bytes_values = 8;
-  }
-
-  repeated NestedMessageValue message_values = 11;
-}
-
-// Message containing fields with field numbers higher than any field above. An
-// instance of this message is prepended to each binary message in the test to
-// exercise the code path that handles fields encoded out of order of field
-// number.
-message ExtraFields {
-  optional string string_value = 1776;
-  optional bool bool_value = 1777;
-}
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2ef105755f..679d2735f9 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -74,7 +74,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
-- 
GitLab


From 1c88fac05afbce5aa1131c87f0594f9f0f1b6706 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 12 Apr 2018 21:39:26 -0700
Subject: [PATCH 0747/1262] Automated g4 rollback of changelist 192698931

PiperOrigin-RevId: 192718697
---
 tensorflow/contrib/BUILD                      |   1 -
 tensorflow/contrib/__init__.py                |   1 -
 tensorflow/contrib/cmake/tf_python.cmake      |   3 +-
 tensorflow/contrib/rpc/BUILD                  |  16 -
 .../contrib/rpc/python/kernel_tests/BUILD     |  76 ----
 .../rpc/python/kernel_tests/rpc_op_test.py    |  71 ----
 .../python/kernel_tests/rpc_op_test_base.py   | 337 ------------------
 .../kernel_tests/rpc_op_test_servicer.py      | 101 ------
 .../python/kernel_tests/test_example.proto    | 171 ---------
 .../core/platform/default/build_config.bzl    |  86 +----
 tensorflow/tools/pip_package/BUILD            |   1 -
 tensorflow/workspace.bzl                      |   4 -
 12 files changed, 4 insertions(+), 864 deletions(-)
 delete mode 100644 tensorflow/contrib/rpc/python/kernel_tests/BUILD
 delete mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
 delete mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
 delete mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
 delete mode 100644 tensorflow/contrib/rpc/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 192d053683..9bef0d8b61 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -86,7 +86,6 @@ py_library(
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
-        "//tensorflow/contrib/rpc",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/signal:signal_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e02dd5e759..aaddb06fa0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -70,7 +70,6 @@ from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
-from tensorflow.contrib import rpc
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
 from tensorflow.contrib import signal
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 9d9db82513..ded15b4b66 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -345,8 +345,7 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
-GENERATE_PYTHON_OP_LIB("rpc_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
+GENERATE_PYTHON_OP_LIB("rpc_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index dbd311a276..597f18c771 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -4,8 +4,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-
 py_library(
     name = "rpc",
     srcs = [
@@ -13,17 +11,3 @@ py_library(
     ],
     deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
 )
-
-py_library(
-    name = "rpc_pip",
-    data = if_static(
-        [],
-        otherwise = ["//tensorflow/contrib/rpc/python/kernel_tests:libtestexample.so"],
-    ),
-    deps = [
-        ":rpc",
-        "//tensorflow/contrib/rpc/python/kernel_tests:py_test_deps",
-        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_base",
-        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_servicer",
-    ],
-)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
deleted file mode 100644
index 08ec1e61a4..0000000000
--- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD
+++ /dev/null
@@ -1,76 +0,0 @@
-# TODO(b/76425722): Port everything in here to OS (currently excluded).
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-# Placeholder for loading internal BUILD rule.
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
-load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-
-tf_proto_library(
-    name = "test_example_proto",
-    srcs = ["test_example.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    protodeps = ["//tensorflow/core:protos_all"],
-)
-
-py_library(
-    name = "py_test_deps",
-    deps = [":test_example_proto_py"],
-)
-
-py_library(
-    name = "rpc_op_test_base",
-    srcs = ["rpc_op_test_base.py"],
-    deps = [
-        ":test_example_proto_py",
-        "//tensorflow/contrib/proto",
-        "//tensorflow/contrib/rpc",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "rpc_op_test_servicer",
-    srcs = ["rpc_op_test_servicer.py"],
-    deps = [
-        ":py_test_deps",
-        ":rpc_op_test_base",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_cc_shared_object(
-    name = "libtestexample.so",
-    linkstatic = 1,
-    deps = [
-        ":test_example_proto_cc",
-    ],
-)
-
-tf_py_test(
-    name = "rpc_op_test",
-    size = "small",
-    srcs = ["rpc_op_test.py"],
-    additional_deps = [
-        ":py_test_deps",
-        ":rpc_op_test_base",
-        ":rpc_op_test_servicer",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-    ],
-    data = if_static(
-        [],
-        otherwise = [":libtestexample.so"],
-    ),
-)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
deleted file mode 100644
index e2e0dbc7a2..0000000000
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Tests for RpcOp."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ctypes as ct
-import os
-
-import grpc
-from grpc.framework.foundation import logging_pool
-import portpicker
-
-from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
-from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_servicer
-from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
-from tensorflow.python.platform import test
-
-
-class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
-  _protocol = 'grpc'
-
-  invalid_method_string = 'Method not found'
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    super(RpcOpTest, self).__init__(methodName)
-    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
-    if os.path.isfile(lib):
-      ct.cdll.LoadLibrary(lib)
-
-  def get_method_name(self, suffix):
-    return '/tensorflow.contrib.rpc.TestCaseService/%s' % suffix
-
-  def setUp(self):
-    super(RpcOpTest, self).setUp()
-
-    service_port = portpicker.pick_unused_port()
-
-    server = grpc.server(logging_pool.pool(max_workers=25))
-    servicer = rpc_op_test_servicer.RpcOpTestServicer()
-    test_example_pb2_grpc.add_TestCaseServiceServicer_to_server(
-        servicer, server)
-    self._address = 'localhost:%d' % service_port
-    server.add_insecure_port(self._address)
-    server.start()
-    self._server = server
-
-  def tearDown(self):
-    # TODO(ebrevdo): Figure out why this sometimes times out.
-    #    self._service.ExitLoop()
-    #    self._service_thread.join()
-    # self._server.stop()
-    super(RpcOpTest, self).tearDown()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
deleted file mode 100644
index aa03a103ed..0000000000
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Base class for RpcOp tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-import numpy as np
-
-from tensorflow.contrib.proto import decode_proto
-from tensorflow.contrib.proto import encode_proto
-from tensorflow.contrib.rpc import rpc
-from tensorflow.contrib.rpc import try_rpc
-from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-
-__all__ = ['I_WARNED_YOU', 'RpcOpTestBase']
-
-I_WARNED_YOU = 'I warned you!'
-
-
-class RpcOpTestBase(object):
-  # pylint: disable=missing-docstring,invalid-name
-  """Base class for RpcOp tests."""
-
-  def get_method_name(self, suffix):
-    raise NotImplementedError
-
-  def rpc(self, *args, **kwargs):
-    return rpc(*args, protocol=self._protocol, **kwargs)
-
-  def try_rpc(self, *args, **kwargs):
-    return try_rpc(*args, protocol=self._protocol, **kwargs)
-
-  def testScalarHostPortRpc(self):
-    with self.test_session() as sess:
-      request_tensors = (
-          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
-      response_tensors = self.rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=self._address,
-          request=request_tensors)
-      self.assertEqual(response_tensors.shape, ())
-      response_values = sess.run(response_tensors)
-    response_message = test_example_pb2.TestCase()
-    self.assertTrue(response_message.ParseFromString(response_values))
-    self.assertAllEqual([2, 3, 4], response_message.shape)
-
-  def testScalarHostPortTryRpc(self):
-    with self.test_session() as sess:
-      request_tensors = (
-          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
-      response_tensors, status_code, status_message = self.try_rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=self._address,
-          request=request_tensors)
-      self.assertEqual(status_code.shape, ())
-      self.assertEqual(status_message.shape, ())
-      self.assertEqual(response_tensors.shape, ())
-      response_values, status_code_values, status_message_values = (
-          sess.run((response_tensors, status_code, status_message)))
-    response_message = test_example_pb2.TestCase()
-    self.assertTrue(response_message.ParseFromString(response_values))
-    self.assertAllEqual([2, 3, 4], response_message.shape)
-    # For the base Rpc op, don't expect to get error status back.
-    self.assertEqual(errors.OK, status_code_values)
-    self.assertEqual(b'', status_message_values)
-
-  def testEmptyHostPortRpc(self):
-    with self.test_session() as sess:
-      request_tensors = []
-      response_tensors = self.rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=self._address,
-          request=request_tensors)
-      self.assertAllEqual(response_tensors.shape, [0])
-      response_values = sess.run(response_tensors)
-    self.assertAllEqual(response_values.shape, [0])
-
-  def testInvalidAddresses(self):
-    with self.test_session() as sess:
-      with self.assertRaisesOpError(self.invalid_method_string):
-        sess.run(
-            self.rpc(
-                method='/InvalidService.IncrementTestShapes',
-                address=self._address,
-                request=''))
-
-      with self.assertRaisesOpError(self.invalid_method_string):
-        sess.run(
-            self.rpc(
-                method=self.get_method_name('InvalidMethodName'),
-                address=self._address,
-                request=''))
-
-      # This also covers the case of address=''
-      # and address='localhost:293874293874'
-      with self.assertRaises(errors.UnavailableError):
-        sess.run(
-            self.rpc(
-                method=self.get_method_name('IncrementTestShapes'),
-                address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@',
-                request=''))
-
-      # Test invalid method with the TryRpc op
-      _, status_code_value, status_message_value = sess.run(
-          self.try_rpc(
-              method=self.get_method_name('InvalidMethodName'),
-              address=self._address,
-              request=''))
-      self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
-      self.assertTrue(
-          self.invalid_method_string in status_message_value.decode('ascii'))
-
-  def testAlwaysFailingMethod(self):
-    with self.test_session() as sess:
-      response_tensors = self.rpc(
-          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
-          address=self._address,
-          request='')
-      self.assertEqual(response_tensors.shape, ())
-      with self.assertRaisesOpError(I_WARNED_YOU):
-        sess.run(response_tensors)
-
-  def testSometimesFailingMethodWithManyRequests(self):
-    with self.test_session() as sess:
-      # Fail hard by default.
-      response_tensors = self.rpc(
-          method=self.get_method_name('SometimesFailWithInvalidArgument'),
-          address=self._address,
-          request=[''] * 20)
-      self.assertEqual(response_tensors.shape, (20,))
-      with self.assertRaisesOpError(I_WARNED_YOU):
-        sess.run(response_tensors)
-
-      # Don't fail hard, use TryRpc - return the failing status instead.
-      response_tensors, status_code, status_message = self.try_rpc(
-          method=self.get_method_name('SometimesFailWithInvalidArgument'),
-          address=self._address,
-          request=[''] * 20)
-      self.assertEqual(response_tensors.shape, (20,))
-      self.assertEqual(status_code.shape, (20,))
-      self.assertEqual(status_message.shape, (20,))
-      status_code_values, status_message_values = sess.run((status_code,
-                                                            status_message))
-      self.assertTrue([
-          x in (errors.OK, errors.INVALID_ARGUMENT) for x in status_code_values
-      ])
-      expected_message_values = np.where(
-          status_code_values == errors.INVALID_ARGUMENT,
-          I_WARNED_YOU.encode('ascii'), b'')
-      self.assertAllEqual(expected_message_values, status_message_values)
-
-  def testVecHostPortRpc(self):
-    with self.test_session() as sess:
-      request_tensors = [
-          test_example_pb2.TestCase(
-              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
-      ]
-      response_tensors = self.rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=self._address,
-          request=request_tensors)
-      self.assertEqual(response_tensors.shape, (20,))
-      response_values = sess.run(response_tensors)
-    self.assertEqual(response_values.shape, (20,))
-    for i in range(20):
-      response_message = test_example_pb2.TestCase()
-      self.assertTrue(response_message.ParseFromString(response_values[i]))
-      self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
-
-  def testVecHostPortManyParallelRpcs(self):
-    with self.test_session() as sess:
-      request_tensors = [
-          test_example_pb2.TestCase(
-              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
-      ]
-      many_response_tensors = [
-          self.rpc(
-              method=self.get_method_name('IncrementTestShapes'),
-              address=self._address,
-              request=request_tensors) for _ in range(10)
-      ]
-      # Launch parallel 10 calls to the RpcOp, each containing
-      # 20 rpc requests.
-      many_response_values = sess.run(many_response_tensors)
-    self.assertEqual(10, len(many_response_values))
-    for response_values in many_response_values:
-      self.assertEqual(response_values.shape, (20,))
-      for i in range(20):
-        response_message = test_example_pb2.TestCase()
-        self.assertTrue(response_message.ParseFromString(response_values[i]))
-        self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
-
-  def testVecHostPortRpcUsingEncodeAndDecodeProto(self):
-    with self.test_session() as sess:
-      request_tensors = encode_proto(
-          message_type='tensorflow.contrib.rpc.TestCase',
-          field_names=['shape'],
-          sizes=[[3]] * 20,
-          values=[
-              [[i, i + 1, i + 2] for i in range(20)],
-          ])
-      response_tensor_strings = self.rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=self._address,
-          request=request_tensors)
-      _, (response_shape,) = decode_proto(
-          bytes=response_tensor_strings,
-          message_type='tensorflow.contrib.rpc.TestCase',
-          field_names=['shape'],
-          output_types=[dtypes.int32])
-      response_shape_values = sess.run(response_shape)
-    self.assertAllEqual([[i + 1, i + 2, i + 3]
-                         for i in range(20)], response_shape_values)
-
-  def testVecHostPortRpcCancelsUponSessionTimeOutWhenSleepingForever(self):
-    with self.test_session() as sess:
-      request_tensors = [''] * 25  # This will launch 25 RPC requests.
-      response_tensors = self.rpc(
-          method=self.get_method_name('SleepForever'),
-          address=self._address,
-          request=request_tensors)
-      for timeout_ms in [1, 500, 1000]:
-        options = config_pb2.RunOptions(timeout_in_ms=timeout_ms)
-        with self.assertRaises((errors.UnavailableError,
-                                errors.DeadlineExceededError)):
-          sess.run(response_tensors, options=options)
-
-  def testVecHostPortRpcCancelsUponConfiguredTimeOutWhenSleepingForever(self):
-    with self.test_session() as sess:
-      request_tensors = [''] * 25  # This will launch 25 RPC requests.
-      response_tensors = self.rpc(
-          method=self.get_method_name('SleepForever'),
-          address=self._address,
-          timeout_in_ms=1000,
-          request=request_tensors)
-      with self.assertRaises(errors.DeadlineExceededError):
-        sess.run(response_tensors)
-
-  def testTryRpcPropagatesDeadlineErrorWithSometimesTimingOutRequests(self):
-    with self.test_session() as sess:
-      response_tensors, status_code, status_message = self.try_rpc(
-          method=self.get_method_name('SometimesSleepForever'),
-          timeout_in_ms=1000,
-          address=self._address,
-          request=[''] * 20)
-      self.assertEqual(response_tensors.shape, (20,))
-      self.assertEqual(status_code.shape, (20,))
-      self.assertEqual(status_message.shape, (20,))
-      status_code_values = sess.run(status_code)
-      self.assertTrue([
-          x in (errors.OK, errors.DEADLINE_EXCEEDED) for x in status_code_values
-      ])
-
-  def testTryRpcWithMultipleAddressesSingleRequest(self):
-    flatten = lambda x: list(itertools.chain.from_iterable(x))
-    with self.test_session() as sess:
-      addresses = flatten([[
-          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
-      ] for _ in range(10)])
-      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
-      response_tensors, status_code, _ = self.try_rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=addresses,
-          request=request)
-      response_tensors_values, status_code_values = sess.run((response_tensors,
-                                                              status_code))
-      self.assertAllEqual(
-          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
-          status_code_values)
-      for i in range(10):
-        self.assertTrue(response_tensors_values[2 * i])
-        self.assertFalse(response_tensors_values[2 * i + 1])
-
-  def testTryRpcWithMultipleMethodsSingleRequest(self):
-    flatten = lambda x: list(itertools.chain.from_iterable(x))
-    with self.test_session() as sess:
-      methods = flatten(
-          [[self.get_method_name('IncrementTestShapes'), 'InvalidMethodName']
-           for _ in range(10)])
-      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
-      response_tensors, status_code, _ = self.try_rpc(
-          method=methods, address=self._address, request=request)
-      response_tensors_values, status_code_values = sess.run((response_tensors,
-                                                              status_code))
-      self.assertAllEqual(
-          flatten([errors.OK, errors.UNIMPLEMENTED] for _ in range(10)),
-          status_code_values)
-      for i in range(10):
-        self.assertTrue(response_tensors_values[2 * i])
-        self.assertFalse(response_tensors_values[2 * i + 1])
-
-  def testTryRpcWithMultipleAddressesAndRequests(self):
-    flatten = lambda x: list(itertools.chain.from_iterable(x))
-    with self.test_session() as sess:
-      addresses = flatten([[
-          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
-      ] for _ in range(10)])
-      requests = [
-          test_example_pb2.TestCase(
-              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
-      ]
-      response_tensors, status_code, _ = self.try_rpc(
-          method=self.get_method_name('IncrementTestShapes'),
-          address=addresses,
-          request=requests)
-      response_tensors_values, status_code_values = sess.run((response_tensors,
-                                                              status_code))
-      self.assertAllEqual(
-          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
-          status_code_values)
-      for i in range(20):
-        if i % 2 == 1:
-          self.assertFalse(response_tensors_values[i])
-        else:
-          response_message = test_example_pb2.TestCase()
-          self.assertTrue(
-              response_message.ParseFromString(response_tensors_values[i]))
-          self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
deleted file mode 100644
index 7cbd636cb1..0000000000
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Test servicer for RpcOp tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import time
-
-import grpc
-
-from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
-from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
-
-
-class RpcOpTestServicer(test_example_pb2_grpc.TestCaseServiceServicer):
-  """Test servicer for RpcOp tests."""
-
-  def IncrementTestShapes(self, request, context):
-    """Increment the entries in the shape attribute of request.
-
-    Args:
-      request: input TestCase.
-      context: the rpc context.
-
-    Returns:
-      output TestCase.
-    """
-    for i in range(len(request.shape)):
-      request.shape[i] += 1
-    return request
-
-  def AlwaysFailWithInvalidArgument(self, request, context):
-    """Always fails with an InvalidArgument status.
-
-    Args:
-      request: input TestCase.
-      context: the rpc context.
-
-    Returns:
-      output TestCase.
-    """
-    del request
-    context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
-    context.set_details(rpc_op_test_base.I_WARNED_YOU)
-
-  def SometimesFailWithInvalidArgument(self, request, context):
-    """Sometimes fails with an InvalidArgument status.
-
-    Args:
-      request: input TestCase.
-      context: the rpc context.
-
-    Returns:
-      output TestCase.
-    """
-    if random.randint(0, 1) == 1:
-      context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
-      context.set_details(rpc_op_test_base.I_WARNED_YOU)
-    return request
-
-  def SleepForever(self, request, context):
-    """Sleeps forever.
-
-    Args:
-      request: input TestCase.
-      context: the rpc context.
-
-    Returns:
-      output TestCase.
-    """
-    # TODO(ebrevdo): Make this async wait like the stubby version.
-    time.sleep(5)
-
-  def SometimesSleepForever(self, request, context):
-    """Sometimes sleeps forever.
-
-    Args:
-      request: input TestCase.
-      context: the rpc context.
-
-    Returns:
-      output TestCase.
-    """
-    if random.randint(0, 1) == 1:
-      time.sleep(5)
-    return request
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
deleted file mode 100644
index 96f4550f62..0000000000
--- a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
+++ /dev/null
@@ -1,171 +0,0 @@
-// Test description and protos to work with it.
-//
-// Many of the protos in this file are for unit tests that haven't been written yet.
-
-syntax = "proto2";
-
-import "tensorflow/core/framework/types.proto";
-
-package tensorflow.contrib.rpc;
-
-// A TestCase holds a proto and a bunch of assertions
-// about how it should decode.
-message TestCase {
-  // A batch of primitives to be serialized and decoded.
-  repeated RepeatedPrimitiveValue primitive = 1;
-  // The shape of the batch.
-  repeated int32 shape = 2;
-  // Expected sizes for each field.
-  repeated int32 sizes = 3;
-  // Expected values for each field.
-  repeated FieldSpec field = 4;
-};
-
-service TestCaseService {
-  // Copy input, and increment each entry in 'shape' by 1.
-  rpc IncrementTestShapes(TestCase) returns (TestCase) {
-  }
-
-  // Sleep forever.
-  rpc SleepForever(TestCase) returns (TestCase) {
-  }
-
-  // Sleep forever 50% of the time, return immediately the other 50%.
-  rpc SometimesSleepForever(TestCase) returns (TestCase) {
-  }
-
-  // Always fails with InvalidArgument.
-  rpc AlwaysFailWithInvalidArgument(TestCase) returns (TestCase) {
-  }
-
-  // Fails with InvalidArgument 50% of the time.
-  rpc SometimesFailWithInvalidArgument(TestCase) returns (TestCase) {
-  }
-};
-
-// FieldSpec describes the expected output for a single field.
-message FieldSpec {
-  optional string name = 1;
-  optional tensorflow.DataType dtype = 2;
-  optional RepeatedPrimitiveValue expected = 3;
-};
-
-message TestValue {
-  optional PrimitiveValue primitive_value = 1;
-  optional EnumValue enum_value = 2;
-  optional MessageValue message_value = 3;
-  optional RepeatedMessageValue repeated_message_value = 4;
-  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
-}
-
-message PrimitiveValue {
-  optional double double_value = 1;
-  optional float float_value = 2;
-  optional int64 int64_value = 3;
-  optional uint64 uint64_value = 4;
-  optional int32 int32_value = 5;
-  optional fixed64 fixed64_value = 6;
-  optional fixed32 fixed32_value = 7;
-  optional bool bool_value = 8;
-  optional string string_value = 9;
-  optional bytes bytes_value = 12;
-  optional uint32 uint32_value = 13;
-  optional sfixed32 sfixed32_value = 15;
-  optional sfixed64 sfixed64_value = 16;
-  optional sint32 sint32_value = 17;
-  optional sint64 sint64_value = 18;
-}
-
-// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
-message RepeatedPrimitiveValue {
-  repeated double double_value = 1;
-  repeated float float_value = 2;
-  repeated int64 int64_value = 3;
-  repeated uint64 uint64_value = 4;
-  repeated int32 int32_value = 5;
-  repeated fixed64 fixed64_value = 6;
-  repeated fixed32 fixed32_value = 7;
-  repeated bool bool_value = 8;
-  repeated string string_value = 9;
-  repeated bytes bytes_value = 12;
-  repeated uint32 uint32_value = 13;
-  repeated sfixed32 sfixed32_value = 15;
-  repeated sfixed64 sfixed64_value = 16;
-  repeated sint32 sint32_value = 17;
-  repeated sint64 sint64_value = 18;
-  repeated PrimitiveValue message_value = 19;
-}
-
-// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
-// in the text format, but the binary serializion is different.
-// We test the packed representations by loading the same test cases
-// using this definition instead of RepeatedPrimitiveValue.
-// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
-// in every way except the packed=true declaration.
-message PackedPrimitiveValue {
-  repeated double double_value = 1 [packed = true];
-  repeated float float_value = 2 [packed = true];
-  repeated int64 int64_value = 3 [packed = true];
-  repeated uint64 uint64_value = 4 [packed = true];
-  repeated int32 int32_value = 5 [packed = true];
-  repeated fixed64 fixed64_value = 6 [packed = true];
-  repeated fixed32 fixed32_value = 7 [packed = true];
-  repeated bool bool_value = 8 [packed = true];
-  repeated string string_value = 9;
-  repeated bytes bytes_value = 12;
-  repeated uint32 uint32_value = 13 [packed = true];
-  repeated sfixed32 sfixed32_value = 15 [packed = true];
-  repeated sfixed64 sfixed64_value = 16 [packed = true];
-  repeated sint32 sint32_value = 17 [packed = true];
-  repeated sint64 sint64_value = 18 [packed = true];
-  repeated PrimitiveValue message_value = 19;
-}
-
-message EnumValue {
-  enum Color {
-    RED = 0;
-    ORANGE = 1;
-    YELLOW = 2;
-    GREEN = 3;
-    BLUE = 4;
-    INDIGO = 5;
-    VIOLET = 6;
-  };
-  optional Color enum_value = 14;
-  repeated Color repeated_enum_value = 15;
-}
-
-
-message InnerMessageValue {
-  optional float float_value = 2;
-  repeated bytes bytes_values = 8;
-}
-
-message MiddleMessageValue {
-  repeated int32 int32_values = 5;
-  optional InnerMessageValue message_value = 11;
-  optional uint32 uint32_value = 13;
-}
-
-message MessageValue {
-  optional double double_value = 1;
-  optional MiddleMessageValue message_value = 11;
-}
-
-message RepeatedMessageValue {
-  message NestedMessageValue {
-    optional float float_value = 2;
-    repeated bytes bytes_values = 8;
-  }
-
-  repeated NestedMessageValue message_values = 11;
-}
-
-// Message containing fields with field numbers higher than any field above. An
-// instance of this message is prepended to each binary message in the test to
-// exercise the code path that handles fields encoded out of order of field
-// number.
-message ExtraFields {
-  optional string string_value = 1776;
-  optional bool bool_value = 1777;
-}
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 44356e3438..4cfa25bf66 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,6 +1,7 @@
 # Platform-specific build configurations.
 
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
+load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
@@ -109,12 +110,6 @@ def _proto_cc_srcs(srcs, use_grpc_plugin=False):
     ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
   return ret
 
-def _proto_py_outs(srcs, use_grpc_plugin=False):
-  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
-  if use_grpc_plugin:
-    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
-  return ret
-
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
@@ -222,80 +217,6 @@ def cc_proto_library(
       hdrs=gen_hdrs,
       **kargs)
 
-# Re-defined protocol buffer rule to bring in the change introduced in commit
-# https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
-# which was not part of a stable protobuf release in 04/2018.
-# TODO(jsimsa): Remove this once the protobuf dependency version is updated
-# to include the above commit.
-def py_proto_library(
-        name,
-        srcs=[],
-        deps=[],
-        py_libs=[],
-        py_extra_srcs=[],
-        include=None,
-        default_runtime="@protobuf_archive//:protobuf_python",
-        protoc="@protobuf_archive//:protoc",
-        use_grpc_plugin=False,
-        **kargs):
-  """Bazel rule to create a Python protobuf library from proto source files
-
-  NOTE: the rule is only an internal workaround to generate protos. The
-  interface may change and the rule may be removed when bazel has introduced
-  the native rule.
-
-  Args:
-    name: the name of the py_proto_library.
-    srcs: the .proto files of the py_proto_library.
-    deps: a list of dependency labels; must be py_proto_library.
-    py_libs: a list of other py_library targets depended by the generated
-        py_library.
-    py_extra_srcs: extra source files that will be added to the output
-        py_library. This attribute is used for internal bootstrapping.
-    include: a string indicating the include path of the .proto files.
-    default_runtime: the implicitly default runtime which will be depended on by
-        the generated py_library target.
-    protoc: the label of the protocol compiler to generate the sources.
-    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
-        when processing the proto files.
-    **kargs: other keyword arguments that are passed to cc_library.
-  """
-  outs = _proto_py_outs(srcs, use_grpc_plugin)
-
-  includes = []
-  if include != None:
-    includes = [include]
-
-  grpc_python_plugin = None
-  if use_grpc_plugin:
-    grpc_python_plugin = "//external:grpc_python_plugin"
-    # Note: Generated grpc code depends on Python grpc module. This dependency
-    # is not explicitly listed in py_libs. Instead, host system is assumed to
-    # have grpc installed.
-
-  proto_gen(
-      name=name + "_genproto",
-      srcs=srcs,
-      deps=[s + "_genproto" for s in deps],
-      includes=includes,
-      protoc=protoc,
-      gen_py=1,
-      outs=outs,
-      visibility=["//visibility:public"],
-      plugin=grpc_python_plugin,
-      plugin_language="grpc"
-  )
-
-  if default_runtime and not default_runtime in py_libs + deps:
-    py_libs = py_libs + [default_runtime]
-
-  native.py_library(
-      name=name,
-      srcs=outs+py_extra_srcs,
-      deps=py_libs+deps,
-      imports=includes,
-      **kargs)
-
 def tf_proto_library_cc(name, srcs = [], has_services = None,
                         protodeps = [],
                         visibility = [], testonly = 0,
@@ -340,7 +261,8 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   )
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
+                        testonly=0,
+                        srcs_version="PY2AND3"):
   py_proto_library(
       name = name + "_py",
       srcs = srcs,
@@ -350,7 +272,6 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
-      use_grpc_plugin = use_grpc_plugin,
   )
 
 def tf_jspb_proto_library(**kwargs):
@@ -389,7 +310,6 @@ def tf_proto_library(name, srcs = [], has_services = None,
       srcs_version = "PY2AND3",
       testonly = testonly,
       visibility = visibility,
-      use_grpc_plugin = has_services,
   )
 
 def tf_additional_lib_hdrs(exclude = []):
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 679d2735f9..376644718f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -75,7 +75,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
-    "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
     "//tensorflow/contrib/signal:test_util",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dee2fcd0e1..72f446d359 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -763,10 +763,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc_cpp_plugin",
       actual = "@grpc//:grpc_cpp_plugin",
   )
-  native.bind(
-      name = "grpc_python_plugin",
-      actual = "@grpc//:grpc_python_plugin",
-  )
 
   # gRPC has three empty C++ functions which it wants the user to define
   # at build time. https://github.com/grpc/grpc/issues/13590
-- 
GitLab


From 68f0f1aadb07ed1e7449b969d8807b5f662be33a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 12 Apr 2018 23:05:35 -0700
Subject: [PATCH 0748/1262] [XLA] Rename Interpreter{Executor,Platform} ->
 XlaInterpreter{Executor,Platform}.

These types live inside StreamExecutor's namespace, but they are
specific to XLA.  Therefore they either shouldn't live in SE's namespace
or should have "XLA" in the name.

Moving them out of SE's namespace is ugly, because almost every type
used inside of these headers then needs to be qualified.  So name-change
it is.

This patch was generated by a mechanical find/replace.

PiperOrigin-RevId: 192724238
---
 .../xla/service/interpreter/compiler.cc       |  8 ++--
 .../xla/service/interpreter/executor.cc       | 47 ++++++++++---------
 .../xla/service/interpreter/executor.h        | 10 ++--
 .../interpreter_transfer_manager.cc           |  4 +-
 .../xla/service/interpreter/platform.cc       | 33 +++++++------
 .../xla/service/interpreter/platform.h        |  8 ++--
 .../xla/service/interpreter/platform_id.cc    |  2 +-
 .../xla/service/interpreter/platform_id.h     |  2 +-
 8 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 9171e859c6..5b9bf5faf3 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -96,7 +96,7 @@ InterpreterCompiler::CompileAheadOfTime(
 }
 
 se::Platform::Id InterpreterCompiler::PlatformId() const {
-  return sep::kInterpreterPlatformId;
+  return sep::kXlaInterpreterPlatformId;
 }
 
 HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@@ -109,11 +109,11 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(sep::kInterpreterPlatformId, []() {
+  xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
     return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
   });
-  xla::ComputationPlacer::RegisterComputationPlacer(sep::kInterpreterPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 68371910d7..3caf9e7b82 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -28,84 +28,85 @@ host::HostStream *AsExecutorStream(Stream *stream) {
   return dynamic_cast<host::HostStream *>(stream->implementation());
 }
 
-InterpreterExecutor::InterpreterExecutor(const PluginConfig &plugin_config)
+XlaInterpreterExecutor::XlaInterpreterExecutor(
+    const PluginConfig &plugin_config)
     : plugin_config_(plugin_config) {}
 
-InterpreterExecutor::~InterpreterExecutor() {}
+XlaInterpreterExecutor::~XlaInterpreterExecutor() {}
 
-void *InterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
+void *XlaInterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
 
-void *InterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
-                                             uint64 offset_bytes,
-                                             uint64 /*size_bytes*/) {
+void *XlaInterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                                uint64 offset_bytes,
+                                                uint64 /*size_bytes*/) {
   return parent + offset_bytes;
 }
 
-void InterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
+void XlaInterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
   if (!mem->is_sub_buffer()) {
     delete[] static_cast<char *>(mem->opaque());
   }
 }
 
-bool InterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
-                                 const DeviceMemoryBase &dev_src, uint64 size) {
+bool XlaInterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
+                                    const DeviceMemoryBase &dev_src,
+                                    uint64 size) {
   AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
     port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
   });
   return true;
 }
 
-bool InterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
-                                 const void *host_src, uint64 size) {
+bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                                    const void *host_src, uint64 size) {
   AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
     port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
   });
   return true;
 }
 
-port::Status InterpreterExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                                    const void *host_src,
-                                                    uint64 size) {
+port::Status XlaInterpreterExecutor::SynchronousMemcpy(
+    DeviceMemoryBase *dev_dst, const void *host_src, uint64 size) {
   memcpy(dev_dst->opaque(), host_src, size);
   return port::Status::OK();
 }
 
-port::Status InterpreterExecutor::SynchronousMemcpy(
+port::Status XlaInterpreterExecutor::SynchronousMemcpy(
     void *host_dst, const DeviceMemoryBase &dev_src, uint64 size) {
   memcpy(host_dst, dev_src.opaque(), size);
   return port::Status::OK();
 }
 
-bool InterpreterExecutor::HostCallback(Stream *stream,
-                                       std::function<void()> callback) {
+bool XlaInterpreterExecutor::HostCallback(Stream *stream,
+                                          std::function<void()> callback) {
   AsExecutorStream(stream)->EnqueueTask(callback);
   return true;
 }
 
-bool InterpreterExecutor::CreateStreamDependency(Stream *dependent,
-                                                 Stream *other) {
+bool XlaInterpreterExecutor::CreateStreamDependency(Stream *dependent,
+                                                    Stream *other) {
   AsExecutorStream(dependent)->EnqueueTask(
       [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsExecutorStream(dependent)->BlockUntilDone();
   return true;
 }
 
-bool InterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
+bool XlaInterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
   dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
   return true;
 }
 
-bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
+bool XlaInterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
   return true;
 }
 
-port::Status InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   AsExecutorStream(stream)->BlockUntilDone();
   return port::Status::OK();
 }
 
-DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
+DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
 
   builder.set_device_address_bits(64);
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index c5d07e906d..77426b0820 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Declares the InterpreterExecutor class, which is a CPU-only implementation of
-// the StreamExecutor interface. For now, this is used for testing and to
+// Declares the XlaInterpreterExecutor class, which is a CPU-only implementation
+// of the StreamExecutor interface. For now, this is used for testing and to
 // examine the performance of host-based StreamExecutor code.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
@@ -50,10 +50,10 @@ namespace interpreter {
 
 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
 
-class InterpreterExecutor : public internal::StreamExecutorInterface {
+class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
  public:
-  explicit InterpreterExecutor(const PluginConfig &plugin_config);
-  ~InterpreterExecutor() override;
+  explicit XlaInterpreterExecutor(const PluginConfig &plugin_config);
+  ~XlaInterpreterExecutor() override;
 
   port::Status Init(int device_ordinal, DeviceOptions device_options) override {
     return port::Status::OK();
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
index cf98ecd774..3cf8506d1c 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@@ -26,7 +26,7 @@ namespace sei = ::perftools::gputools::interpreter;
 namespace xla {
 
 InterpreterTransferManager::InterpreterTransferManager()
-    : GenericTransferManager(sei::kInterpreterPlatformId,
+    : GenericTransferManager(sei::kXlaInterpreterPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
 }  // namespace xla
@@ -38,7 +38,7 @@ CreateInterpreterTransferManager() {
 
 static bool InitModule() {
   xla::TransferManager::RegisterTransferManager(
-      sei::kInterpreterPlatformId, &CreateInterpreterTransferManager);
+      sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index a60e7fc59f..015e00e1e8 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -35,17 +35,19 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-InterpreterPlatform::InterpreterPlatform() : name_("Interpreter") {}
+XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
 
-InterpreterPlatform::~InterpreterPlatform() {}
+XlaInterpreterPlatform::~XlaInterpreterPlatform() {}
 
-Platform::Id InterpreterPlatform::id() const { return kInterpreterPlatformId; }
+Platform::Id XlaInterpreterPlatform::id() const {
+  return kXlaInterpreterPlatformId;
+}
 
-int InterpreterPlatform::VisibleDeviceCount() const { return 1; }
+int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
-const string& InterpreterPlatform::Name() const { return name_; }
+const string& XlaInterpreterPlatform::Name() const { return name_; }
 
-port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
+port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
     int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -55,7 +57,7 @@ port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
 }
 
 port::StatusOr<StreamExecutor*>
-InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
+XlaInterpreterPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = device_ordinal;
@@ -64,16 +66,17 @@ InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> InterpreterPlatform::GetExecutor(
+port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
-InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+XlaInterpreterPlatform::GetUncachedExecutor(
+    const StreamExecutorConfig& config) {
   auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<InterpreterExecutor>(config.plugin_config));
+      this, port::MakeUnique<XlaInterpreterExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
@@ -86,17 +89,17 @@ InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   return std::move(executor);
 }
 
-void InterpreterPlatform::RegisterTraceListener(
+void XlaInterpreterPlatform::RegisterTraceListener(
     std::unique_ptr<TraceListener> listener) {
   LOG(FATAL) << "not yet implemented: register executor trace listener";
 }
 
-void InterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
+void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
   LOG(FATAL) << "not yet implemented: unregister executor trace listener";
 }
 
-static void InitializeInterpreterPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::InterpreterPlatform);
+static void InitializeXlaInterpreterPlatform() {
+  std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
   SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
@@ -105,7 +108,7 @@ static void InitializeInterpreterPlatform() {
 }  // namespace perftools
 
 REGISTER_MODULE_INITIALIZER(interpreter_platform,
-                            sep::InitializeInterpreterPlatform());
+                            sep::InitializeXlaInterpreterPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index c66ddb907d..2f71b29be4 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -27,10 +27,10 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-class InterpreterPlatform : public Platform {
+class XlaInterpreterPlatform : public Platform {
  public:
-  InterpreterPlatform();
-  ~InterpreterPlatform() override;
+  XlaInterpreterPlatform();
+  ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
 
@@ -60,7 +60,7 @@ class InterpreterPlatform : public Platform {
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(InterpreterPlatform);
+  SE_DISALLOW_COPY_AND_ASSIGN(XlaInterpreterPlatform);
 };
 
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
index 1a0373cf86..b7fb365b70 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@@ -18,7 +18,7 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-PLATFORM_DEFINE_ID(kInterpreterPlatformId);
+PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
 
 }  // namespace interpreter
 }  // namespace gputools
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h
index 905efef169..292f958449 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@@ -22,7 +22,7 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-extern const Platform::Id kInterpreterPlatformId;
+extern const Platform::Id kXlaInterpreterPlatformId;
 
 }  // namespace interpreter
 }  // namespace gputools
-- 
GitLab


From 73cc1d5b6f95ff56207e4c42b62d383c2427fb75 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 00:03:48 -0700
Subject: [PATCH 0749/1262] -- Add a new histogram/cdf computation method
 compatible with the TPU. -- Refactor utility functions into pruning_utils.py
 and add tests

PiperOrigin-RevId: 192727737
---
 tensorflow/contrib/model_pruning/BUILD        |  24 +-
 tensorflow/contrib/model_pruning/README.md    |   2 +-
 .../contrib/model_pruning/python/pruning.py   | 237 +++------------
 .../model_pruning/python/pruning_test.py      |  15 +-
 .../model_pruning/python/pruning_utils.py     | 269 ++++++++++++++++++
 .../python/pruning_utils_test.py              |  86 ++++++
 6 files changed, 430 insertions(+), 203 deletions(-)
 create mode 100644 tensorflow/contrib/model_pruning/python/pruning_utils.py
 create mode 100644 tensorflow/contrib/model_pruning/python/pruning_utils_test.py

diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index f50575b2cf..54bd39afac 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -71,6 +71,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "pruning_utils",
+    srcs = ["python/pruning_utils.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "pruning",
     srcs = ["python/pruning.py"],
@@ -78,9 +89,20 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":core_layers",
+        ":pruning_utils",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/python:platform",
-        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "pruning_utils_test",
+    size = "small",
+    srcs = ["python/pruning_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pruning_utils",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 52b659c69f..86f4fd6adf 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -45,7 +45,7 @@ The pruning library allows for specification of the following hyper parameters:
 | do_not_prune | list of strings | [""] | list of layers names that are not pruned |
 | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
-| nbins | integer | 255 | Number of bins to use for histogram computation |
+| nbins | integer | 256 | Number of bins to use for histogram computation |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
 | block_width |integer | 1 | Number of cols in a block for block sparse matrices|
 | block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)|
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 5146a4a2de..ea6032e588 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -33,12 +33,14 @@
   # Returns a list of all the weight tensors that have been masked
   get_weights()
 
-  The Pruning class uses a proto (defined in pruning.proto) to set up the
-  parameters for a pruning specification. Here's a typical usage:
+  The Pruning class uses a tf.hparams object to set up the
+  parameters for a model pruning. Here's a typical usage:
 
-  # Initialize a pruning spec from a proto
-  pruning_spec = '/tmp/pruning.pb'
-  p = Pruning(pruning_spec)
+  # Parse pruning hyperparameters
+  pruning_hparams = pruning.get_pruning_hparams().parse(FLAGS.pruning_hparams)
+
+  # Create a pruning object using the pruning_hparams
+  p = pruning.Pruning(pruning_hparams)
 
   # Add mask update ops to the graph
   mask_update_op = p.conditional_mask_update_op()
@@ -51,24 +53,20 @@
 
   # An object of the pruning also accepts externally defined sparsity:
   sparsity = tf.Variable(0.5, name = "ConstantSparsity")
-  pruning_spec = '/tmp/pruning.pb'
-  p = Pruning(pruning_spec, sparsity=sparsity)
-
+  p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
 """
 # pylint: disable=missing-docstring
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
+from tensorflow.contrib.model_pruning.python import pruning_utils
 from tensorflow.contrib.model_pruning.python.layers import core_layers as core
 from tensorflow.contrib.training.python.training import hparam
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
@@ -87,172 +85,18 @@ _WEIGHT_COLLECTION = core.WEIGHT_COLLECTION
 _MASKED_WEIGHT_NAME = core.MASKED_WEIGHT_NAME
 
 
-def _weight_mask_variable(var, scope):
-  """Create a mask for the weights.
-
-  This function adds a variable 'mask' to the graph.
-
-  Args:
-    var: the weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    the mask variable of the same size and shape as var, initialized to all 1s.
-  """
-  with variable_scope.variable_scope(scope):
-    mask = variable_scope.get_variable(
-        'mask',
-        var.get_shape(),
-        initializer=init_ops.ones_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-  return mask
-
-
-def _weight_threshold_variable(var, scope):
-  """Create a scalar threshold for the weights.
-
-  This function adds a variable
-  'threshold' to the graph.
-
-  Args:
-    var: The weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    a scalar threshold variable initialized to 0.
-  """
-  with variable_scope.variable_scope(scope):
-    threshold = variable_scope.get_variable(
-        'threshold', [],
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-    return threshold
-
-
-def _kronecker_product(mat1, mat2):
-  """Computes the Kronecker product of two matrices mat1 and mat2.
-
-  Args:
-    mat1: A matrix of size m x n
-    mat2: A matrix of size p x q
-  Returns:
-    Kronecker product of matrices mat1 and mat2 of size mp x nq
-  """
-
-  m1, n1 = mat1.get_shape().as_list()
-  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
-  m2, n2 = mat2.get_shape().as_list()
-  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
-  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
-
-
-def _histogram(values, value_range, nbins=100, dtype=np.int32, name=None):
-  """Return histogram of values.
-
-  Given the tensor `values`, this operation returns a rank 1 histogram counting
-  the number of entries in `values` that fell into every bin.  The bins are
-  equal width and determined by the arguments `value_range` and `nbins`.
-
-  Args:
-    values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
-      values <= value_range[0] will be mapped to hist[0],
-      values >= value_range[1] will be mapped to hist[-1].
-    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
-    dtype:  dtype for returned histogram.
-    name:  A name for this operation (defaults to 'histogram').
-
-  Returns:
-    A 1-D `Tensor` holding histogram of values.
-
-  """
-  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
-    values = ops.convert_to_tensor(values, name='values')
-    values = gen_array_ops.reshape(values, [-1])
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
-    nbins = ops.convert_to_tensor(nbins, dtype=np.int32, name='nbins')
-    nbins_float = math_ops.cast(nbins, values.dtype)
-
-    # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(
-        values - value_range[0],
-        value_range[1] - value_range[0],
-        name='scaled_values')
-
-    # map tensor values within the open interval value_range to {0,.., nbins-1},
-    # values outside the open interval will be zero or less, or nbins or more.
-    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
-
-    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
-    indices = math_ops.cast(
-        clip_ops.clip_by_value(indices, 0, nbins_float - 1), np.int32)
-
-    return math_ops.unsorted_segment_sum(
-        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
-
-
-def _determine_partitioned_axis(partitioned_variable):
-  partitioned_axis = 0
-  concatenated_variable_shape = partitioned_variable.get_shape()
-  for partition in partitioned_variable:
-    partition_shape = partition.get_shape()
-    maybe_partitioned_axis = np.less(partition_shape,
-                                     concatenated_variable_shape)
-    # Sanity check: make sure number of partitioned axis == 1
-    if np.count_nonzero(maybe_partitioned_axis) != 1:
-      raise ValueError('Number of partitioned axes %s not equal to 1' %
-                       np.count_nonzero(maybe_partitioned_axis))
-    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
-  return partitioned_axis
-
-
-def _variable_assign(var, new_value):
-  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
-
-
-def _partitioned_variable_assign(partitioned_var, new_value):
-  """Assign op for partitioned variables.
-
-  Args:
-    partitioned_var: A partitioned tensorflow variable
-    new_value: Value to be assigned to the variable var
-
-  Returns:
-    A tensorflow op that groups the assign ops for each of the variable slices
-  """
-  # Determine which axis was used to partition the variable. Currently
-  # tensorflow allows partitioning variable only along 1 axis.
-  axis = 0 if len(partitioned_var) == 1 else _determine_partitioned_axis(
-      partitioned_var)
-
-  partition_sizes = np.array(
-      [partition.get_shape()[axis] for partition in partitioned_var])
-  new_partitioned_values = array_ops.split(
-      new_value,
-      ops.convert_to_tensor(partition_sizes, dtype=np.int32),
-      axis=axis)
-  op_list = []
-  for partition in partitioned_var:
-    op_list.append(
-        _variable_assign(partition, new_partitioned_values[len(op_list)]))
-  return control_flow_ops.group(
-      *op_list, name=partitioned_var.name + '_group_assign')
-
-
 def apply_mask(x, scope=''):
   """Apply mask to a given weight tensor.
 
   Args:
     x: Input weight tensor
-    scope: The current variable scope. Defaults to ""
+    scope: The current variable scope. Defaults to "".
   Returns:
     Tensor representing masked_weights
   """
 
-  mask = _weight_mask_variable(x, scope)
-  threshold = _weight_threshold_variable(x, scope)
+  mask = pruning_utils.weight_mask_variable(x, scope)
+  threshold = pruning_utils.weight_threshold_variable(x, scope)
   # Add masked_weights in the weights namescope so as to make it easier
   # for the quantization library to add quant ops.
   masked_weights = math_ops.multiply(mask, x, _MASKED_WEIGHT_NAME)
@@ -335,6 +179,8 @@ def get_pruning_hparams():
     sparsity_function_exponent: float
       exponent = 1 is linearly varying sparsity between initial and final.
       exponent > 1 varies more slowly towards the end than the beginning
+    use_tpu: False
+      Indicates whether to use TPU
 
     We use the following sparsity function:
 
@@ -357,7 +203,7 @@ def get_pruning_hparams():
       do_not_prune=[''],
       threshold_decay=0.9,
       pruning_frequency=10,
-      nbins=255,
+      nbins=256,
       block_height=1,
       block_width=1,
       block_pooling_function='AVG',
@@ -365,7 +211,8 @@ def get_pruning_hparams():
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
       sparsity_function_end_step=100,
-      sparsity_function_exponent=3)
+      sparsity_function_exponent=3,
+      use_tpu=False)
 
 
 class Pruning(object):
@@ -414,7 +261,7 @@ class Pruning(object):
     if graph_global_step is None:
       graph_global_step = training_util.get_global_step()
 
-    return math_ops.cast(graph_global_step, np.int32)
+    return math_ops.cast(graph_global_step, dtypes.int32)
 
   def _setup_sparsity(self):
     begin_step = self._spec.sparsity_function_begin_step
@@ -429,13 +276,13 @@ class Pruning(object):
           (begin_step, end_step))
 
     with ops.name_scope(self._spec.name):
-      p = math_ops.minimum(1.0,
-                           math_ops.maximum(
-                               0.0,
-                               math_ops.div(
-                                   math_ops.cast(self._global_step - begin_step,
-                                                 np.float32),
-                                   end_step - begin_step)))
+      p = math_ops.minimum(
+          1.0,
+          math_ops.maximum(
+              0.0,
+              math_ops.div(
+                  math_ops.cast(self._global_step - begin_step, dtypes.float32),
+                  end_step - begin_step)))
       sparsity = math_ops.add(
           math_ops.multiply(initial_sparsity - target_sparsity,
                             math_ops.pow(1 - p, exponent)),
@@ -445,17 +292,18 @@ class Pruning(object):
     return sparsity
 
   def _setup_last_update_step(self):
-    with variable_scope.variable_scope(self._spec.name) as scope:
+    with variable_scope.variable_scope(
+        self._spec.name, use_resource=self._spec.use_tpu) as scope:
       try:
         last_update_step = variable_scope.get_variable(
             'last_mask_update_step', [],
             initializer=init_ops.zeros_initializer(),
             trainable=False,
-            dtype=np.int32)
+            dtype=dtypes.int32)
       except ValueError:
         scope.reuse_variables()
         last_update_step = variable_scope.get_variable(
-            'last_mask_update_step', dtype=np.int32)
+            'last_mask_update_step', dtype=dtypes.int32)
     return last_update_step
 
   def _exists_in_do_not_prune_list(self, tensor_name):
@@ -497,18 +345,16 @@ class Pruning(object):
     with ops.name_scope(weights.op.name + '_pruning_ops'):
       abs_weights = math_ops.abs(weights)
       max_value = math_ops.reduce_max(abs_weights)
-      histogram = _histogram(
-          abs_weights, [0.0, max_value],
-          nbins=self._spec.nbins,
-          dtype=np.float32)
+      cdf_fn = pruning_utils.compute_cdf_from_histogram
+      if self._spec.use_tpu:
+        cdf_fn = pruning_utils.compute_cdf
 
-      cdf = math_ops.cumsum(histogram)
-      norm_cdf = math_ops.div(cdf, math_ops.reduce_sum(histogram))
+      norm_cdf = cdf_fn(abs_weights, [0.0, max_value], nbins=self._spec.nbins)
       current_threshold = math_ops.multiply(
           math_ops.div(
               math_ops.reduce_sum(
                   math_ops.cast(
-                      math_ops.less(norm_cdf, self._sparsity), np.float32)),
+                      math_ops.less(norm_cdf, self._sparsity), dtypes.float32)),
               float(self._spec.nbins)), max_value)
 
       smoothed_threshold = math_ops.add_n([
@@ -516,7 +362,7 @@ class Pruning(object):
           math_ops.multiply(threshold, self._spec.threshold_decay)
       ])
       new_mask = math_ops.cast(
-          math_ops.greater(abs_weights, smoothed_threshold), np.float32)
+          math_ops.greater(abs_weights, smoothed_threshold), dtypes.float32)
     return smoothed_threshold, new_mask
 
   def _maybe_update_block_mask(self, weights, threshold):
@@ -572,8 +418,8 @@ class Pruning(object):
           new_mask,
           [pooled_weights.get_shape()[1],
            pooled_weights.get_shape()[2]])
-      updated_mask = _kronecker_product(reshaped_mask,
-                                        array_ops.ones(self._block_dim))
+      updated_mask = pruning_utils.kronecker_product(
+          reshaped_mask, array_ops.ones(self._block_dim))
       sliced_mask = array_ops.slice(
           updated_mask, [0, 0],
           [squeezed_weights.get_shape()[0],
@@ -608,11 +454,12 @@ class Pruning(object):
           continue
 
       new_threshold, new_mask = self._maybe_update_block_mask(weight, threshold)
-      self._assign_ops.append(_variable_assign(threshold, new_threshold))
+      self._assign_ops.append(
+          pruning_utils.variable_assign(threshold, new_threshold))
 
       self._assign_ops.append(
-          _partitioned_variable_assign(mask, new_mask)
-          if is_partitioned else _variable_assign(mask, new_mask))
+          pruning_utils.partitioned_variable_assign(mask, new_mask)
+          if is_partitioned else pruning_utils.variable_assign(mask, new_mask))
 
   def mask_update_op(self):
     with ops.name_scope(self._spec.name):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 89e6571319..f80b7c52c0 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -110,12 +110,12 @@ class PruningTest(test.TestCase):
       self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
 
   def _blockMasking(self, hparams, weights, expected_mask):
 
     threshold = variables.Variable(0.0, name="threshold")
-    sparsity = variables.Variable(0.51, name="sparsity")
+    sparsity = variables.Variable(0.5, name="sparsity")
     test_spec = ",".join(hparams)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
 
@@ -138,7 +138,8 @@ class PruningTest(test.TestCase):
     weights_max = constant_op.constant(
         [[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
          [0.0, -0.3, 0.0, -0.4]])
-    expected_mask = [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]
+    expected_mask = [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                     [1., 1., 1., 1.], [1., 1., 1., 1.]]
 
     self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
                        expected_mask)
@@ -155,7 +156,8 @@ class PruningTest(test.TestCase):
     weights_max = constant_op.constant(
         [[[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
           [0.0, -0.3, 0.0, -0.4]]])
-    expected_mask = [[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]]
+    expected_mask = [[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                      [1., 1., 1., 1.], [1., 1., 1., 1.]]]
 
     self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
                        expected_mask)
@@ -178,11 +180,12 @@ class PruningTest(test.TestCase):
       masked_weights_val = masked_weights.eval()
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
 
   def testConditionalMaskUpdate(self):
     param_list = [
-        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6"
+        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6",
+        "nbins=100"
     ]
     test_spec = ",".join(param_list)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
new file mode 100644
index 0000000000..56d3dcef20
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -0,0 +1,269 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for adding pruning related ops to the graph.
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+_NBINS = 256
+
+
+def weight_mask_variable(var, scope):
+  """Create a mask for the weights.
+
+  This function adds a variable 'mask' to the graph.
+
+  Args:
+    var: the weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    the mask variable of the same size and shape as var, initialized to all 1s.
+  """
+  with variable_scope.variable_scope(scope):
+    mask = variable_scope.get_variable(
+        'mask',
+        var.get_shape(),
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+  return mask
+
+
+def weight_threshold_variable(var, scope):
+  """Create a scalar threshold for the weights.
+
+  This function adds a variable
+  'threshold' to the graph.
+
+  Args:
+    var: The weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    a scalar threshold variable initialized to 0.
+  """
+  with variable_scope.variable_scope(scope):
+    threshold = variable_scope.get_variable(
+        'threshold', [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+    return threshold
+
+
+def kronecker_product(mat1, mat2):
+  """Computes the Kronecker product of two matrices mat1 and mat2.
+
+  Args:
+    mat1: A matrix of size m x n
+    mat2: A matrix of size p x q
+  Returns:
+    Kronecker product of matrices mat1 and mat2 of size mp x nq
+  """
+
+  m1, n1 = mat1.get_shape().as_list()
+  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
+  m2, n2 = mat2.get_shape().as_list()
+  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
+  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
+
+
+def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
+  """Return histogram of values.
+
+  Given the tensor `values`, this operation returns a rank 1 histogram counting
+  the number of entries in `values` that fell into every bin.  The bins are
+  equal width and determined by the arguments `value_range` and `nbins`.
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
+    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram').
+
+  Returns:
+    A 1-D `Tensor` holding histogram of values.
+
+  """
+  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
+    values = ops.convert_to_tensor(values, name='values')
+    values = array_ops.reshape(values, [-1])
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins_float = np.float32(nbins)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+
+    return math_ops.unsorted_segment_sum(
+        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
+
+
+def compute_cdf_from_histogram(values, value_range, **kwargs):
+  """Returns the normalized cumulative distribution of the given values tensor.
+
+  Computes the histogram and uses tf.cumsum to arrive at cdf
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+    **kwargs: keyword arguments: nbins, name
+
+  Returns:
+    A 1-D `Tensor` holding normalized cdf of values.
+
+  """
+  nbins = kwargs.get('nbins', _NBINS)
+  name = kwargs.get('name', None)
+  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
+    histogram = _histogram(
+        values, value_range, dtype=dtypes.float32, nbins=nbins)
+    cdf = math_ops.cumsum(histogram)
+    return math_ops.div(cdf, math_ops.reduce_max(cdf))
+
+
+def compute_cdf(values, value_range, **kwargs):
+  """Returns the normalized cumulative distribution of the given values tensor.
+
+  Uses tf.while_loop to directly compute the cdf of the values. Number of bins
+  for histogram is fixed at _NBINS=255
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`
+    **kwargs: keyword arguments: name
+
+  Returns:
+    A 1-D `Tensor` holding normalized cdf of values.
+
+  """
+  nbins = _NBINS
+  name = kwargs.get('name', None)
+  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
+    values = ops.convert_to_tensor(values, name='values')
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins_float = np.float32(nbins)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+
+    cdf = array_ops.zeros(nbins)
+    i = constant_op.constant(0)
+
+    def loop_cond(loop_count, _):
+      return math_ops.less(loop_count, nbins)
+
+    def loop_body(loop_count, cdf):
+      temp = math_ops.reduce_sum(
+          math_ops.cast(
+              math_ops.less_equal(indices, loop_count), dtypes.float32))
+      cdf = math_ops.add(
+          cdf,
+          array_ops.one_hot(
+              loop_count, depth=_NBINS, on_value=temp, off_value=0.0))
+      return [loop_count + 1, cdf]
+
+    _, cdf = control_flow_ops.while_loop(
+        loop_cond, loop_body, [i, cdf], maximum_iterations=nbins)
+
+    return math_ops.div(cdf, math_ops.reduce_max(cdf))
+
+
+def determine_partitioned_axis(partitioned_variable):
+  partitioned_axis = 0
+  concatenated_variable_shape = partitioned_variable.get_shape()
+  for partition in partitioned_variable:
+    partition_shape = partition.get_shape()
+    maybe_partitioned_axis = np.less(partition_shape,
+                                     concatenated_variable_shape)
+    # Sanity check: make sure number of partitioned axis == 1
+    if np.count_nonzero(maybe_partitioned_axis) != 1:
+      raise ValueError('Number of partitioned axes %s not equal to 1' %
+                       np.count_nonzero(maybe_partitioned_axis))
+    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
+  return partitioned_axis
+
+
+def variable_assign(var, new_value):
+  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
+
+
+def partitioned_variable_assign(partitioned_var, new_value):
+  """Assign op for partitioned variables.
+
+  Args:
+    partitioned_var: A partitioned tensorflow variable
+    new_value: Value to be assigned to the variable var
+
+  Returns:
+    A tensorflow op that groups the assign ops for each of the variable slices
+  """
+  # Determine which axis was used to partition the variable. Currently
+  # tensorflow allows partitioning variable only along 1 axis.
+  axis = 0 if len(partitioned_var) == 1 else determine_partitioned_axis(
+      partitioned_var)
+
+  partition_sizes = np.array(
+      [partition.get_shape()[axis] for partition in partitioned_var])
+  new_partitioned_values = array_ops.split(
+      new_value,
+      ops.convert_to_tensor(partition_sizes, dtype=dtypes.int32),
+      axis=axis)
+  op_list = []
+  for partition in partitioned_var:
+    op_list.append(
+        variable_assign(partition, new_partitioned_values[len(op_list)]))
+  return control_flow_ops.group(
+      *op_list, name=partitioned_var.name + '_group_assign')
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
new file mode 100644
index 0000000000..10e1dd0a8e
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utility functions in pruning_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.model_pruning.python import pruning_utils
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class PruningUtilsTest(test.TestCase):
+
+  def testHistogram(self):
+    width = 10
+    height = 10
+    nbins = 100
+    expected_histogram = np.full(nbins, 1.0)
+    init = init_ops.constant_initializer(np.linspace(0.0, 1.0, width * height))
+    weights = variable_scope.get_variable(
+        "weights", [width, height], initializer=init)
+    histogram = pruning_utils._histogram(
+        weights, [0, 1.0], nbins, dtype=np.float32)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      computed_histogram = histogram.eval()
+    self.assertAllEqual(expected_histogram, computed_histogram)
+
+  def testCDF(self):
+    nbins = 5
+    weights = constant_op.constant([-1, 0, 1, 1.5, 2, 3, 4, 5, 10, 100])
+    abs_weights = math_ops.abs(weights)
+    norm_cdf = pruning_utils.compute_cdf_from_histogram(
+        abs_weights, [0.0, 5.0], nbins=nbins)
+    expected_cdf = np.array([0.1, 0.4, 0.5, 0.6, 1.0], dtype=np.float32)
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      norm_cdf_val = sess.run(norm_cdf)
+      self.assertAllEqual(len(norm_cdf_val), nbins)
+      self.assertAllEqual(expected_cdf, norm_cdf_val)
+
+  def _compare_cdf(self, values):
+    abs_values = math_ops.abs(values)
+    max_value = math_ops.reduce_max(abs_values)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
+          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
+      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
+      return cdf.eval(), cdf_from_histogram.eval()
+
+  def testCDFEquivalence2D(self):
+    width = 100
+    height = 100
+    weights = variable_scope.get_variable("weights", shape=[width, height])
+    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
+    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+
+  def testCDFEquivalence4D(self):
+    weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
+    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
+    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From 1b0c277405171a34c7f41e17cd76459dc36f7f82 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 00:12:41 -0700
Subject: [PATCH 0750/1262] Implementation of Less

PiperOrigin-RevId: 192728635
---
 tensorflow/contrib/lite/builtin_ops.h         |   1 +
 .../lite/g3doc/tf_ops_compatibility.md        |  13 ++
 tensorflow/contrib/lite/kernels/BUILD         |  19 +++
 .../contrib/lite/kernels/comparisons.cc       | 119 +++++++++++++++++
 .../contrib/lite/kernels/comparisons_test.cc  |  98 ++++++++++++++
 .../internal/reference/reference_ops.h        |  45 +++++++
 tensorflow/contrib/lite/kernels/register.cc   |   2 +
 tensorflow/contrib/lite/model.cc              |   3 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |   5 +
 .../contrib/lite/schema/schema_generated.h    | 124 +++++++++++++++++-
 tensorflow/contrib/lite/testing/BUILD         |   1 +
 .../contrib/lite/testing/generate_examples.py |  33 +++++
 .../testing/generated_examples_zip_test.cc    |   1 +
 .../contrib/lite/toco/tflite/operator.cc      |   2 +
 .../contrib/lite/toco/tflite/operator_test.cc |   2 +
 16 files changed, 463 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/comparisons.cc
 create mode 100644 tensorflow/contrib/lite/kernels/comparisons_test.cc

diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 1ceefafc56..859bc7ab70 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -82,6 +82,7 @@ typedef enum {
   kTfLiteBuiltinMaximum = 55,
   kTfLiteBuiltinArgMax = 56,
   kTfLiteBuiltinMinimum = 57,
+  kTfLiteBuiltinLess = 58,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 61ea5231e3..203924f03d 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -302,6 +302,19 @@ Options {
 }
 ```
 
+**LESS**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is less
+  than the corresponding element of the second tensor.
+}
+```
+
 **LOCAL_RESPONSE_NORMALIZATION**
 
 ```
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 914893cd90..800e2a9558 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
         "cast.cc",
+        "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
@@ -818,6 +819,24 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "comparisons_test",
+    size = "small",
+    srcs = [
+        "comparisons_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios_arm64",
+        "tflite_not_portable_ios_x86_64",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
new file mode 100644
index 0000000000..87c413cb98
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace comparisons {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Don't support string and bool.
+  TF_LITE_ENSURE(context,
+                 input1->type != kTfLiteString || input1->type != kTfLiteBool);
+  // Currently only support tensors have the same type.
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = kTfLiteBool;
+
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+
+#define TF_LITE_LESS(type, opname)                                          \
+  reference_ops::opname(GetTensorData<type>(input1), GetTensorDims(input1), \
+                        GetTensorData<type>(input2), GetTensorDims(input2), \
+                        GetTensorData<bool>(output), GetTensorDims(output));
+
+  // TODO(renjieliu): Support quantized data.
+  if (requires_broadcast) {
+    switch (input1->type) {
+      case kTfLiteFloat32:
+        TF_LITE_LESS(float, BroadcastLess);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_LESS(int32_t, BroadcastLess);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_LESS(int64_t, BroadcastLess);
+        break;
+      default:
+        context->ReportError(context,
+                             "Does not support type other than float|int");
+        return kTfLiteError;
+    }
+  } else {
+    switch (input1->type) {
+      case kTfLiteFloat32:
+        TF_LITE_LESS(float, Less);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_LESS(int32_t, Less);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_LESS(int64_t, Less);
+        break;
+      default:
+        context->ReportError(context,
+                             "Does not support type other than float|int");
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_LESS
+  return kTfLiteOk;
+}
+
+}  // namespace comparisons
+
+TfLiteRegistration* Register_LESS() {
+  static TfLiteRegistration r = {nullptr, nullptr, comparisons::LessPrepare,
+                                 comparisons::LessEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
new file mode 100644
index 0000000000..da2d7f8589
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LessOpModel : public SingleOpModel {
+ public:
+  LessOpModel(std::initializer_list<int> input1_shape,
+              std::initializer_list<int> input2_shape, TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_LESS, BuiltinOptions_LessOptions,
+                 CreateLessOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, LessFloat) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessInt) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessBroadcast) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessBroadcastTwoD) {
+  LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
+                                                   true, false, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index c6019390f2..6a89dbc803 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3378,6 +3378,51 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Less(int64_t num_elements, const T* input1, const T* input2,
+                 bool* output) {
+  for (int64_t i = 0; i < num_elements; ++i) {
+    output[i] = input1[i] < input2[i];
+  }
+}
+
+template <typename T>
+inline void Less(const T* input1_data, const Dims<4>& input1_dims,
+                 const T* input2_data, const Dims<4>& input2_dims,
+                 bool* output_data, const Dims<4>& output_dims) {
+  const int64_t batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int64_t height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int64_t width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int64_t depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  Less(batches * height * width * depth, input1_data, input2_data, output_data);
+}
+
+template <typename T1, typename T2>
+inline void BroadcastLess(T1* input1_data, const Dims<4>& input1_dims,
+                          T2* input2_data, const Dims<4>& input2_dims,
+                          bool* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastLess");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] <
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 67ba8d0f39..b07e7b6ff3 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -79,6 +79,7 @@ TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
 TfLiteRegistration* Register_MINIMUM();
 TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_LESS();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -139,6 +140,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
   AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 0b65884025..54b1460173 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -665,6 +665,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_LESS: {
+      break;
+    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 08fb820767..eab82ea8ef 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -353,6 +353,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_MAXIMUM:
       case tflite::BuiltinOperator_MINIMUM:
       case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_LESS:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index fa825500fd..93980b15f0 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -135,6 +135,7 @@ enum BuiltinOperator : byte {
   MAXIMUM = 55,
   ARG_MAX = 56,
   MINIMUM = 57,
+  LESS = 58,
 }
 
 // Options for the builtin operators.
@@ -179,6 +180,7 @@ union BuiltinOptions {
   DequantizeOptions,
   MaximumMinimumOptions,
   ArgMaxOptions,
+  LessOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -399,6 +401,9 @@ table ArgMaxOptions {
   output_type : TensorType;
 }
 
+table LessOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 909c4ccb3b..b2a799d0ef 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -151,6 +151,9 @@ struct MaximumMinimumOptionsT;
 struct ArgMaxOptions;
 struct ArgMaxOptionsT;
 
+struct LessOptions;
+struct LessOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -267,11 +270,12 @@ enum BuiltinOperator {
   BuiltinOperator_MAXIMUM = 55,
   BuiltinOperator_ARG_MAX = 56,
   BuiltinOperator_MINIMUM = 57,
+  BuiltinOperator_LESS = 58,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_MINIMUM
+  BuiltinOperator_MAX = BuiltinOperator_LESS
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[56] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[57] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -328,7 +332,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[56] {
     BuiltinOperator_PRELU,
     BuiltinOperator_MAXIMUM,
     BuiltinOperator_ARG_MAX,
-    BuiltinOperator_MINIMUM
+    BuiltinOperator_MINIMUM,
+    BuiltinOperator_LESS
   };
   return values;
 }
@@ -393,6 +398,7 @@ inline const char **EnumNamesBuiltinOperator() {
     "MAXIMUM",
     "ARG_MAX",
     "MINIMUM",
+    "LESS",
     nullptr
   };
   return names;
@@ -445,11 +451,12 @@ enum BuiltinOptions {
   BuiltinOptions_DequantizeOptions = 38,
   BuiltinOptions_MaximumMinimumOptions = 39,
   BuiltinOptions_ArgMaxOptions = 40,
+  BuiltinOptions_LessOptions = 41,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_ArgMaxOptions
+  BuiltinOptions_MAX = BuiltinOptions_LessOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[41] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[42] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -491,7 +498,8 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[41] {
     BuiltinOptions_CastOptions,
     BuiltinOptions_DequantizeOptions,
     BuiltinOptions_MaximumMinimumOptions,
-    BuiltinOptions_ArgMaxOptions
+    BuiltinOptions_ArgMaxOptions,
+    BuiltinOptions_LessOptions
   };
   return values;
 }
@@ -539,6 +547,7 @@ inline const char **EnumNamesBuiltinOptions() {
     "DequantizeOptions",
     "MaximumMinimumOptions",
     "ArgMaxOptions",
+    "LessOptions",
     nullptr
   };
   return names;
@@ -713,6 +722,10 @@ template<> struct BuiltinOptionsTraits<ArgMaxOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LessOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1064,6 +1077,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_ArgMaxOptions ?
       reinterpret_cast<const ArgMaxOptionsT *>(value) : nullptr;
   }
+  LessOptionsT *AsLessOptions() {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<LessOptionsT *>(value) : nullptr;
+  }
+  const LessOptionsT *AsLessOptions() const {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<const LessOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -3927,6 +3948,46 @@ inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
 
 flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LessOptionsT : public flatbuffers::NativeTable {
+  typedef LessOptions TableType;
+  LessOptionsT() {
+  }
+};
+
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LessOptionsBuilder &operator=(const LessOptionsBuilder &);
+  flatbuffers::Offset<LessOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LessOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -4164,6 +4225,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
     return builtin_options_type() == BuiltinOptions_ArgMaxOptions ? static_cast<const ArgMaxOptions *>(builtin_options()) : nullptr;
   }
+  const LessOptions *builtin_options_as_LessOptions() const {
+    return builtin_options_type() == BuiltinOptions_LessOptions ? static_cast<const LessOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -4350,6 +4414,10 @@ template<> inline const ArgMaxOptions *Operator::builtin_options_as<ArgMaxOption
   return builtin_options_as_ArgMaxOptions();
 }
 
+template<> inline const LessOptions *Operator::builtin_options_as<LessOptions>() const {
+  return builtin_options_as_LessOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -5933,6 +6001,29 @@ inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatB
       _output_type);
 }
 
+inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LessOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -6273,6 +6364,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -6451,6 +6546,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6617,6 +6716,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const ArgMaxOptionsT *>(value);
       return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptionsT *>(value);
+      return CreateLessOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -6783,6 +6886,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new ArgMaxOptionsT(*reinterpret_cast<ArgMaxOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LessOptions: {
+      value = new LessOptionsT(*reinterpret_cast<LessOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -6990,6 +7097,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<LessOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 2c226e76d4..bd888a415b 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -34,6 +34,7 @@ gen_zipped_test_files(
         "global_batch_norm.zip",
         "l2_pool.zip",
         "l2norm.zip",
+        "less.zip",
         "local_response_norm.zip",
         "log_softmax.zip",
         "max_pool.zip",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 4b4ccc0c37..53b41d2358 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1997,6 +1997,39 @@ def make_arg_max_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_less_tests(zip_path):
+  """Make a set of tests to do less."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the less op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.less(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 84ae1d58fe..9da8bd7a28 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -280,6 +280,7 @@ INSTANTIATE_TESTS(squeeze)
 INSTANTIATE_TESTS(strided_slice)
 INSTANTIATE_TESTS(sub)
 INSTANTIATE_TESTS(transpose)
+INSTANTIATE_TESTS(less)
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 0e057fd252..f41a312b47 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -895,6 +895,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "MAXIMUM", OperatorType::kTensorFlowMaximum));
   ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
       "MINIMUM", OperatorType::kTensorFlowMinimum));
+  ops.emplace_back(new SimpleOperator<TensorFlowLessOperator>(
+      "LESS", OperatorType::kTensorFlowLess));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index a947630e28..36ed741541 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -113,6 +113,8 @@ TEST_F(OperatorTest, SimpleOperators) {
       "MAXIMUM", OperatorType::kTensorFlowMaximum);
   CheckSimpleOperator<TensorFlowMinimumOperator>(
       "MINIMUM", OperatorType::kTensorFlowMinimum);
+  CheckSimpleOperator<TensorFlowLessOperator>("LESS",
+                                              OperatorType::kTensorFlowLess);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
-- 
GitLab


From 786668c8300f8f88c21493ecfa500a097a80ccd8 Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Fri, 13 Apr 2018 04:21:15 -0700
Subject: [PATCH 0751/1262] updated installation instructions for
 Tensowflow-TensorRT integration (#18135)

* updated installation instructions for Tensowflow-TensorRT integration

* Minor format changes to clean it up.

* Adding the python symlink command for devel packages too.

* Forcing the symlink creation.

* Updating the sed command for docker parameterized build.
---
 tensorflow/contrib/tensorrt/README.md        | 60 +++++---------------
 tensorflow/docs_src/install/install_linux.md | 36 +++++++++---
 2 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 6eafc1754c..687dee07e1 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,59 +1,29 @@
 # Using TensorRT in TensorFlow
 
-
-This module provides necessary bindings and introduces TRT_engine_op
-operator that wraps a subgraph in TensorRT. This is still a work in progress
-but should be useable with most common graphs.
+This module provides necessary bindings and introduces TRT_engine_op operator
+that wraps a subgraph in TensorRT. This is still a work in progress but should
+be useable with most common graphs.
 
 ## Compilation
 
-
-In order to compile the module, you need to have a local TensorRT
-installation ( libnvinfer.so and respective include files ). During the
-configuration step, TensorRT should be enabled and installation path
-should be set. If installed through package managers (deb,rpm),
-configure script should find the necessary components from the system
-automatically. If installed from tar packages, user has to set path to
-location where the library is installed during configuration.
+In order to compile the module, you need to have a local TensorRT installation
+(libnvinfer.so and respective include files). During the configuration step,
+TensorRT should be enabled and installation path should be set. If installed
+through package managers (deb,rpm), configure script should find the necessary
+components from the system automatically. If installed from tar packages, user
+has to set path to location where the library is installed during configuration.
 
 ```shell
 bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
-After the installation of tensorflow package, TensorRT transformation
-will be available. An example use can be found in test/test_tftrt.py script
+After the installation of tensorflow package, TensorRT transformation will be
+available. An example use can be found in test/test_tftrt.py script
 
 ## Installing TensorRT 3.0.4
 
-In order to make use of TensorRT integration, you will need a local installation of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt). Due to compiler compatibility, you will need to download and install the TensorRT 3.0.4 tarball for _Ubuntu 14.04_, i.e., **_TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz_**, even if you are using Ubuntu 16.04 or later.
-
-### Preparing TensorRT installation
-
-Once you have downloaded TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz, you will need to unpack it to an installation directory, which will be referred to as <install_dir>. Please replace <install_dir> with the full path of actual installation directory you choose in commands below.
-
-```shell
-cd <install_dir> && tar -zxf /path/to/TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz
-```
-
-After unpacking the binaries, you have several options to use them:
-
-#### To run TensorFlow as a user without superuser privileges
-
-For a regular user without any sudo rights, you should add TensorRT to your `$LD_LIBRARY_PATH`:
-
-  ```shell
-   export LD_LIBRARY_PATH=<install_dir>/TensorRT-3.0.4/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-  ```
-
-Then you are ready to use TensorFlow-TensorRT integration. `$LD_LIBRARY_PATH` must contain the path to TensorRT installation for TensorFlow-TensorRT integration to work. If you are using a VirtualEnv-like setup, you can add the command above to your `bin/activate` script or to your `.bashrc` script.
-
-#### To run TensorFlow as a superuser
-
- When running as a superuser, such as in a container or via sudo, the `$LD_LIBRARY_PATH` approach above may not work. The following is preferred when the user has superuser privileges:
-
-  ```shell
-  echo "<install_dir>/TensorRT-3.0.4/lib" | sudo tee /etc/ld.so.conf.d/tensorrt304.conf && sudo ldconfig
-  ```
-
-  Please ensure that any existing deb package installation of TensorRT is removed before following these instructions to avoid package conflicts.
\ No newline at end of file
+In order to make use of TensorRT integration, you will need a local installation
+of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow Installation page](https://www.tensorflow.org/install/install_linux#nvidia_requirements_to_run_tensorflow_with_gpu_support).
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 04e4242b0f..58f6c60287 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -65,16 +65,38 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
+
   * **[OPTIONAL]**  For optimized inferencing performance, you can also install
-    NVIDIA TensorRT 3.0. For details, see
-    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
-    Only steps 1-4 in the TensorRT Tar File installation instructions are
-    required for compatibility with TensorFlow; the Python package installation
-    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+    **NVIDIA TensorRT 3.0**. The minimal set of TensorRT runtime components needed
+    for use with the pre-built `tensorflow-gpu` package can be installed as follows:
+
+    <pre>
+    $ <b>wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
+    $ <b>sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
+    $ <b>sudo apt-get update</b>
+    $ <b>sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</b>
+    </pre>
 
     **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
-    package, please use the Ubuntu **14.04** tar file package of TensorRT
-    even when installing onto an Ubuntu 16.04 system.   
+    package, please use the Ubuntu **14.04** package of TensorRT as shown above,
+    even when installing onto an Ubuntu 16.04 system.<br/>
+    <br/>
+    To build the TensorFlow-TensorRT integration module from source rather than
+    using pre-built binaries, see the [module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
+    For detailed TensorRT installation instructions, see [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).<br/>
+    <br/>
+    To avoid cuDNN version conflicts during later system upgrades, you can hold
+    the cuDNN version at 7.0.5:
+
+    <pre>
+    $ <b> sudo apt-mark hold libcudnn7 libcudnn7-dev</b>
+    </pre>
+
+    To later allow upgrades, you can remove the hold:
+
+    <pre>
+    $ <b> sudo apt-mark unhold libcudnn7 libcudnn7-dev</b>
+    </pre>
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
-- 
GitLab


From bb804104e27400b5e0497cf6c60f4a46a7402d23 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 04:44:10 -0700
Subject: [PATCH 0752/1262] Fix bug in converted_call, and add tests for it.

PiperOrigin-RevId: 192751211
---
 tensorflow/contrib/autograph/impl/api.py      |  2 +-
 tensorflow/contrib/autograph/impl/api_test.py | 86 +++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index a553813e19..a00d9c68dc 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -156,7 +156,7 @@ def converted_call(f, recursive, verbose, arg_types, *args, **kwargs):
     # Constructors
     target_entity = f
     arg_map_target = f.__init__
-    effective_args = (unknown_arg_value,) + args
+    effective_args = args
     partial_types = ()
 
   elif hasattr(f, '__call__') and hasattr(f, '__class__'):
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index f9db07778a..2e09d19621 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -179,6 +179,92 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
+  def test_converted_call_builtin(self):
+    x = api.converted_call(range, False, False, {}, 3)
+    self.assertEqual((0, 1, 2), tuple(x))
+
+  def test_converted_call_function(self):
+
+    def test_fn(x):
+      if x < 0:
+        return -x
+      return x
+
+    with self.test_session() as sess:
+      x = api.converted_call(
+          test_fn, False, False, {}, constant_op.constant(-1))
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_method(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(tc.test_method, False, False, {}, tc)
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_method_by_class(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(TestClass.test_method, False, False, {}, tc)
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_callable_object(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def __call__(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(tc, False, False, {})
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_constructor(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = api.converted_call(
+          TestClass, False, False, {}, constant_op.constant(-1))
+      # tc is now a converted object.
+      x = tc.test_method()
+      self.assertEqual(1, sess.run(x))
+
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
-- 
GitLab


From b520022c95b246749fa3f63ca818058c22944720 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 05:05:12 -0700
Subject: [PATCH 0753/1262] Update for upstream LLVM *.def -> *.inc rename

PiperOrigin-RevId: 192752798
---
 .../xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index defd281d74..df9d9be889 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-- 
GitLab


From 345414cb4fa43af5906adc64a380986eaade4f53 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 05:47:21 -0700
Subject: [PATCH 0754/1262] - Fixed small bug in example script

PiperOrigin-RevId: 192756152
---
 tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
index 3aa52aff19..2c1f099360 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
@@ -32,7 +32,7 @@ flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
 
 
 def main(unused_argv):
-  convnet.train_mnist_single_gpu(FLAGS.data_dir, num_epochs=200)
+  convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
 
 
 if __name__ == "__main__":
-- 
GitLab


From bb8fcd516ebd0a11e1768d308d3aa265b9ad50d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 06:53:54 -0700
Subject: [PATCH 0755/1262] Keep function doc string at the top of the
 function.

PiperOrigin-RevId: 192761604
---
 .../autograph/converters/name_scopes.py       | 20 +++++++++----
 .../autograph/converters/name_scopes_test.py  | 30 ++++++++++++++++---
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/name_scopes.py b/tensorflow/contrib/autograph/converters/name_scopes.py
index 280bc4c314..dfee529aba 100644
--- a/tensorflow/contrib/autograph/converters/name_scopes.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Wraps a function body with a `name_scope` of the function name.
-"""
+"""Wraps a function body with a `name_scope` of the function name."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -48,15 +47,26 @@ class FunctionNameScopeTransformer(transformer.Base):
     return name
 
   def visit_FunctionDef(self, node):
-    self.generic_visit(node)
+    node = self.generic_visit(node)
+
+    unscoped_body = []
+    scoped_body = node.body
+    if scoped_body:
+      first = scoped_body[0]
+      if isinstance(first, gast.Expr) and isinstance(first.value, gast.Str):
+        # Skip any docstring.
+        unscoped_body = scoped_body[:1]
+        scoped_body = scoped_body[1:]
+
     template = """
       with tf.name_scope(scope_name):
         body
     """
-    node.body = templates.replace(
+    scoped_body = templates.replace(
         template,
         scope_name=gast.Str(self._name_for_current_scope()),
-        body=node.body)
+        body=scoped_body)
+    node.body = unscoped_body + scoped_body
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/name_scopes_test.py b/tensorflow/contrib/autograph/converters/name_scopes_test.py
index 2c2b6bbbec..17692cbd88 100644
--- a/tensorflow/contrib/autograph/converters/name_scopes_test.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes_test.py
@@ -27,9 +27,10 @@ from tensorflow.python.platform import test
 
 class FunctionNameScopeTransformer(converter_test_base.TestCase):
 
-  def test_basic_name(self):
+  def test_basic(self):
 
     def test_fn(l):
+      """This should stay here."""
       a = 5
       l += a
       return l
@@ -41,7 +42,28 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
       result_op = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', result_op.op.name)
 
-  def test_nested_name(self):
+      self.assertEqual('This should stay here.', result.test_fn.__doc__)
+
+  def test_long_docstring(self):
+
+    def test_fn(l):
+      """Multi-line docstring.
+
+      Args:
+        l: A thing.
+      Returns:
+        l
+      """
+      return l
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      self.assertIn('Multi-line', result.test_fn.__doc__)
+      self.assertIn('Returns:', result.test_fn.__doc__)
+
+  def test_nested_functions(self):
 
     def test_fn(l):
 
@@ -62,7 +84,7 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
       self.assertNotIn('inner_fn', first_result_input_name)
       self.assertIn('test_fn/inner_fn/', second_result_input_name)
 
-  def test_class_name(self):
+  def test_method(self):
 
     class TestClass(object):
 
@@ -87,7 +109,7 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
       self.assertNotIn('inner_fn', first_result_input_name)
       self.assertIn('TestClass/test_fn/inner_fn/', second_result_input_name)
 
-  def test_special_name(self):
+  def test_operator(self):
 
     class TestClass(object):
 
-- 
GitLab


From 8c47ec384eb28639934f8aee1a179b5b3d814af8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 07:24:15 -0700
Subject: [PATCH 0756/1262] Demo: RNN colorbot with Estimators.

PiperOrigin-RevId: 192765203
---
 .../notebooks/rnn_colorbot_estimator.ipynb    | 1421 +++++++++++++++++
 1 file changed, 1421 insertions(+)
 create mode 100644 tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb

diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
new file mode 100644
index 0000000000..7f5e4d4ac1
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
@@ -0,0 +1,1421 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "LqNpENf-ec0X",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "Pa2qpEmoVOGe",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HNqUFL4deCsL",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "# Case study: building an RNN\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "YkC1k4HEQ7rw",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "In this section, we show how you can use AutoGraph to build RNNColorbot, an RNN that takes as input names of colors and predicts their corresponding RGB tuples. The model will be trained by a [custom Estimator](https://www.tensorflow.org/get_started/custom_estimators)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7nkPDl5CTCNb",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "To get started, set up the dataset. The following cells defines methods that download and format the data needed for RNNColorbot; the details aren't important (read them in the privacy of your own home if you so wish), but make sure to run the cells before proceeding."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "A0uREmVXCQEw",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(\n",
+        "      batch_size, padded_shapes=([None], [None, None], []))\n",
+        "  return dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "waZ89t3DTUla",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n",
+        "\n",
+        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "9v8AJouiC44V",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "class RnnColorbot(object):\n",
+        "  \"\"\"Holds the parameters of the colorbot model.\"\"\"\n",
+        "\n",
+        "  def __init__(self):\n",
+        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "\n",
+        "    self.lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "    self.upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "    self.relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "\n",
+        "\n",
+        "def rnn_layer(chars, cell, batch_size, training):\n",
+        "  \"\"\"A simple RNN layer.\n",
+        "  \n",
+        "  Args:\n",
+        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "  Returns:\n",
+        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "  \"\"\"\n",
+        "  hidden_outputs = []\n",
+        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "  for ch in chars:\n",
+        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "    hidden_outputs.append(cell_output)\n",
+        "  hidden_outputs = hidden_outputs.stack()\n",
+        "  if training:\n",
+        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "  return hidden_outputs\n",
+        "\n",
+        "\n",
+        "@autograph.convert(recursive=True)\n",
+        "def model(inputs, colorbot, batch_size, training):\n",
+        "  \"\"\"RNNColorbot model.\n",
+        "  \n",
+        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "  followed by a fully connected layer with ReLU activation.\n",
+        "  \n",
+        "  Args:\n",
+        "    inputs: A tuple (chars, length)\n",
+        "    colorbot: An object of type RnnColorbot\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "    \n",
+        "  Returns:\n",
+        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "  \"\"\"\n",
+        "  (chars, length) = inputs\n",
+        "  seq = tf.transpose(chars, [1, 0, 2])\n",
+        "  seq.set_shape((None, batch_size, 256))\n",
+        "\n",
+        "  seq = rnn_layer(seq, colorbot.lower_cell, batch_size, training)\n",
+        "  seq = rnn_layer(seq, colorbot.upper_cell, batch_size, training)\n",
+        "\n",
+        "  # Grab just the end-of-sequence from each output.\n",
+        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "  sequence_ends = tf.gather_nd(seq, indices)\n",
+        "  return colorbot.relu_layer(sequence_ends)\n",
+        "\n",
+        "@autograph.convert()\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "JjK4gXFvFsf4",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "We will now create the model function for the estimator.\n",
+        "\n",
+        "In the model function, we simply call the converted functions that we defined above - that's it!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "-yso_Nx23Gy1",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def model_fn(features, labels, mode, params):\n",
+        "  \"\"\"Estimator model function.\"\"\"\n",
+        "  chars = features['chars']\n",
+        "  sequence_length = features['sequence_length']\n",
+        "  inputs = (chars, sequence_length)\n",
+        "\n",
+        "  # Create the model components.\n",
+        "  # Simply calling the AutoGraph-ed functions and objects just works!\n",
+        "  colorbot = RnnColorbot()\n",
+        "  \n",
+        "  batch_size = params['batch_size']\n",
+        "\n",
+        "  if mode == tf.estimator.ModeKeys.TRAIN:\n",
+        "    predictions = model(inputs, colorbot, batch_size, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "\n",
+        "    learning_rate = params['learning_rate']\n",
+        "    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
+        "    global_step = tf.train.get_global_step()\n",
+        "    train_op = optimizer.minimize(loss, global_step=global_step)\n",
+        "    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n",
+        "\n",
+        "  elif mode == tf.estimator.ModeKeys.EVAL:\n",
+        "    predictions = model(inputs, colorbot, batch_size, training=False)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "\n",
+        "    return tf.estimator.EstimatorSpec(mode, loss=loss)\n",
+        "  \n",
+        "  elif mode == tf.estimator.ModeKeys.PREDICT:\n",
+        "    # For prediction, we expect single tensors.\n",
+        "    predictions = model(inputs, colorbot, 1, training=False)\n",
+        "\n",
+        "    predictions = tf.minimum(predictions, 1.0)\n",
+        "    return tf.estimator.EstimatorSpec(mode, predictions=predictions)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HOQfoBnHC9CP",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "We'll create an input function that will feed our training and eval data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "FJZlx7yG2MP0",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def input_fn(data_dir, data_url, params, training=True):\n",
+        "  \"\"\"An input function for training\"\"\"\n",
+        "  batch_size = params['batch_size']\n",
+        "  \n",
+        "  # load_dataset defined above\n",
+        "  dataset = load_dataset(data_dir, data_url, batch_size, training=training)\n",
+        "\n",
+        "  # Package the pipeline end in a format suitable for the estimator.\n",
+        "  labels, chars, sequence_length = dataset.make_one_shot_iterator().get_next()\n",
+        "  features = {\n",
+        "      'chars': chars,\n",
+        "      'sequence_length': sequence_length\n",
+        "  }\n",
+        "\n",
+        "  return features, labels"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "qsvv-lzbDqXd",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "We now have everything in place to build our custom estimator and use it for training and eval!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 35
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 10064,
+          "status": "ok",
+          "timestamp": 1523580419240,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "2pg1AfbxBJQq",
+        "outputId": "41894b16-3d3a-4e30-f6e4-5a9c837a2210",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Eval loss at step 100: 0.0665446\n"
+          ]
+        }
+      ],
+      "source": [
+        "params = {\n",
+        "    'batch_size': 64,\n",
+        "    'learning_rate': 0.01,\n",
+        "}\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\"\n",
+        "\n",
+        "regressor = tf.estimator.Estimator(\n",
+        "    model_fn=model_fn,\n",
+        "    params=params)\n",
+        "\n",
+        "regressor.train(\n",
+        "    input_fn=lambda: input_fn(data_dir, train_url, params),\n",
+        "    steps=100)\n",
+        "eval_results = regressor.evaluate(\n",
+        "    input_fn=lambda: input_fn(data_dir, test_url, params, training=False),\n",
+        "    steps=2\n",
+        ")\n",
+        "\n",
+        "print('Eval loss at step %d: %s' % (eval_results['global_step'], eval_results['loss']))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zG1YAjB_cUnQ",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "And here's the same estimator used for inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 343
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 31286,
+          "status": "ok",
+          "timestamp": 1523580450579,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "dxHex2tUN_10",
+        "outputId": "b3dc558d-b800-4e9b-e60e-3441124e80d8",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f4112527e90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f4112527f10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f4112527f50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f474-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"borderColor\": [\"#a7a7a7\"], \"contentBorder\": [\"0px\"], \"tabNames\": [\"RNN Colorbot\"], \"elementId\": \"id1\"});\n",
+              "//# sourceURL=js_a0db480422"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd1d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f475-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_d2a46ea291"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd0d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_0a8262c6e9"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd390\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_e32f85ccd2"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f478-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_eaee748b21"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd550\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f479-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_2befe06587"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4112527f10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1a-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_8ec4aeeb25"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd690\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_9f9f4574f1"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd350\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_bcccd8f300"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd6d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_2c056cee72"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_c853c3f58b"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd610\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_e5730ab00d"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2050\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_a897ef7e24"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2250\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_565fa3d154"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4113124d90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b22-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_222e0dc6af"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4113124c10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b23-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_831db7458f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4113124310\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab4-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_adb576c6eb"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_9418f2d32f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_3fad25f306"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4112527ed0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab7-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_45b9340e7b"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990c90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab8-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_bec9896d44"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990a10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab9-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_460b91ad4a"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3a10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_7dedd0b037"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3890\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_4b1c977dc7"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3bd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fabc-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_d64fedfcf9"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3410\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fabd-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_3e8c929c3f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3c50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_9f9cf2b76f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_b402e6b587"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3d90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_9b7d66db72"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3b10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_11ec213a3f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3950\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9870-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_9c055e4bc0"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3850\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMRJREFUeJzt3F+IlfW+x/Gvp3FECyIqU4PCO7EgZnQtnUJ0JJGoTDoY\n/dGrMBJhosggIgK7KwwiMdxRF11F/0AJvIisLBqcguxCjEAkmNQGcRvVwIzm71zsc4Yje7P3x9h7\nz97u1+tqrYdnPeu7nos3v2f9m9FaawUQ+K/pHgD49yEYQEwwgJhgADHBAGKCAcQEg2nx9NNPV7fb\nrfvuu69GRkZq5cqV0z0SAcG4xK1evbqGh4ene4wLfPXVVzU8PFyfffZZvf3221VVNWPGjGmeioRg\n8E/122+/1Q8//FDXX399zZo1a7rH4SIJxiXsqaeeqhMnTtSWLVuqv7+/Xn/99frmm2/q/vvvr06n\nU+vXr6+RkZGp/Tdt2lQvv/xyPfDAA9Xf318PP/xwnTlzpqqqJicna9u2bbVs2bLqdDq1YcOGOn36\ndFVVjY2N1ZYtW2rZsmW1du3aeuedd6aOuXPnzhoaGqpt27bV0qVL67333qtnn322Dh06VP39/bVz\n584/m/vo0aO1adOm6nQ6dffdd9f+/furqmp0dLQ6nc7Ufs8880zdeuutU/e3bdtWb7755t/3JHKh\nxiVtcHCwDQ8Pt9ZaO3nyZOt2u+3AgQOttda++OKL1u122+nTp1trrW3cuLGtWbOmff/9921iYqJt\n3Lix7dixo7XW2ltvvdUeffTRNjEx0c6fP98OHz7cfvnll9Zaaw899FDbvn17m5ycbEeOHGnLly+f\nes5XXnml3XTTTe2jjz5qrbU2MTHR3n///fbggw9OzXjw4MG2cuXK1lprZ8+ebWvWrGm7d+9uZ8+e\nbcPDw62vr68dO3Zs6vUcPny4tdba2rVr2+23396OHj3aWmtt1apV7ciRI/+oU0lrzQrjP0D7358L\n7d27t1atWlUrVqyoqqqBgYG6+eab69NPP53a9957760bbrihent764477qgjR45UVVVPT0+dOXOm\njh07VjNmzKjFixfX5ZdfXidPnqyvv/66nnzyyZo5c2YtWrSoNmzYUHv27Jk6Zl9fX61evbqqqnp7\ne//qrIcOHarx8fF65JFHqqenp5YvX16Dg4P1wQcfVFXV0qVLa2RkpE6dOlVVVWvXrq0vv/yyRkdH\n69dff61Fixb9nc4af0nPdA/AP8/x48dr37599fHHH1fVn0Jy7ty5GhgYmNrnmmuumbo9e/bsGh8f\nr6qqe+65p06ePFlPPPFE/fzzz7Vu3bp6/PHHa2xsrK688sqaPXv21OMWLFhQhw8fnro/b968eMax\nsbGaP3/+BdsWLFhQY2NjVVXV6XRq//79dd1111W3261ut1t79uyp3t7eWrJkyUWcDX4PwbjE/f9P\nH+bPn1/r16+v7du3X/Rxenp6auvWrbV169Y6fvx4bd68uRYuXFi33XZb/fTTTzU+Pl5z5sypqqoT\nJ07U3Llz/+IMf8vcuXPrxIkTF2w7fvx4LVy4sKqqut1uvfjiizV//vzqdDrV399fzz33XPX29la3\n273o18XFcUlyibv22mtrdHS0qqrWrVtX+/fvr88//7zOnz9fExMTNTIyUj/++OPfPM7Bgwfru+++\nq/Pnz9ecOXOqp6enLrvsspo3b1719fXVSy+9VJOTk/Xtt9/Wu+++W+vWrftd895yyy01Z86ceu21\n1+rcuXN18ODB+uSTT+rOO++sqqobb7yxZs2aVXv37q1Op1NXXHFFXX311fXhhx9e8IYo/xiCcYnb\nvHlz7dq1q7rdbu3bt6927dpVu3fvroGBgRocHKw33nhj6j2Ov7YSOHXqVA0NDdWSJUvqrrvuqmXL\nlk1FYceOHTU6OlorVqyooaGheuyxxy64zLkYM2fOrFdffbUOHDhQy5cvr+eff75eeOGFqRVG1Z9W\nGVddddXUpc7/hWLx4sW/6znJzWjNH+gAGSsMICYYQEwwgJhgALF/2e9h/PEP/z3dI8B/tKseee/P\ntllhADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwg\nJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICY\nYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKC\nAcQEA4gJBhATDCA2o7XWpnsI4N+DFQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE/gfh60wGjfc7LQAAAABJRU5ErkJg\ngg==\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f4113124310\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9871-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_ba6a061307"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd890\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_83e3496927"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_f437bab20d"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a22d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9874-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_93aa63450e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2b90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9875-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_aca189bea5"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd4d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cdiv class=id_100313201 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f410f990a90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
+              "//# sourceURL=js_5df1fe383e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9877-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_c62c7174ad"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2390\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 input\");\n",
+              "//# sourceURL=js_2e2201ddc4"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2810\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76585-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
+              "//# sourceURL=js_288e5283d6"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a26d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
+              "//# sourceURL=js_2f31d19cde"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76587-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_2fbbcda050"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4112527e90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76588-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_f94d975cf3"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "def predict_input_fn(color_name):\n",
+        "  \"\"\"An input function for prediction.\"\"\"\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "  \n",
+        "  # We create a batch of a single element.\n",
+        "  features = {\n",
+        "      'chars': tf.expand_dims(chars, 0),\n",
+        "      'sequence_length': tf.expand_dims(sequence_length, 0)\n",
+        "  }\n",
+        "  return features, None\n",
+        "\n",
+        "\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def predict_with_estimator(color_name, regressor):\n",
+        "  predictions = regressor.predict(\n",
+        "      input_fn=lambda:predict_input_fn(color_name))\n",
+        "  pred = next(predictions)\n",
+        "  predictions.close()\n",
+        "  pred = np.minimum(pred, 1.0)\n",
+        "  pred = np.expand_dims(np.expand_dims(pred, 0), 0)\n",
+        "\n",
+        "  draw_prediction(color_name, pred)\n",
+        "\n",
+        "tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "while True:\n",
+        "  with tb.output_to(0):\n",
+        "    try:\n",
+        "      color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "    except (EOFError, KeyboardInterrupt):\n",
+        "      break\n",
+        "  if not color_name:\n",
+        "    break\n",
+        "  with tb.output_to(0):\n",
+        "    tb.clear_tab()\n",
+        "    predict_with_estimator(color_name, regressor)\n",
+        "  "
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "RNN Colorbot using Estimators",
+      "provenance": [
+        {
+          "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl",
+          "timestamp": 1523579810961
+        },
+        {
+          "file_id": "1DcfimonWU11tmyivKBGVrbpAl3BIOaRG",
+          "timestamp": 1523016192637
+        },
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
-- 
GitLab


From 3eb4e4f82c3d91586b2510d3fb769d6683a4c5f3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 07:55:46 -0700
Subject: [PATCH 0757/1262] Split byte_order.h off cpu_info.h

PiperOrigin-RevId: 192768744
---
 tensorflow/compiler/aot/test.cc               |  1 +
 tensorflow/compiler/xla/service/backend.cc    |  1 +
 tensorflow/compiler/xla/shape_util.h          |  1 +
 .../xla/tests/local_client_test_base.cc       |  2 +-
 .../factorization/kernels/clustering_ops.cc   |  1 +
 .../contrib/ffmpeg/default/ffmpeg_lib.cc      |  2 +-
 tensorflow/core/BUILD                         |  6 ++-
 .../core/common_runtime/direct_session.cc     |  2 +-
 .../kernel_benchmark_testlib.cc               |  1 +
 .../core/common_runtime/local_device.cc       |  1 +
 .../core/common_runtime/process_util.cc       |  1 +
 tensorflow/core/framework/bfloat16.h          |  1 +
 tensorflow/core/grappler/clusters/utils.cc    |  1 +
 tensorflow/core/grappler/costs/utils.cc       |  2 +-
 tensorflow/core/grappler/devices.cc           |  1 +
 .../grappler/optimizers/constant_folding.cc   |  1 +
 .../adaptive_shared_batch_scheduler.h         |  1 +
 .../batching_util/shared_batch_scheduler.h    |  1 +
 tensorflow/core/kernels/cast_op.h             |  2 +-
 tensorflow/core/kernels/decode_raw_op.cc      |  2 +-
 .../core/kernels/mkl_input_conversion_op.cc   |  1 +
 tensorflow/core/kernels/mkl_tfconv_op.h       |  1 +
 tensorflow/core/kernels/sparse_matmul_op.h    |  1 +
 tensorflow/core/lib/bfloat16/bfloat16.h       |  3 +-
 tensorflow/core/lib/core/coding.cc            |  2 +-
 tensorflow/core/lib/core/raw_coding.h         |  2 +-
 tensorflow/core/lib/gtl/inlined_vector.h      |  2 +-
 tensorflow/core/lib/png/png_io.cc             |  2 +-
 tensorflow/core/lib/wav/wav_io.cc             |  2 +-
 tensorflow/core/platform/byte_order.h         | 37 +++++++++++++++++++
 tensorflow/core/platform/cpu_feature_guard.cc |  1 +
 tensorflow/core/platform/cpu_info.h           |  3 --
 tensorflow/core/platform/denormal.cc          |  3 +-
 tensorflow/core/platform/windows/cpu_info.h   |  9 -----
 34 files changed, 72 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/core/platform/byte_order.h

diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc
index 47ef5f82cb..6b098049cb 100644
--- a/tensorflow/compiler/aot/test.cc
+++ b/tensorflow/compiler/aot/test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 // clang-format on
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 05f2d06278..0b36b67251 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 6d228eff46..f2790ba293 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 96b976d25d..12979a0473 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
index 2a6c97e8b9..025534d540 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@@ -32,6 +32,7 @@
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 35341406a0..cca1a05419 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -28,7 +28,7 @@
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 
 using tensorflow::strings::StrCat;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c461f9ed2f..01fe61eeac 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -282,7 +282,7 @@ PLATFORM_BASE_HDRS = [
     "platform/logging.h",
     "platform/macros.h",
     "platform/types.h",
-    "platform/cpu_info.h",
+    "platform/byte_order.h",
 ]
 
 PLATFORM_OTHER_HDRS = [
@@ -290,6 +290,7 @@ PLATFORM_OTHER_HDRS = [
     "platform/stacktrace.h",
     "platform/stacktrace_handler.h",
     "platform/context.h",
+    "platform/cpu_info.h",
     "platform/cpu_feature_guard.h",
     "platform/dynamic_annotations.h",
     "platform/env.h",
@@ -318,7 +319,6 @@ cc_library(
     srcs = glob([
         "platform/*/integral_types.h",
         "platform/*/logging.h",
-        "platform/*/cpu_info.h",
     ]),
     hdrs = PLATFORM_BASE_HDRS,
     deps = [
@@ -666,6 +666,7 @@ cc_library(
         "framework/tensor_types.h",
         "framework/type_traits.h",
         "lib/bfloat16/bfloat16.h",
+        "platform/byte_order.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -1906,6 +1907,7 @@ cc_library(
         "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
+        "platform/byte_order.h",
         "platform/cpu_info.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0479061daf..0afbd02e86 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -54,7 +54,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 64d8849475..7de1b80e2d 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index ca7f1614f1..873182371e 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index d5bd7f8b98..cf8e11c9c8 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index 968c18bdd2..2f79d0fa70 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_BFLOAT16_H_
 
 #include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 50d6e6468f..a7519725a5 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/mem.h"
 
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index f318e3911c..be54d98534 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index b318ac22d4..2be894a08b 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index e29aaa25fe..45bb188e8d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/setround.h"
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 339d792302..f5ced95feb 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index b77289aded..edc88a0384 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index fd4e75d26f..16d2e0e0a5 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index bacacb94ae..eaef5a6097 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 68d3e1c9ab..3245625a32 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index ddea9e281b..4120f013ac 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 14ef2ed704..e89280724e 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 126e5a17af..e7c24387a4 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <complex>
 
-// We need cpu_info.h here in order to pick up __BYTE_ORDER__.
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 #ifdef __CUDACC__
 // All functions callable from CUDA code must be qualified with __device__
diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc
index bb95c27410..50872eef83 100644
--- a/tensorflow/core/lib/core/coding.cc
+++ b/tensorflow/core/lib/core/coding.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 
 namespace tensorflow {
 namespace core {
diff --git a/tensorflow/core/lib/core/raw_coding.h b/tensorflow/core/lib/core/raw_coding.h
index bbfd33d303..37201b755d 100644
--- a/tensorflow/core/lib/core/raw_coding.h
+++ b/tensorflow/core/lib/core/raw_coding.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_CORE_RAW_CODING_H_
 
 #include <string.h>
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index 6e3cb2206d..2011f7d4a1 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index cba473927d..62c803afb2 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/cpu_info.h"  // endian
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/png.h"
 
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 51b9c6cd82..3f7dbcee85 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
-#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/platform/byte_order.h b/tensorflow/core/platform/byte_order.h
new file mode 100644
index 0000000000..aab6535e4b
--- /dev/null
+++ b/tensorflow/core/platform/byte_order.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+#define TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
+
+// Byte order defines provided by gcc. MSVC doesn't define those so
+// we define them here.
+// We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define __ORDER_LITTLE_ENDIAN__ 0x4d2
+#define __ORDER_BIG_ENDIAN__ 0x10e1
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
+
+namespace tensorflow {
+namespace port {
+
+// TODO(jeff,sanjay): Make portable
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b570658158..9d00aa7b7f 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <mutex>
 #include <string>
 
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index bb77650e26..c42429a394 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -25,9 +25,6 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-// TODO(jeff,sanjay): Make portable
-constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-
 // Returns an estimate of the number of schedulable CPUs for this
 // process.  Usually, it's constant throughout the lifetime of a
 // process, but it might change if the underlying cluster management
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 82cbc43b4f..c510dc204f 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include <tuple>
 
-#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
 // If we're on gcc 4.8 or older, there's a known bug that prevents the use of
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index f20939d3c0..ba2126abcf 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -19,13 +19,4 @@ limitations under the License.
 // included so __cpuidex function is available for GETCPUID on Windows
 #include <intrin.h>
 
-// Byte order defines provided by gcc. MSVC doesn't define those so
-// we define them here.
-// We assume that all windows platform out there are little endian.
-#if defined(_MSC_VER) && !defined(__clang__)
-#define __ORDER_LITTLE_ENDIAN__ 0x4d2
-#define __ORDER_BIG_ENDIAN__ 0x10e1
-#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-#endif
-
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
-- 
GitLab


From f9de043501e401af73aa02ab950864534f07c1df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 08:10:57 -0700
Subject: [PATCH 0758/1262] Automated g4 rollback of changelist 192768744

PiperOrigin-RevId: 192770717
---
 tensorflow/compiler/aot/test.cc               |  1 -
 tensorflow/compiler/xla/service/backend.cc    |  1 -
 tensorflow/compiler/xla/shape_util.h          |  1 -
 .../xla/tests/local_client_test_base.cc       |  2 +-
 .../factorization/kernels/clustering_ops.cc   |  1 -
 .../contrib/ffmpeg/default/ffmpeg_lib.cc      |  2 +-
 tensorflow/core/BUILD                         |  6 +--
 .../core/common_runtime/direct_session.cc     |  2 +-
 .../kernel_benchmark_testlib.cc               |  1 -
 .../core/common_runtime/local_device.cc       |  1 -
 .../core/common_runtime/process_util.cc       |  1 -
 tensorflow/core/framework/bfloat16.h          |  1 -
 tensorflow/core/grappler/clusters/utils.cc    |  1 -
 tensorflow/core/grappler/costs/utils.cc       |  2 +-
 tensorflow/core/grappler/devices.cc           |  1 -
 .../grappler/optimizers/constant_folding.cc   |  1 -
 .../adaptive_shared_batch_scheduler.h         |  1 -
 .../batching_util/shared_batch_scheduler.h    |  1 -
 tensorflow/core/kernels/cast_op.h             |  2 +-
 tensorflow/core/kernels/decode_raw_op.cc      |  2 +-
 .../core/kernels/mkl_input_conversion_op.cc   |  1 -
 tensorflow/core/kernels/mkl_tfconv_op.h       |  1 -
 tensorflow/core/kernels/sparse_matmul_op.h    |  1 -
 tensorflow/core/lib/bfloat16/bfloat16.h       |  3 +-
 tensorflow/core/lib/core/coding.cc            |  2 +-
 tensorflow/core/lib/core/raw_coding.h         |  2 +-
 tensorflow/core/lib/gtl/inlined_vector.h      |  2 +-
 tensorflow/core/lib/png/png_io.cc             |  2 +-
 tensorflow/core/lib/wav/wav_io.cc             |  2 +-
 tensorflow/core/platform/byte_order.h         | 37 -------------------
 tensorflow/core/platform/cpu_feature_guard.cc |  1 -
 tensorflow/core/platform/cpu_info.h           |  3 ++
 tensorflow/core/platform/denormal.cc          |  3 +-
 tensorflow/core/platform/windows/cpu_info.h   |  9 +++++
 34 files changed, 28 insertions(+), 72 deletions(-)
 delete mode 100644 tensorflow/core/platform/byte_order.h

diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc
index 6b098049cb..47ef5f82cb 100644
--- a/tensorflow/compiler/aot/test.cc
+++ b/tensorflow/compiler/aot/test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 // clang-format on
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 0b36b67251..05f2d06278 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index f2790ba293..6d228eff46 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 12979a0473..96b976d25d 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
index 025534d540..2a6c97e8b9 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@@ -32,7 +32,6 @@
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index cca1a05419..35341406a0 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -28,7 +28,7 @@
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 
 using tensorflow::strings::StrCat;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 01fe61eeac..c461f9ed2f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -282,7 +282,7 @@ PLATFORM_BASE_HDRS = [
     "platform/logging.h",
     "platform/macros.h",
     "platform/types.h",
-    "platform/byte_order.h",
+    "platform/cpu_info.h",
 ]
 
 PLATFORM_OTHER_HDRS = [
@@ -290,7 +290,6 @@ PLATFORM_OTHER_HDRS = [
     "platform/stacktrace.h",
     "platform/stacktrace_handler.h",
     "platform/context.h",
-    "platform/cpu_info.h",
     "platform/cpu_feature_guard.h",
     "platform/dynamic_annotations.h",
     "platform/env.h",
@@ -319,6 +318,7 @@ cc_library(
     srcs = glob([
         "platform/*/integral_types.h",
         "platform/*/logging.h",
+        "platform/*/cpu_info.h",
     ]),
     hdrs = PLATFORM_BASE_HDRS,
     deps = [
@@ -666,7 +666,6 @@ cc_library(
         "framework/tensor_types.h",
         "framework/type_traits.h",
         "lib/bfloat16/bfloat16.h",
-        "platform/byte_order.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -1907,7 +1906,6 @@ cc_library(
         "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
-        "platform/byte_order.h",
         "platform/cpu_info.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0afbd02e86..0479061daf 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -54,7 +54,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 7de1b80e2d..64d8849475 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test_benchmark.h"
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 873182371e..ca7f1614f1 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index cf8e11c9c8..d5bd7f8b98 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index 2f79d0fa70..968c18bdd2 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_BFLOAT16_H_
 
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index a7519725a5..50d6e6468f 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/mem.h"
 
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index be54d98534..f318e3911c 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 2be894a08b..b318ac22d4 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/grappler/devices.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 45bb188e8d..e29aaa25fe 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/setround.h"
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index f5ced95feb..339d792302 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index edc88a0384..b77289aded 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 16d2e0e0a5..fd4e75d26f 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index eaef5a6097..bacacb94ae 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 3245625a32..68d3e1c9ab 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index 4120f013ac..ddea9e281b 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index e89280724e..14ef2ed704 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_WINDOWS)
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index e7c24387a4..126e5a17af 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include <cmath>
 #include <complex>
 
-#include "tensorflow/core/platform/byte_order.h"
+// We need cpu_info.h here in order to pick up __BYTE_ORDER__.
+#include "tensorflow/core/platform/cpu_info.h"
 
 #ifdef __CUDACC__
 // All functions callable from CUDA code must be qualified with __device__
diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc
index 50872eef83..bb95c27410 100644
--- a/tensorflow/core/lib/core/coding.cc
+++ b/tensorflow/core/lib/core/coding.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
 namespace core {
diff --git a/tensorflow/core/lib/core/raw_coding.h b/tensorflow/core/lib/core/raw_coding.h
index 37201b755d..bbfd33d303 100644
--- a/tensorflow/core/lib/core/raw_coding.h
+++ b/tensorflow/core/lib/core/raw_coding.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_CORE_RAW_CODING_H_
 
 #include <string.h>
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index 2011f7d4a1..6e3cb2206d 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 62c803afb2..cba473927d 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"  // endian
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/png.h"
 
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 3f7dbcee85..51b9c6cd82 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
-#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/platform/byte_order.h b/tensorflow/core/platform/byte_order.h
deleted file mode 100644
index aab6535e4b..0000000000
--- a/tensorflow/core/platform/byte_order.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
-#define TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
-
-// Byte order defines provided by gcc. MSVC doesn't define those so
-// we define them here.
-// We assume that all windows platform out there are little endian.
-#if defined(_MSC_VER) && !defined(__clang__)
-#define __ORDER_LITTLE_ENDIAN__ 0x4d2
-#define __ORDER_BIG_ENDIAN__ 0x10e1
-#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-#endif
-
-namespace tensorflow {
-namespace port {
-
-// TODO(jeff,sanjay): Make portable
-constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-
-}  // namespace port
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 9d00aa7b7f..b570658158 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <mutex>
 #include <string>
 
-#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index c42429a394..bb77650e26 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -25,6 +25,9 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
+// TODO(jeff,sanjay): Make portable
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
 // Returns an estimate of the number of schedulable CPUs for this
 // process.  Usually, it's constant throughout the lifetime of a
 // process, but it might change if the underlying cluster management
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index c510dc204f..82cbc43b4f 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -15,9 +15,8 @@ limitations under the License.
 
 #include <tuple>
 
-#include "tensorflow/core/platform/byte_order.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
 // If we're on gcc 4.8 or older, there's a known bug that prevents the use of
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index ba2126abcf..f20939d3c0 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -19,4 +19,13 @@ limitations under the License.
 // included so __cpuidex function is available for GETCPUID on Windows
 #include <intrin.h>
 
+// Byte order defines provided by gcc. MSVC doesn't define those so
+// we define them here.
+// We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define __ORDER_LITTLE_ENDIAN__ 0x4d2
+#define __ORDER_BIG_ENDIAN__ 0x10e1
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
+
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
-- 
GitLab


From 91c31997e6854a3d07acc76381cff7436df1c1dd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 08:12:42 -0700
Subject: [PATCH 0759/1262] Add support to TFLite for dilated convolution.

PiperOrigin-RevId: 192770919
---
 tensorflow/contrib/lite/builtin_op_data.h     |  2 +
 tensorflow/contrib/lite/kernels/conv.cc       | 67 ++++++++++++-------
 tensorflow/contrib/lite/kernels/conv_test.cc  |  8 ++-
 .../contrib/lite/kernels/depthwise_conv.cc    |  6 +-
 tensorflow/contrib/lite/kernels/padding.h     |  7 +-
 tensorflow/contrib/lite/kernels/pooling.cc    |  4 +-
 tensorflow/contrib/lite/model.cc              |  2 +
 tensorflow/contrib/lite/schema/schema.fbs     |  2 +
 .../contrib/lite/schema/schema_generated.h    | 38 +++++++++--
 .../contrib/lite/testing/generate_examples.py |  3 +
 .../contrib/lite/toco/tflite/operator.cc      |  6 +-
 11 files changed, 104 insertions(+), 41 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index f5fb2f15e3..4910c89eae 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -53,6 +53,8 @@ typedef struct {
   TfLitePadding padding;
   int stride_width;
   int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
   TfLiteFusedActivation activation;
 } TfLiteConvParams;
 
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 18ff33bf9f..3b467b3aa2 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -225,22 +225,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
-  auto computeOutSize = [padding](int imageSize, int filterSize,
-                                  int stride) -> int {
+  auto computeOutSize = [padding](int imageSize, int filterSize, int stride,
+                                  int dilationRate) -> int {
+    int effectiveFilterSize = (filterSize - 1) * dilationRate + 1;
     return padding == kTfLitePaddingSame
                ? (imageSize + stride - 1) / stride
                : padding == kTfLitePaddingValid
-                     ? (imageSize - filterSize + stride) / stride
+                     ? (imageSize - effectiveFilterSize + stride) / stride
                      : 0;
   };
 
-  int outWidth = computeOutSize(width, filter_width, params->stride_width);
-  int outHeight = computeOutSize(height, filter_height, params->stride_height);
+  int outWidth = computeOutSize(width, filter_width, params->stride_width,
+                                params->dilation_width_factor);
+  int outHeight = computeOutSize(height, filter_height, params->stride_height,
+                                 params->dilation_height_factor);
 
   data->padding.height =
-      ComputePadding(params->stride_height, height, filter_height, outHeight);
+      ComputePadding(params->stride_height, params->dilation_height_factor,
+                     height, filter_height, outHeight);
   data->padding.width =
-      ComputePadding(params->stride_width, width, filter_width, outWidth);
+      ComputePadding(params->stride_width, params->dilation_width_factor, width,
+                     filter_width, outWidth);
 
   TF_LITE_ENSURE(context, hasBias);
 
@@ -375,28 +380,40 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-
-  switch (kernel_type) {
+  KernelType effective_kernel_type;
+  if (((kernel_type == kMultithreadOptimized) ||
+       (kernel_type == kCblasOptimized)) &&
+      ((params->dilation_width_factor != 1) ||
+       (params->dilation_height_factor != 1))) {
+    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // Therefore, fallback to optimized.
+    effective_kernel_type = kGenericOptimized;
+  } else {
+    effective_kernel_type = kernel_type;
+  }
+  switch (effective_kernel_type) {
     case kReference: {
-      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                          GetTensorData<float>(filter), GetTensorDims(filter),
-                          GetTensorData<float>(bias), GetTensorDims(bias),
-                          params->stride_width, params->stride_height, 1, 1,
-                          data->padding.width, data->padding.height,
-                          output_activation_min, output_activation_max,
-                          GetTensorData<float>(output), GetTensorDims(output),
-                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      reference_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(filter), GetTensorDims(filter),
+          GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+          params->stride_height, params->dilation_width_factor,
+          params->dilation_height_factor, data->padding.width,
+          data->padding.height, output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
     case kGenericOptimized: {
-      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                          GetTensorData<float>(filter), GetTensorDims(filter),
-                          GetTensorData<float>(bias), GetTensorDims(bias),
-                          params->stride_width, params->stride_height, 1, 1,
-                          data->padding.width, data->padding.height,
-                          output_activation_min, output_activation_max,
-                          GetTensorData<float>(output), GetTensorDims(output),
-                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      optimized_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(filter), GetTensorDims(filter),
+          GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+          params->stride_height, params->dilation_width_factor,
+          params->dilation_height_factor, data->padding.width,
+          data->padding.height, output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
     case kMultithreadOptimized: {
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index d2393c3c97..0dcfc826fd 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -46,7 +46,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
       TfLiteRegistration* registration, const TensorData& input,
       const TensorData& filter, const TensorData& output, int stride_width = 2,
       int stride_height = 2, enum Padding padding = Padding_VALID,
-      enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
+      int dilation_width_factor = 1, int dilation_height_factor = 1) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -71,8 +72,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
     }
 
     SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
-                 CreateConv2DOptions(builder_, padding, stride_width,
-                                     stride_height, activation)
+                 CreateConv2DOptions(
+                     builder_, padding, stride_width, stride_height, activation,
+                     dilation_width_factor, dilation_height_factor)
                      .Union());
 
     resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index cad9ce114c..eeda1bc3c5 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -140,10 +140,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int out_height =
       compute_out_size(height, filter_height, params->stride_height);
 
-  data->padding.height =
-      ComputePadding(params->stride_height, height, filter_height, out_height);
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
   data->padding.width =
-      ComputePadding(params->stride_width, width, filter_width, out_width);
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index 40b8476b37..e81b970e0f 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -17,9 +17,10 @@ limitations under the License.
 
 namespace tflite {
 
-inline int ComputePadding(int stride, int in_size, int filter_size,
-                          int out_size) {
-  int padding = ((out_size - 1) * stride + filter_size - in_size) / 2;
+inline int ComputePadding(int stride, int dilation_rate, int in_size,
+                          int filter_size, int out_size) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
   return padding > 0 ? padding : 0;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index b798801108..0bf27c34c1 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -94,9 +94,9 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   int outHeight =
       computeOutSize(height, params->filter_height, params->stride_height);
 
-  data->padding.height = ComputePadding(params->stride_height, height,
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
                                         params->filter_height, outHeight);
-  data->padding.width = ComputePadding(params->stride_width, width,
+  data->padding.width = ComputePadding(params->stride_width, 1, width,
                                        params->filter_width, outWidth);
 
   if (input->type == kTfLiteUInt8) {
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 54b1460173..2dd6d67e07 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -333,6 +333,8 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->stride_height = conv_params->stride_h();
         params->activation =
             parse_activation(conv_params->fused_activation_function());
+        params->dilation_width_factor = conv_params->dilation_w_factor();
+        params->dilation_height_factor = conv_params->dilation_h_factor();
       }
       *builtin_data = reinterpret_cast<void*>(params);
       break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 93980b15f0..2b62c257d8 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -199,6 +199,8 @@ table Conv2DOptions {
   stride_w:int;
   stride_h:int;
   fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
 }
 
 table Pool2DOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index b2a799d0ef..0b9961d606 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -1478,11 +1478,15 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
   int32_t stride_w;
   int32_t stride_h;
   ActivationFunctionType fused_activation_function;
+  int32_t dilation_w_factor;
+  int32_t dilation_h_factor;
   Conv2DOptionsT()
       : padding(Padding_SAME),
         stride_w(0),
         stride_h(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
+        fused_activation_function(ActivationFunctionType_NONE),
+        dilation_w_factor(0),
+        dilation_h_factor(0) {
   }
 };
 
@@ -1492,7 +1496,9 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
-    VT_FUSED_ACTIVATION_FUNCTION = 10
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_DILATION_W_FACTOR = 12,
+    VT_DILATION_H_FACTOR = 14
   };
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
@@ -1506,12 +1512,20 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 0);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR) &&
            verifier.EndTable();
   }
   Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1534,6 +1548,12 @@ struct Conv2DOptionsBuilder {
   void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 0);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 0);
+  }
   explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1551,8 +1571,12 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
     Padding padding = Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 0,
+    int32_t dilation_h_factor = 0) {
   Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -4885,6 +4909,8 @@ inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resol
   { auto _e = stride_w(); _o->stride_w = _e; };
   { auto _e = stride_h(); _o->stride_h = _e; };
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; };
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; };
 }
 
 inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4899,12 +4925,16 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatB
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
   return tflite::CreateConv2DOptions(
       _fbb,
       _padding,
       _stride_w,
       _stride_h,
-      _fused_activation_function);
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor);
 }
 
 inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 53b41d2358..e045c27427 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1039,6 +1039,7 @@ def make_conv_tests(zip_path):
           "input_shape": [[1, 3, 4, 3]],
           "filter_shape": [[1, 1, 3, 2]],
           "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
           "constant_filter": [True, False],
@@ -1047,6 +1048,7 @@ def make_conv_tests(zip_path):
           "input_shape": [[2, 14, 14, 2]],
           "filter_shape": [[6, 6, 2, 2]],
           "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1], [1, 2, 2, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
           "constant_filter": [True, False],
@@ -1072,6 +1074,7 @@ def make_conv_tests(zip_path):
         input_tensor,
         filter_input,
         strides=parameters["strides"],
+        dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     return input_tensors, [out]
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index f41a312b47..d2e14ac5e0 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -68,7 +68,9 @@ class Convolution
     auto activation_function =
         ActivationFunction::Serialize(op.fused_activation_function);
     return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
-                                         op.stride_height, activation_function);
+                                         op.stride_height, activation_function,
+                                         op.dilation_width_factor,
+                                         op.dilation_height_factor);
   }
 
   void ReadOptions(const TfLiteOptions& options,
@@ -76,6 +78,8 @@ class Convolution
     op->padding.type = Padding::Deserialize(options.padding());
     op->stride_width = options.stride_w();
     op->stride_height = options.stride_h();
+    op->dilation_width_factor = options.dilation_w_factor();
+    op->dilation_height_factor = options.dilation_h_factor();
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
-- 
GitLab


From 17aa70e87ad9818f8918534ac4a567c3a3ef4550 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 08:17:49 -0700
Subject: [PATCH 0760/1262] Refactor to remove the duplicate calls to obtain a
 function's namespace. This removes the need to explicitly import internal
 components (barring the tf module which cannot be imported directly).

PiperOrigin-RevId: 192771440
---
 tensorflow/contrib/autograph/impl/api.py      | 14 +++---
 tensorflow/contrib/autograph/impl/api_test.py |  2 -
 tensorflow/contrib/autograph/impl/config.py   |  6 ---
 .../contrib/autograph/impl/conversion.py      | 48 ++++++++++---------
 .../contrib/autograph/impl/conversion_test.py |  9 ++--
 5 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index a00d9c68dc..f97a33326e 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -235,7 +235,8 @@ def to_graph(e,
       nocompile_decorators=(convert, do_not_convert, converted_call),
       partial_types=partial_types,
       api_module=tf_inspect.getmodule(to_graph))
-  _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
+  _, name, namespace = conversion.entity_to_graph(e, conversion_map, arg_values,
+                                                  arg_types)
 
   module = gast.Module([])
   for import_line in config.COMPILED_IMPORT_STATEMENTS:
@@ -244,13 +245,12 @@ def to_graph(e,
     module.body.append(dep)
   compiled_node, compiled_src = compiler.ast_to_object(module)
 
-  # The compiled code should see everything the entry function saw.
+  # The compiled code should see everything the entry entity saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
-  if tf_inspect.isfunction(e):
-    for key, val in inspect_utils.getnamespace(e).items():
-      # Avoid overwriting entities that have been transformed.
-      if key not in compiled_node.__dict__:
-        compiled_node.__dict__[key] = val
+  for key, val in namespace.items():
+    # Avoid overwriting entities that have been transformed.
+    if key not in compiled_node.__dict__:
+      compiled_node.__dict__[key] = val
   compiled_fn = getattr(compiled_node, name)
 
   if verbose:
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index 2e09d19621..a7737b7f44 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -39,8 +39,6 @@ class ApiTest(test.TestCase):
         'from __future__ import print_function',
         'from tensorflow.contrib.autograph import utils'
         ' as autograph_utils',
-        'from tensorflow.contrib.autograph import operators'
-        ' as __ops',
         'tf = autograph_utils.fake_tf()',
     )
 
diff --git a/tensorflow/contrib/autograph/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
index 26326465e2..2600088595 100644
--- a/tensorflow/contrib/autograph/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -46,10 +46,4 @@ NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
     'import tensorflow as tf',
-    'from tensorflow.contrib.autograph.impl import api'
-    ' as autograph_api',
-    'from tensorflow.contrib.autograph import utils'
-    ' as autograph_utils',
-    'from tensorflow.contrib.autograph import operators'
-    ' as __ops',
 )
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 3bacc94300..373dc1602b 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import gast
 
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.converters import asserts
 from tensorflow.contrib.autograph.converters import break_statements
@@ -138,20 +139,22 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types):
         parameters.
 
   Returns:
-    A tuple (ast, new_name):
+    A tuple (ast, new_name, namespace):
         * ast: An AST representing an entity with interface equivalent to `o`,
             but which when executed it creates TF a graph.
         * new_name: The symbol name under which the new entity can be found.
+        * namespace: A dict mapping all symbols visible to the converted entity,
+            keyed by their symbol name.
 
   Raises:
     ValueError: if the entity type is not supported.
   """
   if tf_inspect.isclass(o):
-    node, new_name = class_to_graph(o, conversion_map)
+    node, name, ns = class_to_graph(o, conversion_map)
   elif tf_inspect.isfunction(o):
-    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
-    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
   else:
     raise ValueError(
         'Entity "%s" has unsupported type "%s". Only functions and classes are '
@@ -174,7 +177,7 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types):
         continue
       entity_to_graph(candidate, conversion_map, {}, {})
 
-  return node, new_name
+  return node, name, ns
 
 
 def class_to_graph(c, conversion_map):
@@ -185,17 +188,18 @@ def class_to_graph(c, conversion_map):
   if not members:
     raise ValueError('Cannot convert %s: it has no member methods.' % c)
 
-  class_namespace = None
+  class_namespace = {}
   for _, m in members:
-    node, _ = function_to_graph(
+    node, _, namespace = function_to_graph(
         m,
         conversion_map=conversion_map,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
         owner_type=c)
-    # TODO(mdan): Do not assume all members have the same view of globals.
     if class_namespace is None:
-      class_namespace = inspect_utils.getnamespace(m)
+      class_namespace = namespace
+    else:
+      class_namespace.update(namespace)
     converted_members[m] = node
   namer = conversion_map.new_namer(class_namespace)
   class_name = namer.compiled_class_name(c.__name__, c)
@@ -206,25 +210,23 @@ def class_to_graph(c, conversion_map):
       body=list(converted_members.values()),
       decorator_list=[])
 
-  return node, class_name
+  return node, class_name, class_namespace
+
+
+def _add_reserved_symbol(namespace, name, entity):
+  if name not in namespace:
+    namespace[name] = entity
+  elif namespace[name] != entity:
+    raise ValueError('The name "%s" is reserved and may not be used.' % name)
 
 
 def _add_self_references(namespace, api_module):
-  """Self refs are only required for analysis and are not used directly."""
   # Manually add the utils namespace which may be used from generated code.
-  if 'autograph_util' not in namespace:
-    namespace['autograph_utils'] = utils
-  elif namespace['autograph_utils'] != utils:
-    raise ValueError(
-        'The module name "autograph_utils" is reserved and may not be used.')
-
+  _add_reserved_symbol(namespace, 'autograph_utils', utils)
+  _add_reserved_symbol(namespace, '__ops', operators)
   # We also make reference to the api module for dynamic conversion, but
   # to avoid circular references we don't import it here.
-  if 'autograph_api' not in namespace:
-    namespace['autograph_api'] = api_module
-  elif namespace['autograph_api'] != api_module:
-    raise ValueError(
-        'The module name "autograph_api" is reserved and may not be used.')
+  _add_reserved_symbol(namespace, 'autograph_api', api_module)
 
 
 def function_to_graph(f, conversion_map, arg_values, arg_types,
@@ -261,7 +263,7 @@ def function_to_graph(f, conversion_map, arg_values, arg_types,
   # TODO(mdan): Use this at compilation.
   conversion_map.additional_imports.update(deps)
 
-  return node, new_name
+  return node, new_name, namespace
 
 
 def _static_analysis_pass(node, ctx):
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 7066739eb8..962009c71f 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -43,14 +43,15 @@ class ConversionTest(test.TestCase):
       conversion.entity_to_graph('dummy', conversion_map, None, None)
 
   def test_entity_to_graph_callable(self):
-
+    b = 2
     def f(a):
-      return a
+      return a + b
 
     conversion_map = conversion.ConversionMap(True, (), (), None)
-    ast, new_name = conversion.entity_to_graph(f, conversion_map, None, None)
+    ast, name, ns = conversion.entity_to_graph(f, conversion_map, None, None)
     self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
-    self.assertEqual('tf__f', new_name)
+    self.assertEqual('tf__f', name)
+    self.assertTrue(ns['b'] is b)
 
   def test_entity_to_graph_call_tree(self):
 
-- 
GitLab


From 554c587c54d0725d6da0ce39557d17b8393c35bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 08:22:06 -0700
Subject: [PATCH 0761/1262] Experiment with pre-shuffled fully-connected
 weights

PiperOrigin-RevId: 192771889
---
 .../lite/kernels/internal/compatibility.h     |   1 +
 .../internal/optimized/optimized_ops.h        | 136 ++++++++++++++++++
 .../internal/reference/reference_ops.h        |  61 ++++++++
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 .../experimental_shuffle_fc_weights.cc        | 135 +++++++++++++++++
 .../graph_transformations.h                   |   1 +
 .../graph_transformations/identify_lstm.cc    |   6 +
 tensorflow/contrib/lite/toco/model.h          |   1 +
 8 files changed, 342 insertions(+)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc

diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 51426bb1c5..93fc6b6a76 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -77,6 +77,7 @@ limitations under the License.
 #endif
 
 // TODO(ahentz): Clean up.
+using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index fa91db7fe1..7fc6615965 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1203,6 +1203,142 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
                  output_activation_max, output_data, output_dims, gemm_context);
 }
 
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label(
+      "ExperimentalShuffledFullyConnected/8bit");
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  // The experimental shuffling is an optimization for matrix*vector product.
+  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
+  // batches>1.
+  TFLITE_DCHECK_EQ(batches, 1);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* shuffled_weights_ptr =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+#if defined USE_NEON
+  // We'll only need to xor signbit to the input activation values, as
+  // that xor-ing is pre-built into the shuffled weights values.
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  const int right_shift = output_shift > 0 ? output_shift : 0;
+  const int left_shift = output_shift > 0 ? 0 : -output_shift;
+  for (int c = 0; c < output_depth; c += 4) {
+    // Accumulation loop.
+    int32x4_t row_accum0 = vdupq_n_s32(0);
+    int32x4_t row_accum1 = vdupq_n_s32(0);
+    int32x4_t row_accum2 = vdupq_n_s32(0);
+    int32x4_t row_accum3 = vdupq_n_s32(0);
+    for (int d = 0; d < accum_depth; d += 16) {
+      int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+      int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+      int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+      int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+      shuffled_weights_ptr += 64;
+      int8x16_t input =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + d)));
+      int16x8_t local_accum0 =
+          vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
+      int16x8_t local_accum1 =
+          vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
+      int16x8_t local_accum2 =
+          vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
+      int16x8_t local_accum3 =
+          vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
+      local_accum0 =
+          vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
+      local_accum1 =
+          vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
+      local_accum2 =
+          vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
+      local_accum3 =
+          vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
+      row_accum0 = vpadalq_s16(row_accum0, local_accum0);
+      row_accum1 = vpadalq_s16(row_accum1, local_accum1);
+      row_accum2 = vpadalq_s16(row_accum2, local_accum2);
+      row_accum3 = vpadalq_s16(row_accum3, local_accum3);
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_data + c);
+    reduced = vaddq_s32(reduced, bias_vec);
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_data + c, res16);
+  }
+#else
+  for (int c = 0; c < output_depth; c += 4) {
+    // Internal accumulation.
+    // Initialize accumulator with the bias-value.
+    int32 accum[4] = {0};
+    // Accumulation loop.
+    for (int d = 0; d < accum_depth; d += 16) {
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 16; j++) {
+          int8 input_val = input_data[d + j] - 128;
+          int8 weights_val = *shuffled_weights_ptr++;
+          accum[i] += weights_val * input_val;
+        }
+      }
+    }
+    for (int i = 0; i < 4; i++) {
+      // Add bias value
+      int acc = accum[i] + bias_data[c + i];
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      acc =
+          MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
+      // Saturate, cast to int16, and store to output array.
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[c + i] = acc;
+    }
+  }
+#endif
+}
+
 template <typename T>
 inline void ExtractPatchIntoBufferColumn(
     const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 6a89dbc803..791fb52391 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -602,6 +602,67 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  // The experimental shuffling is an optimization for matrix*vector product.
+  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
+  // batches>1.
+  TFLITE_DCHECK_EQ(batches, 1);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* shuffled_weights_ptr =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+  for (int c = 0; c < output_depth; c += 4) {
+    // Internal accumulation.
+    // Initialize accumulator with the bias-value.
+    int32 accum[4] = {0};
+    // Accumulation loop.
+    for (int d = 0; d < accum_depth; d += 16) {
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 16; j++) {
+          int8 input_val = input_data[d + j] - 128;
+          int8 weights_val = *shuffled_weights_ptr++;
+          accum[i] += weights_val * input_val;
+        }
+      }
+    }
+    for (int i = 0; i < 4; i++) {
+      // Add bias value
+      int acc = accum[i] + bias_data[c + i];
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      acc =
+          MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
+      // Saturate, cast to int16, and store to output array.
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[c + i] = acc;
+    }
+  }
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 4c8652d62e..5b86e4e5ae 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -219,6 +219,7 @@ cc_library(
         "graph_transformations/drop_fake_quant.cc",
         "graph_transformations/drop_im2col_arrays.cc",
         "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/experimental_shuffle_fc_weights.cc",
         "graph_transformations/fuse_activation_functions.cc",
         "graph_transformations/fuse_binary_into_following_affine.cc",
         "graph_transformations/fuse_binary_into_preceding_affine.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
new file mode 100644
index 0000000000..f098981a5c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
+  Operator* op = model->operators[op_index].get();
+  if (op->type != OperatorType::kFullyConnected) {
+    return false;
+  }
+  FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op);
+  // Exit if this FC op already has shuffled weights
+  if (fc_op->experimental_shuffled_weights) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(fc_op->inputs[0]);
+  const string& weights_name = fc_op->inputs[1];
+  Array& weights_array = model->GetArray(weights_name);
+  const Array& output_array = model->GetArray(fc_op->outputs[0]);
+  // Exit if this FC op isn't quantized with uint8 inputs and int16 outputs,
+  // the only case where we are currently interested in providing a fast path
+  // with shuffled weights.
+  if (input_array.data_type != ArrayDataType::kUint8 ||
+      weights_array.data_type != ArrayDataType::kUint8 ||
+      output_array.data_type != ArrayDataType::kInt16 ||
+      !input_array.quantization_params || !weights_array.quantization_params ||
+      !output_array.quantization_params) {
+    return false;
+  }
+  // Exit if the shapes aren't known
+  if (!input_array.has_shape() || !weights_array.has_shape()) {
+    return false;
+  }
+  // Exit if, based on the known shapes, this FC op is not a GEMV.
+  // The shuffling of FC weights is only useful to enable fast GEMV paths.
+  const Shape& input_shape = input_array.shape();
+  for (int i = 0; i < input_shape.dimensions_count() - 1; i++) {
+    if (input_shape.dims(i) != 1) {
+      // The input activations, shaped as a matrix, have multiple columns.
+      // This FC op isn't a matrix*vector multiplication.
+      AddMessageF(
+          "Not applying experimental shuffling to the weights of %s because "
+          "it's not a matrix*vector product",
+          LogName(*op));
+      return false;
+    }
+  }
+  // Exit if the weights shape isn't an integral multiple of the shuffled
+  // block shape, 4x16. We don't want to have to write code dealing with
+  // odd sizes, that would go un-exercised at the moment as the models
+  // for which we need this shuffling have shapes that are multiples of that
+  // 4x16 block size. In fact, much of the rationale for this shuffling is
+  // to avoid cache aliasin issue with large power-of-two depths, with our
+  // models motivating this shuffling having FC weights shapes like
+  // 4096x2048. Thus, if some model doesn't get the shuffling because of that
+  // size requirement, that might be just fine --- that model might just not
+  // suffer from that cache aliasing issue that we have with large powers of
+  // two.
+  const Shape& weights_shape = weights_array.shape();
+  if (weights_shape.dimensions_count() != 2) {
+    return false;
+  }
+  const int rows = weights_shape.dims(0);
+  const int cols = weights_shape.dims(1);
+  if (rows % 4 || cols % 16) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because its "
+        "shape isn't a multiple of the shuffling block shape, 4x16",
+        LogName(*op));
+    return false;
+  }
+  // Exit if the weights aren't already a constant array.
+  if (!weights_array.buffer) {
+    return false;
+  }
+  // Exit if the weights are used by more than one op.
+  if (CountOpsWithInput(*model, weights_name) != 1) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because that "
+        "array is consumed by other operators",
+        LogName(*op));
+    return false;
+  }
+  // Compute the shuffled weights
+  auto& weights_data =
+      weights_array.GetMutableBuffer<ArrayDataType::kUint8>().data;
+  CHECK_EQ(rows * cols, weights_data.size());
+  std::vector<uint8> shuffled_data(weights_data.size());
+  uint8* shuffled_data_ptr = shuffled_data.data();
+  for (int r = 0; r < rows; r += 4) {
+    for (int c = 0; c < cols; c += 16) {
+      for (int i = 0; i < 4; i++) {
+        const uint8* src_data_ptr = weights_data.data() + (r + i) * cols + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the runtime will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_data_ptr++ = dst_val;
+        }
+      }
+    }
+  }
+  CHECK_EQ(shuffled_data_ptr, shuffled_data.data() + rows * cols);
+  // Switch this FC op to using the shuffled weights.
+  weights_data = std::move(shuffled_data);
+  fc_op->experimental_shuffled_weights = true;
+  AddMessageF("Applied experimental shuffling to the weights of %s",
+              LogName(*op));
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 384bd85b81..dbf029a853 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -187,6 +187,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero)
 DECLARE_GRAPH_TRANSFORMATION(Dequantize)
 DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
+DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights)
 
 class ResolveReshapeAttributes : public GraphTransformation {
  public:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index c363b93394..e9842524c8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -306,6 +306,12 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
+  if (static_cast<FullyConnectedOperator*>(fully_connected)
+          ->experimental_shuffled_weights) {
+    // Not yet implemented: experimental shuffled weights in fused LSTM cell.
+    return false;
+  }
+
   // Emplace a new LSTM cell operator
   auto* lstm_cell_op = new LstmCellOperator;
   lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 716a579d22..1c4c96ae70 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -425,6 +425,7 @@ struct SpaceToDepthOperator : Operator {
 // input activations as a matrix, followed by a MatMul node.
 struct FullyConnectedOperator : Operator {
   FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
+  bool experimental_shuffled_weights = false;
 };
 
 // Dequantization operator, converting a quantized array of integers with
-- 
GitLab


From cd7ba4390360e1860cd57a6674a8423cf56b55bd Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Fri, 13 Apr 2018 10:02:25 -0700
Subject: [PATCH 0762/1262] Add debugging checks for setting cuda stream, so it
 will check fail if the stream is not set or set to a wrong one when running
 cudnn methods that conceptually require a stream.

Also add missing cudnnSetStream()s for DoRnnForwardImpl() and
DoRnnBackwardImpl().

Implementation details:
1. a current_cudnn_stream_ member is added which will be set in cudnnSetStream()
2. a different macro is used to wrap cudnn methods that require a stream in
   order to verify whether the provided stream is same as current_cudnn_stream_,
   and the program will check fail if not

PiperOrigin-RevId: 192783913
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 215 ++++++++++++--------
 tensorflow/stream_executor/cuda/cuda_dnn.h  |  24 ++-
 2 files changed, 151 insertions(+), 88 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 1dc7f991b3..4a6b2bf5d7 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -169,11 +169,34 @@ static port::ThreadPool* GetCudaThreadpool() {
     }                                                              \
   } __name;
 
+#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)        \
+  struct WrapperShim__##__name {                                         \
+    template <typename... Args>                                          \
+    cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
+        SHARED_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {                  \
+      CHECK_NOTNULL(s);                                                  \
+      CHECK_EQ(s, dnn->GetCurrentDnnStream())                            \
+          << "Stream is not set correctly!";                             \
+      cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()}; \
+      cudnnStatus_t retval = ::__name(args...);                          \
+      return retval;                                                     \
+    }                                                                    \
+  } __name;
+
+// Handles cudnnSetStream differently in order to add debug information.
+struct WrapperShim__cudnnSetStream {
+  cudnnStatus_t operator()(CudnnSupport* dnn, Stream* stream,
+                           cudnnHandle_t handle)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {
+    dnn->SetCurrentDnnStream(stream);
+    cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()};
+    cudnnStatus_t retval = ::cudnnSetStream(handle, AsCUDAStreamValue(stream));
+    return retval;
+  }
+} cudnnSetStream;
+
 // clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnBatchNormalizationBackward)                \
-  __macro(cudnnBatchNormalizationForwardInference)        \
-  __macro(cudnnBatchNormalizationForwardTraining)         \
   __macro(cudnnGetConvolutionNdForwardOutputDim)          \
   __macro(cudnnGetConvolutionForwardAlgorithm)            \
   __macro(cudnnCreateTensorDescriptor)                    \
@@ -190,16 +213,25 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnDestroyConvolutionDescriptor)              \
   __macro(cudnnCreate)                                    \
   __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
   __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
   __macro(cudnnSetConvolutionNdDescriptor)                \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensorNdDescriptor)                     \
-  __macro(cudnnSetFilterNdDescriptor)                     \
+  __macro(cudnnSetFilterNdDescriptor)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(__macro)       \
+  __macro(cudnnBatchNormalizationBackward)                \
+  __macro(cudnnBatchNormalizationForwardInference)        \
+  __macro(cudnnBatchNormalizationForwardTraining)         \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnTransformTensor)                           \
   __macro(cudnnPoolingForward)                            \
   __macro(cudnnPoolingBackward)                           \
   __macro(cudnnLRNCrossChannelForward)                    \
@@ -207,9 +239,11 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnAddTensor)                                 \
   __macro(cudnnConvolutionBackwardData)                   \
   __macro(cudnnConvolutionBackwardFilter)
-// clang-format on
 
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
@@ -225,14 +259,15 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // APIs in R3 but not in R5
 // clang-format off
 #if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R3(__macro)                    \
+#define CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(__macro)        \
   __macro(cudnnAddTensor_v3)                                  \
   __macro(cudnnConvolutionBackwardData_v3)                    \
   __macro(cudnnConvolutionBackwardFilter_v3)
 // clang-format on
 
-CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R3
+CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
 #endif
 
 // APIs in R5
@@ -254,29 +289,44 @@ CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnGetRNNTrainingReserveSize)                     \
   __macro(cudnnGetRNNLinLayerMatrixParams)                    \
   __macro(cudnnGetRNNLinLayerBiasParams)                      \
-  __macro(cudnnRNNForwardInference)                           \
-  __macro(cudnnRNNForwardTraining)                            \
-  __macro(cudnnRNNBackwardData)                               \
-  __macro(cudnnRNNBackwardWeights)                            \
   __macro(cudnnSetRNNDescriptor)                              \
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
-
 CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R5
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(__macro)        \
+  __macro(cudnnRNNForwardInference)                           \
+  __macro(cudnnRNNForwardTraining)                            \
+  __macro(cudnnRNNBackwardData)                               \
+  __macro(cudnnRNNBackwardWeights)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
 #endif
 
 // APIs in R6
 // clang-format off
 #if CUDNN_VERSION >= 6000
 #define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnConvolutionBiasActivationForward)              \
   __macro(cudnnSetRNNDescriptor_v6)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R6
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(__macro)        \
+  __macro(cudnnConvolutionBiasActivationForward)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
 #endif
 
 // APIs in R7
@@ -291,8 +341,6 @@ CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R7
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-
 }  // namespace wrap
 
 namespace {
@@ -419,7 +467,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 }  // namespace
 
 CudnnSupport::CudnnSupport(CUDAExecutor* parent)
-    : parent_(parent), dnn_handle_(nullptr) {}
+    : parent_(parent), dnn_handle_(nullptr), current_dnn_stream_(nullptr) {}
 
 CudnnSupport::~CudnnSupport() {
   auto status = wrap::cudnnDestroy(parent_, ToHandle(dnn_handle_));
@@ -1660,6 +1708,12 @@ bool CudnnSupport::DoRnnForwardImpl(
 
   // check params size
   mutex_lock lock{dnn_handle_mutex_};
+  auto set_stream_status =
+      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: "
+               << ToString(set_stream_status);
+  }
 
   if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
                              input_desc)) {
@@ -1720,7 +1774,7 @@ bool CudnnSupport::DoRnnForwardImpl(
   cudnnStatus_t status;
   if (!is_training) {
     status = wrap::cudnnRNNForwardInference(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1733,7 +1787,7 @@ bool CudnnSupport::DoRnnForwardImpl(
         workspace.size() /*workSpaceSizeInBytes*/);
   } else {
     status = wrap::cudnnRNNForwardTraining(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1810,6 +1864,12 @@ bool CudnnSupport::DoRnnBackwardImpl(
 
   // check params size
   mutex_lock lock{dnn_handle_mutex_};
+  auto set_stream_status =
+      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: "
+               << ToString(set_stream_status);
+  }
 
   if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
                              input_desc)) {
@@ -1841,10 +1901,11 @@ bool CudnnSupport::DoRnnBackwardImpl(
   }
   // make the backward data call
   cudnnStatus_t status = wrap::cudnnRNNBackwardData(
-      parent_, ToHandle(dnn_handle_) /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      model_dims.seq_length /*seqLength*/, output_desc.handles() /*yDesc*/,
-      output_data.opaque() /*y*/, output_desc.handles() /*dyDesc*/,
-      output_backprop_data.opaque() /*dy*/, output_h_desc.handle() /*dhyDesc*/,
+      this, stream, ToHandle(dnn_handle_) /*handle*/,
+      rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
+      output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
+      output_desc.handles() /*dyDesc*/, output_backprop_data.opaque() /*dy*/,
+      output_h_desc.handle() /*dhyDesc*/,
       output_h_backprop_data.opaque() /*dhy*/,
       output_c_desc.handle() /*dcyDesc*/,
       output_c_backprop_data.opaque() /*dcy*/,
@@ -1873,7 +1934,7 @@ bool CudnnSupport::DoRnnBackwardImpl(
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     // make the backward weight call
     status = wrap::cudnnRNNBackwardWeights(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -2517,8 +2578,7 @@ bool CudnnSupport::DoConvolveImpl(
                                    GetConvComputeType<T>()};
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -2668,7 +2728,7 @@ bool CudnnSupport::DoConvolveImpl(
     }
   }
   status = wrap::cudnnConvolutionForward(
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
@@ -2737,8 +2797,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
       static_cast<cudnnDataType_t>(cudnn_compute_type)};
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   CHECK(status == CUDNN_STATUS_SUCCESS)
       << "failed to set stream for cudnn handle: " << ToString(status);
 
@@ -2804,7 +2863,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_data->opaque() = " << output_data->opaque();
 
   status = wrap::cudnnConvolutionBiasActivationForward(
-      parent_, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
+      this, stream, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
       /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
       /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
       /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
@@ -3009,8 +3068,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3046,7 +3104,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     }
 
     status = wrap::cudnnBatchNormalizationForwardTraining(
-        parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
         batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
@@ -3063,7 +3121,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     const void* maybe_inv_var = estimated_variance.opaque();
 #endif
     status = wrap::cudnnBatchNormalizationForwardInference(
-        parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(),
         estimated_mean.opaque(), maybe_inv_var, epsilon);
@@ -3114,8 +3172,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3136,7 +3193,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
   float zero = 0.0;
 
   status = wrap::cudnnBatchNormalizationBackward(
-      parent_, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
+      this, stream, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
       x_descriptor.handle(), x.opaque(), x_descriptor.handle(),
       y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
@@ -3326,7 +3383,7 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   float alpha = 1.0f;
   float beta = 0.0f;
   auto status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
       backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(),
       (*transform_scratch)->mutable_device_memory()->opaque());
 
@@ -3345,8 +3402,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  cudnnStatus_t status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                              AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3357,7 +3413,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   ScopedTensorDescriptor output_tensor_desc(
       parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
   status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
+      this, stream, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
       input_data.opaque(), &beta, output_tensor_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3384,8 +3440,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3554,7 +3609,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 #else
   status = wrap::cudnnConvolutionBackwardData_v3(
 #endif
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
       /*alpha=*/alpha,
       /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(),
@@ -3655,8 +3710,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3826,7 +3880,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
 #else
   status = wrap::cudnnConvolutionBackwardFilter_v3(
 #endif
-      parent_, ToHandle(dnn_handle_), /*alpha=*/alpha,
+      this, stream, ToHandle(dnn_handle_), /*alpha=*/alpha,
       /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(),
       /*diffDesc=*/out_back_nd.handle(),
@@ -3922,8 +3976,7 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3938,7 +3991,7 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
   float beta = 0.0;
 
   status = wrap::cudnnConvolutionBackwardBias(
-      parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
       input_data.opaque(), &beta, bias_nd.handle(),
       backward_bias_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4143,8 +4196,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
   }
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4158,7 +4210,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
 #else
   status = wrap::cudnnAddTensor_v3(
 #endif
-      parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
       biases.opaque(), &beta, input_descriptor.handle(), output_data->opaque());
 
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4176,8 +4228,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
                               DeviceMemory<float>* output_data,
                               uint64 options) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4221,7 +4272,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
   // Beta is the output scaling factor.
   float beta = 0.0;
   status = wrap::cudnnActivationForward(
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
 #if CUDNN_VERSION >= 5000
       activation_desc.handle(),
 #else
@@ -4245,8 +4296,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4262,7 +4312,7 @@ bool CudnnSupport::DoPoolForward(
                                    CUDNN_DATA_DOUBLE};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4280,8 +4330,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4297,7 +4346,7 @@ bool CudnnSupport::DoPoolForward(
                                    CUDNN_DATA_FLOAT};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4315,8 +4364,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<Eigen::half>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4331,7 +4379,7 @@ bool CudnnSupport::DoPoolForward(
   ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4351,8 +4399,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<double>& input_diff_data,
     DeviceMemory<double>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4368,7 +4415,7 @@ bool CudnnSupport::DoPoolBackward(
                                    CUDNN_DATA_DOUBLE};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4389,8 +4436,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4406,7 +4452,7 @@ bool CudnnSupport::DoPoolBackward(
                                    CUDNN_DATA_FLOAT};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4427,8 +4473,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4443,7 +4488,7 @@ bool CudnnSupport::DoPoolBackward(
   ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4478,8 +4523,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
 
   // Launch the normalization.
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4494,7 +4538,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
   float beta = 0.0f;
 
   status = wrap::cudnnLRNCrossChannelForward(
-      parent_, ToHandle(dnn_handle_), normalize.handle(),
+      this, stream, ToHandle(dnn_handle_), normalize.handle(),
       CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(), input_data.opaque(),
       &beta, dims.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4521,8 +4565,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   }
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4535,7 +4578,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   float beta = 0.0f;
 
   status = wrap::cudnnLRNCrossChannelBackward(
-      parent_, ToHandle(dnn_handle_), normalize.handle(),
+      this, stream, ToHandle(dnn_handle_), normalize.handle(),
       CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(),
       normalized_data.opaque(), dims.handle(),
       normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 0e5368aca8..7518b23757 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -625,10 +625,27 @@ class CudnnSupport : public dnn::DnnSupport {
                          dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
- private:
-  // Guards the enqueueing of DNN operations via the dnn_handle_ below.
+  const Stream* GetCurrentDnnStream() const
+      SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    return current_dnn_stream_;
+  }
+
+  void SetCurrentDnnStream(Stream* stream)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    current_dnn_stream_ = stream;
+  }
+
+  CUDAExecutor* GetParentExecutor() { return parent_; }
+
+  // Guards the enqueueing of DNN operations via the dnn_handle_ below, and
+  // access to current_dnn_stream_.
+  //
+  // This is a public member because we need to add thread safty annotations in
+  // the cudnn wrapper functions in the cc file, which need to access this
+  // mutex (the annotations require C++ permission checks).
   mutex dnn_handle_mutex_;
 
+ private:
   CUDAExecutor* parent_;  // Parent executor object. Not owned.
 
   // cudnn library handle. cudnnHandle_t type is not present in this header to
@@ -636,6 +653,9 @@ class CudnnSupport : public dnn::DnnSupport {
   // single cuda_dnn translation unit.
   void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_);
 
+  // The current cudnn stream that is set by cudnnSetStream().
+  Stream* current_dnn_stream_ GUARDED_BY(dnn_handle_mutex_);
+
   // NOTE(keveman): Temporary data layout transformation until cuDNN supports
   // kBatchYXDepth for backward pass. This function allocates temporary memory,
   // lays out the source data into the temporary but in the kBatchDepthXY
-- 
GitLab


From 49f56ac87ee630cf4d15a161900e5a0bb631f563 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 10:07:10 -0700
Subject: [PATCH 0763/1262] Enable GCS remote cache in Windows Bazel Build

PiperOrigin-RevId: 192784701
---
 .../ci_build/windows/bazel/bazel_test_lib.sh    |  7 +++++++
 .../windows/cpu/pip/build_tf_windows.sh         | 17 ++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index d654b433e7..b2e16902d6 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -140,6 +140,13 @@ function run_configure_for_gpu_build {
   echo "" | ./configure
 }
 
+function set_gcs_remote_cache_options {
+  echo "build --experimental_remote_spawn_cache" >> .bazelrc
+  echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> .bazelrc
+  echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> .bazelrc
+  echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> .bazelrc
+}
+
 function create_python_test_dir() {
   rm -rf "$1"
   mkdir -p "$1"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 5e9ae497e1..4657ff196b 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,20 +42,27 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
+# Recreate an empty bazelrc file under source root
+rm -f .bazelrc
+touch .bazelrc
+
 skip_test=0
 
 for ARG in "$@"; do
   if [[ "$ARG" == --skip_test ]]; then
     skip_test=1
+  elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
+    set_gcs_remote_cache_options
   fi
 done
 
-run_configure_for_cpu_build
-
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
 # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-BUILD_OPTS="--define=override_eigen_strong_inline=true"
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+echo "build --define=override_eigen_strong_inline=true" >> .bazelrc
+
+run_configure_for_cpu_build
+
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -73,7 +80,7 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-- 
GitLab


From a6bc4afc97ce7a2a285e549822d06f4cbf51c4ef Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 13 Apr 2018 10:19:24 -0700
Subject: [PATCH 0764/1262] Cherry-picking PR #18444 into r1.8

---
 tensorflow/contrib/tensorrt/BUILD                           | 2 +-
 .../contrib/tensorrt/resources/trt_resource_manager.cc      | 6 ++++++
 .../contrib/tensorrt/resources/trt_resource_manager.h       | 6 +-----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2f316767b3..fd3582e175 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -52,7 +52,6 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -183,6 +182,7 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index e663eed4dd..9c3698e5d1 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,6 +19,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+std::shared_ptr<TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
+  return instance_;
+}
+
 std::shared_ptr<tensorflow::ResourceMgr>
 tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
   // mutex is held for lookup only. Most instantiations where mutex will be held
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index 5f8ad491d3..bc15b51e05 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -29,11 +29,7 @@ class TRTResourceManager {
   TRTResourceManager() = default;
 
  public:
-  static std::shared_ptr<TRTResourceManager> instance() {
-    static std::shared_ptr<TRTResourceManager> instance_(
-        new TRTResourceManager);
-    return instance_;
-  }
+  static std::shared_ptr<TRTResourceManager> instance();
   // returns a manager for given op, if it doesn't exists it creates one
   std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
 
-- 
GitLab


From defc185d57233d5185c4d77c973d8e25256b1e73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 10:27:11 -0700
Subject: [PATCH 0765/1262] DepthwiseConv Optimization Fixes

PiperOrigin-RevId: 192787669
---
 .../depthwiseconv_uint8_3x3_filter.h          | 170 +++++++++---------
 1 file changed, 86 insertions(+), 84 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index cdcb166b2f..55e0d5c3aa 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -386,12 +386,13 @@ inline void DotProductAndStore2yStride1(
 }
 
 // A kernel that is optimized on the number of output cells in the x and y
-// direction, and the stride. Assumes 3x3 filters of 16 depth.
-template <int kFixedOutputY, int kFixedOutputX, int kFixedStride = 1>
+// direction, and the stride. Assumes 3x3 filters of 8 depth.
+template <int kFixedOutputY, int kFixedOutputX, int kFixedStrideWidth,
+          int kFixedStrideHeight>
 struct ConvKernel3x3FilterDepth8 {};
 
 template <>
-struct ConvKernel3x3FilterDepth8<8, 8, 1> {
+struct ConvKernel3x3FilterDepth8<8, 8, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -1642,7 +1643,7 @@ struct ConvKernel3x3FilterDepth8<8, 8, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 4, 1> {
+struct ConvKernel3x3FilterDepth8<4, 4, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -1957,7 +1958,7 @@ struct ConvKernel3x3FilterDepth8<4, 4, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 2, 1> {
+struct ConvKernel3x3FilterDepth8<4, 2, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -2123,7 +2124,7 @@ struct ConvKernel3x3FilterDepth8<4, 2, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 1, 1> {
+struct ConvKernel3x3FilterDepth8<4, 1, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -2235,7 +2236,7 @@ struct ConvKernel3x3FilterDepth8<4, 1, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 2, 1> {
+struct ConvKernel3x3FilterDepth8<2, 2, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -2373,7 +2374,7 @@ struct ConvKernel3x3FilterDepth8<2, 2, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 4, 1> {
+struct ConvKernel3x3FilterDepth8<2, 4, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -2554,7 +2555,7 @@ struct ConvKernel3x3FilterDepth8<2, 4, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<1, 4, 1> {
+struct ConvKernel3x3FilterDepth8<1, 4, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -2669,7 +2670,7 @@ struct ConvKernel3x3FilterDepth8<1, 4, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 1, 1> {
+struct ConvKernel3x3FilterDepth8<2, 1, 1, 1> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -2746,7 +2747,7 @@ struct ConvKernel3x3FilterDepth8<2, 1, 1> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 2, 2> {
+struct ConvKernel3x3FilterDepth8<4, 2, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3063,7 +3064,7 @@ struct ConvKernel3x3FilterDepth8<4, 2, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 4, 2> {
+struct ConvKernel3x3FilterDepth8<4, 4, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3073,13 +3074,13 @@ struct ConvKernel3x3FilterDepth8<4, 4, 2> {
                          int32 output_activation_max, uint8* output_ptr,
                          int output_depth, int output_width) {
     // Reuse 4x2 kernel twice.
-    ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
         input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
         filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
         output_activation_min, output_activation_max, output_ptr, output_depth,
         output_width);
 
-    ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
         input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
         filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
         output_shift, output_activation_min, output_activation_max,
@@ -3088,7 +3089,7 @@ struct ConvKernel3x3FilterDepth8<4, 4, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 1, 2> {
+struct ConvKernel3x3FilterDepth8<4, 1, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3243,7 +3244,7 @@ struct ConvKernel3x3FilterDepth8<4, 1, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 2, 2> {
+struct ConvKernel3x3FilterDepth8<2, 2, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3433,7 +3434,7 @@ struct ConvKernel3x3FilterDepth8<2, 2, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 4, 2> {
+struct ConvKernel3x3FilterDepth8<2, 4, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3443,13 +3444,13 @@ struct ConvKernel3x3FilterDepth8<2, 4, 2> {
                          int32 output_activation_max, uint8* output_ptr,
                          int output_depth, int output_width) {
     // Reuse 2x2 kernel twice.
-    ConvKernel3x3FilterDepth8<2, 2, 2>::Run(
+    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
         input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
         filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
         output_activation_min, output_activation_max, output_ptr, output_depth,
         output_width);
 
-    ConvKernel3x3FilterDepth8<2, 2, 2>::Run(
+    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
         input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
         filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
         output_shift, output_activation_min, output_activation_max,
@@ -3458,7 +3459,7 @@ struct ConvKernel3x3FilterDepth8<2, 4, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 1, 2> {
+struct ConvKernel3x3FilterDepth8<2, 1, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3551,7 +3552,7 @@ struct ConvKernel3x3FilterDepth8<2, 1, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<1, 2, 2> {
+struct ConvKernel3x3FilterDepth8<1, 2, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3643,7 +3644,7 @@ struct ConvKernel3x3FilterDepth8<1, 2, 2> {
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<1, 4, 2> {
+struct ConvKernel3x3FilterDepth8<1, 4, 2, 2> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3798,8 +3799,8 @@ struct ConvKernel3x3FilterDepth8<1, 4, 2> {
   }
 };
 
-template <>
-struct ConvKernel3x3FilterDepth8<1, 1> {
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight> {
   static inline void Run(const uint8* input_ptr, int input_depth,
                          int32 input_offset, int input_row_size,
                          const uint8* filter_ptr, int32 filter_offset,
@@ -3872,12 +3873,11 @@ inline void ShuffleInput(const uint8* input_ptr, int input_depth,
   }
 }
 
-template <int kFixedHeight, int kFixedStrideWidth,
-          int kFixedStrideHeight = kFixedStrideWidth>
+template <int kFixedHeight, int kFixedStrideWidth, int kFixedStrideHeight>
 struct ConvRow3x3FilterDepth8 {};
 
-template <int kFixedStrideWidth>
-struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth> {
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth, kFixedStrideHeight> {
   static inline void Run(const uint8* input_data, int start_x, int start_y,
                          int input_depth, int input_width, int input_height,
                          int input_row_size, int32 input_offset,
@@ -3899,11 +3899,11 @@ struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
+        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
 
         input_ptr += 8;
         output_ptr += 8;
@@ -3924,11 +3924,11 @@ struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<1, 1>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
+        ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
 
         input_ptr += 8;
         output_ptr += 8;
@@ -3942,8 +3942,8 @@ struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth> {
   }
 };
 
-template <int kFixedStrideWidth>
-struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth, kFixedStrideHeight> {
   static inline void Run(const uint8* input_data, int start_x, int start_y,
                          int input_depth, int input_width, int input_height,
                          int input_row_size, int32 input_offset,
@@ -3965,11 +3965,11 @@ struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
+        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
 
         input_ptr += 8;
         output_ptr += 8;
@@ -3990,11 +3990,11 @@ struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
+        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
 
         input_ptr += 8;
         output_ptr += 8;
@@ -4015,11 +4015,11 @@ struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
+        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
 
         input_ptr += 8;
         output_ptr += 8;
@@ -4034,7 +4034,7 @@ struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
 };
 
 template <>
-struct ConvRow3x3FilterDepth8<4, 1> {
+struct ConvRow3x3FilterDepth8<4, 1, 1> {
   static inline void Run(const uint8* input_data, int start_x, int start_y,
                          int input_depth, int input_width, int input_height,
                          int input_row_size, int32 input_offset,
@@ -4056,7 +4056,7 @@ struct ConvRow3x3FilterDepth8<4, 1> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 4, 1>::Run(
+        ConvKernel3x3FilterDepth8<4, 4, 1, 1>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4082,7 +4082,7 @@ struct ConvRow3x3FilterDepth8<4, 1> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 2, 1>::Run(
+        ConvKernel3x3FilterDepth8<4, 2, 1, 1>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4107,7 +4107,7 @@ struct ConvRow3x3FilterDepth8<4, 1> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 1, 1>::Run(
+        ConvKernel3x3FilterDepth8<4, 1, 1, 1>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4126,7 +4126,7 @@ struct ConvRow3x3FilterDepth8<4, 1> {
 };
 
 template <>
-struct ConvRow3x3FilterDepth8<4, 2> {
+struct ConvRow3x3FilterDepth8<4, 2, 2> {
   // The buffer size of the shuffled input.
   static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
 
@@ -4195,7 +4195,7 @@ struct ConvRow3x3FilterDepth8<4, 2> {
         const uint8* shuffled_ptr = &shuffle_workspace[0];
 
         for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
-          ConvKernel3x3FilterDepth8<4, 4, 2>::Run(
+          ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
               shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
               bias_ptr, output_offset, output_multiplier, output_shift,
               output_activation_min, output_activation_max, output_ptr,
@@ -4221,7 +4221,7 @@ struct ConvRow3x3FilterDepth8<4, 2> {
       DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
 
       for (; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 4, 2>::Run(
+        ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4249,7 +4249,7 @@ struct ConvRow3x3FilterDepth8<4, 2> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+        ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4274,7 +4274,7 @@ struct ConvRow3x3FilterDepth8<4, 2> {
       uint8* output_ptr = output_data;
 
       for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 1, 2>::Run(
+        ConvKernel3x3FilterDepth8<4, 1, 2, 2>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4293,7 +4293,7 @@ struct ConvRow3x3FilterDepth8<4, 2> {
 };
 
 template <>
-struct ConvRow3x3FilterDepth8<8, 2> {
+struct ConvRow3x3FilterDepth8<8, 2, 2> {
   static inline void Run(const uint8* input_data, int start_x, int start_y,
                          int input_depth, int input_width, int input_height,
                          int input_row_size, int32 input_offset,
@@ -4305,14 +4305,14 @@ struct ConvRow3x3FilterDepth8<8, 2> {
                          int output_depth, int output_width,
                          uint8* shuffle_workspace) {
     // Reuse 4 row kernels twice.
-    ConvRow3x3FilterDepth8<4, 2>::Run(
+    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
         input_data, start_x, start_y, input_depth, input_width, input_height,
         input_row_size, input_offset, filter_data, filter_offset, bias_data,
         output_offset, output_multiplier, output_shift, output_activation_min,
         output_activation_max, output_data, output_depth, output_width,
         shuffle_workspace);
 
-    ConvRow3x3FilterDepth8<4, 2>::Run(
+    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
         input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
         input_width, input_height, input_row_size, input_offset, filter_data,
         filter_offset, bias_data, output_offset, output_multiplier,
@@ -4323,7 +4323,7 @@ struct ConvRow3x3FilterDepth8<8, 2> {
 };
 
 template <>
-struct ConvRow3x3FilterDepth8<8, 1> {
+struct ConvRow3x3FilterDepth8<8, 1, 1> {
   // The buffer size of the shuffled input.
   static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
 
@@ -4359,7 +4359,7 @@ struct ConvRow3x3FilterDepth8<8, 1> {
         const uint8* shuffled_ptr = shuffle_workspace;
 
         for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
-          ConvKernel3x3FilterDepth8<8, 8, 1>::Run(
+          ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
               shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
               filter_offset, bias_ptr, output_offset, output_multiplier,
               output_shift, output_activation_min, output_activation_max,
@@ -4374,7 +4374,7 @@ struct ConvRow3x3FilterDepth8<8, 1> {
       }
 
       for (; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<8, 8, 1>::Run(
+        ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
             input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
             filter_offset, bias_ptr, output_offset, output_multiplier,
             output_shift, output_activation_min, output_activation_max,
@@ -4391,14 +4391,14 @@ struct ConvRow3x3FilterDepth8<8, 1> {
     }
 
     // Handle the rest of the right side by re-using 4 row kernels twice.
-    ConvRow3x3FilterDepth8<4, 1>::Run(
+    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
         input_data, out_x, start_y, input_depth, input_width, input_height,
         input_row_size, input_offset, filter_data, filter_offset, bias_data,
         output_offset, output_multiplier, output_shift, output_activation_min,
         output_activation_max, output_data, output_depth, output_width,
         shuffle_workspace);
 
-    ConvRow3x3FilterDepth8<4, 1>::Run(
+    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
         input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
         input_width, input_height, input_row_size, input_offset, filter_data,
         filter_offset, bias_data, output_offset, output_multiplier,
@@ -4426,7 +4426,8 @@ inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
                    depth_multiplier == 1 &&
                    (stride_width == 1 || stride_width == 2) &&
                    (stride_height == 1 || stride_height == 2) &&
-                   pad_width == 0 && pad_height == 0 && (input_depth % 8) == 0;
+                   (stride_width == stride_height) && pad_width == 0 &&
+                   pad_height == 0 && (input_depth % 8) == 0;
 
   if (!supported) {
     return false;
@@ -4477,23 +4478,24 @@ inline void DepthwiseConv3x3Filter(
   TFLITE_DCHECK(pad_width == 0);
   TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
   TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
 
   const int input_row_size = input_depth * (input_width + 2 * pad_width);
   const int output_row_size = output_depth * output_width;
   const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
   const int output_batch_size = output_depth * output_width * output_height;
 
-  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1>::Run);
-  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1>::Run;
-  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1>::Run;
-  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1>::Run;
-  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1>::Run;
+  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1, 1>::Run);
+  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1, 1>::Run;
+  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1, 1>::Run;
+  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1, 1>::Run;
+  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1, 1>::Run;
 
   if (stride_width == 2) {
-    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2>::Run;
-    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2>::Run;
-    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2>::Run;
-    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2>::Run;
+    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2, 2>::Run;
+    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2, 2>::Run;
+    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2, 2>::Run;
+    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2, 2>::Run;
   }
 
   // Allocate maximum memory needed for shuffled input.
@@ -4505,10 +4507,10 @@ inline void DepthwiseConv3x3Filter(
   uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
 
   // Make sure the kernels using this buffer will not run out of bounds.
-  static_assert(ConvRow3x3FilterDepth8<8, 1>::ShuffleWorkspaceSize() <=
+  static_assert(ConvRow3x3FilterDepth8<8, 1, 1>::ShuffleWorkspaceSize() <=
                     DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
                 "Shuffle workspace size is too small.");
-  static_assert(ConvRow3x3FilterDepth8<4, 2>::ShuffleWorkspaceSize() <=
+  static_assert(ConvRow3x3FilterDepth8<4, 2, 2>::ShuffleWorkspaceSize() <=
                     DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
                 "Shuffle workspace size is too small.");
 
-- 
GitLab


From 6e8c908c8e299ddb46ac20b6a668e37ed37f24c0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 10:30:32 -0700
Subject: [PATCH 0766/1262] Disable x * x -> square(x) Grapler rewrite for
 complex types unless the op is on CPU. Square is not registered for complex
 types on GPU, and doing so produces a crash in with CUDA_ILLEGAL_INSTRUCTION
 when running it on open source ubuntu.

PiperOrigin-RevId: 192788160
---
 .../optimizers/arithmetic_optimizer.cc        | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 60b1af48ec..b80ae5fa40 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1782,13 +1782,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
 
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
       !OptimizedNodeExists(*node, "square")) {
-    NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-    new_square_node->set_op("Square");
-    for (int i = 1; i < new_square_node->input_size(); ++i) {
-      new_square_node->set_input(i - 1, new_square_node->input(i));
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    string dontcare;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      return new_square_node->name();
     }
-    new_square_node->mutable_input()->RemoveLast();
-    return new_square_node->name();
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-- 
GitLab


From 8303fa2a53071a7e4a346454f707d25abbd6e1b5 Mon Sep 17 00:00:00 2001
From: James Wexler <jwexler@google.com>
Date: Fri, 13 Apr 2018 13:33:37 -0400
Subject: [PATCH 0767/1262] closure proto library for example protos

---
 WORKSPACE             | 19 ++++++++++++-------
 tensorflow/core/BUILD | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 11c5cdb207..d37e213922 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,13 +1,18 @@
 workspace(name = "org_tensorflow")
 
-http_archive(
+## DO NOT SUBMIT
+#http_archive(
+#    name = "io_bazel_rules_closure",
+#    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
+#    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
+#    urls = [
+#        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
+#        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
+#    ],
+#)
+local_repository(
     name = "io_bazel_rules_closure",
-    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
-    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
-    urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
-    ],
+    path = "/usr/local/google/home/jwexler/jameswex/rules_closure",
 )
 
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c5ca421ced..08884fa914 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -149,6 +149,7 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load("@io_bazel_rules_closure//closure:defs.bzl","closure_proto_library")
 
 exports_files(["ops/ops.pbtxt"])
 
@@ -244,6 +245,21 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
+proto_library(
+    name = "example_protos",
+    srcs = [
+        "example/example.proto",
+        "example/feature.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+closure_proto_library(
+    name = "example_protos_closure",
+    deps = [":example_protos"],
+    visibility = ["//visibility:public"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
-- 
GitLab


From b004e233da511e2692277d5a98d72ec40917b4b2 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 13 Apr 2018 10:52:56 -0700
Subject: [PATCH 0768/1262] Internal change.

PiperOrigin-RevId: 192791493
---
 tensorflow/python/kernel_tests/BUILD | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e504a9fd21..e82d738f14 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2669,10 +2669,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
-    tags = [
-        "manual",
-        "notap",  # b/30226163
-    ],
 )
 
 cuda_py_test(
-- 
GitLab


From ff97232dbf44c8c5515e10f7d3d72f215381bd65 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Fri, 13 Apr 2018 11:02:08 -0700
Subject: [PATCH 0769/1262] Fix comment in xla_data.proto related to padding
 value for Windows.

PiperOrigin-RevId: 192792971
---
 tensorflow/compiler/xla/xla_data.proto | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 1f16e6d251..f18d53c608 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -355,17 +355,19 @@ message WindowDimension {
   // positions of the window in this dimension.
   int64 stride = 2;
 
-  // If positive, means the amount of padding with zeroes to add to the base
-  // area at the low end of this dimension; if negative, its negative means the
-  // number of elements removed from the low end of this dimension. For example,
-  // in the horizontal dimension of a rectangle, this would be the number of
-  // zeroes to pad on the left, given that indices increase when going right.
+  // If positive, means the amount of padding to add to the base area at the low
+  // end of this dimension; if negative, its negative means the number of
+  // elements removed from the low end of this dimension. For example, in the
+  // horizontal dimension of a rectangle, this would be the number of padding
+  // values to pad on the left, given that indices increase when going right.
+  // The actual padding value depends upon the context. Convolution pads with
+  // zeros. ReduceWindow and SelectAndScatter pads with the reduce function's
+  // init value.
   int64 padding_low = 3;
 
-  // As padding_low, but on the high end of this dimension. For
-  // example, in the horizontal dimension of a rectangle, this would
-  // be the number of zeroes to pad on the right, given that indices
-  // increase when going right.
+  // As padding_low, but on the high end of this dimension. For example, in the
+  // horizontal dimension of a rectangle, this would be the number of values to
+  // pad on the right, given that indices increase when going right.
   int64 padding_high = 4;
 
   // Dilation factor of the sliding window in this dimension. A dilation factor
-- 
GitLab


From 4fa6ca2bb74aa27ffb71a23e4a8d72810c377b07 Mon Sep 17 00:00:00 2001
From: James Wexler <jwexler@google.com>
Date: Fri, 13 Apr 2018 14:09:42 -0400
Subject: [PATCH 0770/1262] review changes

---
 WORKSPACE             | 19 +++++++------------
 tensorflow/core/BUILD |  2 +-
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index d37e213922..4ddfb9a383 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,18 +1,13 @@
 workspace(name = "org_tensorflow")
 
-## DO NOT SUBMIT
-#http_archive(
-#    name = "io_bazel_rules_closure",
-#    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
-#    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
-#    urls = [
-#        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
-#        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
-#    ],
-#)
-local_repository(
+http_archive(
     name = "io_bazel_rules_closure",
-    path = "/usr/local/google/home/jwexler/jameswex/rules_closure",
+    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
+    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
+    ],
 )
 
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 08884fa914..ab25283cc4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -149,7 +149,7 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
-load("@io_bazel_rules_closure//closure:defs.bzl","closure_proto_library")
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 
 exports_files(["ops/ops.pbtxt"])
 
-- 
GitLab


From 8e2fd4b30210ef633153b65d3d45cc51a3d4f0cf Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 13 Apr 2018 11:09:58 -0700
Subject: [PATCH 0771/1262] Use eager compatible wrappers in load_library for
 custom ops

---
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/framework/load_library.py   |  2 +-
 tensorflow/python/framework/python_op_gen.i   |  8 ++--
 .../tools/ci_build/builds/test_user_ops.sh    | 41 +++++++++++--------
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index db17a3fe02..9209ca4b96 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3286,6 +3286,7 @@ tf_py_wrap_cc(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
+        "//tensorflow/python/eager:python_eager_op_gen",
         "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 1f2aa264c1..4f349304d3 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -60,7 +60,7 @@ def load_op_library(library_filename):
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
   op_list.ParseFromString(compat.as_bytes(op_list_str))
-  wrappers = py_tf.GetPythonWrappers(op_list_str)
+  wrappers = py_tf.GetEagerPythonWrappers(op_list_str)
 
   # Delete the library handle to release any memory held in C
   # that are no longer needed.
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 26ec4e8e66..e39c425b05 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -16,10 +16,10 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/python/framework/python_op_gen.h"
+#include "tensorflow/python/eager/python_eager_op_gen.h"
 %}
 
-// Input typemap for GetPythonWrappers.
+// Input typemap for GetEagerPythonWrappers.
 // Accepts a python object of 'bytes' type, and converts it to
 // a const char* pointer and size_t length. The default typemap
 // going from python bytes to const char* tries to decode the
@@ -37,5 +37,5 @@ limitations under the License.
 
 
 %ignoreall;
-%unignore tensorflow::GetPythonWrappers;
-%include "tensorflow/python/framework/python_op_gen.h"
+%unignore tensorflow::GetEagerPythonWrappers;
+%include "third_party/tensorflow/python/eager/python_eager_op_gen.h"
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index caa3a40817..c342367bac 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//')
 echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\
 "via pip installation"
 
-ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-
-# Format OUTPUT for analysis
-if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
-  if [[ ${IS_MAC} == "1" ]]; then
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+function run_op() {
+  local ORIG_OUTPUT=$1
+  local ADDITIONAL_LOG=$2
+
+  # Format OUTPUT for analysis
+  if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
+    if [[ ${IS_MAC} == "1" ]]; then
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+    else
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    fi
   else
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    local OUTPUT="${ORIG_OUTPUT}"
   fi
-else
-  OUTPUT="${ORIG_OUTPUT}"
-fi
 
-EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
+  local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
 
-if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
-  die "FAILED: Output from user op (${OUTPUT}) does not match expected "\
-"output ${EXPECTED_OUTPUT}"
-else
-  echo "Output from user op (${OUTPUT}) matches expected output"
-fi
+  if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
+    local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\
+  "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG}
+    die ${ERROR}
+  else
+    echo "Output from user op (${OUTPUT}) matches expected output"
+  fi
+}
+
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
 
 popd
 
-- 
GitLab


From e42ebc5b95856760332443987292e5d750050531 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Fri, 13 Apr 2018 11:06:49 -0700
Subject: [PATCH 0772/1262] Add more logging for failure cases in CUDATimer

PiperOrigin-RevId: 192793983
---
 tensorflow/stream_executor/cuda/cuda_timer.cc | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
index 7d78601fb9..8532f08725 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -73,16 +73,22 @@ float CUDATimer::GetElapsedMilliseconds() const {
   return elapsed_milliseconds;
 }
 
-bool CUDATimer::Start(CUDAStream *stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), start_event_,
-                                 stream->cuda_stream())
-      .ok();
+bool CUDATimer::Start(CUDAStream* stream) {
+  port::Status status = CUDADriver::RecordEvent(
+      parent_->cuda_context(), start_event_, stream->cuda_stream());
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
-bool CUDATimer::Stop(CUDAStream *stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), stop_event_,
-                                 stream->cuda_stream())
-      .ok();
+bool CUDATimer::Stop(CUDAStream* stream) {
+  port::Status status = CUDADriver::RecordEvent(
+      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 }  // namespace cuda
-- 
GitLab


From 3dbd4518321088e2796e738fec2e253cdc6d3da1 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 13 Apr 2018 11:14:09 -0700
Subject: [PATCH 0773/1262] [TF:XLA] Start a TensorFlow library that contains
 direct wrappers for XLA operators.

Add new XlaReduceWindow and XlaDynamicUpdateSlice operators.
Add new tests for the existing XlaWhile operator.
Add wrappers for XlaSend and XlaRecv.

PiperOrigin-RevId: 192795174
---
 tensorflow/compiler/tests/BUILD               |  43 ++++++
 .../compiler/tests/dynamic_slice_ops_test.py  |  93 ++++++++++++
 .../compiler/tests/reduce_window_test.py      | 102 +++++++++++++
 tensorflow/compiler/tests/while_test.py       | 130 +++++++++++++++++
 tensorflow/compiler/tf2xla/BUILD              |   4 +-
 tensorflow/compiler/tf2xla/cc/BUILD           |  38 +----
 .../tf2xla/functionalize_control_flow_test.cc |   2 +-
 tensorflow/compiler/tf2xla/kernels/BUILD      |   8 +-
 .../tf2xla/kernels/dynamic_slice_ops.cc       |  69 +++++++++
 .../tf2xla/kernels/reduce_window_op.cc        | 135 ++++++++++++++++++
 .../compiler/tf2xla/kernels/sendrecv_ops.cc   |   6 +-
 tensorflow/compiler/tf2xla/ops/BUILD          |  30 ++--
 .../compiler/tf2xla/ops/dynamic_slice_ops.cc  |  49 +++++++
 .../compiler/tf2xla/ops/reduce_window_op.cc   |  45 ++++++
 .../compiler/tf2xla/ops/sendrecv_ops.cc       |  23 +--
 tensorflow/compiler/tf2xla/python/BUILD       |   8 ++
 tensorflow/compiler/tf2xla/python/xla.py      |  80 +++++++++++
 17 files changed, 795 insertions(+), 70 deletions(-)
 create mode 100644 tensorflow/compiler/tests/dynamic_slice_ops_test.py
 create mode 100644 tensorflow/compiler/tests/reduce_window_test.py
 create mode 100644 tensorflow/compiler/tests/while_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
 create mode 100644 tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
 create mode 100644 tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
 create mode 100644 tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
 create mode 100644 tensorflow/compiler/tf2xla/python/xla.py

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 47c6ab58c0..b9e42ca677 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -271,6 +271,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "dynamic_slice_ops_test",
+    size = "small",
+    srcs = ["dynamic_slice_ops_test.py"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -497,6 +509,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reduce_window_test",
+    size = "small",
+    srcs = ["reduce_window_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "reverse_ops_test",
     size = "medium",
@@ -689,6 +717,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "while_test",
+    size = "small",
+    srcs = ["while_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "gather_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/dynamic_slice_ops_test.py b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
new file mode 100644
index 0000000000..6a46d2ec3e
--- /dev/null
+++ b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
@@ -0,0 +1,93 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA dynamic slicing ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class DynamicUpdateSliceOpsTest(XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self, op, args, expected):
+    with self.test_session() as session:
+      with self.test_scope():
+        placeholders = [
+            array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
+            for arg in args
+        ]
+        feeds = {placeholders[i]: args[i] for i in range(0, len(args))}
+        output = op(*placeholders)
+      result = session.run(output, feeds)
+      self.assertAllClose(result, expected, rtol=1e-3)
+
+  def testUpdateSlice(self):
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array([], dtype=dtype),
+              np.array([], dtype=dtype),
+              np.array([0], dtype=np.int32)
+          ],
+          expected=np.array([], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
+              np.array([-1, -2, -3], dtype=dtype),
+              np.array([6], dtype=np.int32)
+          ],
+          expected=np.array([1, 2, 3, 4, 5, 6, -1, -2, -3, 10], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.array([[42, 43], [44, 45]], dtype=dtype),
+              np.array([1, 2], dtype=np.int32)
+          ],
+          expected=np.array(
+              [[1, 2, 3, 4], [5, 6, 42, 43], [9, 10, 44, 45]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.array([[], []], dtype=dtype),
+              np.array([1, 2], dtype=np.int32)
+          ],
+          expected=np.array(
+              [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.ones([3, 4], dtype=dtype),
+              np.array([0, 0], dtype=np.int32)
+          ],
+          expected=np.ones([3, 4], dtype=dtype))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/reduce_window_test.py b/tensorflow/compiler/tests/reduce_window_test.py
new file mode 100644
index 0000000000..e78a63465b
--- /dev/null
+++ b/tensorflow/compiler/tests/reduce_window_test.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for xla.reduce_window."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class ReduceWindowTest(XLATestCase):
+  """Test cases for xla.reduce_window."""
+
+  def _reduce_window(self, operand, init, reducer, **kwargs):
+    with self.test_session():
+      placeholder = array_ops.placeholder(operand.dtype)
+      with self.test_scope():
+        output = xla.reduce_window(placeholder, init, reducer, **kwargs)
+      return output.eval(feed_dict={placeholder: operand})
+
+  def testReduceWindow(self):
+
+    # TODO(b/77644762): float16 and float64 ReduceWindow are unimplemented.
+    for dtype in set(self.numeric_types).intersection(
+        set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
+
+      @function.Defun(dtype, dtype)
+      def sum_reducer(x, y):
+        return x + y
+
+      @function.Defun(dtype, dtype)
+      def mul_reducer(x, y):
+        return x * y
+
+      self.assertAllClose(
+          np.array([3, 5, 7, 9, 11, 13], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2]))
+
+      self.assertAllClose(
+          np.array([3, 7, 11], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2],
+              window_strides=[2]))
+
+      self.assertAllClose(
+          np.array([1, 4, 7], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[1],
+              window_strides=[3]))
+
+      self.assertAllClose(
+          np.array([[24, 36, 24], [96, 0, 0]], dtype=dtype),
+          self._reduce_window(
+              np.array([[1, 2, 3, 4], [4, 3, 2, 1], [2, 4, 0, 1]], dtype=dtype),
+              1.0,
+              mul_reducer,
+              window_dimensions=[2, 2],
+              window_strides=[1, 1]))
+
+      self.assertAllClose(
+          np.array([[0, 0, 0], [5, 10, 5], [2, 4, 1], [0, 0, 0]], dtype=dtype),
+          self._reduce_window(
+              np.array([[1, 2, 3, 4], [4, 3, 2, 1], [2, 4, 0, 1]], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2, 2],
+              window_strides=[2, 2],
+              padding=[[2, 3], [1, 2]]))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py
new file mode 100644
index 0000000000..f79eb27435
--- /dev/null
+++ b/tensorflow/compiler/tests/while_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for while loops in XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class WhileTest(XLATestCase):
+
+  def testSingletonLoopHandrolled(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32)
+    def loop_body(step):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      return step_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32)
+    def loop_cond(step):
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index], loop_cond, loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0})
+      self.assertAllClose(result, [10], rtol=1e-3)
+
+  def testCountingLoopHandrolled(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def loop_body(step, rsum):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      sum_out = rsum + constant_op.constant(1.5, dtype=dtypes.float32)
+      return step_out, sum_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def loop_cond(step, rsum):
+      del rsum
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      init_sum = array_ops.placeholder(dtypes.float32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, init_sum], loop_cond,
+                                      loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0, init_sum: 0.0})
+      self.assertAllClose(result, [10, 15.0], rtol=1e-3)
+      no_iters_result = sess.run(loop_outputs, {init_index: 10, init_sum: 0.0})
+      self.assertAllClose(no_iters_result, [10, 0.0], rtol=1e-3)
+
+  def testCountingLoopHandrolledC64(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.complex64)
+    def loop_body(step, rsum):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      sum_out = rsum + constant_op.constant(1.5 + 2j, dtype=dtypes.complex64)
+      return step_out, sum_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.complex64)
+    def loop_cond(step, rsum):
+      del rsum
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      init_sum = array_ops.placeholder(dtypes.complex64, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, init_sum], loop_cond,
+                                      loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0, init_sum: 0.0})
+      self.assertAllClose(result[1], np.complex64(15 + 20j), rtol=1e-3)
+      no_iters_result = sess.run(loop_outputs, {init_index: 10, init_sum: 0.0})
+      self.assertAllClose(no_iters_result[1], np.complex64(0), rtol=1e-3)
+
+  def testLoopWithConstantOutput(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def loop_body(step, x):
+      del x
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      return (step_out, 7)
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def loop_cond(step, x):
+      del x
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, 42], loop_cond, loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0})
+      self.assertAllClose(result, [10, 7], rtol=1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e7daf4e01c..ba5c3a1484 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -415,7 +415,7 @@ cc_library(
         "//tensorflow/compiler/jit:graph_to_functiondef",
         "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla:dump_graph",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
@@ -437,7 +437,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
-        "//tensorflow/compiler/tf2xla/cc:functional_ops",
+        "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index c30bb9cacd..4f8bb8ad74 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -7,44 +7,20 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 
 tf_gen_op_wrapper_cc(
-    name = "functional_ops_gen",
-    include_internal_ops = 1,
-    out_ops_file = "ops/functional_ops",
-    deps = ["//tensorflow/compiler/tf2xla/ops:functional_ops"],
+    name = "xla_ops_gen",
+    out_ops_file = "ops/xla_ops",
+    deps = ["//tensorflow/compiler/tf2xla/ops:xla_ops"],
 )
 
 cc_library(
-    name = "functional_ops",
-    srcs = ["ops/functional_ops.cc"],
-    hdrs = ["ops/functional_ops.h"],
+    name = "xla_ops",
+    srcs = ["ops/xla_ops.cc"],
+    hdrs = ["ops/xla_ops.h"],
     deps = [
         "//tensorflow/cc:const_op",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_gen_op_wrapper_cc(
-    name = "sendrecv_ops_gen",
-    include_internal_ops = 1,
-    out_ops_file = "ops/sendrecv_ops",
-    deps = ["//tensorflow/compiler/tf2xla/ops:sendrecv_ops"],
-)
-
-cc_library(
-    name = "sendrecv_ops",
-    srcs = ["ops/sendrecv_ops.cc"],
-    hdrs = ["ops/sendrecv_ops.h"],
-    deps = [
-        "//tensorflow/cc:const_op",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index bc7276c3af..e494f42e8e 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/tf2xla/cc/ops/functional_ops.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 3ba37b0383..579b669699 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -29,6 +29,7 @@ tf_kernel_library(
         "cwise_ops.h",
         "depthtospace_op.cc",
         "diag_op.cc",
+        "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
         "extract_image_patches_op.cc",
@@ -56,6 +57,7 @@ tf_kernel_library(
         "pooling_ops.cc",
         "quantize_and_dequantize_op.cc",
         "random_ops.cc",
+        "reduce_window_op.cc",
         "reduction_ops.cc",
         "reduction_ops.h",
         "reduction_ops_common.cc",
@@ -103,7 +105,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/lib:while_loop",
-        "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -146,7 +148,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:framework",
@@ -162,7 +164,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
new file mode 100644
index 0000000000..800ef5ab98
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicUpdateSliceOp : public XlaOpKernel {
+ public:
+  explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(3) << "DynamicUpdateSliceOp::Compile";
+
+    DataType index_type = input_type(2);
+    OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("index must be int32 or int64"));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape update_shape = ctx->InputShape(1);
+    const TensorShape index_shape = ctx->InputShape(2);
+
+    OP_REQUIRES(
+        ctx,
+        TensorShapeUtils::IsVector(index_shape) &&
+            index_shape.num_elements() == input_shape.dims(),
+        errors::InvalidArgument("index must be a vector with length equal to "
+                                "the number of input dimensions"));
+    OP_REQUIRES(
+        ctx, input_shape.dims() == update_shape.dims(),
+        errors::InvalidArgument("input and update must have the same rank,"
+                                " input shape is ",
+                                input_shape.DebugString(), "; update shape is ",
+                                update_shape.DebugString()));
+
+    xla::ComputationDataHandle result = ctx->builder()->DynamicUpdateSlice(
+        ctx->Input(0), ctx->Input(1), ctx->Input(2));
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaDynamicUpdateSlice"), DynamicUpdateSliceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
new file mode 100644
index 0000000000..cb144bea9e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/while_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class ReduceWindowOp : public XlaOpKernel {
+ public:
+  explicit ReduceWindowOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("computation", &computation_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("window_dimensions", &window_dimensions_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("window_strides", &window_strides_));
+    OP_REQUIRES_OK(context, context->GetAttr("padding_low", &padding_low_));
+    OP_REQUIRES_OK(context, context->GetAttr("padding_high", &padding_high_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const DataType dtype = context->input_type(0);
+
+    const int rank = input_shape.dims();
+    OP_REQUIRES(context, rank == window_dimensions_.size(),
+                errors::InvalidArgument(
+                    "The size of window_dimensions must be equal to the input "
+                    "rank (",
+                    window_dimensions_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_strides_.size(),
+                errors::InvalidArgument(
+                    "The size of window_strides must be equal to the input "
+                    "rank (",
+                    window_strides_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_low_.size(),
+                errors::InvalidArgument(
+                    "The size of padding_low must be equal to the input "
+                    "rank (",
+                    padding_low_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_high_.size(),
+                errors::InvalidArgument(
+                    "The size of padding_high must be equal to the input "
+                    "rank (",
+                    padding_high_.size(), " vs. ", rank, ")"));
+
+    xla::ComputationBuilder* builder = context->builder();
+
+    // Build the reducer function.
+    XlaCompiler::Argument reducer_arg;
+    reducer_arg.kind = XlaCompiler::Argument::kParameter;
+    reducer_arg.type = dtype;
+    reducer_arg.shape = TensorShape();
+
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.use_tuple_arg = false;
+    compile_options.resolve_compile_time_constants = false;
+    compile_options.is_entry_computation = false;
+    XlaCompiler::CompilationResult reducer;
+    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
+                                compile_options, *computation_,
+                                {reducer_arg, reducer_arg}, &reducer));
+
+    xla::Shape scalar_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    OP_REQUIRES(context,
+                xla::ShapeUtil::Compatible(
+                    reducer.xla_output_shape,
+                    xla::ShapeUtil::MakeTupleShape({scalar_shape})),
+                errors::InvalidArgument(
+                    "Invalid output shape of ReduceWindow reducer. Expected ",
+                    xla::ShapeUtil::HumanString(scalar_shape), " got ",
+                    xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
+
+    // Wraps the reducer in a computation that unpacks the output tuple.
+    xla::Computation wrapper;
+    {
+      std::unique_ptr<xla::ComputationBuilder> cb =
+          builder->CreateSubBuilder("wrapper");
+      auto x = cb->Parameter(0, scalar_shape, "x");
+      auto y = cb->Parameter(1, scalar_shape, "y");
+      auto outputs = cb->Call(*reducer.computation, {x, y});
+      cb->GetTupleElement(outputs, 0);
+      xla::StatusOr<xla::Computation> result = cb->Build();
+      OP_REQUIRES_OK(context, result.status());
+      wrapper = std::move(result.ValueOrDie());
+    }
+
+    std::vector<std::pair<int64, int64>> padding(rank);
+    for (int i = 0; i < rank; ++i) {
+      padding[i] = {padding_low_[i], padding_high_[i]};
+    }
+
+    xla::ComputationDataHandle output = builder->ReduceWindowWithGeneralPadding(
+        context->Input(0), context->Input(1), wrapper, window_dimensions_,
+        window_strides_, padding);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  const NameAttrList* computation_;
+  std::vector<int64> window_dimensions_;
+  std::vector<int64> window_strides_;
+  std::vector<int64> padding_low_;
+  std::vector<int64> padding_high_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ReduceWindowOp);
+};
+
+REGISTER_XLA_OP(Name("XlaReduceWindow"), ReduceWindowOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index 5172781c0d..d079b89861 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -48,7 +48,7 @@ void SendOp::Compile(XlaOpKernelContext* ctx) {
   ctx->builder()->Send(ctx->Input(0), channel);
 }
 
-REGISTER_XLA_OP(Name("_XLASend"), SendOp);
+REGISTER_XLA_OP(Name("XlaSend"), SendOp);
 
 class RecvOp : public XlaOpKernel {
  public:
@@ -68,7 +68,7 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   TensorShape tensor_shape;
   DataType dtype;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &tensor_shape));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype));
   OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, tensor_shape, &shape_));
 }
 
@@ -79,7 +79,7 @@ void RecvOp::Compile(XlaOpKernelContext* ctx) {
   ctx->SetOutput(0, ctx->builder()->Recv(shape_, channel));
 }
 
-REGISTER_XLA_OP(Name("_XLARecv"), RecvOp);
+REGISTER_XLA_OP(Name("XlaRecv"), RecvOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index aeb743a663..bb9168fa35 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -7,17 +7,13 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 cc_library(
-    name = "functional_ops",
-    srcs = ["functional_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
+    name = "xla_ops",
+    srcs = [
+        "dynamic_slice_ops.cc",
+        "functional_ops.cc",
+        "reduce_window_op.cc",
+        "sendrecv_ops.cc",
     ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "sendrecv_ops",
-    srcs = ["sendrecv_ops.cc"],
     deps = [
         "//tensorflow/core:framework",
     ],
@@ -25,17 +21,9 @@ cc_library(
 )
 
 tf_gen_op_wrapper_py(
-    name = "gen_functional_ops",
-    out = "gen_functional_ops.py",
-    deps = [
-        ":functional_ops",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_sendrecv_ops",
-    out = "gen_sendrecv_ops.py",
+    name = "gen_xla_ops",
+    out = "gen_xla_ops.py",
     deps = [
-        ":sendrecv_ops",
+        ":xla_ops",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
new file mode 100644
index 0000000000..d6c0edbb88
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("XlaDynamicUpdateSlice")
+    .Input("input: T")
+    .Input("update: T")
+    .Input("indices: Tindices")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Wraps the XLA DynamicUpdateSlice operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice
+.
+
+XlaDynamicUpdateSlice generates a result which is the value of the `input`
+operand, with a slice update overwritten at `indices`. The shape of `update`
+determines the shape of the sub-array of the result which is updated. The shape
+of indices must be rank == 1, with dimension size equal to the rank of `input`.
+
+Handling of out-of-bounds slice indices is implementation-defined.
+
+input: A `Tensor` of type T.
+indices: A vector of indices into `input`. Must have length equal to the rank of
+  `input`.
+update: A `Tensor` of type T. Same rank as `input`.
+output: A `Tensor` of type T.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
new file mode 100644
index 0000000000..d9af982adc
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("XlaReduceWindow")
+    .Input("input: T")
+    .Input("init_value: T")
+    .Attr("T: numbertype")
+    .Attr("computation: func")
+    .Attr("window_dimensions: list(int)")
+    .Attr("window_strides: list(int)")
+    .Attr("padding_low: list(int)")
+    .Attr("padding_high: list(int)")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Wraps the XLA ReduceWindow operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
+
+input: the input tensor
+init_value: a scalar representing the initial value for the reduction
+computation: a reducer function to apply
+window_dimensions: the shape of the window
+window_strides: the inter-window strides
+padding_low: the padding to apply at the start of each input dimensions
+padding_high: the padding to apply at the end of each input dimension.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
index 4b41c16a8b..7ec7b50e90 100644
--- a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
@@ -18,22 +18,24 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_XLASend")
+REGISTER_OP("XlaSend")
     .Input("tensor: T")
     .Attr("T: type")
     .Attr("tensor_name: string")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Sends the named tensor to another XLA computation.
+Sends the named tensor to another XLA computation. Wraps the XLA Send operator
+documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#send .
 
 tensor: The tensor to send.
-tensor_name: The name of the tensor to send.
+tensor_name: A string key that identifies the channel.
 )doc");
 
-REGISTER_OP("_XLARecv")
-    .Output("tensor: T")
-    .Attr("T: type")
+REGISTER_OP("XlaRecv")
+    .Output("tensor: dtype")
+    .Attr("dtype: type")
     .Attr("tensor_name: string")
     .Attr("shape: shape")
     .SetIsStateful()
@@ -46,11 +48,14 @@ REGISTER_OP("_XLARecv")
       return Status::OK();
     })
     .Doc(R"doc(
-Receives the named tensor from another XLA computation.
+Receives the named tensor from another XLA computation. Wraps the XLA Recv
+operator documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#recv .
 
 tensor: The tensor to receive.
-tensor_name: The name of the tensor to receive.
-shape: The shape of the input tensor.
+dtype: The type of the tensor.
+tensor_name: A string key that identifies the channel.
+shape: The shape of the tensor.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index f0a2ef0651..42b6292f79 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -22,3 +22,11 @@ tf_py_clif_cc(
         "//tensorflow/compiler/tf2xla:xla_compiler",
     ],
 )
+
+py_library(
+    name = "xla",
+    srcs = ["xla.py"],
+    deps = [
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
new file mode 100644
index 0000000000..e5ce65bec9
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental library that exposes XLA operations directly in TensorFlow.
+
+It is sometimes useful to be able to build HLO programs directly from
+TensorFlow. This file provides Tensorflow operators that map as closely as
+possible to HLO operators.
+
+There is no promise of backward or forward compatibility for operators defined
+in this module.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tf2xla.ops import gen_xla_ops
+
+# TODO(phawkins): provide wrappers for all XLA operators.
+
+dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
+
+
+def reduce_window(operand,
+                  init,
+                  reducer,
+                  window_dimensions,
+                  window_strides=None,
+                  padding=None,
+                  name=None):
+  """Wraps the XLA ReduceWindow operator.
+
+  ReduceWindow is documented at
+  https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
+
+  Args:
+    operand: the input tensor
+    init: a scalar tensor representing the initial value for the reduction
+    reducer: a reduction function that combines a pair of scalars.
+    window_dimensions: shape of the window, as a list of integers
+    window_strides: inter-window strides, as a list of integers. Optional;
+      if omitted, defaults to strides of 1.
+    padding: padding to apply to 'operand'. List of (low, high) pairs of
+      integers that specify the padding to apply before and after each
+      dimension. Optional; if omitted, defaults to no padding.
+    name: the operator name, or None.
+  Returns:
+    A tensor that represents the output of the reduce_window operator.
+  """
+  window_strides = window_strides or [1] * len(window_dimensions)
+  padding = padding or [(0, 0)] * len(window_dimensions)
+  padding_low = [x for (x, _) in padding]
+  padding_high = [y for (_, y) in padding]
+  return gen_xla_ops.xla_reduce_window(
+      operand,
+      init,
+      reducer,
+      window_dimensions,
+      window_strides,
+      padding_low,
+      padding_high,
+      name=name)
+
+
+recv = gen_xla_ops.xla_recv
+send = gen_xla_ops.xla_send
+
+while_loop = gen_xla_ops.xla_while
-- 
GitLab


From 2d07eb5109ff3987681f6bac07d1b322dab5950b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 11:16:36 -0700
Subject: [PATCH 0774/1262] Fixing output alternatives

PiperOrigin-RevId: 192795596
---
 .../boosted_trees/estimator_batch/estimator_utils.py     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
index c9cf4ae25a..48a7f85ead 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
@@ -58,9 +58,12 @@ def _export_outputs_to_output_alternatives(export_outputs):
   return None
 
 
-def estimator_spec_to_model_fn_ops(estimator_spec):
-  alternatives = _export_outputs_to_output_alternatives(
-      estimator_spec.export_outputs)
+def estimator_spec_to_model_fn_ops(estimator_spec, export_alternatives=False):
+  if export_alternatives:
+    alternatives = _export_outputs_to_output_alternatives(
+        estimator_spec.export_outputs)
+  else:
+    alternatives = []
 
   return model_fn.ModelFnOps(
       mode=_core_mode_to_contrib_mode(estimator_spec.mode),
-- 
GitLab


From 6942b87c255e9bce9289f87ff6894d198fcab6f4 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 13 Apr 2018 11:09:58 -0700
Subject: [PATCH 0775/1262] Use eager compatible wrappers in load_library for
 custom ops

---
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/framework/load_library.py   |  2 +-
 tensorflow/python/framework/python_op_gen.i   |  8 ++--
 .../tools/ci_build/builds/test_user_ops.sh    | 41 +++++++++++--------
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a683c8cfa6..579a8faaad 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3482,6 +3482,7 @@ tf_py_wrap_cc(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
+        "//tensorflow/python/eager:python_eager_op_gen",
         "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 535c6017f5..9a8477debb 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -58,7 +58,7 @@ def load_op_library(library_filename):
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
   op_list.ParseFromString(compat.as_bytes(op_list_str))
-  wrappers = py_tf.GetPythonWrappers(op_list_str)
+  wrappers = py_tf.GetEagerPythonWrappers(op_list_str)
 
   # Delete the library handle to release any memory held in C
   # that are no longer needed.
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 26ec4e8e66..e39c425b05 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -16,10 +16,10 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/python/framework/python_op_gen.h"
+#include "tensorflow/python/eager/python_eager_op_gen.h"
 %}
 
-// Input typemap for GetPythonWrappers.
+// Input typemap for GetEagerPythonWrappers.
 // Accepts a python object of 'bytes' type, and converts it to
 // a const char* pointer and size_t length. The default typemap
 // going from python bytes to const char* tries to decode the
@@ -37,5 +37,5 @@ limitations under the License.
 
 
 %ignoreall;
-%unignore tensorflow::GetPythonWrappers;
-%include "tensorflow/python/framework/python_op_gen.h"
+%unignore tensorflow::GetEagerPythonWrappers;
+%include "third_party/tensorflow/python/eager/python_eager_op_gen.h"
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index caa3a40817..c342367bac 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//')
 echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\
 "via pip installation"
 
-ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-
-# Format OUTPUT for analysis
-if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
-  if [[ ${IS_MAC} == "1" ]]; then
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+function run_op() {
+  local ORIG_OUTPUT=$1
+  local ADDITIONAL_LOG=$2
+
+  # Format OUTPUT for analysis
+  if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then
+    if [[ ${IS_MAC} == "1" ]]; then
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g')
+    else
+      local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    fi
   else
-    OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g')
+    local OUTPUT="${ORIG_OUTPUT}"
   fi
-else
-  OUTPUT="${ORIG_OUTPUT}"
-fi
 
-EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
+  local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})")
 
-if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
-  die "FAILED: Output from user op (${OUTPUT}) does not match expected "\
-"output ${EXPECTED_OUTPUT}"
-else
-  echo "Output from user op (${OUTPUT}) matches expected output"
-fi
+  if [[ "${EQUALS_EXPECTED}" != "True" ]]; then
+    local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\
+  "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG}
+    die ${ERROR}
+  else
+    echo "Output from user op (${OUTPUT}) matches expected output"
+  fi
+}
+
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
+run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
 
 popd
 
-- 
GitLab


From 889a63b641f3b6204c8a772fb42c3e256166cac9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 13 Apr 2018 11:26:03 -0700
Subject: [PATCH 0776/1262] Add deprecation args decoration for tf.squeeze
 (#18495)

* Add deprecation args decoration with tf.squeeze

This fix adds deprecation args decoration with tf.squeeze,
with deprecates `squeeze_dims` with `axis`.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Enhancement with deprecated_argument_lookup

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9e136937f6..ceeabe090d 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2578,6 +2578,8 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
 
 
 @tf_export("squeeze")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
   # pylint: disable=redefined-builtin
   """Removes dimensions of size 1 from the shape of a tensor.
@@ -2618,10 +2620,8 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   Raises:
     ValueError: When both `squeeze_dims` and `axis` are specified.
   """
-  if squeeze_dims is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'squeeze_dims' and 'axis'")
-    axis = squeeze_dims
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "squeeze_dims", squeeze_dims)
   if np.isscalar(axis):
     axis = [axis]
   return gen_array_ops.squeeze(input, axis, name)
-- 
GitLab


From 584d072537ff350f21ed973e64ed67a3d0d943e3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 13 Apr 2018 11:26:39 -0700
Subject: [PATCH 0777/1262] Fix warnings in `nn.sampled_softmax_loss` (#18494)

* Fix warnings in `nn.sampled_softmax_loss`

The softmax_cross_entropy_with_logits has been deprecated
and replaced with softmax_cross_entropy_with_logits_v2.
This causes nn.sampled_softmax_loss to always generate
a WANRING whenever called. This fix replaces
`softmax_cross_entropy_with_logits` with `softmax_cross_entropy_with_logits_v2`
and maintains the existing behavior to fix the warning.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Pylint fix for line too long

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/nn_impl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 47cc4da7f2..1715e5b36a 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -1340,7 +1340,8 @@ def sampled_softmax_loss(weights,
       partition_strategy=partition_strategy,
       name=name,
       seed=seed)
-  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
+  labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
+  sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses
-- 
GitLab


From 988ad74476250eee70227349b5f1eabc86d22833 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 13 Apr 2018 11:29:31 -0700
Subject: [PATCH 0778/1262] Not in third_party

---
 tensorflow/python/framework/python_op_gen.i | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index e39c425b05..efcce2f209 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -38,4 +38,4 @@ limitations under the License.
 
 %ignoreall;
 %unignore tensorflow::GetEagerPythonWrappers;
-%include "third_party/tensorflow/python/eager/python_eager_op_gen.h"
+%include "tensorflow/python/eager/python_eager_op_gen.h"
-- 
GitLab


From 692a71da6aad55dcaa597633aaf88de8322ca8ab Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 13 Apr 2018 11:33:07 -0700
Subject: [PATCH 0779/1262] Fix the broken TFLite iOS example. (#18483)

The demo app is only relying on CocoaPod now, but it's incorrectly
configured to use the headers on Github. It crashes the app when
the header is different between Github and CocoaPod.
---
 .../tflite_camera_example.xcodeproj/project.pbxproj       | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index b0236e9c60..98d3b5bb8a 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -326,10 +326,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
@@ -373,10 +369,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
-- 
GitLab


From 0c2ca00e1082ab2692af68af183083e41393f6c4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 13 Apr 2018 11:38:43 -0700
Subject: [PATCH 0780/1262] Fix crash when invalid dtype was passed (#18481)

* Fix crash when invalid dtype was passed

This fix tries to address the issue raised in 18474
where crash may happen if invalid dtype (e.g., `"[,]"`)
is passed to `tf.constant(tf.string, "[,]")`. The crash
happens during the comparision of `"[,]"` and numpy dtype
candidate (e.g., `np.dtype([("qint8", np.int8, 1)])`:
```
>>> import numpy as np
>>> np.dtype([("qint8", np.int8, 1)]) == "[,]"
Segmentation fault: 11
```

This fix adds a type check to make sure the type of the passed
dtype is either numpy.dtype or type.

This fix fixes 18474.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for invalid type to tf.constant

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/framework/dtypes.py              | 14 ++++++++------
 tensorflow/python/kernel_tests/constant_op_test.py |  5 +++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 5efda44a5f..eda713641d 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -699,11 +699,13 @@ def as_dtype(type_value):
     if type_value.type == np.string_ or type_value.type == np.unicode_:
       return string
 
-  for key, val in _NP_TO_TF:
-    try:
-      if key == type_value:
-        return val
-    except TypeError as e:
-      raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
+  if isinstance(type_value, (type, np.dtype)):
+    for key, val in _NP_TO_TF:
+      try:
+        if key == type_value:
+          return val
+      except TypeError as e:
+        raise TypeError("Cannot convert {} to a dtype. {}".format(
+            type_value, e))
 
   raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 749313b00d..107ee37fab 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -65,6 +65,11 @@ class ConstantTest(test.TestCase):
     self._testCpu(x)
     self._testGpu(x)
 
+  def testInvalidDType(self):
+    # Test case for GitHub issue 18474
+    with self.assertRaises(TypeError):
+      constant_op.constant(dtypes_lib.string, "[,]")
+
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
-- 
GitLab


From 6c22bbdda41d839cb9e1f7803533c571596ea4ee Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 13 Apr 2018 11:47:02 -0700
Subject: [PATCH 0781/1262] Fix warnings in tf.distributions.Categorical
 (#18468)

In tf.distributions.Categorical dimension was used with argmax.
As dimension has been deprecated this generates a warning.
This fix fixes the warning by changing to axis.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/distributions/categorical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 9161e3fa9f..995dd9ca2a 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -311,7 +311,7 @@ class Categorical(distribution.Distribution):
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
 
   def _mode(self):
-    ret = math_ops.argmax(self.logits, dimension=self._batch_rank)
+    ret = math_ops.argmax(self.logits, axis=self._batch_rank)
     ret = math_ops.cast(ret, self.dtype)
     ret.set_shape(self.batch_shape)
     return ret
-- 
GitLab


From 7e0db0fe4992c466f758338183dfa0636c61a36b Mon Sep 17 00:00:00 2001
From: James Wexler <jwexler@google.com>
Date: Fri, 13 Apr 2018 15:18:17 -0400
Subject: [PATCH 0782/1262] fix build file format

---
 tensorflow/core/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ab25283cc4..46da23f6f9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -256,8 +256,8 @@ proto_library(
 
 closure_proto_library(
     name = "example_protos_closure",
-    deps = [":example_protos"],
     visibility = ["//visibility:public"],
+    deps = [":example_protos"],
 )
 
 exports_files([
-- 
GitLab


From be328931086e212a87bac26ccff021b51863d875 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 12:18:53 -0700
Subject: [PATCH 0783/1262] Expose tf.decode_compressed to the public API.

PiperOrigin-RevId: 192805605
---
 .../core/api_def/python_api/api_def_DecodeCompressed.pbtxt    | 4 ----
 tensorflow/tools/api/golden/tensorflow.pbtxt                  | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt

diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
deleted file mode 100644
index f0b7539918..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DecodeCompressed"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index be64fd19d8..c66249999f 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -912,6 +912,10 @@ tf_module {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_compressed"
+    argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "decode_csv"
     argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
-- 
GitLab


From cb3cd61be2301202731e1157c3ee957d26f9695e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 12:35:32 -0700
Subject: [PATCH 0784/1262] [XLA] Redesign: add a method that creates fake data
 for XlaComputation.

PiperOrigin-RevId: 192807851
---
 tensorflow/compiler/xla/client/lib/BUILD      |  1 +
 tensorflow/compiler/xla/client/lib/testing.cc | 16 ++++++++++++++++
 tensorflow/compiler/xla/client/lib/testing.h  |  7 +++++++
 3 files changed, 24 insertions(+)

diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index f4673a8204..59c4a53c05 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index b63a1465ea..311dc4bdd7 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -111,4 +111,20 @@ std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
   return fake_arguments;
 }
 
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client) {
+  CHECK(computation.proto().has_program_shape())
+      << "Computation should have progran shape.";
+  auto program_shape = computation.proto().program_shape();
+
+  // For every (unbound) parameter that the computation wants, we manufacture
+  // some arbitrary data so that we can invoke the computation.
+  std::vector<std::unique_ptr<GlobalData>> fake_arguments;
+  for (const Shape& parameter : program_shape.parameters()) {
+    fake_arguments.push_back(MakeFakeDataOrDie(parameter, client));
+  }
+
+  return fake_arguments;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 7e640d1307..1dc2622972 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -38,6 +39,12 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
     const Computation& computation, Client* client);
 
+// Returns vector of GlobalData handles of fake data (created using
+// MakeFakeDataOrDie) that are correctly shaped arguments for the given
+// xla computation.
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TESTING_H_
-- 
GitLab


From ec6003aee63a8eabace3c211e15d9587a405c1f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 12:37:04 -0700
Subject: [PATCH 0785/1262] [XLA] Redesign: add a constructor:
 XlaComputation(HloModuleProto).

PiperOrigin-RevId: 192808038
---
 tensorflow/compiler/xla/client/xla_client/xla_computation.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index 7182908666..085fabd56d 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -30,6 +30,8 @@ namespace xla {
 class XlaComputation {
  public:
   XlaComputation() : unique_id_(-1) {}
+  XlaComputation(const HloModuleProto& proto)
+      : unique_id_(proto.id()), proto_(proto) {}
 
   XlaComputation(const XlaComputation&) = delete;
   XlaComputation& operator=(const XlaComputation&) = delete;
-- 
GitLab


From 76a73f899cdc5e19ef2b99373524dcb4dba0bd2b Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 9 Apr 2018 17:45:13 -0700
Subject: [PATCH 0786/1262] boosted_trees: early stop hooks are fixed to stop
 at the right moment  by reading tensor values in a separate session after
 train_op run. PiperOrigin-RevId: 192217338

---
 .../python/estimator/boosted_trees_test.py    | 97 +++++++------------
 .../python/estimator/canned/boosted_trees.py  | 33 +++----
 .../estimator/canned/boosted_trees_test.py    | 63 +++++-------
 3 files changed, 71 insertions(+), 122 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index e99a87f3b3..eee5910687 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -69,10 +70,18 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateEstimator(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -88,9 +97,10 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferEstimator(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -108,31 +118,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
-
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
-
-
-class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
   def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=True)
@@ -145,36 +137,16 @@ class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
-
-    # Check predict that all labels are correct.
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -187,20 +159,17 @@ class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.2136638)
-
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 500ea03ea7..c5d5455b1a 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -209,8 +209,8 @@ class _CacheTrainingStatesUsingVariables(object):
         name='cache_insert')
 
 
-class StopAtAttemptsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
+class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of attempts."""
 
   def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
                max_trees, max_depth):
@@ -224,25 +224,17 @@ class StopAtAttemptsHook(session_run_hook.SessionRunHook):
         [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
 
   def after_run(self, run_context, run_values):
+    # num_* tensors should be retrieved by a separate session than the training
+    # one, in order to read the values after growing.
+    # So, if it's approaching to the limit, get the actual value by additional
+    # session.
     num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees - 1 or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
+      num_finalized_trees, num_attempted_layers = run_context.session.run(
+          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
     if (num_finalized_trees >= self._max_trees or
-        1.0 * num_attempted_layers / self._max_depth > 2 * self._max_trees):
-      run_context.request_stop()
-
-
-class StopAtNumTreesHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
-
-  def __init__(self, num_trees_tensor, max_trees):
-    self._num_trees_tensor = num_trees_tensor
-    self._max_trees = max_trees
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._num_trees_tensor)
-
-  def after_run(self, run_context, run_values):
-    num_trees = run_values.results
-    if num_trees > self._max_trees:
+        num_attempted_layers > 2 * self._max_trees * self._max_depth):
       run_context.request_stop()
 
 
@@ -468,7 +460,8 @@ def _bt_model_fn(
     # Add an early stop hook.
     estimator_spec = estimator_spec._replace(
         training_hooks=estimator_spec.training_hooks +
-        (StopAtNumTreesHook(num_trees, tree_hparams.n_trees),))
+        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                             tree_hparams.n_trees, tree_hparams.max_depth),))
   return estimator_spec
 
 
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 01e5cc7a5d..625745a3f9 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -69,7 +69,7 @@ def _make_train_input_fn(is_classification):
   return _input_fn
 
 
-class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._feature_columns = {
@@ -79,10 +79,18 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateBinaryClassifier(self):
     input_fn = _make_train_input_fn(is_classification=True)
@@ -97,7 +105,8 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
 
@@ -118,29 +127,9 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
     # All labels are correct.
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testTrainAndEvaluateRegressor(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -155,9 +144,10 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferRegressor(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -174,16 +164,13 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 class ModelFnTests(test_util.TensorFlowTestCase):
-- 
GitLab


From 3e1739c0c3c6cd3b74879f3e1872dd1354401e56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 11 Apr 2018 15:37:49 -0700
Subject: [PATCH 0787/1262] Revealing the range of node ids in the latest layer
 via resource' state

PiperOrigin-RevId: 192520351
---
 ...tedTreesCalculateBestGainsPerFeature.pbtxt |  4 +-
 ...pi_def_BoostedTreesGetEnsembleStates.pbtxt | 12 +++++-
 .../kernels/boosted_trees/boosted_trees.proto |  4 ++
 .../kernels/boosted_trees/resource_ops.cc     | 12 ++++++
 .../core/kernels/boosted_trees/resources.h    | 20 ++++++++++
 .../core/kernels/boosted_trees/stats_ops.cc   |  6 +--
 .../kernels/boosted_trees/training_ops.cc     |  8 ++++
 tensorflow/core/ops/boosted_trees_ops.cc      |  2 +
 .../core/ops/compat/ops_history.v1.pbtxt      |  4 ++
 .../python/estimator/canned/boosted_trees.py  |  9 ++---
 .../estimator/canned/boosted_trees_test.py    | 12 ++++++
 .../boosted_trees/resource_ops_test.py        | 31 +++++++++-----
 .../boosted_trees/stats_ops_test.py           |  8 ++--
 .../boosted_trees/training_ops_test.py        | 40 +++++++++++++++++--
 tensorflow/python/ops/boosted_trees_ops.py    | 15 ++++---
 15 files changed, 150 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index b1921e3507..62876a293c 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "node_id_range"
     description: <<END
-A Rank 1 tensor (shape=[2]) to specify the range [first, last] of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1]+1)` (Note that the last index node_id_range[1] is inclusive).
+A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
 END
   }
   in_arg {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
index ef45a92498..4377125224 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -31,5 +31,13 @@ END
 The number of layers we attempted to build (but not necessarily succeeded).
 END
   }
-  summary: "Retrieves the tree ensemble resource stamp token."
-}
+  out_arg {
+    name: "last_layer_nodes_range"
+    description: <<END
+Rank size 2 tensor that contains start and end ids of the nodes in the latest
+layer.
+END
+
+  }
+  summary: "Retrieves the tree ensemble resource stamp token, number of trees and growing statistics."
+}
\ No newline at end of file
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 106ceedc00..55599de731 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -100,6 +100,10 @@ message GrowingMetadata {
   // Number of layers that we have attempted to build. After pruning, these
   // layers might have been removed.
   int64 num_layers_attempted = 2;
+  // The start (inclusive) and end (exclusive) ids of the nodes in the latest
+  // layer of the latest tree.
+  int32 last_layer_node_start = 3;
+  int32 last_layer_node_end = 4;
 }
 
 // TreeEnsemble describes an ensemble of decision trees.
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
index f49242d856..563f7b8b08 100644
--- a/tensorflow/core/kernels/boosted_trees/resource_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -99,6 +99,7 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     Tensor* output_num_trees_t = nullptr;
     Tensor* output_num_finalized_trees_t = nullptr;
     Tensor* output_num_attempted_layers_t = nullptr;
+    Tensor* output_last_layer_nodes_range_t = nullptr;
 
     OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
                                                      &output_stamp_token_t));
@@ -110,11 +111,22 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(3, TensorShape(),
                                             &output_num_attempted_layers_t));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                4, {2}, &output_last_layer_nodes_range_t));
 
     output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
     output_num_trees_t->scalar<int32>()() = num_trees;
     output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
     output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+
+    int32 range_start;
+    int32 range_end;
+    tree_ensemble_resource->GetLastLayerNodesRange(&range_start, &range_end);
+
+    output_last_layer_nodes_range_t->vec<int32>()(0) = range_start;
+    // For a completely empty ensemble, this will be 0. To make it a valid range
+    // we add this max cond.
+    output_last_layer_nodes_range_t->vec<int32>()(1) = std::max(1, range_end);
   }
 };
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index c82588b950..561ca3a18a 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -93,6 +93,26 @@ class BoostedTreesEnsembleResource : public StampedResource {
         new_num_layers);
   }
 
+  void UpdateLastLayerNodesRange(const int32 node_range_start,
+                                 int32 node_range_end) const {
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+        node_range_start);
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+        node_range_end);
+  }
+
+  void GetLastLayerNodesRange(int32* node_range_start,
+                              int32* node_range_end) const {
+    *node_range_start =
+        tree_ensemble_->growing_metadata().last_layer_node_start();
+    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+  }
+
+  int64 GetNumNodes(const int32 tree_id) {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->trees(tree_id).nodes_size();
+  }
+
   void UpdateGrowingMetadata() const;
 
   int32 GetNumLayersAttempted() {
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 33fdab6a86..16e65cf284 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -42,8 +42,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     const Tensor* node_id_range_t;
     OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
     const auto node_id_range = node_id_range_t->vec<int32>();
-    int32 node_id_first = node_id_range(0);
-    int32 node_id_last = node_id_range(1);  // inclusive.
+    const int32 node_id_first = node_id_range(0);  // inclusive
+    const int32 node_id_last = node_id_range(1);   // exclusive
     // stats_summary_list
     OpInputList stats_summary_list;
     OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
@@ -86,7 +86,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       std::vector<int32> output_thresholds;
       std::vector<float> output_left_node_contribs;
       std::vector<float> output_right_node_contribs;
-      for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) {
+      for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
         // Calculate gains.
         cum_grad.clear();
         cum_hess.clear();
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index b9ded4054a..67cac14c52 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -101,6 +101,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
             << current_tree << " of ensemble of " << current_tree + 1
             << " trees.";
     bool split_happened = false;
+    int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
     // Add the splits to the tree.
     for (auto& split_entry : best_splits) {
       const int32 node_id = split_entry.first;
@@ -139,11 +140,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           right_contrib, &left_node_id, &right_node_id);
       split_happened = true;
     }
+    int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
     if (split_happened) {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
       if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+        // If the tree is finalized, next growing will start from node 0;
+        node_id_start = 0;
+        node_id_end = 1;
         ensemble_resource->SetIsFinalized(current_tree, true);
         if (pruning_mode_ == kPostPruning) {
           ensemble_resource->PostPruneTree(current_tree);
@@ -153,6 +158,9 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
         }
       }
+      // If we managed to split, update the node range. If we didn't, don't
+      // update as we will try to split the same nodes with new instances.
+      ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end);
     }
   }
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 297e94655f..8af4903418 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -128,6 +128,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
     .Output("num_trees: int32")
     .Output("num_finalized_trees: int32")
     .Output("num_attempted_layers: int32")
+    .Output("last_layer_nodes_range: int32")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused_input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
@@ -135,6 +136,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       c->set_output(3, c->Scalar());
+      c->set_output(4, c->Vector(2));
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 026bfa89cf..2f6f588d2c 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10861,6 +10861,10 @@ op {
     name: "num_attempted_layers"
     type: DT_INT32
   }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
   is_stateful: true
 }
 op {
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index c5d5455b1a..58af59dbb1 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -349,8 +349,8 @@ def _bt_model_fn(
             array_ops.zeros(
                 [batch_size, head.logits_dimension], dtype=dtypes.float32))
       with ops.control_dependencies([ensemble_reload]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = local_tree_ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         last_layer_nodes_range) = local_tree_ensemble.get_states()
         summary.scalar('ensemble/num_trees', num_trees)
         summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
         summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
@@ -393,10 +393,7 @@ def _bt_model_fn(
         (node_ids_per_feature, gains_list, thresholds_list,
          left_node_contribs_list, right_node_contribs_list) = (
              boosted_trees_ops.calculate_best_gains_per_feature(
-                 node_id_range=array_ops.stack([
-                     math_ops.reduce_min(node_ids),
-                     math_ops.reduce_max(node_ids)
-                 ]),
+                 node_id_range=last_layer_nodes_range,
                  stats_summary_list=stats_summary_list,
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 625745a3f9..7823ef8410 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -223,6 +223,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -307,6 +309,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -407,6 +411,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
@@ -444,6 +450,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -528,6 +536,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -628,6 +638,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index a223241e89..d5f0c22d6e 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -36,16 +36,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
       self.assertEqual(0, stamp_token.eval())
-      (_, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (_, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
   def testCreateWithProto(self):
     with self.test_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -141,6 +143,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 6
+          last_layer_node_start: 16
+          last_layer_node_end: 19
         }
       """, ensemble_proto)
       ensemble = boosted_trees_ops.TreeEnsemble(
@@ -148,28 +152,31 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
           stamp_token=7,
           serialized_proto=ensemble_proto.SerializeToString())
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(7, stamp_token.eval())
       self.assertEqual(2, num_trees.eval())
       self.assertEqual(1, num_finalized_trees.eval())
       self.assertEqual(6, num_attempted_layers.eval())
+      self.assertAllEqual([16, 19], nodes_range.eval())
 
   def testSerializeDeserialize(self):
     with self.test_session():
       # Initialize.
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(5, stamp_token.eval())
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -201,6 +208,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 5
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
       """, ensemble_proto)
       with ops.control_dependencies([
@@ -208,13 +217,15 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
               stamp_token=3,
               serialized_proto=ensemble_proto.SerializeToString())
       ]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         nodes_range) = ensemble.get_states()
       self.assertEqual(3, stamp_token.eval())
       self.assertEqual(1, num_trees.eval())
       # This reads from metadata, not really counting the layers.
       self.assertEqual(5, num_attempted_layers.eval())
       self.assertEqual(0, num_finalized_trees.eval())
+      self.assertAllEqual([3, 7], nodes_range.eval())
+
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index a54cc43517..4d09cf94d4 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -29,7 +29,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation without any regularization."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -76,7 +76,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -123,7 +123,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L1."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -173,7 +173,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index 4226ff75c2..d6c0047747 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -132,6 +132,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -314,6 +316,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -461,6 +465,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -615,6 +621,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 5
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -624,7 +632,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test that the metadata is updated even though we can't split."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -655,6 +664,9 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+
         }
       """, tree_ensemble_config)
 
@@ -685,7 +697,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
       # Expect no new splits created, but attempted (global) stats updated. Meta
       # data for this tree should not be updated (we didn't succeed building a
-      # layer.
+      # layer. Node ranges don't change.
       new_stamp, serialized = session.run(tree_ensemble.serialize())
       tree_ensemble = boosted_trees_pb2.TreeEnsemble()
       tree_ensemble.ParseFromString(serialized)
@@ -721,6 +733,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -730,7 +744,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -761,6 +776,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """, tree_ensemble_config)
 
@@ -851,6 +868,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -941,6 +960,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1046,6 +1067,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
        """
       self.assertEqual(new_stamp, 2)
@@ -1179,6 +1202,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 3
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
        """
       self.assertEqual(new_stamp, 3)
@@ -1268,6 +1293,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1307,7 +1334,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       # Expect the ensemble to be empty as post-pruning will prune
       # the entire finalized tree.
       self.assertEqual(new_stamp, 2)
-      self.assertProtoEquals("""
+      self.assertProtoEquals(
+          """
       trees {
         nodes {
           leaf {
@@ -1359,6 +1387,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       growing_metadata {
         num_trees_attempted: 1
         num_layers_attempted: 2
+        last_layer_node_start: 0
+        last_layer_node_end: 1
       }
       """, res_ensemble)
 
@@ -1455,6 +1485,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 174d00987f..2a2bcdd9d6 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -115,7 +115,7 @@ class TreeEnsemble(object):
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
-    stamp_token, _, _, _ = (
+    stamp_token, _, _, _, _ = (
         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
             self.resource_handle))
     return stamp_token
@@ -124,17 +124,20 @@ class TreeEnsemble(object):
     """Returns states of the tree ensemble.
 
     Returns:
-      stamp_token, num_trees, num_finalized_trees, num_attempted_layers.
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers and
+      range of the nodes in the latest layer.
     """
-    stamp_token, num_trees, num_finalized_trees, num_attempted_layers = (
-        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
-            self.resource_handle))
+    (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+     nodes_range) = (
+         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+             self.resource_handle))
     # Use identity to give names.
     return (array_ops.identity(stamp_token, name='stamp_token'),
             array_ops.identity(num_trees, name='num_trees'),
             array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
             array_ops.identity(
-                num_attempted_layers, name='num_attempted_layers'))
+                num_attempted_layers, name='num_attempted_layers'),
+            array_ops.identity(nodes_range, name='last_layer_nodes_range'))
 
   def serialize(self):
     """Serializes the ensemble into proto and returns the serialized proto.
-- 
GitLab


From 33c737b70d42e05cabc43b4c6e778e988b6d0a9e Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Wed, 11 Apr 2018 16:59:45 -0700
Subject: [PATCH 0788/1262] boosted_trees: make sure ensemble deserialization
 happens for the non-TRAIN modes too.

PiperOrigin-RevId: 192532297
---
 .../python/estimator/canned/boosted_trees.py  | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 58af59dbb1..0ecc8c7089 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -317,27 +317,28 @@ def _bt_model_fn(
                                                    head.logits_dimension)
 
     # Create Ensemble resources.
-    if is_single_machine:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      local_tree_ensemble = tree_ensemble
-      ensemble_reload = control_flow_ops.no_op()
-    else:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      with ops.device(worker_device):
-        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-            name=name + '_local', is_local=True)
-      # TODO(soroush): Do partial updates if this becomes a bottleneck.
-      ensemble_reload = local_tree_ensemble.deserialize(
-          *tree_ensemble.serialize())
-
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
     # Create logits.
     if mode != model_fn.ModeKeys.TRAIN:
       logits = boosted_trees_ops.predict(
-          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
           logits_dimension=head.logits_dimension,
           max_depth=tree_hparams.max_depth)
     else:
+      if is_single_machine:
+        local_tree_ensemble = tree_ensemble
+        ensemble_reload = control_flow_ops.no_op()
+      else:
+        # Have a local copy of ensemble for the distributed setting.
+        with ops.device(worker_device):
+          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+              name=name + '_local', is_local=True)
+        # TODO(soroush): Do partial updates if this becomes a bottleneck.
+        ensemble_reload = local_tree_ensemble.deserialize(
+            *tree_ensemble.serialize())
       if cache:
         cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
       else:
-- 
GitLab


From f6c5cd435df9c64e79ad0f6434b619d4517e740a Mon Sep 17 00:00:00 2001
From: James Wexler <jwexler@google.com>
Date: Fri, 13 Apr 2018 12:44:41 -0700
Subject: [PATCH 0789/1262] Remove closure_js_proto_library rule for tf.example
 protos.

PiperOrigin-RevId: 192809073
---
 tensorflow/core/BUILD | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c461f9ed2f..7ea8a38834 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -70,10 +70,6 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
-load(
-    "@io_bazel_rules_closure//closure:defs.bzl",
-    "closure_js_proto_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "full_path",
@@ -249,15 +245,6 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
-closure_js_proto_library(
-    name = "example_js_protos",
-    srcs = [
-        "example/example.proto",
-        "example/feature.proto",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 exports_files([
     "framework/types.proto",
 ])
-- 
GitLab


From 544ae7128d5684644319d529de35a3f761ba5385 Mon Sep 17 00:00:00 2001
From: Peng Yu <peng.yu@shopify.com>
Date: Fri, 13 Apr 2018 16:07:36 -0400
Subject: [PATCH 0790/1262] Add myself into code ownder for tensor_forest

---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 007a304c3e..b9f0313cc6 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,7 +45,7 @@
 # /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 # /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 # /tensorflow/contrib/stateless/ @girving
-# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
 # /tensorflow/contrib/testing/ @dandelionmane
 # /tensorflow/contrib/timeseries/ @allenlavoie
 # /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
-- 
GitLab


From 460a8b6a5df176412c0d261d91eccdc32e9d39f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 13:40:28 -0700
Subject: [PATCH 0791/1262] Support scalar mean in resolve_batch_normalization

PiperOrigin-RevId: 192816848
---
 .../resolve_batch_normalization.cc              | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index fb109eb91b..2b3ee36ad1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -33,7 +33,7 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   const auto* bn_op =
       static_cast<const BatchNormalizationOperator*>(bn_it->get());
 
-  const auto& mean_array = model->GetArray(bn_op->inputs[1]);
+  auto& mean_array = model->GetArray(bn_op->inputs[1]);
   const auto& multiplier_array = model->GetArray(bn_op->inputs[2]);
   const auto& offset_array = model->GetArray(bn_op->inputs[3]);
 
@@ -49,6 +49,13 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   CHECK(multiplier_array.data_type == ArrayDataType::kFloat);
   CHECK(offset_array.data_type == ArrayDataType::kFloat);
 
+  // This graph transformations will need to address constant buffers below,
+  // so we need to exit early if these buffers don't exist (i.e. if the params
+  // haven't yet been resolved as constants).
+  if (!mean_array.buffer || !multiplier_array.buffer || !offset_array.buffer) {
+    return false;
+  }
+
   // Create the new Mul, Add operators
   auto* mul_op = new MulOperator;
   auto* add_op = new AddOperator;
@@ -80,9 +87,15 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   DCHECK_EQ(bn_it->get(), bn_op);
 
   // Create the new param arrays
-  const auto& mean_shape = mean_array.shape();
+  auto& mean_shape = *mean_array.mutable_shape();
   const auto& multiplier_shape = multiplier_array.shape();
   const auto& offset_shape = offset_array.shape();
+  if (mean_shape.dims().empty()) {
+    *mean_shape.mutable_dims() = multiplier_shape.dims();
+    auto& data = mean_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+    CHECK_EQ(data.size(), 1);
+    data.resize(RequiredBufferSizeForShape(mean_shape), data[0]);
+  }
   CHECK(mean_shape.dims() == multiplier_shape.dims());
   CHECK(mean_shape.dims() == offset_shape.dims());
   const auto& param_shape = mean_shape;
-- 
GitLab


From 6a2d781e2c529511442e1818d23334d89b171cf2 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 13 Apr 2018 14:09:58 -0700
Subject: [PATCH 0792/1262] Internal change.

PiperOrigin-RevId: 192821482
---
 tensorflow/workspace.bzl    | 8 ++++----
 third_party/llvm/llvm.BUILD | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 72f446d359..85bd1ea28b 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -462,11 +462,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/15535accd9e1e9d7772202ce51c8428c1994a04b.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/15535accd9e1e9d7772202ce51c8428c1994a04b.tar.gz",
       ],
-      sha256 = "a6d94bd9de23515a1e3792a830421e3885977ea43d03427cdbe68f98cb7e0045",
-      strip_prefix = "llvm-7e78daafdd22f3f17720a103d29d89590534004e",
+      sha256 = "3470c2dde055dc974e859e707aa6cd1d22eadd4f3a1f282e74c3cf1f7dc9510a",
+      strip_prefix = "llvm-15535accd9e1e9d7772202ce51c8428c1994a04b",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 075b46896e..097bbf5d42 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -2053,6 +2053,7 @@ cc_library(
         "include/llvm/Target/*.def",
         "include/llvm/Target/*.inc",
         "include/llvm/CodeGen/*.def",
+        "include/llvm/CodeGen/*.inc",
     ]),
     deps = [
         ":analysis",
-- 
GitLab


From 92f870d1a95cb598c0fec9ff1f5c0cf95fa42eae Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 13 Apr 2018 14:12:16 -0700
Subject: [PATCH 0793/1262] Extend Keras symbol-feeding to dynamic-length
 tensors and tensors of different dtypes from the target placeholders.

PiperOrigin-RevId: 192821770
---
 .../python/keras/_impl/keras/backend.py       |  2 ++
 .../python/keras/_impl/keras/backend_test.py  |  5 ++++
 .../keras/_impl/keras/engine/training_test.py | 17 +++++++++++
 .../_impl/keras/engine/training_utils.py      | 29 +++++++++----------
 4 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 6647cc5b79..81a4d2f820 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -2833,6 +2833,8 @@ class Function(object):
     # Handle symbolic feed.
     for x, y in zip(feed_symbols, symbol_vals):
       connection = callable_opts.tensor_connection.add()
+      if x.dtype != y.dtype:
+        y = math_ops.cast(y, dtype=x.dtype)
       from_tensor = ops._as_graph_element(y)
       if from_tensor is None:
         from_tensor = y
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index 0193fc6976..de1ed467a2 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -217,6 +217,11 @@ class BackendUtilsTest(test.TestCase):
       outs = f([y4, y2, None])
       self.assertEqual(outs, [5., 2.])
 
+      # Test with a different dtype
+      y5 = keras.backend.constant(10., dtype='float64')
+      outs = f([y5, y2, None])
+      self.assertEqual(outs, [11., 2.])
+
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.Session().run() via its
     # `fetches` arguments. In contrast to `updates` argument of
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 08fd26dd18..6699fd5212 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -23,10 +23,12 @@ import unittest
 
 import numpy as np
 
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 try:
@@ -1140,6 +1142,21 @@ class TestTrainingWithDataTensors(test.TestCase):
                 epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=(inputs, targets), validation_steps=2)
 
+      # Test with dynamic shape
+      inputs = array_ops.placeholder_with_default(
+          np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
+      targets = array_ops.placeholder_with_default(
+          np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
+      self.assertEqual(inputs.shape[0].value, None)
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
     with self.test_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index a3fc8ef2a0..48afe48e6c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -61,22 +61,21 @@ def check_num_samples(ins,
   Raises:
       ValueError: In case of invalid arguments.
   """
-  if steps is not None:
-    num_samples = None
-    if batch_size is not None:
-      raise ValueError(
-          'If ' + steps_name + ' is set, the `batch_size` must be None.')
-  if has_symbolic_tensors(ins) and steps is None:
-    raise ValueError('If your data is in the form of symbolic tensors, '
-                     'you should specify the `' + steps_name + '` argument '
-                     '(instead of the `batch_size` argument).')
-  if ins and hasattr(ins[0], 'shape'):
-    num_samples = int(ins[0].shape[0])
-  elif steps is None:
+  if steps is not None and batch_size is not None:
     raise ValueError(
-        'Either the input data should have '
-        'a defined shape, or ' + steps_name + ' should be specified.')
-  return num_samples
+        'If ' + steps_name + ' is set, the `batch_size` must be None.')
+
+  if not ins or has_symbolic_tensors(ins):
+    if steps is None:
+      raise ValueError('If your data is in the form of symbolic tensors, '
+                       'you should specify the `' + steps_name + '` argument '
+                       '(instead of the `batch_size` argument, '
+                       'because symbolic tensors are expected to produce '
+                       'batches of input data).')
+    return None
+  if hasattr(ins[0], 'shape'):
+    return int(ins[0].shape[0])
+  return None  # Edge case where ins == [static_learning_phase]
 
 
 def standardize_single_array(x):
-- 
GitLab


From 638fd98e844a9ba8857b9b6fa194f555f53c033d Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Fri, 13 Apr 2018 14:13:12 -0700
Subject: [PATCH 0794/1262] Small tag change

PiperOrigin-RevId: 192821895
---
 tensorflow/contrib/lite/kernels/BUILD | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 800e2a9558..ac7c3f071f 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -265,8 +265,7 @@ tf_cc_test(
     size = "small",
     srcs = ["arg_max_test.cc"],
     tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
+        "tflite_not_portable_ios",
     ],
     deps = [
         ":builtin_ops",
-- 
GitLab


From bf724a8ced3710ed2234f25748ed7719e319d78c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 14:17:31 -0700
Subject: [PATCH 0795/1262] [XLA] Redesign: add ~XlaOp() and ~XlaComputation().

PiperOrigin-RevId: 192822559
---
 tensorflow/compiler/xla/client/xla_client/xla_builder.h     | 1 +
 tensorflow/compiler/xla/client/xla_client/xla_computation.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index e583b4fe48..1f7c731064 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -53,6 +53,7 @@ class XlaBuilder;
 class XlaOp {
  public:
   XlaOp() : handle_(0), builder_(nullptr) {}
+  ~XlaOp() {}
 
   StatusOr<Shape> GetShape() const;
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index 085fabd56d..7ad212aa24 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -33,6 +33,8 @@ class XlaComputation {
   XlaComputation(const HloModuleProto& proto)
       : unique_id_(proto.id()), proto_(proto) {}
 
+  ~XlaComputation() {}
+
   XlaComputation(const XlaComputation&) = delete;
   XlaComputation& operator=(const XlaComputation&) = delete;
 
-- 
GitLab


From 8600d918a63c658b9b79ba96ee821c903ba3ee94 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 13 Apr 2018 14:32:45 -0700
Subject: [PATCH 0796/1262] Allow tf.train.Saver to load object-based
 checkpoints (using names)

This is the second part of the compatibility story. Object-based checkpointing APIs can already read name-based checkpoints, and now the name-based APIs can read object-based checkpoints by looking up the modified keys in the object graph proto.

PiperOrigin-RevId: 192824907
---
 tensorflow/python/training/checkpointable.py  |   5 +
 .../python/training/checkpointable_utils.py   |  14 +-
 .../training/checkpointable_utils_test.py     |   3 -
 tensorflow/python/training/saver.py           |  70 +++++++-
 tensorflow/python/training/saver_test.py      | 150 ++++++++++++++++++
 5 files changed, 227 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index 9bf48df22e..0b8473742c 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -26,6 +26,11 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.util import nest
 
+
+# Key where the object graph proto is saved in a TensorBundle
+OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
+
+
 # A key indicating a variable's value in an object's checkpointed Tensors
 # (Checkpointable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index da99d2ec31..2c4677a278 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -54,8 +54,6 @@ _OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
 # attribute in checkpoint names. Used like:
 #   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
 _OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-# Key where the object graph proto is saved in a TensorBundle
-_OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
 
 
 class _CheckpointRestoreCoordinator(object):
@@ -680,10 +678,11 @@ class CheckpointableSaver(object):
         object_graph_tensor = constant_op.constant(
             graph_proto.SerializeToString(), dtype=dtypes.string)
       feed_additions = None
-    assert _OBJECT_GRAPH_PROTO_KEY not in named_variables
-    named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
-        tensor=object_graph_tensor,
-        name=_OBJECT_GRAPH_PROTO_KEY)
+    assert checkpointable_lib.OBJECT_GRAPH_PROTO_KEY not in named_variables
+    named_variables[checkpointable_lib.OBJECT_GRAPH_PROTO_KEY] = (
+        _NoRestoreSaveable(
+            tensor=object_graph_tensor,
+            name=checkpointable_lib.OBJECT_GRAPH_PROTO_KEY))
     if (self._last_save_object_graph != graph_proto
         # When executing eagerly, we need to re-create SaveableObjects each time
         # save() is called so they pick up new Tensors passed to their
@@ -786,7 +785,8 @@ class CheckpointableSaver(object):
       file_prefix_feed_dict = None
     reader = pywrap_tensorflow.NewCheckpointReader(save_path)
     try:
-      object_graph_string = reader.get_tensor(_OBJECT_GRAPH_PROTO_KEY)
+      object_graph_string = reader.get_tensor(
+          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
     except errors_impl.NotFoundError:
       # The object graph proto does not exist in this checkpoint. Try again with
       # name-based saving.
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable_utils_test.py
index ddf9820616..29fcdb70b4 100644
--- a/tensorflow/python/training/checkpointable_utils_test.py
+++ b/tensorflow/python/training/checkpointable_utils_test.py
@@ -1268,9 +1268,6 @@ class CheckpointCompatibilityTests(test.TestCase):
       status.initialize_or_restore()
       self._check_sentinels(root)
 
-  # TODO(allenl): Test for the core name-based saver loading object-based
-  # checkpoints once object-based checkpointing is in core.
-
   def testSaveGraphLoadEager(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index e40b8d22ed..79d278cf90 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import os.path
 import re
+import sys
 import time
 import uuid
 
@@ -30,8 +31,10 @@ import six
 
 from google.protobuf import text_format
 
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -1340,6 +1343,9 @@ class Saver(object):
       self._check_saver_def()
       self._write_version = self.saver_def.version
     self._save_relative_paths = save_relative_paths
+    # For compatibility with object-based checkpoints, we may build a second
+    # Saver to read the renamed keys.
+    self._object_restore_saver = None
 
   def build(self):
     if context.executing_eagerly():
@@ -1795,11 +1801,65 @@ class Saver(object):
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
     logging.info("Restoring parameters from %s", save_path)
-    if context.executing_eagerly():
-      self._build_eager(save_path, build_save=False, build_restore=True)
-    else:
-      sess.run(self.saver_def.restore_op_name,
-               {self.saver_def.filename_tensor_name: save_path})
+    try:
+      if context.executing_eagerly():
+        self._build_eager(save_path, build_save=False, build_restore=True)
+      else:
+        sess.run(self.saver_def.restore_op_name,
+                 {self.saver_def.filename_tensor_name: save_path})
+    except errors.NotFoundError:
+      exception_type, exception_value, exception_traceback = sys.exc_info()
+      # The checkpoint would not be loaded successfully as is. Try to parse it
+      # as an object-based checkpoint.
+      try:
+        reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+        object_graph_string = reader.get_tensor(
+            checkpointable.OBJECT_GRAPH_PROTO_KEY)
+      except errors.NotFoundError:
+        # This is not an object-based checkpoint, or the checkpoint doesn't
+        # exist. Re-raise the original exception.
+        six.reraise(exception_type, exception_value, exception_traceback)
+      del exception_traceback  # avoid reference cycles
+
+      # This is an object-based checkpoint. We'll print a warning and then do
+      # the restore.
+      logging.warning(
+          # TODO(allenl): Modify instructions for using the object-based saver
+          # once that's in core.
+          "Restoring an object-based checkpoint using a name-based saver. This "
+          "may be somewhat fragile, and will re-build the Saver. Instead, "
+          "consider loading object-based checkpoints using "
+          "tf.contrib.eager.Checkpoint().")
+      self._restore_from_object_based_checkpoint(
+          sess=sess, save_path=save_path,
+          object_graph_string=object_graph_string)
+
+  def _restore_from_object_based_checkpoint(self, sess, save_path,
+                                            object_graph_string):
+    """A compatibility mode for reading object-based checkpoints."""
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    names_to_keys = {}
+    for node in object_graph_proto.nodes:
+      for attribute in node.attributes:
+        names_to_keys[attribute.full_name] = attribute.checkpoint_key
+    saveables = self._builder._ValidateAndSliceInputs(self._var_list)  # pylint: disable=protected-access
+    for saveable in saveables:
+      for spec in saveable.specs:
+        if spec.name not in names_to_keys:
+          raise errors.NotFoundError(
+              None, None,
+              message=("Attempting to load an object-based checkpoint using "
+                       "variable names, but could not find %s in the "
+                       "checkpoint.") % spec.name)
+        spec.name = names_to_keys[spec.name]
+    if self._object_restore_saver is None:
+      # Cache the Saver so multiple restore() calls don't pollute the graph when
+      # graph building. This assumes keys are consistent (i.e. this is the same
+      # type of object-based checkpoint we saw previously).
+      self._object_restore_saver = Saver(saveables)
+    self._object_restore_saver.restore(sess=sess, save_path=save_path)
 
   @staticmethod
   def _add_collection_def(meta_graph_def, key, export_scope=None):
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 14dda79979..3867c0d8da 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import functools
 import math
 import os
 import random
@@ -50,6 +51,8 @@ from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -69,10 +72,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
+from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
 
@@ -2948,6 +2953,29 @@ class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
     return self.non_dep_variable.name
 
 
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
 @test_util.with_c_api
 class CheckpointableCompatibilityTests(test.TestCase):
 
@@ -3011,6 +3039,128 @@ class CheckpointableCompatibilityTests(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual(1, v.eval_count)
 
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def testVariableNotFoundErrorRaised(self):
+    # Restore does some tricky exception handling to figure out if it should
+    # load an object-based checkpoint. Tests that the exception handling isn't
+    # too broad.
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    a = resource_variable_ops.ResourceVariable(1., name="a")
+    b = resource_variable_ops.ResourceVariable(1., name="b")
+    a_saver = saver_module.Saver([a])
+    b_saver = saver_module.Saver([b])
+    with self.test_session() as sess:
+      sess.run(a.initializer)
+      save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
+      with self.assertRaisesRegexp(
+          errors.NotFoundError, "Key b not found in checkpoint"):
+        b_saver.restore(sess=sess, save_path=save_path)
+
+  def testCheckpointNotFoundErrorRaised(self):
+    # Restore does some tricky exception handling to figure out if it should
+    # load an object-based checkpoint. Tests that the exception handling isn't
+    # too broad.
+    a = resource_variable_ops.ResourceVariable(1., name="a")
+    saver = saver_module.Saver([a])
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(
+          errors.NotFoundError,
+          "Failed to find any matching files for path_which_does_not_exist"):
+        saver.restore(sess=sess, save_path="path_which_does_not_exist")
+
+  def testLoadFromObjectBasedGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+      # An incompatible object-based checkpoint to check error messages
+      var = resource_variable_ops.ResourceVariable(1., name="a")
+      self.evaluate(var.initializer)
+      second_saver = checkpointable_utils.CheckpointableSaver(var)
+      second_path = second_saver.save(file_prefix=os.path.join(
+          checkpoint_directory, "second"))
+
+    restore_graph = ops_lib.Graph()
+    with restore_graph.as_default(), self.test_session(
+        graph=restore_graph) as sess:
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver()
+      saver.restore(sess=sess, save_path=save_path)
+      self._check_sentinels(root)
+      before_second_restore_ops = restore_graph.get_operations()
+      # Test that multiple restores do not pollute the graph
+      saver.restore(sess=sess, save_path=save_path)
+      self.assertEqual(before_second_restore_ops,
+                       restore_graph.get_operations())
+      with self.assertRaisesRegexp(errors.NotFoundError,
+                                   "could not find a_variable"):
+        saver.restore(sess=sess, save_path=second_path)
+
+  def testLoadFromObjectBasedEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver(
+          root.model.variables + root.optimizer.variables())
+      saver.restore(sess=None, save_path=save_path)
+      self._check_sentinels(root)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From aedc409605be54f9c7cb67f7b49bdc123d65a8fb Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Fri, 13 Apr 2018 14:51:16 -0700
Subject: [PATCH 0797/1262] Added PmfToQuantizedCdf op to contrib/coder in
 TensorFlow.

The added op transforms probability mass functions (PMF) to quantized
cumulative distribution function (CDF), which can be used by range coder ops in
contrib/coder.

The op takes greedy approach to ensure that the post-quantization probability
masses do not sum over the maximum quantized value. The op does not make any
adjustment when the post-quantization probability masses already sum less than
the maximum value.

PiperOrigin-RevId: 192827779
---
 tensorflow/contrib/coder/BUILD                |  34 +++-
 .../contrib/coder/kernels/pmf_to_cdf_op.cc    | 150 ++++++++++++++++++
 .../coder/kernels/pmf_to_cdf_op_test.cc       | 140 ++++++++++++++++
 tensorflow/contrib/coder/ops/coder_ops.cc     |  32 ++++
 4 files changed, 355 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
 create mode 100644 tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc

diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index ce12e38248..9ca4ce8a9c 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -92,6 +92,34 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "pmf_to_cdf_op",
+    srcs = ["kernels/pmf_to_cdf_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":coder_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "pmf_to_cdf_op_test",
+    size = "small",
+    srcs = ["kernels/pmf_to_cdf_op_test.cc"],
+    deps = [
+        ":pmf_to_cdf_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 cc_library(
     name = "all_ops",
     deps = [":coder_ops_op_lib"],
@@ -99,12 +127,16 @@ cc_library(
 
 cc_library(
     name = "all_kernels",
-    deps = [":range_coder_ops"],
+    deps = [
+        ":pmf_to_cdf_op",
+        ":range_coder_ops",
+    ],
 )
 
 tf_custom_op_library(
     name = "python/ops/_coder_ops.so",
     srcs = [
+        "kernels/pmf_to_cdf_op.cc",
         "kernels/range_coder.cc",
         "kernels/range_coder.h",
         "kernels/range_coder_ops.cc",
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
new file mode 100644
index 0000000000..c787e8eded
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+using errors::InvalidArgument;
+
+class PmfToCdfOp : public OpKernel {
+ public:
+  explicit PmfToCdfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("precision", &precision_));
+    OP_REQUIRES(
+        context, 0 < precision_ && precision_ <= 16,
+        InvalidArgument("`precision` must be in [1, 16]: ", precision_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& pmf_tensor = context->input(0);
+
+    TensorShape shape = pmf_tensor.shape();
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(shape),
+                InvalidArgument("`pmf` should be at least 1-D."));
+    OP_REQUIRES(
+        context, shape.dim_size(shape.dims() - 1) > 1,
+        InvalidArgument("`pmf` size should be at least 2 in the last axis."));
+    shape.set_dim(shape.dims() - 1, shape.dim_size(shape.dims() - 1) + 1);
+
+    Tensor* cdf_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &cdf_tensor));
+
+    auto pmf = pmf_tensor.flat_inner_dims<float, 2>();
+    auto cdf = cdf_tensor->flat_inner_dims<int32, 2>();
+    CHECK_EQ(pmf.dimension(0), cdf.dimension(0));
+    CHECK_EQ(pmf.dimension(1) + 1, cdf.dimension(1));
+
+    const double n = pmf.dimension(1);
+    const int64 cost_per_unit = static_cast<int64>(50.0 * n * std::log2(n));
+    thread::ThreadPool* thread_pool =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    thread_pool->ParallelFor(
+        pmf.dimension(0), cost_per_unit,
+        [this, pmf, &cdf](int64 start, int64 limit) {
+          const gtl::ArraySlice<float>::size_type pmf_size = pmf.dimension(1);
+          for (int64 i = start; i < limit; ++i) {
+            cdf(i, 0) = 0;
+            PerShard({&pmf(i, 0), pmf_size}, {&cdf(i, 1), pmf_size});
+          }
+        });
+  }
+
+ private:
+  struct Item {
+    Item(int32* p, double mass) : pointer(p), mass(mass) {
+      penalty = ComputeNextPenalty();
+    }
+
+    void Decrease() {
+      CHECK_GT(*pointer, 1);
+      --*pointer;
+      penalty = ComputeNextPenalty();
+    }
+
+    friend bool operator<(const Item& lhs, const Item& rhs) {
+      return lhs.penalty < rhs.penalty;
+    }
+
+    double ComputeNextPenalty() {
+      if (*pointer <= 1) {
+        return std::numeric_limits<double>::infinity();
+      }
+      return mass * (std::log2(*pointer) - std::log2(*pointer - 1));
+    }
+
+    int32* pointer;
+    double mass;
+    double penalty;
+  };
+
+  void PerShard(gtl::ArraySlice<float> pmf,
+                gtl::MutableArraySlice<int32> cdf) const {
+    CHECK_EQ(pmf.size(), cdf.size());
+
+    const int32 normalizer = 1 << precision_;
+    std::transform(pmf.begin(), pmf.end(), cdf.begin(),
+                   [normalizer](float mass) {
+                     int32 value = std::rint(mass * normalizer);
+                     // NOTE: Consider checking if mass > 0.
+                     value = std::max(value, 1);
+                     return value;
+                   });
+
+    int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0);
+    if (sum > normalizer) {
+      std::vector<Item> queue;
+      queue.reserve(cdf.size());
+      for (int i = 0; i < cdf.size(); ++i) {
+        queue.emplace_back(&cdf[i], pmf[i]);
+      }
+
+      std::sort(queue.begin(), queue.end());
+      while (sum-- > normalizer) {
+        queue[0].Decrease();
+        // Performs a linear search because this find_if is likely to return
+        // iterator very close to the begin.
+        auto iter =
+            std::find_if(std::next(queue.begin()), queue.end(),
+                         [&queue](const Item& rhs) { return queue[0] < rhs; });
+        std::rotate(queue.begin(), std::next(queue.begin()), iter);
+      }
+    }
+    std::partial_sum(cdf.begin(), cdf.end(), cdf.begin());
+  }
+
+  int precision_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("PmfToQuantizedCdf").Device(DEVICE_CPU),
+                        PmfToCdfOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
new file mode 100644
index 0000000000..c70e38faab
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+class PmfToQuantizedCdfOpTest : public OpsTestBase {
+ protected:
+  void SetupOp(int precision, Tensor* input) {
+    TF_ASSERT_OK(NodeDefBuilder("pmf_to_cdf", "PmfToQuantizedCdf")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("precision", precision)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    inputs_.clear();
+    inputs_.emplace_back(input);
+  }
+
+  void GenerateData(random::SimplePhilox* rand,
+                    gtl::MutableArraySlice<float> slice) {
+    constexpr float minimum = std::numeric_limits<float>::epsilon();
+    float sum = 0;
+    for (float& value : slice) {
+      value = std::max(rand->RandFloat(), minimum);
+      sum += value;
+    }
+    for (float& value : slice) {
+      value /= sum;
+    }
+  }
+
+  void Verify(int precision, const Tensor& pmf_tensor,
+              const Tensor& cdf_tensor) {
+    ASSERT_EQ(pmf_tensor.dims(), cdf_tensor.dims());
+    const int n = pmf_tensor.dims();
+
+    for (int i = 0; i < n - 1; ++i) {
+      EXPECT_EQ(pmf_tensor.dim_size(i), cdf_tensor.dim_size(i));
+    }
+
+    auto pmf = pmf_tensor.flat_inner_dims<float, 2>();
+    auto cdf = cdf_tensor.flat_inner_dims<int32, 2>();
+    EXPECT_EQ(pmf.dimension(1) + 1, cdf.dimension(1));
+
+    const int normalizer = 1 << precision;
+    for (int i = 0; i < pmf.dimension(0); ++i) {
+      EXPECT_EQ(0, cdf(i, 0));
+
+      TTypes<int32>::UnalignedConstVec cdf_slice(&cdf(i, 0), cdf.dimension(1));
+
+      for (int j = 1; j < cdf_slice.size(); ++j) {
+        const int32 diff = cdf_slice(j) - cdf_slice(j - 1);
+        EXPECT_GT(diff, 0);
+      }
+
+      EXPECT_LE(cdf_slice(cdf_slice.size() - 1), normalizer);
+    }
+  }
+};
+
+TEST_F(PmfToQuantizedCdfOpTest, UnderSum) {
+  Tensor pmf(DT_FLOAT, {1, 10, 1, 32});
+  auto matrix = pmf.flat_inner_dims<float, 2>();
+  const std::size_t n = matrix.dimension(1);
+
+  random::PhiloxRandom gen(random::New64(), random::New64());
+  random::SimplePhilox rand(&gen);
+  for (int64 i = 0; i < matrix.dimension(0); ++i) {
+    GenerateData(&rand, {&matrix(i, 0), n});
+  }
+
+  constexpr int kPrecision = 10;
+  SetupOp(kPrecision, &pmf);
+  TF_ASSERT_OK(RunOpKernel());
+
+  Verify(kPrecision, pmf, *GetOutput(0));
+}
+
+TEST_F(PmfToQuantizedCdfOpTest, OverSum) {
+  Tensor pmf(DT_FLOAT, {10, 1, 1, 100});
+  auto matrix = pmf.flat_inner_dims<float, 2>();
+
+  // Half of each PMF is filled with zeros. The op will round up zeros to ones,
+  // post quantization. These round ups are likely to make the sum over
+  // normalizer value.
+  matrix.setZero();
+  const std::size_t n = matrix.dimension(1) / 2;
+
+  random::PhiloxRandom gen;
+  random::SimplePhilox rand(&gen);
+  for (int64 i = 0; i < matrix.dimension(0); ++i) {
+    GenerateData(&rand, {&matrix(i, 0), n});
+  }
+
+  constexpr int kPrecision = 7;
+  SetupOp(kPrecision, &pmf);
+  TF_ASSERT_OK(RunOpKernel());
+
+  Verify(kPrecision, pmf, *GetOutput(0));
+}
+
+TEST_F(PmfToQuantizedCdfOpTest, ShapeFn) {
+  ShapeInferenceTestOp op("PmfToQuantizedCdf");
+
+  INFER_OK(op, "?", "?");
+  INFER_OK(op, "[3]", "[4]");
+  INFER_OK(op, "[3,4]", "[d0_0,5]");
+  INFER_OK(op, "[3,4,5]", "[d0_0,d0_1,6]");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc
index 9056d1a696..9bb171298f 100644
--- a/tensorflow/contrib/coder/ops/coder_ops.cc
+++ b/tensorflow/contrib/coder/ops/coder_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
@@ -115,5 +116,36 @@ decoded: An int32 tensor with shape equal to `shape`.
 precision: The number of bits for probability quantization. Must be <= 16, and
   must match the precision used by RangeEncode that produced `encoded`.
 )doc");
+
+REGISTER_OP("PmfToQuantizedCdf")
+    .Input("pmf: float")
+    .Output("cdf: int32")
+    .Attr("precision: int >= 1")
+    .SetShapeFn([] (InferenceContext* c) {
+      ShapeHandle in;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &in));
+      DimensionHandle last;
+      TF_RETURN_IF_ERROR(c->Add(c->Dim(in, -1), 1, &last));
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(in, -1, last, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Converts PMF to quantized CDF. This op uses floating-point operations
+internally. Therefore the quantized output may not be consistent across multiple
+platforms. For entropy encoders and decoders to have the same quantized CDF on
+different platforms, the quantized CDF should be produced once and saved, then
+the saved quantized CDF should be used everywhere.
+
+After quantization, if PMF sums to less than or equal to 2^precision, then this
+is equivalent to cumsum over the last dimension. This op makes no effort to make
+the sum close to 2^precision when the sum is already <= 2^precision.
+
+After quantization, if PMF sums to greater than 2^precision, then some values of
+PMF is decreased to keep the sum no more than 2^precision.
+
+Note that the input PMF is pre-quantization.
+)doc");
 // clang-format on
 }  // namespace tensorflow
-- 
GitLab


From fa6150d369ea40b795a17221e6f5a0bf054a8cc8 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 13 Apr 2018 15:01:07 -0700
Subject: [PATCH 0798/1262] Adding py_test for TF-TRT integration

---
 tensorflow/contrib/tensorrt/BUILD             |   9 +
 .../contrib/tensorrt/test/test_integration.py | 178 ++++++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 tensorflow/contrib/tensorrt/test/test_integration.py

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index fd3582e175..d116114db0 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -272,3 +272,12 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+py_test(
+  name = "tf_trt_integration_test",
+  srcs = ["test/test_integration.py"],
+  srcs_version = "PY2AND3",
+  deps = [
+    ":init_py"
+  ]
+)
\ No newline at end of file
diff --git a/tensorflow/contrib/tensorrt/test/test_integration.py b/tensorflow/contrib/tensorrt/test/test_integration.py
new file mode 100644
index 0000000000..8ad26c3f69
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/test_integration.py
@@ -0,0 +1,178 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.client import session as csess
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+@test_util.with_c_api
+class IntegrationTest(test_util.TensofFlowTestCase):
+
+  def setUp(self):
+    """ Setup method """
+    super(IntegrationTest, self).setUp()
+    warnings.simplefilter('always')
+    inp_dims = (100, 24, 24, 2)
+    self._input = np.random.random_sample(inp_dims)
+    self._original_graph = get_simple_graph_def()
+    self._gpu_options = cpb2.GPUOptions(
+        per_process_gpu_memory_fraction=0.50)
+    self._config = cpb2.ConfigProto(gpu_options=gpu_options)
+    self._reference = self.run_graph(self._original_graph, self._input)
+
+  def get_simple_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = aops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      e = cop.constant(
+          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+          name="weights",
+          dtype=dtypes.float32)
+      conv = nn.conv2d(
+          input=a,
+          filter=e,
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          name="conv")
+      b = cop.constant(
+          [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+      t = nn.bias_add(conv, b, name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = aops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      aops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+    with self.test_session(
+        grap=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  # Use real data that is representative of the inference dataset
+  # for calibration. For this test script it is random data.
+  def run_calibration(self, gdef, dumm_inp):
+    """Run given calibration graph multiple times."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+      # run over real calibration data here, we are mimicking a calibration set of
+      # 30 different batches. Use as much calibration data as you want
+    with self.test_session(
+        grap=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      for _ in range(30):
+        val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def get_trt_graph(self, mode):
+    """  return trt converted graph """
+    if mode == "FP32":
+      return trt.create_inference_graph(
+          input_graph_def=self._orig_graph,
+          outputs=["output"],
+          max_batch_size=inp_dims[0],
+          max_workspace_size_bytes=1 << 25,
+          precision_mode=
+          "FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
+          minimum_segment_size=2  # minimum number of nodes in an engine
+          )
+    elif mode == "FP16":
+      return trt.create_inference_graph(
+          input_graph_def=self._orig_graph,
+          outputs=["output"],
+          max_batch_size=inp_dims[0],
+          max_workspace_size_bytes=1 << 25,
+          precision_mode=
+          "FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
+          minimum_segment_size=2  # minimum number of nodes in an engine
+          )
+    elif mode == "INT8":
+      return trt.create_inference_graph(
+          input_graph_def=self._orig_graph,
+          outputs=["output"],
+          max_batch_size=inp_dims[0],
+          max_workspace_size_bytes=1 << 25,
+          precision_mode=
+          "INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
+          minimum_segment_size=2  # minimum number of nodes in an engine
+          )
+
+    return None
+
+  def testFP32(self):
+    """ Test FP32 conversion. Results should be identical to native case """
+    trt_graph = self.get_trt_graph("FP32")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+
+  def testFP16(self):
+    """ Test FP16 conversion. Results may be different from native case """
+    trt_graph = self.get_trt_graph("FP16")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+
+  def testINT8(self):
+    """ Test INT8 conversion. Results may be different from native case """
+    calib_graph = self.get_trt_graph("INT8")
+    result = self.run_calibration(calib_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
+    result = self.run_graph(int8_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    result = self.run_graph(int8_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+
+
+if __name__ == '__main__':
+  googletest.main()
-- 
GitLab


From 1298c3240aa9f36b79ea7f0e772edfff87381771 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 13 Apr 2018 15:15:44 -0700
Subject: [PATCH 0799/1262] [TF] Enable half precision XLA compiler tests for
 the gpu backend.

Modify some tests to allow larger error for half precision.

Enable half precision SpaceToBatchNDTest for the cpu backend.

PiperOrigin-RevId: 192831909
---
 tensorflow/compiler/tests/build_defs.bzl      |  2 +-
 tensorflow/compiler/tests/ftrl_test.py        | 14 +++++++----
 tensorflow/compiler/tests/image_ops_test.py   |  3 ++-
 .../compiler/tests/spacetobatch_op_test.py    | 23 +++++++++++++------
 tensorflow/python/framework/test_util.py      |  4 +++-
 5 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 45b6a6eb86..7b114d4f85 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -56,7 +56,7 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     elif backend == "gpu":
       backend_args += [
           "--test_device=XLA_GPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
       ]
       backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index f9db4cf201..8e6407dffd 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -134,9 +134,15 @@ class FtrlOptimizerTest(XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.60260963, -4.29698515]), var0.eval(), float_rtol=1e-5)
+            np.array([-2.60260963, -4.29698515]),
+            var0.eval(),
+            float_rtol=1e-5,
+            half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28432083, -0.56694895]), var1.eval(), float_rtol=1e-5)
+            np.array([-0.28432083, -0.56694895]),
+            var1.eval(),
+            float_rtol=1e-5,
+            half_rtol=1e-2)
 
   def testFtrlwithoutRegularization2(self):
     for dtype in self.float_types:
@@ -272,8 +278,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4)
-    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4, half_rtol=1e-2)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4, half_rtol=1e-2)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 3bc41b7cfd..12791ef8ac 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -65,7 +65,8 @@ class RGBToHSVTest(XLATestCase):
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
       self.assertAllClose(batch2, join2)
-      self.assertAllCloseAccordingToType(batch2, inp, bfloat16_atol=0.03)
+      self.assertAllCloseAccordingToType(
+          batch2, inp, bfloat16_atol=0.03, half_rtol=0.02)
 
   def testRGBToHSVRoundTrip(self):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index ef47187477..f37c34156f 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -163,17 +163,26 @@ class SpaceToBatchNDTest(XLATestCase):
         # error.
         if dtype == dtypes.bfloat16.as_numpy_dtype:
           continue
-        # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
-        if dtype == np.float16 and self.device == "XLA_CPU":
-          continue
+        if dtype == np.float16:
+          actual_inputs = np.array(inputs).astype(dtype)
+          actual_paddings = np.array(paddings).astype(dtype)
+          expected_outputs = np.array(outputs).astype(dtype)
+        else:
+          actual_inputs = inputs
+          actual_paddings = paddings
+          expected_outputs = outputs
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
-        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
-        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape,
+                                           actual_paddings)
+        self.assertAllEqual(
+            sess.run(x_tf, {placeholder: actual_inputs}), expected_outputs)
         # inputs = batch_to_space(outputs)
         placeholder = array_ops.placeholder(dtype)
-        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape, paddings)
-        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape,
+                                           actual_paddings)
+        self.assertAllEqual(
+            sess.run(x_tf, {placeholder: expected_outputs}), actual_inputs)
 
   def _testDirect(self, input_shape, block_shape, paddings):
     inputs = np.arange(np.prod(input_shape), dtype=np.float32)
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index eea27d76c6..70e70abc06 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1380,7 +1380,9 @@ class TensorFlowTestCase(googletest.TestCase):
                      " %s" % (a.shape, b.shape, msg))
     same = (a == b)
 
-    if a.dtype == np.float32 or a.dtype == np.float64:
+    if (a.dtype in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]):
       same = np.logical_or(same, np.logical_and(np.isnan(a), np.isnan(b)))
     if not np.all(same):
       # Prints more details than np.testing.assert_array_equal.
-- 
GitLab


From 9fb54c30efdcf38ef83c2709a8619a5bf20f2434 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 13 Apr 2018 15:18:48 -0700
Subject: [PATCH 0800/1262] Fix testing

---
 .../contrib/tensorrt/test/test_integration.py | 41 ++++++++++---------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_integration.py b/tensorflow/contrib/tensorrt/test/test_integration.py
index 8ad26c3f69..97915c2659 100644
--- a/tensorflow/contrib/tensorrt/test/test_integration.py
+++ b/tensorflow/contrib/tensorrt/test/test_integration.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+import warnings
 
 from tensorflow.contrib import tensorrt as trt
 from tensorflow.core.protobuf import config_pb2 as cpb2
@@ -36,7 +37,7 @@ from tensorflow.python.platform import test
 
 
 @test_util.with_c_api
-class IntegrationTest(test_util.TensofFlowTestCase):
+class IntegrationTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     """ Setup method """
@@ -44,10 +45,10 @@ class IntegrationTest(test_util.TensofFlowTestCase):
     warnings.simplefilter('always')
     inp_dims = (100, 24, 24, 2)
     self._input = np.random.random_sample(inp_dims)
-    self._original_graph = get_simple_graph_def()
+    self._original_graph = self.get_simple_graph_def()
     self._gpu_options = cpb2.GPUOptions(
         per_process_gpu_memory_fraction=0.50)
-    self._config = cpb2.ConfigProto(gpu_options=gpu_options)
+    self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
     self._reference = self.run_graph(self._original_graph, self._input)
 
   def get_simple_graph_def(self):
@@ -86,7 +87,7 @@ class IntegrationTest(test_util.TensofFlowTestCase):
       inp = inp.outputs[0]
       out = out.outputs[0]
     with self.test_session(
-        grap=g, config=self._config, use_gpu=True,
+        graph=g, config=self._config, use_gpu=True,
         force_gpu=True) as sess:
       val = sess.run(out, {inp: dumm_inp})
     return val
@@ -105,7 +106,7 @@ class IntegrationTest(test_util.TensofFlowTestCase):
       # run over real calibration data here, we are mimicking a calibration set of
       # 30 different batches. Use as much calibration data as you want
     with self.test_session(
-        grap=g, config=self._config, use_gpu=True,
+        graph=g, config=self._config, use_gpu=True,
         force_gpu=True) as sess:
       for _ in range(30):
         val = sess.run(out, {inp: dumm_inp})
@@ -115,9 +116,9 @@ class IntegrationTest(test_util.TensofFlowTestCase):
     """  return trt converted graph """
     if mode == "FP32":
       return trt.create_inference_graph(
-          input_graph_def=self._orig_graph,
+          input_graph_def=self._original_graph,
           outputs=["output"],
-          max_batch_size=inp_dims[0],
+          max_batch_size=self._input.shape[0],
           max_workspace_size_bytes=1 << 25,
           precision_mode=
           "FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
@@ -125,9 +126,9 @@ class IntegrationTest(test_util.TensofFlowTestCase):
           )
     elif mode == "FP16":
       return trt.create_inference_graph(
-          input_graph_def=self._orig_graph,
+          input_graph_def=self._original_graph,
           outputs=["output"],
-          max_batch_size=inp_dims[0],
+          max_batch_size=self._input.shape[0],
           max_workspace_size_bytes=1 << 25,
           precision_mode=
           "FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
@@ -135,9 +136,9 @@ class IntegrationTest(test_util.TensofFlowTestCase):
           )
     elif mode == "INT8":
       return trt.create_inference_graph(
-          input_graph_def=self._orig_graph,
+          input_graph_def=self._original_graph,
           outputs=["output"],
-          max_batch_size=inp_dims[0],
+          max_batch_size=self._input.shape[0],
           max_workspace_size_bytes=1 << 25,
           precision_mode=
           "INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
@@ -151,27 +152,27 @@ class IntegrationTest(test_util.TensofFlowTestCase):
     trt_graph = self.get_trt_graph("FP32")
     result = self.run_graph(trt_graph, self._input)
     self.assertAllEqual(self._reference, result)
-    result = self.run_graph(trt_graph, self._input)
-    self.assertAllEqual(self._reference, result)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
 
   def testFP16(self):
     """ Test FP16 conversion. Results may be different from native case """
     trt_graph = self.get_trt_graph("FP16")
     result = self.run_graph(trt_graph, self._input)
-    self.assertAllEqual(self._reference, result)
-    result = self.run_graph(trt_graph, self._input)
-    self.assertAllEqual(self._reference, result)
+    self.assertAllClose(self._reference, result,rtol=1.e-03)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
 
   def testINT8(self):
     """ Test INT8 conversion. Results may be different from native case """
     calib_graph = self.get_trt_graph("INT8")
     result = self.run_calibration(calib_graph, self._input)
     self.assertAllEqual(self._reference, result)
-    int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
-    result = self.run_graph(int8_graph, self._input)
-    self.assertAllEqual(self._reference, result)
+    int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
     result = self.run_graph(int8_graph, self._input)
-    self.assertAllEqual(self._reference, result)
+    self.assertAllClose(self._reference, result,rtol=1.e-03)
+    result1 = self.run_graph(int8_graph, self._input)
+    self.assertAllEqual(result1, result)
 
 
 if __name__ == '__main__':
-- 
GitLab


From a77dcb5e56dbbbcc3383cb0b39cd79dd88135635 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 15:23:08 -0700
Subject: [PATCH 0801/1262] Add broadcasting to all LinearOperators.

This will broadcast in cases where batch shapes are not equal (but tries to determine statically if this is the case). The broadcasting is not as efficient as doing the broadcast in C++, but makes for the API to at least be completely broadcastable.

PiperOrigin-RevId: 192832919
---
 tensorflow/contrib/linalg/BUILD               |  2 +-
 .../linear_operator_block_diag_test.py        | 67 +---------------
 .../python/ops/linalg/linear_operator.py      |  5 +-
 .../ops/linalg/linear_operator_full_matrix.py |  4 +-
 .../linalg/linear_operator_low_rank_update.py | 25 +++---
 .../linear_operator_lower_triangular.py       |  5 +-
 .../ops/linalg/linear_operator_test_util.py   | 76 ++++++++++++++-----
 7 files changed, 82 insertions(+), 102 deletions(-)

diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index a7812f74d1..8b7ff75ba5 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -58,6 +58,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 4,
+    shard_count = 5,
     tags = ["noasan"],
 )
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
index cc1a047d6a..e7407ede11 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
@@ -76,6 +76,8 @@ class SquareLinearOperatorBlockDiagTest(
         build_info((1, 1)),
         build_info((1, 3, 3)),
         build_info((5, 5), blocks=[(2, 2), (3, 3)]),
+        build_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
+        build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
   def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
@@ -184,70 +186,5 @@ class SquareLinearOperatorBlockDiagTest(
       block_diag.LinearOperatorBlockDiag([])
 
 
-# This test is for blocks with different batch dimensions.
-# LinearOperatorFullMatrix doesn't broadcast matmul/solve.
-class SquareDiagLinearOperatorBlockDiagTest(
-    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
-  """Most tests done in the base class LinearOperatorDerivedClassTest."""
-
-  def setUp(self):
-    # Increase from 1e-6 to 1e-4
-    self._atol[dtypes.float32] = 1e-4
-    self._atol[dtypes.complex64] = 1e-4
-    self._rtol[dtypes.float32] = 1e-4
-    self._rtol[dtypes.complex64] = 1e-4
-
-  @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
-    return [
-        build_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
-        build_info((2, 1, 6, 6), blocks=[(2, 1, 2, 2), (1, 1, 4, 4)]),
-    ]
-
-  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
-    shape = list(build_info.shape)
-    expected_blocks = (
-        build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
-        else [shape])
-    diag_matrices = [
-        linear_operator_test_util.random_uniform(
-            shape=block_shape[:-1], minval=1., maxval=20., dtype=dtype)
-        for block_shape in expected_blocks
-    ]
-
-    if use_placeholder:
-      diag_matrices_ph = [
-          array_ops.placeholder(dtype=dtype) for _ in expected_blocks
-      ]
-      diag_matrices = self.evaluate(diag_matrices)
-      # Evaluate here because (i) you cannot feed a tensor, and (ii)
-      # values are random and we want the same value used for both mat and
-      # feed_dict.
-      operator = block_diag.LinearOperatorBlockDiag(
-          [linalg.LinearOperatorDiag(m_ph) for m_ph in diag_matrices_ph])
-      feed_dict = {m_ph: m for (m_ph, m) in zip(
-          diag_matrices_ph, diag_matrices)}
-    else:
-      operator = block_diag.LinearOperatorBlockDiag(
-          [linalg.LinearOperatorDiag(m) for m in diag_matrices])
-      feed_dict = None
-      # Should be auto-set.
-      self.assertTrue(operator.is_square)
-
-    # Broadcast the shapes.
-    expected_shape = list(build_info.shape)
-
-    matrices = linear_operator_util.broadcast_matrix_batch_dims(
-        [array_ops.matrix_diag(diag_block) for diag_block in diag_matrices])
-
-    block_diag_dense = _block_diag_dense(expected_shape, matrices)
-    if not use_placeholder:
-      block_diag_dense.set_shape(
-          expected_shape[:-2] + [expected_shape[-1], expected_shape[-1]])
-
-    return operator, block_diag_dense, feed_dict
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 193c787baa..8cfe964b1c 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -699,9 +699,10 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     if self._can_use_cholesky():
-      return linalg_ops.cholesky_solve(
+      return linear_operator_util.cholesky_solve_with_broadcast(
           linalg_ops.cholesky(self.to_dense()), rhs)
-    return linalg_ops.matrix_solve(self.to_dense(), rhs, adjoint=adjoint)
+    return linear_operator_util.matrix_solve_with_broadcast(
+        self.to_dense(), rhs, adjoint=adjoint)
 
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
     """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 5ba3b090ae..746da8df1c 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -176,7 +176,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     return array_ops.shape(self._matrix)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return math_ops.matmul(
+    return linear_operator_util.matmul_with_broadcast(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index be91102909..08e5896e10 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -365,14 +366,17 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
-      uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      uh_x = linear_operator_util.matmul_with_broadcast(
+          u, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_uh_x = d.matmul(uh_x, adjoint=adjoint)
-      v_d_uh_x = math_ops.matmul(v, d_uh_x)
+      v_d_uh_x = linear_operator_util.matmul_with_broadcast(
+          v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
-      vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      vh_x = linear_operator_util.matmul_with_broadcast(
+          v, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_vh_x = d.matmul(vh_x, adjoint=adjoint)
-      u_d_vh_x = math_ops.matmul(u, d_vh_x)
+      u_d_vh_x = linear_operator_util.matmul_with_broadcast(u, d_vh_x)
       return leading_term + u_d_vh_x
 
   def _determinant(self):
@@ -431,16 +435,18 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # L^{-1} rhs
     linv_rhs = l.solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
     # V^H L^{-1} rhs
-    vh_linv_rhs = math_ops.matmul(v, linv_rhs, adjoint_a=True)
+    vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
+        v, linv_rhs, adjoint_a=True)
     # C^{-1} V^H L^{-1} rhs
     if self._use_cholesky:
-      capinv_vh_linv_rhs = linalg_ops.cholesky_solve(
+      capinv_vh_linv_rhs = linear_operator_util.cholesky_solve_with_broadcast(
           self._chol_capacitance, vh_linv_rhs)
     else:
-      capinv_vh_linv_rhs = linalg_ops.matrix_solve(
+      capinv_vh_linv_rhs = linear_operator_util.matrix_solve_with_broadcast(
           self._capacitance, vh_linv_rhs, adjoint=adjoint)
     # U C^{-1} V^H M^{-1} rhs
-    u_capinv_vh_linv_rhs = math_ops.matmul(u, capinv_vh_linv_rhs)
+    u_capinv_vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
+        u, capinv_vh_linv_rhs)
     # L^{-1} U C^{-1} V^H L^{-1} rhs
     linv_u_capinv_vh_linv_rhs = l.solve(u_capinv_vh_linv_rhs, adjoint=adjoint)
 
@@ -454,7 +460,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # L^{-1} U
     linv_u = self.base_operator.solve(self.u)
     # V^H L^{-1} U
-    vh_linv_u = math_ops.matmul(self.v, linv_u, adjoint_a=True)
+    vh_linv_u = linear_operator_util.matmul_with_broadcast(
+        self.v, linv_u, adjoint_a=True)
 
     # D^{-1} + V^H L^{-1} V
     capacitance = self._diag_inv_operator.add_to_tensor(vh_linv_u)
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index c4d386ccb4..fb1eb2fedb 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
@@ -194,7 +193,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         message="Singular operator:  Diagonal contained zero values.")
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return math_ops.matmul(
+    return linear_operator_util.matmul_with_broadcast(
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
@@ -206,7 +205,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
-    return linalg_ops.matrix_triangular_solve(
+    return linear_operator_util.matrix_triangular_solve_with_broadcast(
         self._tril, rhs, lower=True, adjoint=adjoint)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index ce1a112ad5..9c8abb9740 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
 
@@ -126,13 +127,16 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("Not implemented yet.")
 
   @abc.abstractmethod
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     """Make a rhs appropriate for calling operator.solve(rhs).
 
     Args:
       operator:  A `LinearOperator`
       adjoint:  Python `bool`.  If `True`, we are making a 'rhs' value for the
         adjoint operator.
+      with_batch: Python `bool`. If `True`, create `rhs` with the same batch
+        shape as operator, and otherwise create a matrix without any batch
+        shape.
 
     Returns:
       A `Tensor`
@@ -140,13 +144,15 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("_make_rhs is not defined.")
 
   @abc.abstractmethod
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
       operator:  A `LinearOperator`
       adjoint:  Python `bool`.  If `True`, we are making an 'x' value for the
         adjoint operator.
+      with_batch: Python `bool`. If `True`, create `x` with the same batch shape
+        as operator, and otherwise create a matrix without any batch shape.
 
     Returns:
       A `Tensor`
@@ -224,8 +230,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 [op_log_abs_det, mat_log_abs_det], feed_dict=feed_dict)
             self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
 
-  def test_matmul(self):
-    self._skip_if_tests_to_skip_contains("matmul")
+  def _test_matmul(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
@@ -235,7 +240,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                     build_info, dtype, use_placeholder=use_placeholder)
-                x = self._make_x(operator, adjoint=adjoint)
+                x = self._make_x(
+                    operator, adjoint=adjoint, with_batch=with_batch)
                 # If adjoint_arg, compute A X^H^H = A X.
                 if adjoint_arg:
                   op_matmul = operator.matmul(
@@ -244,7 +250,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                       adjoint_arg=adjoint_arg)
                 else:
                   op_matmul = operator.matmul(x, adjoint=adjoint)
-                mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
+                mat_matmul = linear_operator_util.matmul_with_broadcast(
+                    mat, x, adjoint_a=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(op_matmul.get_shape(),
                                       mat_matmul.get_shape())
@@ -252,8 +259,15 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_matmul, mat_matmul], feed_dict=feed_dict)
                 self.assertAC(op_matmul_v, mat_matmul_v)
 
-  def test_solve(self):
-    self._skip_if_tests_to_skip_contains("solve")
+  def test_matmul(self):
+    self._skip_if_tests_to_skip_contains("matmul")
+    self._test_matmul(with_batch=True)
+
+  def test_matmul_with_broadcast(self):
+    self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
+    self._test_matmul(with_batch=False)
+
+  def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
@@ -263,7 +277,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                     build_info, dtype, use_placeholder=use_placeholder)
-                rhs = self._make_rhs(operator, adjoint=adjoint)
+                rhs = self._make_rhs(
+                    operator, adjoint=adjoint, with_batch=with_batch)
                 # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
                 if adjoint_arg:
                   op_solve = operator.solve(
@@ -273,7 +288,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 else:
                   op_solve = operator.solve(
                       rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
-                mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
+                mat_solve = linear_operator_util.matrix_solve_with_broadcast(
+                    mat, rhs, adjoint=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(op_solve.get_shape(),
                                       mat_solve.get_shape())
@@ -281,6 +297,14 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_solve, mat_solve], feed_dict=feed_dict)
                 self.assertAC(op_solve_v, mat_solve_v)
 
+  def test_solve(self):
+    self._skip_if_tests_to_skip_contains("solve")
+    self._test_solve(with_batch=True)
+
+  def test_solve_with_broadcast(self):
+    self._skip_if_tests_to_skip_contains("solve_with_broadcast")
+    self._test_solve(with_batch=False)
+
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
@@ -358,13 +382,13 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         build_info((3, 4, 4)),
         build_info((2, 1, 4, 4))]
 
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     # This operator is square, so rhs and x will have same shape.
     # adjoint value makes no difference because the operator shape doesn't
     # change since it is square, but be pedantic.
-    return self._make_x(operator, adjoint=not adjoint)
+    return self._make_x(operator, adjoint=not adjoint, with_batch=with_batch)
 
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     # Value of adjoint makes no difference because the operator is square.
     # Return the number of systems to solve, R, equal to 1 or 2.
     r = self._get_num_systems(operator)
@@ -373,11 +397,17 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
     if operator.shape.is_fully_defined():
       batch_shape = operator.batch_shape.as_list()
       n = operator.domain_dimension.value
-      x_shape = batch_shape + [n, r]
+      if with_batch:
+        x_shape = batch_shape + [n, r]
+      else:
+        x_shape = [n, r]
     else:
       batch_shape = operator.batch_shape_tensor()
       n = operator.domain_dimension_tensor()
-      x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      if with_batch:
+        x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      else:
+        x_shape = [n, r]
 
     return random_normal(x_shape, dtype=operator.dtype)
 
@@ -404,7 +434,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "det", "log_abs_det"]
+    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
   def _operator_build_infos(self):
@@ -417,12 +447,12 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         build_info((3, 3, 4)),
         build_info((2, 1, 2, 4))]
 
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     # TODO(langmore) Add once we're testing solve_ls.
     raise NotImplementedError(
         "_make_rhs not implemented because we don't test solve")
 
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
@@ -433,14 +463,20 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         n = operator.range_dimension.value
       else:
         n = operator.domain_dimension.value
-      x_shape = batch_shape + [n, r]
+      if with_batch:
+        x_shape = batch_shape + [n, r]
+      else:
+        x_shape = [n, r]
     else:
       batch_shape = operator.batch_shape_tensor()
       if adjoint:
         n = operator.range_dimension_tensor()
       else:
         n = operator.domain_dimension_tensor()
-      x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      if with_batch:
+        x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      else:
+        x_shape = [n, r]
 
     return random_normal(x_shape, dtype=operator.dtype)
 
-- 
GitLab


From a22344f82ddd1e877f0b9f82584b9bb1d6c8dc16 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 15:32:11 -0700
Subject: [PATCH 0802/1262] [XLA] Pattern matcher for HLO, Shapes, Layouts

PiperOrigin-RevId: 192834129
---
 tensorflow/compiler/xla/service/BUILD         |   23 +
 .../compiler/xla/service/pattern_matcher.h    | 1014 +++++++++++++++++
 .../xla/service/pattern_matcher_test.cc       |  144 +++
 tensorflow/compiler/xla/shape_util.cc         |   12 +
 tensorflow/compiler/xla/shape_util.h          |    3 +
 5 files changed, 1196 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/pattern_matcher.h
 create mode 100644 tensorflow/compiler/xla/service/pattern_matcher_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 65203fa2a0..ddc099807d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -302,6 +302,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "pattern_matcher",
+    hdrs = ["pattern_matcher.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "pattern_matcher_test",
+    srcs = ["pattern_matcher_test.cc"],
+    deps = [
+        ":hlo",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
new file mode 100644
index 0000000000..5d49638077
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -0,0 +1,1014 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+
+// A pattern matcher for HloInstructions, Shapes, and Layouts.
+//
+// The Match function's first argument must be HloInstruction*, Shape*, or
+// Layout*. The second argument is a pattern that will be matched against the
+// first argument, as described below.
+//
+// Patterns are constructed using the match::Op, match::Shape, or match::Layout
+// functions. By default, the returned patterns will match any HloInstruction,
+// Shape, or Layout, respectively. However the match can be made more specific
+// by using the pattern's modifier methods, for example:
+//
+//   match::Op().WithOpcode(HloOpcode::kAdd).WithOperand(
+//     0, match::Op().WithOpcode(HloOpcode::kConstant))
+//
+// This pattern will match Add instructions whose first operand is a constant.
+//
+// Each pattern type has the following modifiers:
+//
+//   Op():
+//     - WithName: match operations with the given name
+//     - WithOpcode: match operations with the given opcode
+//     - WithShape: match operations whose shape matches the given pattern
+//     - WithOperand: match operations whose operand matches the given pattern
+//
+//   Shape():
+//     - EqualTo: matches shapes that are equal to the argument
+//     - CompatibleTo: matches shapes that are compatible to the argument
+//     - IsScalar/IsArray/IsTuple: matches scalar/array/tuple shapes
+//     - IsDenseArray/IsSparseArray: matches arrays with dense/sparse format
+//     - WithLayout: match shapes whose layout matches the given pattern
+//     - WithLayoutEqualTo: matches shapes whose layouts equal the argument
+//     - WithSubshape: matches tuple shapes whose subshape matches the given
+//       pattern
+//     - WithSubshapeEqualTo: matches shapes with a subshape equal the argument
+//     - WithElementType: matches array/scalar shapes with the given element
+//       type
+//     - WithRank: matches array/scalar types with the given rank
+//
+//  Layout():
+//     - EqualTo: matches layouts that are equal to the argument
+//     - WithDenseFormat/WithSparseFormat: matches layouts with dense/sparse
+//       format
+//
+// Op(), Shape(), and Layout() may be passed an argument of type
+// HloInstruction**, Shape**, or Layout**, respectively, or const versions of
+// these pointers. If the pattern is matched, the address of the matched value
+// will be "captured" and stored at this location.
+//
+// For example:
+//   HloInstruction* foo = ...;
+//   HloInstruction* matched_operand;
+//   CHECK(Match(foo,
+//               match::Op().WithOperand(0, match::Op(&matched_operand))));
+//
+// Helpers are provided for common nullary, unary, binary, and ternary
+// instructions. These helpers can be called with no arguments, in which case
+// they will match any instruction matching the opcode. They may also be called
+// with matches for the operands and with an optional capture. (The capture must
+// be the first argument.) Some examples of these helpers and their equivalents
+// are provided below.
+//
+// Example nullary instruction:
+//   Recv()                            == Op().WithOpcode(HloOpcode::kRecv)
+//   Recv(&a)                          == Op(&a).WithOpcode(HloOpcode::kRecv)
+//
+// Example unary instruction:
+//   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                       == Op().WithOpcode(HloOpcode::kAbs)
+//                                            .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                   == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                              .WithOperand(0, Op(&b))
+//
+// Example binary instruction:
+//   Add()                             == Op().WithOpcode(HloOpcode::kAdd)
+//   Add(Op(&a), Op(&b))               == Op().WithOpcode(HloOpcode::kAdd)
+//                                            .WithOperand(0, Op(&a))
+//                                            .WithOperand(1, Op(&b))
+//   Add(&a, Op(&b), Op(&c))           == Op(&a).WithOpcode(HloOpcode::kAdd)
+//                                              .WithOperand(0, Op(&b))
+//                                              .WithOperand(1, Op(&c))
+//
+// Example ternary instruction:
+//   Clamp()                           == Op().WithOpcode(HloOpcode::kClamp)
+//   Clamp(Op(&a), Op(&b), Op(&c))     == Op().WithOpcode(HloOpcode::kClamp)
+//                                            .WithOperand(0, Op(&a))
+//                                            .WithOperand(1, Op(&b))
+//                                            .WithOperand(2, Op(&c))
+//   Clamp(&a, Op(&b), Op(&c), Op(&d)) == Op(&a).WithOpcode(HloOpcode::kClamp)
+//                                              .WithOperand(0, Op(&b))
+//                                              .WithOperand(1, Op(&c))
+//                                              .WithOperand(2, Op(&d))
+//
+template <typename Value, typename Pattern>
+bool Match(Value* value, const Pattern& pattern) {
+  return pattern.Match(value);
+}
+
+namespace match {
+
+namespace detail {
+
+template <typename LayoutType, typename Impl>
+class LayoutPattern;
+
+// The base LayoutPattern implementation. Matches only if the layout is not
+// nullptr.
+class LayoutPatternBaseImpl {
+ public:
+  bool Match(const ::xla::Layout* layout) const { return layout != nullptr; }
+};
+
+// A LayoutPattern implementation that matches only if the layout equals a
+// Layout proto.
+template <typename Previous>
+class LayoutPatternEqualImpl {
+ public:
+  explicit constexpr LayoutPatternEqualImpl(const Previous& previous,
+                                            const ::xla::Layout* layout)
+      : previous_(previous), layout_(layout) {}
+
+  bool Match(const ::xla::Layout* layout) const {
+    return previous_.Match(layout) && LayoutUtil::Equal(*layout_, *layout);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Layout* layout_;
+};
+
+// A LayoutPattern implementation that matches only if the layout has a given
+// format.
+template <typename Previous>
+class LayoutPatternFormatImpl {
+ public:
+  explicit constexpr LayoutPatternFormatImpl(const Previous& previous,
+                                             Format format)
+      : previous_(previous), format_(format) {}
+
+  bool Match(const ::xla::Layout* layout) const {
+    return previous_.Match(layout) && layout->format() == format_;
+  }
+
+ private:
+  Previous previous_;
+  Format format_;
+};
+
+// A pattern that matches Layouts.
+template <typename LayoutType, typename Impl>
+class LayoutPattern {
+ public:
+  explicit constexpr LayoutPattern(const Impl& impl,
+                                   LayoutType** matched_layout)
+      : impl_(impl), matched_layout_(matched_layout) {}
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(const ::xla::Layout* layout) const {
+    if (impl_.Match(layout)) {
+      if (matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(::xla::Layout* layout) const {
+    if (impl_.Match(layout)) {
+      if (matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the layout equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
+      const Layout* layout) const {
+    return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
+        LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
+  }
+
+  // Modifies the pattern to match only if the layout has a dense format.
+  constexpr LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>
+  WithDenseFormat() const {
+    return LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>(
+        LayoutPatternFormatImpl<Impl>(impl_, DENSE), matched_layout_);
+  }
+
+  // Modifies the pattern to match only if the layout has a sparse format.
+  constexpr LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>
+  WithSparseFormat() const {
+    return LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>(
+        LayoutPatternFormatImpl<Impl>(impl_, SPARSE), matched_layout_);
+  }
+
+ private:
+  Impl impl_;
+  LayoutType** matched_layout_;
+};
+
+}  // namespace detail
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr detail::LayoutPattern<const ::xla::Layout,
+                                       detail::LayoutPatternBaseImpl>
+Layout(const ::xla::Layout** matched_layout = nullptr) {
+  return detail::LayoutPattern<const ::xla::Layout,
+                               detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr detail::LayoutPattern<::xla::Layout,
+                                       detail::LayoutPatternBaseImpl>
+Layout(::xla::Layout** matched_layout) {
+  return detail::LayoutPattern<::xla::Layout, detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+namespace detail {
+
+template <typename ShapeType, typename Impl>
+class ShapePattern;
+
+// The base ShapePattern implementation. Matches only if the shape is not
+// nullptr.
+class ShapePatternBaseImpl {
+ public:
+  bool Match(const ::xla::Shape* shape) const { return shape != nullptr; }
+};
+
+// A ShapePattern implementation that matches only if the shape equals a Shape
+// proto.
+template <typename Previous>
+class ShapePatternEqualImpl {
+ public:
+  explicit constexpr ShapePatternEqualImpl(const Previous& previous,
+                                           const ::xla::Shape* shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Equal(*shape_, *shape);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape is compatible to
+// a Shape proto.
+template <typename Previous>
+class ShapePatternCompatibleImpl {
+ public:
+  explicit constexpr ShapePatternCompatibleImpl(const Previous& previous,
+                                                const ::xla::Shape* shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Compatible(*shape_, *shape);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// element type.
+template <typename Previous>
+class ShapePatternElementTypeImpl {
+ public:
+  explicit constexpr ShapePatternElementTypeImpl(const Previous& previous,
+                                                 PrimitiveType element_type)
+      : previous_(previous), element_type_(element_type) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && shape->element_type() == element_type_;
+  }
+
+ private:
+  Previous previous_;
+  PrimitiveType element_type_;
+};
+
+// A ShapePattern implementation that matches only if the shape is scalar.
+template <typename Previous>
+class ShapePatternIsScalarImpl {
+ public:
+  explicit constexpr ShapePatternIsScalarImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsScalar(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape is an array
+template <typename Previous>
+class ShapePatternIsArrayImpl {
+ public:
+  explicit constexpr ShapePatternIsArrayImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsArray(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape is a tuple.
+template <typename Previous>
+class ShapePatternIsTupleImpl {
+ public:
+  explicit constexpr ShapePatternIsTupleImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsTuple(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// rank.
+template <typename Previous>
+class ShapePatternRankImpl {
+ public:
+  explicit constexpr ShapePatternRankImpl(const Previous& previous, int64 rank)
+      : previous_(previous), rank_(rank) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Rank(*shape) == rank_;
+  }
+
+ private:
+  Previous previous_;
+  int64 rank_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a layout
+// that matches a given pattern.
+template <typename Previous, typename LayoutType, typename LayoutImpl>
+class ShapePatternLayoutImpl {
+ public:
+  explicit constexpr ShapePatternLayoutImpl(
+      const Previous& previous,
+      const LayoutPattern<LayoutType, LayoutImpl>& layout)
+      : previous_(previous), layout_(layout) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(&shape->layout());
+  }
+
+  bool Match(Shape* shape) const {
+    return previous_.Match(shape) && LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(shape->mutable_layout());
+  }
+
+ private:
+  Previous previous_;
+  LayoutPattern<LayoutType, LayoutImpl> layout_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a subshape
+// that matches a given pattern.
+template <typename Previous, typename SubshapeType, typename SubshapeImpl>
+class ShapePatternSubshapeImpl {
+ public:
+  explicit ShapePatternSubshapeImpl(
+      const Previous& previous, ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape)
+      : previous_(previous), index_(index), subshape_(subshape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IndexIsValid(*shape, index_) &&
+           subshape_.Match(&ShapeUtil::GetSubshape(*shape, index_));
+  }
+
+  bool Match(::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IndexIsValid(*shape, index_) &&
+           subshape_.Match(ShapeUtil::GetMutableSubshape(shape, index_));
+  }
+
+ private:
+  Previous previous_;
+  ShapeIndexView index_;
+  ShapePattern<SubshapeType, SubshapeImpl> subshape_;
+};
+
+// A pattern that matches Shapes.
+template <typename ShapeType, typename Impl>
+class ShapePattern {
+ public:
+  explicit constexpr ShapePattern(const Impl& impl, ShapeType** matched_shape)
+      : impl_(impl), matched_shape_(matched_shape) {}
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(const ::xla::Shape* shape) const {
+    if (impl_.Match(shape)) {
+      if (matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(::xla::Shape* shape) const {
+    if (impl_.Match(shape)) {
+      if (matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the shape equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr ShapePattern<ShapeType, ShapePatternEqualImpl<Impl>> EqualTo(
+      const ::xla::Shape* shape) const {
+    return ShapePattern<ShapeType, ShapePatternEqualImpl<Impl>>(
+        ShapePatternEqualImpl<Impl>(impl_, shape), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is compatible to the given
+  // proto. The layout must outlive the returned pattern.
+  constexpr ShapePattern<ShapeType, ShapePatternCompatibleImpl<Impl>>
+  CompatibleTo(const ::xla::Shape* shape) const {
+    return ShapePattern<ShapeType, ShapePatternCompatibleImpl<Impl>>(
+        ShapePatternCompatibleImpl<Impl>(impl_, shape), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has the given element type.
+  constexpr ShapePattern<ShapeType, ShapePatternElementTypeImpl<Impl>>
+  WithElementType(PrimitiveType element_type) const {
+    return ShapePattern<ShapeType, ShapePatternElementTypeImpl<Impl>>(
+        ShapePatternElementTypeImpl<Impl>(impl_, element_type), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is scalar.
+  constexpr ShapePattern<ShapeType, ShapePatternIsScalarImpl<Impl>> IsScalar()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsScalarImpl<Impl>>(
+        ShapePatternIsScalarImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is an array.
+  constexpr ShapePattern<ShapeType, ShapePatternIsArrayImpl<Impl>> IsArray()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsArrayImpl<Impl>>(
+        ShapePatternIsArrayImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is a tuple.
+  constexpr ShapePattern<ShapeType, ShapePatternIsTupleImpl<Impl>> IsTuple()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsTupleImpl<Impl>>(
+        ShapePatternIsTupleImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has the given rank.
+  constexpr ShapePattern<ShapeType, ShapePatternRankImpl<Impl>> WithRank(
+      int64 rank) const {
+    return ShapePattern<ShapeType, ShapePatternRankImpl<Impl>>(
+        ShapePatternRankImpl<Impl>(impl_, rank), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has a layout that matches
+  // the given pattern.
+  template <typename LayoutType, typename LayoutImpl>
+  constexpr ShapePattern<ShapeType,
+                         ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>>
+  WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const {
+    return ShapePattern<ShapeType,
+                        ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>>(
+        ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>(impl_, layout),
+        matched_shape_);
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternEqualImpl<LayoutPatternBaseImpl>>>
+  WithLayoutEqualTo(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().EqualTo(layout));
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
+  IsDenseArray(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().WithDenseFormat());
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
+  IsSparseArray(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().WithSparseFormat());
+  }
+
+  // Modifies the pattern to match only if the shape has a subshape that matches
+  // the given pattern.
+  template <typename SubshapeType, typename SubshapeImpl>
+  ShapePattern<ShapeType,
+               ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>>
+  WithSubshape(ShapeIndexView index,
+               const ShapePattern<SubshapeType, SubshapeImpl>& subshape) const {
+    return ShapePattern<
+        ShapeType, ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>>(
+        ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>(impl_, index,
+                                                                   subshape),
+        matched_shape_);
+  }
+
+  ShapePattern<ShapeType, ShapePatternSubshapeImpl<
+                              Impl, const ::xla::Shape,
+                              ShapePatternEqualImpl<ShapePatternBaseImpl>>>
+  WithSubshapeEqualTo(ShapeIndexView index, const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .EqualTo(shape));
+  }
+
+  ShapePattern<ShapeType, ShapePatternSubshapeImpl<
+                              Impl, const ::xla::Shape,
+                              ShapePatternCompatibleImpl<ShapePatternBaseImpl>>>
+  WithSubshapeCompatibleTo(ShapeIndexView index,
+                           const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .CompatibleTo(shape));
+  }
+
+ private:
+  Impl impl_;
+  ShapeType** matched_shape_;
+};
+
+}  // namespace detail
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr detail::ShapePattern<const ::xla::Shape,
+                                      detail::ShapePatternBaseImpl>
+Shape(const ::xla::Shape** matched_shape = nullptr) {
+  return detail::ShapePattern<const ::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr detail::ShapePattern<::xla::Shape,
+                                      detail::ShapePatternBaseImpl>
+Shape(::xla::Shape** matched_shape) {
+  return detail::ShapePattern<::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+namespace detail {
+
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern;
+
+// The base HloInstructionPattern implementation. Matches only if the
+// instruction is not nullptr.
+class HloInstructionPatternBaseImpl {
+ public:
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return inst != nullptr;
+  }
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given name.
+template <typename Previous>
+class HloInstructionPatternNameImpl {
+ public:
+  explicit HloInstructionPatternNameImpl(const Previous& previous,
+                                         tensorflow::StringPiece name)
+      : previous_(previous), name_(name) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->name() == name_;
+  }
+
+ private:
+  Previous previous_;
+  tensorflow::StringPiece name_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given opcode.
+template <typename Previous>
+class HloInstructionPatternOpcodeImpl {
+ public:
+  explicit constexpr HloInstructionPatternOpcodeImpl(const Previous& previous,
+                                                     HloOpcode opcode,
+                                                     bool invert)
+      : previous_(previous), opcode_(opcode), invert_(invert) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && (invert_ ^ (inst->opcode() == opcode_));
+  }
+
+ private:
+  Previous previous_;
+  HloOpcode opcode_;
+  bool invert_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a shape that matches a given pattern.
+template <typename Previous, typename ShapeType, typename ShapeImpl>
+class HloInstructionPatternShapeImpl {
+ public:
+  explicit constexpr HloInstructionPatternShapeImpl(
+      const Previous& previous, const ShapePattern<ShapeType, ShapeImpl>& shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && shape_.Match(&inst->shape());
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && shape_.Match(inst->mutable_shape());
+  }
+
+ private:
+  Previous previous_;
+  ShapePattern<ShapeType, ShapeImpl> shape_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has an operand that matches a given pattern.
+template <typename Previous, typename OperandType, typename OperandImpl>
+class HloInstructionPatternOperandImpl {
+ public:
+  explicit constexpr HloInstructionPatternOperandImpl(
+      const Previous& previous, int64 operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand)
+      : previous_(previous), operand_index_(operand_index), operand_(operand) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && operand_index_ < inst->operand_count() &&
+           operand_.Match(inst->operand(operand_index_));
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && operand_index_ < inst->operand_count() &&
+           operand_.Match(inst->mutable_operand(operand_index_));
+  }
+
+ private:
+  Previous previous_;
+  int64 operand_index_;
+  HloInstructionPattern<OperandType, OperandImpl> operand_;
+};
+
+// A pattern that matches HloInstructions.
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern {
+ public:
+  explicit constexpr HloInstructionPattern(const Impl& impl,
+                                           HloInstructionType** matched_inst)
+      : impl_(impl), matched_inst_(matched_inst) {}
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(const ::xla::HloInstruction* inst) const {
+    if (impl_.Match(inst)) {
+      if (matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(::xla::HloInstruction* inst) const {
+    if (impl_.Match(inst)) {
+      if (matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the instruction has the given name.
+  HloInstructionPattern<HloInstructionType, HloInstructionPatternNameImpl<Impl>>
+  WithName(tensorflow::StringPiece name) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternNameImpl<Impl>>(
+        HloInstructionPatternNameImpl<Impl>(impl_, name), matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction has the given opcode.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  WithOpcode(HloOpcode opcode) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternOpcodeImpl<Impl>>(
+        HloInstructionPatternOpcodeImpl<Impl>(impl_, opcode, false),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction does not have the
+  // given opcode.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  WithoutOpcode(HloOpcode opcode) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternOpcodeImpl<Impl>>(
+        HloInstructionPatternOpcodeImpl<Impl>(impl_, opcode, true),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction is a constant.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  IsConstant() const {
+    return WithOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction is not a constant.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  IsNonConstant() const {
+    return WithoutOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction has a shape that
+  // matches the given pattern.
+  template <typename ShapeType, typename ShapeImpl>
+  constexpr HloInstructionPattern<
+      HloInstructionType,
+      HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>>
+  WithShape(const ShapePattern<ShapeType, ShapeImpl>& shape) const {
+    return HloInstructionPattern<
+        HloInstructionType,
+        HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>>(
+        HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>(impl_,
+                                                                   shape),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction has an operand that
+  // matches the given pattern.
+  template <typename OperandType, typename OperandImpl>
+  constexpr HloInstructionPattern<
+      HloInstructionType,
+      HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>>
+  WithOperand(
+      int64 operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
+    return HloInstructionPattern<
+        HloInstructionType,
+        HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>>(
+        HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>(
+            impl_, operand_index, operand),
+        matched_inst_);
+  }
+
+ private:
+  Impl impl_;
+  HloInstructionType** matched_inst_;
+};
+
+}  // namespace detail
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr detail::HloInstructionPattern<
+    const ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
+Op(const ::xla::HloInstruction** matched_inst = nullptr) {
+  return detail::HloInstructionPattern<const ::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr detail::HloInstructionPattern<
+    ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
+Op(::xla::HloInstruction** matched_inst) {
+  return detail::HloInstructionPattern<::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Helpers for nullary instructions.
+#define XLA_NULLOP_PATTERN(NAME)                                      \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
+    return Op().WithOpcode(HloOpcode::k##NAME);                       \
+  }                                                                   \
+                                                                      \
+  template <typename HloInstructionType>                              \
+  inline auto NAME(HloInstructionType** matched_inst)                 \
+      ->decltype(Op(matched_inst).WithOpcode(HloOpcode::k##NAME)) {   \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);           \
+  }
+XLA_NULLOP_PATTERN(Constant)
+XLA_NULLOP_PATTERN(Infeed)
+XLA_NULLOP_PATTERN(Parameter)
+XLA_NULLOP_PATTERN(Recv)
+#undef XLA_NULLOP_PATTERN
+
+// Helpers for unary instructions.
+#define XLA_UNOP_PATTERN(NAME)                                        \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
+    return Op().WithOpcode(HloOpcode::k##NAME);                       \
+  }                                                                   \
+                                                                      \
+  template <typename Arg>                                             \
+  inline auto NAME(Arg&& arg)->decltype(                              \
+      Op().WithOpcode(HloOpcode::k##NAME)                             \
+          .WithOperand(0, std::forward<Arg>(arg))) {                  \
+    return Op()                                                       \
+        .WithOpcode(HloOpcode::k##NAME)                               \
+        .WithOperand(0, std::forward<Arg>(arg));                      \
+  }                                                                   \
+                                                                      \
+  template <typename HloInstructionType, typename Arg>                \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg)      \
+      ->decltype(Op(matched_inst)                                     \
+                     .WithOpcode(HloOpcode::k##NAME)                  \
+                     .WithOperand(0, std::forward<Arg>(arg))) {       \
+    return Op(matched_inst)                                           \
+        .WithOpcode(HloOpcode::k##NAME)                               \
+        .WithOperand(0, std::forward<Arg>(arg));                      \
+  }
+XLA_UNOP_PATTERN(Abs)
+XLA_UNOP_PATTERN(RoundNearestAfz)
+XLA_UNOP_PATTERN(Bitcast)
+XLA_UNOP_PATTERN(Broadcast)
+XLA_UNOP_PATTERN(BroadcastDimOne)
+XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Copy)
+XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(Exp)
+XLA_UNOP_PATTERN(Fft)
+XLA_UNOP_PATTERN(Floor)
+XLA_UNOP_PATTERN(Imag)
+XLA_UNOP_PATTERN(IsFinite)
+XLA_UNOP_PATTERN(Log)
+XLA_UNOP_PATTERN(Not)
+XLA_UNOP_PATTERN(Negate)
+XLA_UNOP_PATTERN(Outfeed)
+XLA_UNOP_PATTERN(Real)
+XLA_UNOP_PATTERN(Reduce)
+XLA_UNOP_PATTERN(ReducePrecision)
+XLA_UNOP_PATTERN(Reshape)
+XLA_UNOP_PATTERN(Reverse)
+XLA_UNOP_PATTERN(Send)
+XLA_UNOP_PATTERN(Sign)
+XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Sort)
+XLA_UNOP_PATTERN(Tanh)
+XLA_UNOP_PATTERN(Transpose)
+#undef XLA_UNOP_PATTERN
+
+// Helpers for binary instructions.
+#define XLA_BINOP_PATTERN(NAME)                                             \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {       \
+    return Op().WithOpcode(HloOpcode::k##NAME);                             \
+  }                                                                         \
+                                                                            \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                    \
+      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                \
+                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
+    return Op()                                                             \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithOperand(0, std::forward<Lhs>(lhs))                             \
+        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+  }                                                                         \
+                                                                            \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                \
+                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithOperand(0, std::forward<Lhs>(lhs))                             \
+        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+  }
+XLA_BINOP_PATTERN(Add)
+XLA_BINOP_PATTERN(Atan2)
+XLA_BINOP_PATTERN(Divide)
+XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(Eq)
+XLA_BINOP_PATTERN(Gather)
+XLA_BINOP_PATTERN(Ge)
+XLA_BINOP_PATTERN(Gt)
+XLA_BINOP_PATTERN(Le)
+XLA_BINOP_PATTERN(Lt)
+XLA_BINOP_PATTERN(Maximum)
+XLA_BINOP_PATTERN(Minimum)
+XLA_BINOP_PATTERN(Multiply)
+XLA_BINOP_PATTERN(Ne)
+XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(Remainder)
+XLA_BINOP_PATTERN(Subtract)
+XLA_BINOP_PATTERN(And)
+XLA_BINOP_PATTERN(Or)
+XLA_BINOP_PATTERN(ShiftLeft)
+XLA_BINOP_PATTERN(ShiftRightArithmetic)
+XLA_BINOP_PATTERN(ShiftRightLogical)
+#undef XLA_BINOP_PATTERN
+
+// Helpers for ternary instructions.
+#define XLA_TERNOP_PATTERN(NAME)                                       \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {  \
+    return Op().WithOpcode(HloOpcode::k##NAME);                        \
+  }                                                                    \
+                                                                       \
+  template <typename Arg0, typename Arg1, typename Arg2>               \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2)              \
+      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                   \
+                     .WithOperand(0, std::forward<Arg0>(arg0))         \
+                     .WithOperand(1, std::forward<Arg1>(arg1))         \
+                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+    return Op()                                                        \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }                                                                    \
+                                                                       \
+  template <typename HloInstructionType, typename Arg0, typename Arg1, \
+            typename Arg2>                                             \
+  inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
+                   Arg1&& arg1, Arg2&& arg2)                           \
+      ->decltype(Op(matched_inst)                                      \
+                     .WithOpcode(HloOpcode::k##NAME)                   \
+                     .WithOperand(0, std::forward<Arg0>(arg0))         \
+                     .WithOperand(1, std::forward<Arg1>(arg1))         \
+                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+    return Op(matched_inst)                                            \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }
+XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Select);
+#undef XLA_TERNOP_PATTERN
+
+// Helpers for matching non-constant instructions.
+inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
+  return Op().IsNonConstant();
+}
+
+template <typename HloInstructionType>
+inline auto NonConstant(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsNonConstant()) {
+  return Op(matched_inst).IsNonConstant();
+}
+
+}  // namespace match
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
new file mode 100644
index 0000000000..5291b1437a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+TEST(PatternMatcherTest, AddOp) {
+  constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
+    ENTRY %two_plus_two_computation () -> f32[] {
+      %two = f32[] constant(2)
+      ROOT %two_plus_two = f32[] add(f32[] %two, f32[] %two)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+
+  const HloInstruction* matched_inst;
+  HloInstruction* matched_operand;
+  Shape* matched_shape;
+  Layout* matched_layout;
+
+  ASSERT_TRUE(Match(
+      hlo_module->entry_computation()->root_instruction(),
+      match::Op(&matched_inst)
+          .WithName("two_plus_two")
+          .WithOpcode(HloOpcode::kAdd)
+          .WithShape(
+              match::Shape(&matched_shape)
+                  .WithLayout(match::Layout(&matched_layout).WithDenseFormat()))
+          .WithOperand(
+              0,
+              match::Op(&matched_operand).WithOpcode(HloOpcode::kConstant))));
+  ASSERT_NE(matched_inst, nullptr);
+  EXPECT_EQ(matched_inst->name(), "two_plus_two");
+  EXPECT_EQ(matched_inst->opcode(), HloOpcode::kAdd);
+
+  EXPECT_TRUE(Match(hlo_module->entry_computation()->root_instruction(),
+                    match::Add(match::Constant(), match::Constant())));
+
+  EXPECT_FALSE(Match(hlo_module->entry_computation()->root_instruction(),
+                     match::Op().WithName("bad_name")));
+  matched_inst = nullptr;
+  EXPECT_FALSE(Match(hlo_module->entry_computation()->root_instruction(),
+                     match::Multiply(&matched_inst, match::Op(), match::Op())));
+}
+
+TEST(PatternMatcherTest, ScalarShape) {
+  auto scalar_shape = ShapeUtil::MakeShape(F32, {});
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape(&matched_shape).IsScalar()));
+  EXPECT_EQ(matched_shape, &scalar_shape);
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsArray()));
+  EXPECT_FALSE(Match(&scalar_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithRank(0)));
+  EXPECT_FALSE(Match(
+      &scalar_shape,
+      match::Shape().WithSubshape({0}, match::Shape()).WithElementType(F32)));
+}
+
+TEST(PatternMatcherTest, ArrayShape) {
+  auto array_shape = ShapeUtil::MakeShape(F32, {2, 3, 4});
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
+  EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3)));
+  EXPECT_FALSE(
+      Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape())));
+  Layout* matched_layout;
+  EXPECT_FALSE(Match(&array_shape,
+                     match::Shape().WithLayout(
+                         match::Layout(&matched_layout).WithSparseFormat())));
+}
+
+TEST(PatternMatcherTest, TupleShape) {
+  auto tuple_shape = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {1, 2, 3}),
+      ShapeUtil::MakeShape(S32, {4, 5}),
+  });
+  EXPECT_TRUE(Match(&tuple_shape, match::Shape().IsTuple()));
+  EXPECT_FALSE(Match(&tuple_shape, match::Shape().IsArray()));
+  EXPECT_FALSE(Match(&tuple_shape, match::Shape().IsScalar()));
+
+  Shape* subshape;
+  ASSERT_TRUE(Match(
+      &tuple_shape,
+      match::Shape().WithSubshape(
+          {0}, match::Shape(&subshape).WithElementType(F32).WithRank(3))));
+  ASSERT_NE(subshape, nullptr);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(*subshape, ShapeUtil::GetSubshape(tuple_shape, {0})));
+  EXPECT_TRUE(Match(&tuple_shape,
+                    match::Shape().WithSubshape(
+                        {0}, match::Shape().EqualTo(
+                                 &ShapeUtil::GetSubshape(tuple_shape, {0})))));
+  EXPECT_FALSE(Match(&tuple_shape,
+                     match::Shape().WithSubshape(
+                         {0}, match::Shape().EqualTo(
+                                  &ShapeUtil::GetSubshape(tuple_shape, {1})))));
+
+  ASSERT_TRUE(Match(
+      &tuple_shape,
+      match::Shape().WithSubshape(
+          {1}, match::Shape(&subshape).WithElementType(S32).WithRank(2))));
+  ASSERT_NE(subshape, nullptr);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(*subshape, ShapeUtil::GetSubshape(tuple_shape, {1})));
+  EXPECT_TRUE(Match(&tuple_shape,
+                    match::Shape().WithSubshape(
+                        {1}, match::Shape().EqualTo(
+                                 &ShapeUtil::GetSubshape(tuple_shape, {1})))));
+  EXPECT_FALSE(Match(&tuple_shape,
+                     match::Shape().WithSubshape(
+                         {1}, match::Shape().EqualTo(
+                                  &ShapeUtil::GetSubshape(tuple_shape, {0})))));
+
+  EXPECT_FALSE(
+      Match(&tuple_shape, match::Shape().WithSubshape({2}, match::Shape())));
+  EXPECT_FALSE(
+      Match(&tuple_shape, match::Shape().WithSubshape({0, 0}, match::Shape())));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 6825d24765..ac7e201bfd 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -824,6 +824,18 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return new_shape;
 }
 
+/* static */ bool ShapeUtil::IndexIsValid(const Shape& shape,
+                                          ShapeIndexView index) {
+  const Shape* subshape = &shape;
+  for (auto i : index) {
+    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size()) {
+      return false;
+    }
+    subshape = &subshape->tuple_shapes(i);
+  }
+  return true;
+}
+
 /* static */ const Shape& ShapeUtil::GetSubshape(const Shape& shape,
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 6d228eff46..63da9154cf 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -448,6 +448,9 @@ class ShapeUtil {
   static bool ShapeIs(const Shape& shape, PrimitiveType element_type,
                       std::initializer_list<int64> dimensions);
 
+  // Returns true if the given shape has a subshape at the given index.
+  static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
+
   // GetSubshape and GetMutableSubshape return a particular nested Shape within
   // the given Shape argument.
   static const Shape& GetSubshape(const Shape& shape, ShapeIndexView index);
-- 
GitLab


From 026f052710475d1a5d08007e5ff7e105c653a965 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 13 Apr 2018 15:33:07 -0700
Subject: [PATCH 0803/1262] Avoid mixing `Dimension` type and `int` when
 defining kernel shapes in conv layers.

PiperOrigin-RevId: 192834255
---
 .../keras/_impl/keras/layers/convolutional.py      | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index d202b6551d..12b965587f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -148,7 +148,7 @@ class Conv(Layer):
     if input_shape[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
+    input_dim = int(input_shape[channel_axis])
     kernel_shape = self.kernel_size + (input_dim, self.filters)
 
     self.kernel = self.add_variable(name='kernel',
@@ -705,6 +705,7 @@ class Conv2DTranspose(Conv2D):
         **kwargs)
 
   def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
     if len(input_shape) != 4:
       raise ValueError('Inputs should have rank 4. Received input shape: ' +
                        str(input_shape))
@@ -712,10 +713,10 @@ class Conv2DTranspose(Conv2D):
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis] is None:
+    if input_shape[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
+    input_dim = int(input_shape[channel_axis])
     self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
@@ -945,6 +946,7 @@ class Conv3DTranspose(Conv3D):
         **kwargs)
 
   def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
     if len(input_shape) != 5:
       raise ValueError('Inputs should have rank 5, received input shape:',
                        str(input_shape))
@@ -952,10 +954,10 @@ class Conv3DTranspose(Conv3D):
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis] is None:
+    if input_shape[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined, found None: ' + str(input_shape))
-    input_dim = input_shape[channel_axis]
+    input_dim = int(input_shape[channel_axis])
     kernel_shape = self.kernel_size + (self.filters, input_dim)
     self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
 
@@ -1212,7 +1214,7 @@ class SeparableConv(Conv):
     if input_shape[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
+    input_dim = int(input_shape[channel_axis])
     self.input_spec = InputSpec(ndim=self.rank + 2,
                                 axes={channel_axis: input_dim})
     depthwise_kernel_shape = self.kernel_size + (input_dim,
-- 
GitLab


From cfc59cb0e89077c5aa80f386602b0be6a357c7c1 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Fri, 13 Apr 2018 15:47:37 -0700
Subject: [PATCH 0804/1262] Enable remote functions for TPU_SYSTEM.

PiperOrigin-RevId: 192836098
---
 .../core/common_runtime/process_function_library_runtime.cc  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 92fdcb404e..d05f146f21 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -144,7 +144,10 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
   }
   Device* device = flr->device();
   string device_type = device->parsed_name().type;
-  if (device_type == "CPU") return Status::OK();
+  if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
+    // "TPU_SYSTEM" indicates that `device` is a CPU.
+    return Status::OK();
+  }
   if (device_type == "GPU") {
     auto* dev_info = flr->device()->tensorflow_gpu_device_info();
     if (dev_info) {
-- 
GitLab


From aa65cee4bb9644ef4d3f8704161c70d61113cce3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 15:53:05 -0700
Subject: [PATCH 0805/1262] Restore definitions of static members in
 MklCpuAllocator.

These were removed in #17396 which made the static member variables of
MklCpuAllocator into inline variables, which are a C++17 feature, and not
properly restored in #18006 which reverted the inline declarations, leading to
an ODR violation that is apparently ignored with some compilers.
END_PUBLIC

RELNOTES: n/a

BEGIN_PUBLIC
Automated g4 rollback of changelist 191305220

PiperOrigin-RevId: 192836808
---
 tensorflow/core/common_runtime/mkl_cpu_allocator.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 829c19204a..43a909466e 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr const char* MklCPUAllocator::kMaxLimitStr;
+constexpr const size_t MklCPUAllocator::kDefaultMaxLimit;
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-- 
GitLab


From 3bf8fe926b833aa5258d6a5ac58ed3aac2b4cda3 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jart@google.com>
Date: Fri, 13 Apr 2018 15:57:45 -0700
Subject: [PATCH 0806/1262] Upgrade SQLite

PiperOrigin-RevId: 192837358
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 85bd1ea28b..aab0fb41fb 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -232,11 +232,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "org_sqlite",
       urls = [
-          "https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
-          "http://www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
+          "https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip",
+          "https://www.sqlite.org/2018/sqlite-amalgamation-3230100.zip",
       ],
-      sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
-      strip_prefix = "sqlite-amalgamation-3200000",
+      sha256 = "4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc",
+      strip_prefix = "sqlite-amalgamation-3230100",
       build_file = clean_dep("//third_party:sqlite.BUILD"),
   )
 
-- 
GitLab


From 0d3fda7691f21ff2cb84d391494697f37804bec6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 16:38:12 -0700
Subject: [PATCH 0807/1262] Improve layout optimizer tests -- Evaluate nodes
 before and after optimization, to confirm the graph's behavior is maintained
 after optimization.

PiperOrigin-RevId: 192842623
---
 tensorflow/core/grappler/optimizers/BUILD     |  3 ++
 .../optimizers/layout_optimizer_test.cc       | 36 +++++++++++++++----
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index a4545bb8f8..aa5102017c 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -479,10 +479,13 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:virtual_placer",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index b913f2b004..e405c4c58c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -17,11 +17,15 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -30,15 +34,25 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class LayoutOptimizerTest : public ::testing::Test {
+class LayoutOptimizerTest : public GrapplerTest {
  protected:
   void SetUp() override {
-    DeviceProperties device_properties;
-    device_properties.set_type("GPU");
-    device_properties.mutable_environment()->insert({"architecture", "6"});
-    virtual_cluster_.reset(new VirtualCluster({{"/GPU:1", device_properties}}));
+    gpu_available_ = GetNumAvailableGPUs() > 0;
+
+    if (gpu_available_) {
+      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
+    } else {
+      DeviceProperties device_properties;
+      device_properties.set_type("GPU");
+      device_properties.mutable_environment()->insert({"architecture", "6"});
+      virtual_cluster_.reset(
+          new VirtualCluster({{"/GPU:1", device_properties}}));
+    }
+    TF_CHECK_OK(virtual_cluster_->Provision());
   }
 
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding) {
     return SimpleConv2D(s, input_size, filter_size, padding, "");
@@ -160,6 +174,7 @@ class LayoutOptimizerTest : public ::testing::Test {
   }
 
   std::unique_ptr<Cluster> virtual_cluster_;
+  bool gpu_available_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -183,6 +198,15 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   Tensor input_sizes_expected(DT_INT32, {4});
   test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7});
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
+
+  if (gpu_available_) {
+    std::vector<string> fetch = {"Fetch"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  }
 }
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
@@ -1150,7 +1174,7 @@ TEST_F(LayoutOptimizerTest, DevicePlacement) {
   NodeMap node_map(&output);
   auto vec_permute =
       node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
-  EXPECT_EQ(vec_permute->device(), "/device:CPU:0");
+  EXPECT_TRUE(str_util::EndsWith(vec_permute->device(), "CPU:0"));
 }
 }  // namespace
 }  // namespace grappler
-- 
GitLab


From 3d66977d99c1d37cf318557ea613cd0dd6b001fd Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Fri, 13 Apr 2018 16:38:29 -0700
Subject: [PATCH 0808/1262] Automated g4 rollback of changelist 192784701

PiperOrigin-RevId: 192842670
---
 .../ci_build/windows/bazel/bazel_test_lib.sh    |  7 -------
 .../windows/cpu/pip/build_tf_windows.sh         | 17 +++++------------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index b2e16902d6..d654b433e7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -140,13 +140,6 @@ function run_configure_for_gpu_build {
   echo "" | ./configure
 }
 
-function set_gcs_remote_cache_options {
-  echo "build --experimental_remote_spawn_cache" >> .bazelrc
-  echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> .bazelrc
-  echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> .bazelrc
-  echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> .bazelrc
-}
-
 function create_python_test_dir() {
   rm -rf "$1"
   mkdir -p "$1"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 4657ff196b..5e9ae497e1 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,27 +42,20 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-# Recreate an empty bazelrc file under source root
-rm -f .bazelrc
-touch .bazelrc
-
 skip_test=0
 
 for ARG in "$@"; do
   if [[ "$ARG" == --skip_test ]]; then
     skip_test=1
-  elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
-    set_gcs_remote_cache_options
   fi
 done
 
-# --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
-# by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-echo "build --define=override_eigen_strong_inline=true" >> .bazelrc
-
 run_configure_for_cpu_build
 
-bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
+# --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+# by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
+BUILD_OPTS="--define=override_eigen_strong_inline=true"
+bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -80,7 +73,7 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt -k --test_output=errors \
+bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-- 
GitLab


From 6048b07adb364fcef086fb30ecdfb8a2881ba6ac Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 13 Apr 2018 17:13:45 -0700
Subject: [PATCH 0809/1262] TFLite: Copy output data from BufferHandle to CPU
 memory by default. PiperOrigin-RevId: 192846824

---
 tensorflow/contrib/lite/interpreter.cc |  6 ++++++
 tensorflow/contrib/lite/interpreter.h  | 16 ++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index f258654608..31b874a6a6 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -570,6 +570,12 @@ TfLiteStatus Interpreter::Invoke() {
     }
   }
 
+  if (!allow_buffer_handle_output_) {
+    for (int tensor_index : outputs_) {
+      EnsureTensorDataIsReadable(tensor_index);
+    }
+  }
+
   return status;
 }
 
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index df67cce9de..3c776aacb6 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -282,6 +282,7 @@ class Interpreter {
 
   // Ensure the data in `tensor.data` is readable. In case delegate is used,
   // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
     TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
     TfLiteTensor* tensor = &tensors_[tensor_index];
@@ -328,6 +329,18 @@ class Interpreter {
   // pointers to existing tensors.
   static constexpr int kTensorsCapacityHeadroom = 16;
 
+  // Set if buffer handle output is allowed.
+  //
+  // When using hardware delegation, Interpreter will make the data of output
+  // tensors available in `tensor->data` by default. If the application can
+  // consume the buffer handle directly (e.g. reading output from OpenGL
+  // texture), it can set this flag to false, so Interpreter won't copy the data
+  // from buffer handle to CPU memory.
+  // WARNING: This is an experimental API and subject to change.
+  void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
+    allow_buffer_handle_output_ = allow_buffer_handle_output;
+  }
+
  private:
   // Give 'op_reg' a chance to initialize itself using the contents of
   // 'buffer'.
@@ -518,6 +531,9 @@ class Interpreter {
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  bool allow_buffer_handle_output_ = false;
 };
 
 }  // namespace tflite
-- 
GitLab


From 360c5a37957311657d45c351248aaa8e8fcac3be Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Fri, 13 Apr 2018 17:26:46 -0700
Subject: [PATCH 0810/1262] Revamp Cudnn RNN kernels for incoming autotune
 changes.

* Create DoForward() and DoBackward() to be used by fwd/bak kernels and later autotune.
* Simplify CudnnRnnForward Comupute() function. Offload the majority of its logic to other member functions.

PiperOrigin-RevId: 192848100
---
 tensorflow/core/kernels/cudnn_rnn_ops.cc | 689 ++++++++++++++---------
 1 file changed, 410 insertions(+), 279 deletions(-)

diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index e4036ddaa9..a21f13a4dd 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -78,6 +78,7 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 #if GOOGLE_CUDA
 
 using GPUDevice = Eigen::GpuDevice;
+using ::perftools::gputools::StreamExecutor;
 
 template <typename Device, typename T, typename Index>
 class CudnnRNNParamsSizeOp;
@@ -101,15 +102,21 @@ enum class TFRNNInputMode {
 };
 
 namespace {
-using perftools::gputools::DeviceMemory;
-using perftools::gputools::DeviceMemoryBase;
-using perftools::gputools::ScratchAllocator;
-using perftools::gputools::dnn::AlgorithmConfig;
-using perftools::gputools::dnn::RnnDirectionMode;
-using perftools::gputools::dnn::RnnInputMode;
-using perftools::gputools::dnn::RnnMode;
-using perftools::gputools::dnn::ToDataType;
-using perftools::gputools::port::StatusOr;
+using ::perftools::gputools::DeviceMemory;
+using ::perftools::gputools::DeviceMemoryBase;
+using ::perftools::gputools::ScratchAllocator;
+using ::perftools::gputools::Stream;
+using ::perftools::gputools::dnn::AlgorithmConfig;
+using ::perftools::gputools::dnn::AlgorithmDesc;
+using ::perftools::gputools::dnn::ProfileResult;
+using ::perftools::gputools::dnn::RnnDescriptor;
+using ::perftools::gputools::dnn::RnnDirectionMode;
+using ::perftools::gputools::dnn::RnnInputMode;
+using ::perftools::gputools::dnn::RnnMode;
+using ::perftools::gputools::dnn::RnnSequenceTensorDescriptor;
+using ::perftools::gputools::dnn::RnnStateTensorDescriptor;
+using ::perftools::gputools::dnn::ToDataType;
+using ::perftools::gputools::port::StatusOr;
 
 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
   if (str == "rnn_relu") {
@@ -252,12 +259,12 @@ class CudnnRnnAllocatorInTemp : public ScratchAllocator {
 
   explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      perftools::gputools::Stream* stream, int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
     Tensor temporary_memory;
     const DataType tf_data_type = ToTFDataType<T>::value;
     int64 allocate_count =
@@ -298,11 +305,11 @@ class CudnnRnnAllocatorInOutput : public ScratchAllocator {
   ~CudnnRnnAllocatorInOutput() override {}
   CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      perftools::gputools::Stream* stream, int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
     CHECK(total_byte_size_ == 0)
         << "Reserve space allocator can only be called once";
     int64 allocate_count =
@@ -338,12 +345,12 @@ class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
 
   ~CudnnRNNPersistentSpaceAllocator() override {}
 
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      perftools::gputools::Stream* stream, int64 byte_size) override {
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
     if (total_byte_size_ != 0) {
       return Status(error::FAILED_PRECONDITION,
                     "Persistent space allocator can only be called once");
@@ -374,6 +381,13 @@ struct CudnnModelTypes {
     // input-h.
     return rnn_mode == RnnMode::kRnnLstm;
   }
+
+  string DebugString() const {
+    return strings::Printf(
+        "[rnn_mode, rnn_input_mode, rnn_direction_mode]: %d, %d, %d ",
+        static_cast<int>(rnn_mode), static_cast<int>(rnn_input_mode),
+        static_cast<int>(rnn_direction_mode));
+  }
 };
 
 // A helper class that collects the shapes to describe a RNN model.
@@ -381,9 +395,9 @@ struct CudnnRnnModelShapes {
   int num_layers;
   int input_size;
   int num_units;
+  int dir_count;
   int seq_length;
   int batch_size;
-  int dir_count;
   TensorShape input_shape;
   TensorShape output_shape;
   TensorShape hidden_state_shape;
@@ -392,10 +406,11 @@ struct CudnnRnnModelShapes {
     return num_layers == rhs.num_layers && input_size == rhs.input_size &&
            num_units == rhs.num_units && dir_count == rhs.dir_count;
   }
-  string RnnDescDebugString() {
+  string DebugString() const {
     return strings::Printf(
-        "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
-        num_layers, input_size, num_units, dir_count);
+        "[num_layers, input_size, num_units, dir_count, seq_length, "
+        "batch_size]: [%d, %d, %d, %d, %d, %d] ",
+        num_layers, input_size, num_units, dir_count, seq_length, batch_size);
   }
 };
 
@@ -420,8 +435,15 @@ struct CudnnRnnModelShapesComparator {
   }
 };
 
-// Extract and checks the forward input tensors, parameters, and shapes from
-// the OpKernelContext.
+// Pointers to RNN scratch space for a specific set of shape parameters (used as
+// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp).
+struct RnnScratchSpace {
+  std::unique_ptr<RnnDescriptor> rnn_desc;
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator;
+};
+
+// Extract and checks the forward input tensors, parameters, and shapes from the
+// OpKernelContext.
 Status ExtractForwardInput(OpKernelContext* context,
                            const CudnnModelTypes& model_types,
                            const Tensor** input, const Tensor** input_h,
@@ -474,13 +496,171 @@ Status ExtractForwardInput(OpKernelContext* context,
   return Status::OK();
 }
 
-using perftools::gputools::dnn::RnnDescriptor;
+template <typename T>
+Status CreateForwardAndBackwardIODescriptors(
+    OpKernelContext* context, const CudnnRnnModelShapes& model_shapes,
+    std::unique_ptr<RnnSequenceTensorDescriptor>* input_desc,
+    std::unique_ptr<RnnStateTensorDescriptor>* state_desc,
+    std::unique_ptr<RnnSequenceTensorDescriptor>* output_desc) {
+  StreamExecutor* executor = context->op_device_context()->stream()->parent();
+  ::perftools::gputools::dnn::DataType data_type = ToDataType<T>::value;
+
+  const TensorShape& input_shape = model_shapes.input_shape;
+  const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+  const TensorShape& output_shape = model_shapes.output_shape;
+
+  DCHECK_EQ(input_shape.dims(), 3);
+  auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+      input_shape.dim_size(0), input_shape.dim_size(1), input_shape.dim_size(2),
+      data_type);
+  TF_RETURN_IF_ERROR(input_desc_s.status());
+  *input_desc = input_desc_s.ConsumeValueOrDie();
+
+  DCHECK_EQ(hidden_state_shape.dims(), 3);
+  auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
+      hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
+      hidden_state_shape.dim_size(2), data_type);
+  TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
+  *state_desc = hidden_state_desc_s.ConsumeValueOrDie();
+
+  DCHECK_EQ(output_shape.dims(), 3);
+  auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+      output_shape.dim_size(0), output_shape.dim_size(1),
+      output_shape.dim_size(2), data_type);
+  TF_RETURN_IF_ERROR(output_desc_s.status());
+  *output_desc = output_desc_s.ConsumeValueOrDie();
+  return Status::OK();
+}
+
+template <typename T>
+Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
+                 const CudnnModelTypes& model_types,
+                 const CudnnRnnModelShapes& model_shapes,
+                 /* forward inputs */
+                 const Tensor* input, const Tensor* input_h,
+                 const Tensor* input_c, const Tensor* params,
+                 const bool is_training,
+                 /* forward outputs, outputs of the function */
+                 Tensor* output, Tensor* output_h, Tensor* output_c,
+                 ScratchAllocator* reserve_space_allocator,
+                 ScratchAllocator* workspace_allocator,
+                 ProfileResult* output_profile_result) {
+  std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
+  std::unique_ptr<RnnStateTensorDescriptor> state_desc;
+  std::unique_ptr<RnnSequenceTensorDescriptor> output_desc;
+
+  TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
+      context, model_shapes, &input_desc, &state_desc, &output_desc));
+
+  auto input_data = AsDeviceMemory<T>(input);
+  auto input_h_data = AsDeviceMemory<T>(input_h);
+  DeviceMemory<T> input_c_data;
+  if (model_types.HasInputC()) {
+    input_c_data = AsDeviceMemory<T>(input_c);
+  }
+  auto params_data = AsDeviceMemory<T>(params);
+  auto output_data = AsDeviceMemory<T>(output);
+  auto output_h_data = AsDeviceMemory<T>(output_h);
+  DeviceMemory<T> output_c_data;
+  if (model_types.HasInputC()) {
+    output_c_data = AsDeviceMemory<T>(output_c);
+  }
+
+  Stream* stream = context->op_device_context()->stream();
+  bool launch_success =
+      stream
+          ->ThenRnnForward(rnn_desc, *input_desc, input_data, *state_desc,
+                           input_h_data, *state_desc, input_c_data, params_data,
+                           *output_desc, &output_data, *state_desc,
+                           &output_h_data, *state_desc, &output_c_data,
+                           is_training, reserve_space_allocator,
+                           workspace_allocator, output_profile_result)
+          .ok();
+  return launch_success
+             ? Status::OK()
+             : errors::Internal(
+                   "Failed to call ThenRnnForward with model config: ",
+                   model_types.DebugString(), ", ", model_shapes.DebugString());
+}
+
+template <typename T>
+Status DoBackward(
+    OpKernelContext* context, const RnnDescriptor& rnn_desc,
+    const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes,
+    /* forward inputs */
+    const Tensor* input, const Tensor* input_h, const Tensor* input_c,
+    const Tensor* params,
+    /* forward outptus */
+    const Tensor* output, const Tensor* output_h, const Tensor* output_c,
+    /* backprop inputs */
+    const Tensor* output_backprop, const Tensor* output_h_backprop,
+    const Tensor* output_c_backprop, const Tensor* reserve_space,
+    /* backprop outputs, output of the function */
+    Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop,
+    Tensor* params_backprop, ScratchAllocator* workspace_allocator,
+    ProfileResult* output_profile_result) {
+  std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
+  std::unique_ptr<RnnStateTensorDescriptor> state_desc;
+  std::unique_ptr<RnnSequenceTensorDescriptor> output_desc;
+
+  TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
+      context, model_shapes, &input_desc, &state_desc, &output_desc));
+
+  auto input_data = AsDeviceMemory<T>(input);
+  auto input_h_data = AsDeviceMemory<T>(input_h);
+  DeviceMemory<T> input_c_data;
+  if (model_types.HasInputC()) {
+    input_c_data = AsDeviceMemory<T>(input_c);
+  }
+  auto params_data = AsDeviceMemory<T>(params);
+  auto output_data = AsDeviceMemory<T>(output);
+  auto output_h_data = AsDeviceMemory<T>(output_h);
+  DeviceMemory<T> output_c_data;
+  if (model_types.HasInputC()) {
+    output_c_data = AsDeviceMemory<T>(output_c);
+  }
+  auto output_backprop_data = AsDeviceMemory<T>(output_backprop);
+  auto output_h_backprop_data = AsDeviceMemory<T>(output_h_backprop);
+  DeviceMemory<T> output_c_backprop_data;
+  if (model_types.HasInputC()) {
+    output_c_backprop_data = AsDeviceMemory<T>(output_c_backprop);
+  }
+  auto input_backprop_data = AsDeviceMemory<T>(input_backprop);
+  auto input_h_backprop_data = AsDeviceMemory<T>(input_h_backprop);
+  DeviceMemory<T> input_c_backprop_data;
+  if (model_types.HasInputC()) {
+    input_c_backprop_data = AsDeviceMemory<T>(input_c_backprop);
+  }
+  auto params_backprop_data = AsDeviceMemory<T>(params_backprop);
+  auto reserve_space_uint8 =
+      CastDeviceMemory<uint8, T>(const_cast<Tensor*>(reserve_space));
+
+  // Creates a memory callback for the workspace. The memory lives to the end
+  // of this kernel calls.
+  Stream* stream = context->op_device_context()->stream();
+  bool launch_success =
+      stream
+          ->ThenRnnBackward(rnn_desc, *input_desc, input_data, *state_desc,
+                            input_h_data, *state_desc, input_c_data,
+                            params_data, *output_desc, output_data, *state_desc,
+                            output_h_data, *state_desc, output_c_data,
+                            output_backprop_data, output_h_backprop_data,
+                            output_c_backprop_data, &input_backprop_data,
+                            &input_h_backprop_data, &input_c_backprop_data,
+                            &params_backprop_data, &reserve_space_uint8,
+                            workspace_allocator, output_profile_result)
+          .ok();
+  return launch_success
+             ? Status::OK()
+             : errors::Internal(
+                   "Failed to call ThenRnnBackward with model config: ",
+                   model_types.DebugString(), ", ", model_shapes.DebugString());
+}
 
 template <typename T>
 void RestoreParams(const OpInputList params_input,
                    const std::vector<RnnDescriptor::ParamsRegion>& params,
-                   DeviceMemoryBase* data_dst,
-                   perftools::gputools::Stream* stream) {
+                   DeviceMemoryBase* data_dst, Stream* stream) {
   int num_params = params.size();
   CHECK(params_input.size() == num_params)
       << "Number of params mismatch. Expected " << params_input.size()
@@ -570,7 +750,7 @@ class CudnnRNNKernelCommon : public OpKernel {
     TF_RETURN_IF_ERROR(
         ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
 
-    auto* stream = context->op_device_context()->stream();
+    Stream* stream = context->op_device_context()->stream();
     // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
     // random number generator, therefore set state_allocator to nullptr.
     const AlgorithmConfig algo_config;
@@ -585,6 +765,51 @@ class CudnnRNNKernelCommon : public OpKernel {
     return Status::OK();
   }
 
+  template <typename T>
+  Status CreateRnnDescriptor(OpKernelContext* context,
+                             const CudnnRnnModelShapes& model_shapes,
+                             const RnnInputMode& input_mode,
+                             const AlgorithmConfig& algo_config,
+                             ScratchAllocator* dropout_state_allocator,
+                             std::unique_ptr<RnnDescriptor>* rnn_desc) {
+    StreamExecutor* executor = context->op_device_context()->stream()->parent();
+    ::perftools::gputools::dnn::DataType data_type = ToDataType<T>::value;
+    auto rnn_desc_s = executor->createRnnDescriptor(
+        model_shapes.num_layers, model_shapes.num_units,
+        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
+        data_type, algo_config, dropout(), seed(), dropout_state_allocator);
+    TF_RETURN_IF_ERROR(rnn_desc_s.status());
+
+    *rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    return Status::OK();
+  }
+
+  using RnnStateCache =
+      gtl::FlatMap<CudnnRnnModelShapes, RnnScratchSpace,
+                   CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>;
+  // Returns a raw rnn descriptor pointer. The cache owns the rnn descriptor and
+  // should outlive the returned pointer.
+  template <typename T>
+  Status GetCachedRnnDescriptor(OpKernelContext* context,
+                                const CudnnRnnModelShapes& model_shapes,
+                                const RnnInputMode& input_mode,
+                                const AlgorithmConfig& algo_config,
+                                RnnStateCache* cache,
+                                RnnDescriptor** rnn_desc) {
+    RnnScratchSpace& rnn_state = (*cache)[model_shapes];
+    if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
+      CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
+          new CudnnRNNPersistentSpaceAllocator(context);
+      rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+      Status status =
+          CreateRnnDescriptor<T>(context, model_shapes, input_mode, algo_config,
+                                 dropout_state_allocator, &rnn_state.rnn_desc);
+      TF_RETURN_IF_ERROR(status);
+    }
+    *rnn_desc = rnn_state.rnn_desc.get();
+    return Status::OK();
+  }
+
  private:
   int seed_;
   int seed2_;
@@ -648,7 +873,7 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(3);
     auto input_ptr = StreamExecutorUtil::AsDeviceMemory<T>(input);
-    auto* stream = context->op_device_context()->stream();
+    Stream* stream = context->op_device_context()->stream();
 
     std::unique_ptr<RnnDescriptor> rnn_desc;
     OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo<T>(context, &rnn_desc));
@@ -789,7 +1014,7 @@ class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, {params_size}, &output));
     auto output_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*output);
-    auto* stream = context->op_device_context()->stream();
+    Stream* stream = context->op_device_context()->stream();
 
     OpInputList weights;
     OP_REQUIRES_OK(context, context->input_list("weights", &weights));
@@ -816,13 +1041,6 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
-// Pointers to RNN scratch space for a specific set of shape parameters (used as
-// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp).
-struct RnnScratchSpace {
-  std::unique_ptr<RnnDescriptor> rnn_desc;
-  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator;
-};
-
 // Run the forward operation of the RNN model.
 template <typename T>
 class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
@@ -842,115 +1060,71 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
-    const auto& input_shape = model_shapes.input_shape;
-    const auto& hidden_state_shape = model_shapes.hidden_state_shape;
-    const auto& output_shape = model_shapes.output_shape;
-
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    Tensor* output_h = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, hidden_state_shape, &output_h));
-    Tensor* output_c = nullptr;
-    if (HasInputC()) {
-      // Only LSTM uses input_c and output_c. So for all other models, we only
-      // need to create dummy outputs.
-      OP_REQUIRES_OK(
-          context, context->allocate_output(2, hidden_state_shape, &output_c));
-    } else {
-      OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_c));
-    }
-
-    auto* stream = context->op_device_context()->stream();
-    auto* executor = stream->parent();
     RnnInputMode input_mode;
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    auto data_type = ToDataType<T>::value;
-
-    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
-        input_shape.dim_size(0), input_shape.dim_size(1),
-        input_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(input_desc_s));
-    auto input_desc = input_desc_s.ConsumeValueOrDie();
-
-    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
-        hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
-        hidden_state_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(hidden_state_desc_s));
-    auto hidden_state_desc = hidden_state_desc_s.ConsumeValueOrDie();
-
-    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
-        output_shape.dim_size(0), output_shape.dim_size(1),
-        output_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(output_desc_s));
-    auto output_desc = output_desc_s.ConsumeValueOrDie();
-
-    auto input_data = AsDeviceMemory<T>(input);
-    auto input_h_data = AsDeviceMemory<T>(input_h);
-    DeviceMemory<T> input_c_data;
-    if (HasInputC()) {
-      input_c_data = AsDeviceMemory<T>(input_c);
-    }
-    auto params_data = AsDeviceMemory<T>(params);
-    auto output_data = AsDeviceMemory<T>(output);
-    auto output_h_data = AsDeviceMemory<T>(output_h);
-    DeviceMemory<T> output_c_data;
-    if (HasInputC()) {
-      output_c_data = AsDeviceMemory<T>(output_c);
-    }
 
+    Tensor* output = nullptr;
+    Tensor* output_h = nullptr;
+    Tensor* output_c = nullptr;
+    OP_REQUIRES_OK(context, AllocateOutputs(context, model_shapes, &output,
+                                            &output_h, &output_c));
+
+    AlgorithmConfig algo_config;
     // Creates a memory callback for the reserve_space. The memory lives in the
     // output of this kernel. And it will be fed into the backward pass when
     // needed.
     CudnnRnnAllocatorInOutput<T> reserve_space_allocator(context, 3);
-    if (!is_training_) {
-      Tensor* dummy_reserve_space = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(3, {}, &dummy_reserve_space));
-    }
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
-    bool launch_status = false;
+    Status launch_status;
     {
       mutex_lock l(mu_);
-      RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes];
-      if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
-        CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
-            new CudnnRNNPersistentSpaceAllocator(context);
-        rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
-        const AlgorithmConfig algo_config;
-        auto rnn_desc_s = executor->createRnnDescriptor(
-            model_shapes.num_layers, model_shapes.num_units,
-            model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, algo_config, dropout(), seed(),
-            dropout_state_allocator);
-        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-        rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
-      }
-      launch_status =
-          stream
-              ->ThenRnnForward(
-                  *rnn_state.rnn_desc, *input_desc, input_data,
-                  *hidden_state_desc, input_h_data, *hidden_state_desc,
-                  input_c_data, params_data, *output_desc, &output_data,
-                  *hidden_state_desc, &output_h_data, *hidden_state_desc,
-                  &output_c_data, is_training_, &reserve_space_allocator,
-                  &workspace_allocator, /*output_result_profile=*/nullptr)
-              .ok();
+      RnnDescriptor* rnn_desc_ptr = nullptr;
+      OP_REQUIRES_OK(
+          context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
+                                             algo_config, &rnn_state_cache_,
+                                             &rnn_desc_ptr));
+      launch_status = DoForward<T>(
+          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+          input_c, params, is_training_, output, output_h, output_c,
+          &reserve_space_allocator, &workspace_allocator,
+          /*output_profile_result=*/nullptr);
     }
-    OP_REQUIRES(context, launch_status,
-                errors::Internal("Failed to call ThenRnnForward"));
+    OP_REQUIRES_OK(context, launch_status);
   }
 
  private:
+  Status AllocateOutputs(OpKernelContext* context,
+                         const CudnnRnnModelShapes& model_shapes,
+                         Tensor** output, Tensor** output_h,
+                         Tensor** output_c) {
+    const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+    const TensorShape& output_shape = model_shapes.output_shape;
+
+    TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, output));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(1, hidden_state_shape, output_h));
+    if (HasInputC()) {
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(2, hidden_state_shape, output_c));
+    } else {
+      // Only LSTM uses input_c and output_c. So for all other models, we only
+      // need to create dummy outputs.
+      TF_RETURN_IF_ERROR(context->allocate_output(2, {}, output_c));
+    }
+    if (!is_training_) {
+      Tensor* dummy_reserve_space = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(3, {}, &dummy_reserve_space));
+    }
+    return Status::OK();
+  }
+
   mutex mu_;
   bool is_training_;
-  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
-                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
-      rnn_state_cache_ GUARDED_BY(mu_);
+  RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
 };
 
 #define REGISTER_GPU(T)                                           \
@@ -981,184 +1155,141 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
+    RnnInputMode input_mode;
+    OP_REQUIRES_OK(context,
+                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
+                                  model_shapes.input_size, &input_mode));
 
-    const auto& input_shape = model_shapes.input_shape;
-    const auto& hidden_state_shape = model_shapes.hidden_state_shape;
-    const auto& output_shape = model_shapes.output_shape;
-
-    auto data_type = ToDataType<T>::value;
     const Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->input("output", &output));
-    OP_REQUIRES(context, output_shape == output->shape(),
-                errors::InvalidArgument(
-                    "input_h and input_c must have the same shape: ",
-                    input_h->shape().DebugString(), " ",
-                    input_c->shape().DebugString()));
     const Tensor* output_h = nullptr;
-    OP_REQUIRES_OK(context, context->input("output_h", &output_h));
-    OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
-                errors::InvalidArgument(
-                    "Invalid output_h shape: ", output_h->shape().DebugString(),
-                    " ", hidden_state_shape.DebugString()));
     const Tensor* output_c = nullptr;
-    if (HasInputC()) {
-      // Only LSTM uses input_c and output_c. So for all other models, we only
-      // need to create dummy outputs.
-      OP_REQUIRES_OK(context, context->input("output_c", &output_c));
-      OP_REQUIRES(context, output_c->shape() == hidden_state_shape,
-                  errors::InvalidArgument("Invalid output_c shape: ",
-                                          output_c->shape().DebugString(), " ",
-                                          hidden_state_shape.DebugString()));
-    }
-
     const Tensor* output_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->input("output_backprop", &output_backprop));
-    OP_REQUIRES(context, output_backprop->shape() == output_shape,
-                errors::InvalidArgument("Invalid output_backprop shapes: ",
-                                        output_backprop->shape().DebugString(),
-                                        " ", output_shape.DebugString()));
-
     const Tensor* output_h_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->input("output_h_backprop", &output_h_backprop));
-    OP_REQUIRES(
-        context, output_h_backprop->shape() == hidden_state_shape,
-        errors::InvalidArgument("Invalid output_h_backprop shapes: ",
-                                output_h_backprop->shape().DebugString(), " ",
-                                hidden_state_shape.DebugString()));
     const Tensor* output_c_backprop = nullptr;
-    if (HasInputC()) {
-      OP_REQUIRES_OK(context,
-                     context->input("output_c_backprop", &output_c_backprop));
-      OP_REQUIRES(
-          context, output_c_backprop->shape() == hidden_state_shape,
-          errors::InvalidArgument("Invalid output_c_backprop shapes: ",
-                                  output_c_backprop->shape().DebugString(), " ",
-                                  hidden_state_shape.DebugString()));
-    }
-    const Tensor* reserve_space_const = nullptr;
-    // This is the same "reserve_space" created by the forward op.
-    // It can also be modified by this backward operation.
+    const Tensor* reserve_space = nullptr;
     OP_REQUIRES_OK(context,
-                   context->input("reserve_space", &reserve_space_const));
-    // Cudnn needs the reserve space to be writeable. This is fine because they
-    // are opaque.
-    Tensor* reserve_space = const_cast<Tensor*>(reserve_space_const);
+                   ExtractBackwardInputs(context, model_shapes, model_types(),
+                                         &output, &output_h, &output_c,
+                                         &output_backprop, &output_h_backprop,
+                                         &output_c_backprop, &reserve_space));
 
     Tensor* input_backprop = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, input->shape(), &input_backprop));
     Tensor* input_h_backprop = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, input_h->shape(),
-                                                     &input_h_backprop));
     Tensor* input_c_backprop = nullptr;
-    if (HasInputC()) {
-      OP_REQUIRES_OK(context, context->allocate_output(2, input_c->shape(),
-                                                       &input_c_backprop));
-    } else {
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(2, {}, &input_c_backprop));
-    }
     Tensor* params_backprop = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(3, params->shape(),
-                                                     &params_backprop));
-
-    auto* stream = context->op_device_context()->stream();
-    auto* executor = stream->parent();
-    RnnInputMode input_mode;
     OP_REQUIRES_OK(context,
-                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
-                                  model_shapes.input_size, &input_mode));
+                   AllocateOutputs(context, model_shapes, params->shape(),
+                                   &input_backprop, &input_h_backprop,
+                                   &input_c_backprop, &params_backprop));
 
-    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
-        input_shape.dim_size(0), input_shape.dim_size(1),
-        input_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(input_desc_s));
-    auto input_desc = input_desc_s.ConsumeValueOrDie();
-
-    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
-        hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
-        hidden_state_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(hidden_state_desc_s));
-    auto hidden_state_desc = hidden_state_desc_s.ConsumeValueOrDie();
-
-    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
-        output_shape.dim_size(0), output_shape.dim_size(1),
-        output_shape.dim_size(2), data_type);
-    OP_REQUIRES_OK(context, FromExecutorStatus(output_desc_s));
-    auto output_desc = output_desc_s.ConsumeValueOrDie();
-
-    auto input_data = AsDeviceMemory<T>(input);
-    auto input_h_data = AsDeviceMemory<T>(input_h);
-    DeviceMemory<T> input_c_data;
-    if (HasInputC()) {
-      input_c_data = AsDeviceMemory<T>(input_c);
-    }
-    auto params_data = AsDeviceMemory<T>(params);
-    auto output_data = AsDeviceMemory<T>(output);
-    auto output_h_data = AsDeviceMemory<T>(output_h);
-    DeviceMemory<T> output_c_data;
-    if (HasInputC()) {
-      output_c_data = AsDeviceMemory<T>(output_c);
-    }
-    auto output_backprop_data = AsDeviceMemory<T>(output_backprop);
-    auto output_h_backprop_data = AsDeviceMemory<T>(output_h_backprop);
-    DeviceMemory<T> output_c_backprop_data;
-    if (HasInputC()) {
-      output_c_backprop_data = AsDeviceMemory<T>(output_c_backprop);
-    }
-    auto input_backprop_data = AsDeviceMemory<T>(input_backprop);
-    auto input_h_backprop_data = AsDeviceMemory<T>(input_h_backprop);
-    DeviceMemory<T> input_c_backprop_data;
-    if (HasInputC()) {
-      input_c_backprop_data = AsDeviceMemory<T>(input_c_backprop);
-    }
-    auto params_backprop_data = AsDeviceMemory<T>(params_backprop);
-    auto reserve_space_uint8 = CastDeviceMemory<uint8, T>(reserve_space);
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
-    bool launch_status = false;
+    const AlgorithmConfig default_algo_config;
+    Status launch_status;
     {
       mutex_lock l(mu_);
-      RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes];
-      if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
-        CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
-            new CudnnRNNPersistentSpaceAllocator(context);
-        rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
-        const AlgorithmConfig algo_config;
-        auto rnn_desc_s = executor->createRnnDescriptor(
-            model_shapes.num_layers, model_shapes.num_units,
-            model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, algo_config, dropout(), seed(),
-            dropout_state_allocator);
-        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-        rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
-      }
-      launch_status =
-          stream
-              ->ThenRnnBackward(
-                  *rnn_state.rnn_desc, *input_desc, input_data,
-                  *hidden_state_desc, input_h_data, *hidden_state_desc,
-                  input_c_data, params_data, *output_desc, output_data,
-                  *hidden_state_desc, output_h_data, *hidden_state_desc,
-                  output_c_data, output_backprop_data, output_h_backprop_data,
-                  output_c_backprop_data, &input_backprop_data,
-                  &input_h_backprop_data, &input_c_backprop_data,
-                  &params_backprop_data, &reserve_space_uint8,
-                  &workspace_allocator, /*output_result_profile=*/nullptr)
-              .ok();
+      RnnDescriptor* rnn_desc_ptr = nullptr;
+      OP_REQUIRES_OK(
+          context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
+                                             default_algo_config,
+                                             &rnn_state_cache_, &rnn_desc_ptr));
+      launch_status = DoBackward<T>(
+          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+          input_c, params, output, output_h, output_c, output_backprop,
+          output_h_backprop, output_c_backprop, reserve_space, input_backprop,
+          input_h_backprop, input_c_backprop, params_backprop,
+          &workspace_allocator, /*output_profile_result=*/nullptr);
     }
-    OP_REQUIRES(context, launch_status,
-                errors::Internal("Failed to call ThenRnnBackward"));
+    OP_REQUIRES_OK(context, launch_status);
   }
 
  private:
   mutex mu_;
-  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
-                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
-      rnn_state_cache_ GUARDED_BY(mu_);
+  RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
+
+  Status ExtractBackwardInputs(
+      OpKernelContext* context, const CudnnRnnModelShapes& model_shapes,
+      const CudnnModelTypes& model_types, const Tensor** output,
+      const Tensor** output_h, const Tensor** output_c,
+      const Tensor** output_backprop, const Tensor** output_h_backprop,
+      const Tensor** output_c_backprop, const Tensor** reserve_space) {
+    TF_RETURN_IF_ERROR(context->input("output", output));
+    TF_RETURN_IF_ERROR(context->input("output_backprop", output_backprop));
+    TF_RETURN_IF_ERROR(context->input("output_h", output_h));
+    TF_RETURN_IF_ERROR(context->input("output_h_backprop", output_h_backprop));
+    if (model_types.HasInputC()) {
+      TF_RETURN_IF_ERROR(context->input("output_c", output_c));
+      TF_RETURN_IF_ERROR(
+          context->input("output_c_backprop", output_c_backprop));
+    }
+    TF_RETURN_IF_ERROR(context->input("reserve_space", reserve_space));
+    const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+    const TensorShape& output_shape = model_shapes.output_shape;
+
+    if (output_shape != (*output)->shape()) {
+      return errors::InvalidArgument(
+          "Invalid output shape: ", (*output)->shape().DebugString(), " ",
+          output_shape.DebugString());
+    }
+    if (hidden_state_shape != (*output_h)->shape()) {
+      return errors::InvalidArgument(
+          "Invalid output_h shape: ", (*output_h)->shape().DebugString(), " ",
+          hidden_state_shape.DebugString());
+    }
+
+    if (output_shape != (*output_backprop)->shape()) {
+      return errors::InvalidArgument("Invalid output_backprop shape: ",
+                                     (*output_backprop)->shape().DebugString(),
+                                     " ", output_shape.DebugString());
+    }
+    if (hidden_state_shape != (*output_h_backprop)->shape()) {
+      return errors::InvalidArgument(
+          "Invalid output_h_backprop shape: ",
+          (*output_h_backprop)->shape().DebugString(), " ",
+          hidden_state_shape.DebugString());
+    }
+
+    if (model_types.HasInputC()) {
+      if (hidden_state_shape != (*output_c)->shape()) {
+        return errors::InvalidArgument(
+            "Invalid output_c shape: ", (*output_c)->shape().DebugString(), " ",
+            hidden_state_shape.DebugString());
+      }
+      if (hidden_state_shape != (*output_c_backprop)->shape()) {
+        return errors::InvalidArgument(
+            "Invalid output_c_backprop shape: ",
+            (*output_c_backprop)->shape().DebugString(), " ",
+            hidden_state_shape.DebugString());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AllocateOutputs(OpKernelContext* context,
+                         const CudnnRnnModelShapes& model_shapes,
+                         const TensorShape& params_shape,
+                         Tensor** input_backprop, Tensor** input_h_backprop,
+                         Tensor** input_c_backprop, Tensor** params_backprop) {
+    const TensorShape& input_shape = model_shapes.input_shape;
+    const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
+
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, input_shape, input_backprop));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(1, hidden_state_shape, input_h_backprop));
+    if (HasInputC()) {
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(2, hidden_state_shape, input_c_backprop));
+    } else {
+      // Only LSTM uses input_c and output_c. So for all other models, we only
+      // need to create dummy outputs.
+      TF_RETURN_IF_ERROR(context->allocate_output(2, {}, input_c_backprop));
+    }
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(3, params_shape, params_backprop));
+    return Status::OK();
+  }
 };
 
 #define REGISTER_GPU(T)                                                   \
-- 
GitLab


From a4b408543dd3b882131f522359bcb547c7972e4f Mon Sep 17 00:00:00 2001
From: Jeremy Lau <lauj@google.com>
Date: Fri, 13 Apr 2018 17:36:00 -0700
Subject: [PATCH 0811/1262] VLOG(1) all OutOfRange CtxFailures, and
 LOG(WARNING) all other CtxFailures. This unifies the logging behavior of the
 OP_REQUIRES and OP_REQUIRES_OK macros.

PiperOrigin-RevId: 192848921
---
 tensorflow/core/framework/op_kernel.cc | 48 +++++++++++++++-----------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 05171006b0..ca91d68f79 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -1273,51 +1273,59 @@ const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
 }
 #endif
 
+namespace {
+template <class OpKernelT>
+void CtxFailureInternal(OpKernelT* op_kernel, const char* file, int line,
+                        const Status& s) {
+  const string logging_prefix =
+      file == nullptr ? "CtxFailure: "
+                      : strings::StrCat("CtxFailure at ", io::Basename(file),
+                                        ":", line, ": ");
+
+  if (errors::IsOutOfRange(s)) {
+    // VLOG OutOfRange errors. Dataset ops create OutOfRange errors when they
+    // reach end-of-sequence.
+    VLOG(1) << logging_prefix << s;
+  } else {
+    LOG(WARNING) << logging_prefix << s;
+  }
+  op_kernel->SetStatus(s);
+}
+}  // anonymous namespace
+
 void OpKernelConstruction::CtxFailure(const Status& s) {
-  VLOG(1) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelConstruction::CtxFailureWithWarning(const Status& s) {
-  LOG(WARNING) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelConstruction::CtxFailure(const char* file, int line,
                                       const Status& s) {
-  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-          << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 void OpKernelConstruction::CtxFailureWithWarning(const char* file, int line,
                                                  const Status& s) {
-  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-               << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 void OpKernelContext::CtxFailure(const Status& s) {
-  VLOG(1) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelContext::CtxFailureWithWarning(const Status& s) {
-  LOG(WARNING) << s;
-  SetStatus(s);
+  CtxFailureInternal(this, nullptr, 0, s);
 }
 
 void OpKernelContext::CtxFailure(const char* file, int line, const Status& s) {
-  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-          << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
                                             const Status& s) {
-  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
-               << " : " << s;
-  SetStatus(s);
+  CtxFailureInternal(this, file, line, s);
 }
 
 }  // namespace tensorflow
-- 
GitLab


From 6e533eb718b33f23ab3f06025cbf680258534d76 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Fri, 13 Apr 2018 17:47:58 -0700
Subject: [PATCH 0812/1262] Add a caveat about make_initiliazable_iterator to
 the README.

PiperOrigin-RevId: 192850014
---
 tensorflow/contrib/distribute/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 14de1e8f49..2482731198 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -130,6 +130,8 @@ adjusting your learning rate or batch size according to the number of GPUs.
 We are working on addressing this limitation by splitting each batch across GPUs
 instead.
 * PartitionedVariables are not supported yet.
+* Input pipelines with Datasets that capture stateful objects and rely on
+`make_initializable_iterator` are not supported yet.
 
 ## What's next?
 
-- 
GitLab


From ef24ad14502e992716c49fdd5c63e6b2c2fb6b5a Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Fri, 13 Apr 2018 17:51:37 -0700
Subject: [PATCH 0813/1262] Java: Bump release to 1.8.0-rc0

PiperOrigin-RevId: 192850310
---
 tensorflow/java/maven/libtensorflow/pom.xml         | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml     | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +-
 tensorflow/java/maven/pom.xml                       | 2 +-
 tensorflow/java/maven/proto/pom.xml                 | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index c99d04869a..9c1601753b 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0</version>
+    <version>1.8.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 4561c2c8ad..3d013e12b0 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0</version>
+    <version>1.8.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 82a2b8e769..40e44af1f5 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0</version>
+    <version>1.8.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 4c1ec0cc80..82bfd0c73a 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index fcd8236bad..0a2775a500 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0</version>
+    <version>1.8.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 241581713a..61961432a7 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0</version>
+    <version>1.8.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From 3652556dab3ebfe0152232facc7304fe5754aecb Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Fri, 13 Apr 2018 17:52:20 -0700
Subject: [PATCH 0814/1262] Merge changes from github.

PiperOrigin-RevId: 192850372
---
 tensorflow/BUILD                              |   5 +-
 tensorflow/compiler/jit/BUILD                 |   1 +
 .../compiler/jit/mark_for_compilation_pass.cc |   4 +
 tensorflow/contrib/cmake/external/grpc.cmake  |   1 +
 .../copy_graph/python/util/copy_elements.py   |   4 +-
 tensorflow/contrib/data/__init__.py           |   2 +
 .../contrib/data/python/kernel_tests/BUILD    |   1 +
 .../kernel_tests/batch_dataset_op_test.py     |  70 ++++
 .../kernel_tests/sequence_dataset_op_test.py  |  10 +
 tensorflow/contrib/data/python/ops/BUILD      |   1 +
 .../contrib/data/python/ops/batching.py       |  41 ++
 .../contrib/distribute/python/values.py       |   2 +-
 .../contrib/kernel_methods/python/losses.py   |   6 +-
 .../python/mappers/random_fourier_features.py |  44 +-
 .../mappers/random_fourier_features_test.py   |   2 +-
 .../contrib/kfac/python/ops/fisher_blocks.py  |  82 ++--
 .../contrib/lite/build_ios_universal_lib.sh   |  15 +-
 .../contrib/metrics/python/ops/metric_ops.py  |  29 +-
 tensorflow/contrib/rnn/python/ops/rnn_cell.py |   2 +-
 .../seq2seq/python/ops/attention_wrapper.py   |   4 +-
 tensorflow/contrib/sparsemax/__init__.py      |   2 +-
 .../contrib/sparsemax/python/ops/sparsemax.py |   2 +-
 .../contrib/tensorrt/convert/convert_graph.cc |  10 +-
 .../contrib/tensorrt/convert/convert_nodes.cc |  68 ++-
 .../base_api/api_def_ClipByValue.pbtxt        |  36 ++
 .../python_api/api_def_ClipByValue.pbtxt      |   4 +
 .../core/common_runtime/process_util.cc       |  21 +-
 tensorflow/core/grappler/optimizers/BUILD     |  23 +-
 tensorflow/core/kernels/BUILD                 |   2 +
 tensorflow/core/kernels/cwise_op_abs.cc       |   2 -
 tensorflow/core/kernels/cwise_op_clip.cc      | 225 ++++++++++
 tensorflow/core/kernels/cwise_op_clip.h       |  61 +++
 .../core/kernels/cwise_op_clip_gpu.cu.cc      | 134 ++++++
 tensorflow/core/kernels/maxpooling_op.cc      |  93 ++++-
 .../core/kernels/segment_reduction_ops.h      |   6 +
 tensorflow/core/ops/dataset_ops.cc            |  12 +-
 tensorflow/core/ops/math_ops.cc               |   8 +
 tensorflow/core/platform/macros.h             |   9 +-
 .../docs_src/community/documentation.md       |  18 +-
 tensorflow/docs_src/extend/adding_an_op.md    | 159 +++----
 .../docs_src/get_started/custom_estimators.md |   2 +-
 tensorflow/docs_src/install/install_c.md      |   2 +-
 .../docs_src/performance/performance_guide.md |   8 +-
 .../docs_src/programmers_guide/debugger.md    |  61 ++-
 tensorflow/python/BUILD                       |   1 +
 tensorflow/python/framework/dtypes.py         |  10 +
 tensorflow/python/framework/dtypes_test.py    |   5 +
 tensorflow/python/framework/function_test.py  |   3 +-
 tensorflow/python/framework/tensor_shape.py   |   3 +
 .../python/framework/tensor_shape_test.py     |   5 +
 .../keras/_impl/keras/utils/io_utils.py       |  14 +-
 .../python/kernel_tests/clip_ops_test.py      | 124 +++++-
 .../python/kernel_tests/pooling_ops_test.py   |   6 -
 tensorflow/python/ops/clip_ops.py             |  30 ++
 tensorflow/python/ops/hidden_ops.txt          | 395 ++++++++++++++++++
 tensorflow/python/util/tf_inspect.py          |  43 +-
 tensorflow/tensorflow.bzl                     |  53 ++-
 .../tools/api/generator/create_python_api.py  |   3 +-
 tensorflow/tools/docker/Dockerfile            |   2 +-
 tensorflow/tools/docker/Dockerfile.devel      |   2 +
 tensorflow/tools/docker/Dockerfile.devel-gpu  |   2 +
 tensorflow/tools/docker/Dockerfile.gpu        |   2 +-
 .../notebooks/3_mnist_from_scratch.ipynb      |   6 +-
 .../docker/parameterized_docker_build.sh      |   4 +-
 tensorflow/tools/docs/BUILD                   |   2 +-
 tensorflow/tools/docs/build_docs_test.py      |   5 -
 tensorflow/tools/docs/generate_lib.py         |  19 +-
 tensorflow/tools/docs/generate_lib_test.py    |   3 -
 tensorflow/tools/docs/parser.py               |  56 ++-
 tensorflow/tools/docs/parser_test.py          |  80 +++-
 tensorflow/tools/docs/pretty_docs.py          |  12 +-
 tensorflow/tools/docs/py_guide_parser.py      |   2 +-
 tensorflow/workspace.bzl                      |  13 +-
 73 files changed, 1797 insertions(+), 402 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
 create mode 100644 tensorflow/core/kernels/cwise_op_clip.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_clip.h
 create mode 100644 tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
 create mode 100644 tensorflow/python/ops/hidden_ops.txt

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index cfafffdd13..f2ad16fa04 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -450,11 +450,12 @@ tf_cc_shared_object(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core:gpu_runtime_impl",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/stream_executor:stream_executor_impl",
-        "//tensorflow/core:gpu_runtime_impl",
     ] + tf_additional_binary_deps(),
 )
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 6edeb7047f..50fa95c4f3 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -318,6 +318,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:bounds_check",
     ],
 )
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 0c9fbf3d54..8e2ee0f1d7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/version.h"
 
@@ -441,6 +442,9 @@ string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
   }
 
   auto node_name = [&cycles, &graph](int node_id) {
+    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
+      return string("(null)");
+    }
     auto* node = graph.FindNodeId(node_id);
     if (node == nullptr) {
       return string("(null)");
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index bec8177a3f..35c2a294ec 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -35,6 +35,7 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index b806799202..102bc460fd 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
     #stores String-based info such as name, device and type of the op.
     #Unique to every Operation instance.
-    new_node_def = deepcopy(op._node_def)
+    new_node_def = deepcopy(op.node_def)
     #Change the name
     new_node_def.name = new_name
 
@@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
 
     #Make a copy of the op_def too.
     #Its unique to every _type_ of Operation.
-    op_def = deepcopy(op._op_def)
+    op_def = deepcopy(op.op_def)
 
     #Initialize a new Operation instance
     new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index f58e5ec1f0..637b1dc46c 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -25,6 +25,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@Counter
 @@SqlDataset
 
+@@assert_element_shape
 @@batch_and_drop_remainder
 @@bucket_by_sequence_length
 @@dense_to_sparse_batch
@@ -55,6 +56,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 
+from tensorflow.contrib.data.python.ops.batching import assert_element_shape
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import map_and_batch
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index a8481dc90a..b475c9fa6b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -21,6 +21,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 75482f67da..413d873797 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -579,5 +581,73 @@ class PaddedBatchDatasetSerializationTest(
                         lambda: build_dataset(seq_lens2), 8)
 
 
+class RestructuredDatasetTest(test.TestCase):
+
+  def test_assert_element_shape(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index b044ff1775..d0cb203a3a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest(
     # Skip nothing
     self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
 
+  def testInvalidSkip(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
+
   def _build_take_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(count)
@@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest(
     # Take nothing
     self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
 
+  def testInvalidTake(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
+
   def _build_repeat_dataset(self, count, take_count=3):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 7c28d1f005..0e4590829b 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -112,6 +112,7 @@ py_library(
     srcs = ["batching.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index a212adf6cf..28db949da9 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@@ -345,6 +346,46 @@ class _RestructuredDataset(dataset_ops.Dataset):
     return self._output_shapes
 
 
+def assert_element_shape(expected_shapes):
+  """Assert the shape of this `Dataset`.
+
+  ```python
+  shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)]
+  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
+  print(result.output_shapes)  # ==> "((16, 256), <unknown>)"
+  ```
+
+  If dataset shapes and expected_shape, are fully defined, assert they match.
+  Otherwise, add assert op that will validate the shapes when tensors are
+  evaluated, and set shapes on tensors, respectively.
+
+  Args:
+    expected_shapes: A nested structure of `tf.TensorShape` objects.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}
+  """
+
+  def _check_shape(*elements):
+    flatten_tensors = nest.flatten(elements)
+    flatten_shapes = nest.flatten(expected_shapes)
+    checked_tensors = [
+        with_shape(shape, tensor)
+        for shape, tensor in zip(flatten_shapes, flatten_tensors)
+    ]
+    return nest.pack_sequence_as(elements, checked_tensors)
+
+  def _apply_fn(dataset):
+    return _RestructuredDataset(
+        dataset.map(_check_shape),
+        dataset.output_types,
+        output_shapes=expected_shapes,
+        output_classes=dataset.output_classes)
+
+  return _apply_fn
+
+
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 9acb6a9db9..87bf059038 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -73,7 +73,7 @@ class DistributedValues(object):
 
   @property
   def devices(self):
-    return self._index.keys()
+    return list(self._index.keys())
 
   def __str__(self):
     return "%s:%s" % (self.__class__.__name__, self._index)
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index f182fef067..4ef0a66a52 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss(
 
   This is a generalization of standard (binary) hinge loss. For a given instance
   with correct label c*, the loss is given by:
-    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+    $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$
   or equivalently
-    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
-  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+    $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$
+  where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise.
 
   Args:
     labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 9dc01124ab..9a721a9d44 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
   r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-  ```
-  exp(-||x-y||_2^2 / (2 * sigma^2))
-  ```
+  $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
-  where `d` is the input dimension (number of dense input features) and `D` is
-  the output dimension (i.e., dimension of the feature space the input is mapped
-  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
-  distribution and each entry of `b` is sampled independently and uniformly from
-  [0, 2 * pi].
-
-  For a single input feature vector x in R^d, its RFFM is defined as:
-  ```
-      sqrt(2/D) * cos(x * Omega + b)
-  ```
-  where `cos` is the element-wise cosine function and `x, b` are represented as
-  row vectors. The aforementioned paper shows that the linear kernel of
-  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+  The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector
+  \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input
+  features) and \\(D\\) is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d.
+  from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled
+  independently and uniformly from [0, \\(2 * \pi\\)].
+
+  For a single input feature vector \\(x \in R^d\\), its RFFM is defined as:
+  $$\sqrt(2/D) * cos(x * \Omega + b)$$
+
+  where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are
+  represented as row vectors. The aforementioned paper shows that the linear
+  kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial
+  vectors.
 
   """
 
   def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
-    """Constructs a RandomFourierFeatureMapper instance.
+    r"""Constructs a RandomFourierFeatureMapper instance.
 
     Args:
       input_dim: The dimension (number of features) of the tensors to be mapped.
@@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
       stddev: The standard deviation of the Gaussian kernel to be approximated.
         The error of the classifier trained using this approximation is very
         sensitive to this parameter.
-      seed: An integer used to initialize the parameters (`Omega` and `b`) of
-        the mapper. For repeatable sequences across different invocations of the
-        mapper object (for instance, to ensure consistent mapping both at
-        training and eval/inference if these happen in different invocations),
-        set this to the same integer.
+      seed: An integer used to initialize the parameters (\\(\Omega\\) and
+        \\(b\\)) of the mapper. For repeatable sequences across different
+        invocations of the mapper object (for instance, to ensure consistent
+        mapping both at training and eval/inference if these happen in
+        different invocations), set this to the same integer.
       name: name for the mapper object.
     """
     # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 6f4a264485..91929184a2 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -34,7 +34,7 @@ def _inner_product(x, y):
   """Inner product between tensors x and y.
 
   The input tensors are assumed to be in ROW representation, that is, the method
-  returns x * y^T.
+  returns \\(x * y^T\\).
 
   Args:
     x: input tensor in row format
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index e0d9cb5ea9..00b3673a74 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior
 distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
 Fisher Information matrix is given by,
 
-  F(params) = E[ v(x, y, params) v(x, y, params)^T ]
+  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
 
 where,
 
-  v(x, y, params) = (d / d params) log p(y | x, params)
+  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
 
 and the expectation is taken with respect to the data's distribution for 'x' and
 the model's posterior distribution for 'y',
@@ -85,7 +85,7 @@ def normalize_damping(damping, num_replications):
 def compute_pi_tracenorm(left_cov, right_cov):
   """Computes the scalar constant pi for Tikhonov regularization/damping.
 
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
   See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
 
   Args:
@@ -462,14 +462,14 @@ class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider fully connected layer in this model with (unshared) weight matrix
   'w'. For an example 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( a (d loss / d s)^T )
+    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
 
   This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
   to the layer's parameters 'w'.
@@ -532,14 +532,14 @@ class ConvDiagonalFB(InputOutputMultiTower, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider a convoluational layer in this model with (unshared) filter matrix
   'w'. For an example image 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )
+    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
 
   where 'loc' is a single (x, y) location in an image.
 
@@ -805,12 +805,12 @@ class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
   'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
   this FisherBlock estimates,
 
-    F(w) = #locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])
+    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
+                                  E[flat(ds) flat(ds)^T])$$
 
   where
 
-    ds = (d / ds) log p(y | x, w)
+    $$ds = (d / ds) log p(y | x, w)$$
     #locations = number of (x, y) locations where 'w' is applied.
 
   where the expectation is taken over all examples and locations and flat()
@@ -1567,7 +1567,7 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
 
     if self._option == SeriesFBApproximation.option1:
 
-      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
       L_A, psi_A = self._input_factor.get_option1quants(
           self._input_damping_func)
       L_G, psi_G = self._output_factor.get_option1quants(
@@ -1581,33 +1581,33 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
         T = self._num_timesteps
         return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
 
-      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
       # Even though Y is Z-independent we are recomputing it from the psi's
       # each since Y depends on both A and G quantities, and it is relatively
       # cheap to compute.
       Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
 
-      # Z = L_G^T * Z * L_A
+      # \\(Z = L_G^T * Z * L_A\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = U_G^T * Z * U_A
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = U_G^T * Z * U_A\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
 
-      # Z = Z .* Y
+      # \\(Z = Z .* Y\\)
       Z *= Y
 
-      # Z = L_G * Z * L_A^T
+      # \\(Z = L_G * Z * L_A^T\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = U_G * Z * U_A^T
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = U_G * Z * U_A^T\\)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
 
     elif self._option == SeriesFBApproximation.option2:
 
-      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
-      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
+      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
       P_A, K_A, mu_A = self._input_factor.get_option2quants(
           self._input_damping_func)
       P_G, K_G, mu_G = self._output_factor.get_option2quants(
@@ -1616,26 +1616,26 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
       # Our approach differs superficially from the pseudo-code in the paper
       # in order to reduce the total number of matrix-matrix multiplies.
       # In particular, the first three computations in the pseudo code are
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = Z - hPsi_G^T * Z * hPsi_A
-      # Z = E_G^T * Z * E_A
-      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
-      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
+      # \\(Z = E_G^T * Z * E_A\\)
+      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
+      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
       # the entire computation can be written as
-      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
-      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
-      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
-      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
-      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
+      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
+      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
+      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
+      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
       # This final expression is computed by the following two lines:
-      # Z = Z - P_G * Z * P_A^T
+      # \\(Z = Z - P_G * Z * P_A^T\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # Z = K_G^T * Z * K_A
+      # \\(Z = K_G^T * Z * K_A\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
 
-      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
       # Be careful with the outer product.  We don't want to accidentally
       # make it an inner-product instead.
       tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
@@ -1646,13 +1646,13 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
       # We now perform the transpose/reverse version of the operations
       # derived above, whose derivation from the original pseudo-code is
       # analgous.
-      # Z = K_G * Z * K_A^T
+      # \\(Z = K_G * Z * K_A^T\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
 
-      # Z = Z - P_G^T * Z * P_A
+      # \\(Z = Z - P_G^T * Z * P_A\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
 
-      # Z = normalize (1/E[T]) * Z
+      # \\(Z = normalize (1/E[T]) * Z\\)
       # Note that this normalization is done because we compute the statistics
       # by averaging, not summing, over time. (And the gradient is presumably
       # summed over time, not averaged, and thus their scales are different.)
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index 4a9023ff33..9f398f4a9f 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -19,11 +19,16 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../.."
 
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a
 
 lipo \
 tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 81f05e7ce5..9c8ae48094 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -63,6 +63,8 @@ def _safe_div(numerator, denominator, name):
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
                              labels,
                              weights=None,
@@ -107,6 +109,8 @@ def streaming_true_positives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_negatives(predictions,
                              labels,
                              weights=None,
@@ -151,6 +155,8 @@ def streaming_true_negatives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_positives(predictions,
                               labels,
                               weights=None,
@@ -195,6 +201,8 @@ def streaming_false_positives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_negatives(predictions,
                               labels,
                               weights=None,
@@ -238,6 +246,7 @@ def streaming_false_negatives(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.mean')
 def streaming_mean(values,
                    weights=None,
                    metrics_collections=None,
@@ -287,6 +296,7 @@ def streaming_mean(values,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.mean_tensor')
 def streaming_mean_tensor(values,
                           weights=None,
                           metrics_collections=None,
@@ -340,9 +350,8 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(None,
-            'Please switch to tf.metrics.accuracy. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order '
+            'of the labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -400,6 +409,8 @@ def streaming_accuracy(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order '
+            'of the labels and predictions arguments has been switched.')
 def streaming_precision(predictions,
                         labels,
                         weights=None,
@@ -456,6 +467,8 @@ def streaming_precision(predictions,
       name=name)
 
 
+@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order '
+            'of the labels and predictions arguments has been switched.')
 def streaming_recall(predictions,
                      labels,
                      weights=None,
@@ -975,8 +988,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of '
+            'the labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1797,9 +1810,9 @@ def streaming_sensitivity_at_specificity(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.precision_at_thresholds. Note that '
+            'the order of the labels and predictions arguments are switched.')
 def streaming_precision_at_thresholds(predictions,
                                       labels,
                                       thresholds,
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 2f6ae9f367..b12e2cd5ed 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2891,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
     output_size = weight.get_shape().as_list()[1]
     g = vs.get_variable(name, [output_size], dtype=weight.dtype)
-    return nn_impl.l2_normalize(weight, dim=0) * g
+    return nn_impl.l2_normalize(weight, axis=0) * g
 
   def _linear(self,
               args,
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 9e0d69593f..f0f143ddfc 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -610,8 +610,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
   addition, once an input sequence element is attended to at a given output
   timestep, elements occurring before it cannot be attended to at subsequent
   output timesteps.  This function generates attention distributions according
-  to these assumptions.  For more information, see ``Online and Linear-Time
-  Attention by Enforcing Monotonic Alignments''.
+  to these assumptions.  For more information, see `Online and Linear-Time
+  Attention by Enforcing Monotonic Alignments`.
 
   Args:
     p_choose_i: Probability of choosing input sequence/memory element i.  Should
diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py
index 19d213fb3e..7bc726f4a8 100644
--- a/tensorflow/contrib/sparsemax/__init__.py
+++ b/tensorflow/contrib/sparsemax/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Module that implements sparsemax and sparsemax loss, see [1].
 
-[1] https://arxiv.org/abs/1602.02068
+[1]: https://arxiv.org/abs/1602.02068
 
 ## Sparsemax
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 890ca20f4c..e617af2ff1 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -31,7 +31,7 @@ def sparsemax(logits, name=None):
   """Computes sparsemax activations [1].
 
   For each batch `i` and class `j` we have
-    sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)
+    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
 
   [1]: https://arxiv.org/abs/1602.02068
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index ff8cc6374d..b412b296e0 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -405,7 +405,13 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                          max_mem_per_engine, static_graph_properties,
                          &output_edge_map, precision_mode);
     if (precision_mode == INT8MODE) {
-      TF_RETURN_IF_ERROR(GetCalibNode(&p));
+      tensorflow::Status status = GetCalibNode(&p);
+      if (status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
+      }
     } else {
       tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
       if (status != tensorflow::Status::OK()) {
@@ -414,8 +420,8 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                      << "\" SKIPPING......( " << subgraph_node_names.size()
                      << " nodes)";
       }
-      count++;
     }
+    count++;
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index e920a797fe..b81ae9dc3e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -443,7 +443,9 @@ class Converter {
        * 2) Control dependency inputs contain caret at the beginning and we
        *    remove this and annotate the edge as a control dependency.
        ************************************************************************/
-      string name = input_name[0] == '^' ? input_name.substr(1) : input_name;
+      // skip control nodes
+      if (input_name[0] == '^') continue;
+      string name = input_name;
       auto first = name.find_first_of(':');
       if (first != string::npos && first + 2 == name.size() &&
           name[first + 1] == '0')
@@ -2262,6 +2264,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
   TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
   Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (const std::pair<int, int>& input : s.input_inds) {
@@ -2270,20 +2273,41 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     int output_idx = input.second;
     tensorflow::Node* node = s.graph.FindNodeId(node_id);
     auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
-    // TODO(jie): alternative :)
-    if (!s.graph_properties.HasOutputProperties(node_name))
+    // input_names should use the node name in the graph
+    // here it should be the input tensor name -> matching the binding
+    // insert original node name without port
+    auto tensor_name = node_name;
+    if (output_idx != 0) {
+      tensor_name = StrCat(tensor_name, ":", output_idx);
+    }
+
+    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
+            << " idx: " << output_idx;
+
+    auto shape_inference_node_name = node_name;
+    auto shape_inference_output_idx = output_idx;
+    // rewire the shape inference to original node in the graph
+    if (s.output_edge_map->count(tensor_name)) {
+      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
+      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
+    }
+    if (shape_inference_output_idx < 0) continue;
+    VLOG(2) << "shapeinference name: " << shape_inference_node_name
+            << " idx: " << shape_inference_output_idx;
+
+    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
       return tensorflow::errors::Internal("failed to find input node: " +
-                                          node_name);
+                                          shape_inference_node_name);
 
-    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
+    auto op_info_vec =
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
+    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
       return tensorflow::errors::Internal(
-          "accessing output index of: ", output_idx, ", at node: ", node_name,
-          "with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(output_idx);
+          "accessing output index of: ", shape_inference_output_idx,
+          ", at node: ", shape_inference_node_name,
+          " with output entry from shape_map: ", op_info_vec.size());
 
+    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
@@ -2294,16 +2318,23 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
                    << "' failed";
       return type_status;
     }
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
     VLOG(2) << "accessing output index of: " << output_idx
             << ", at node: " << node_name
             << "with output entry from shape_map: " << op_info_vec.size();
-
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
+    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
+    //            update the code once TRT 4.0 comes out.
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
+
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
@@ -2312,8 +2343,11 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     // TODO(ben,jie): proper way to restore input tensor name?
     auto input_tensor_name = node_name;
-    if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx);
+    if (output_idx != 0) {
+      input_tensor_name = StrCat(node_name, ":", output_idx);
+    }
 
+    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -2377,11 +2411,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     tensor->setType(trt_dtype);
   }
 
-  VLOG(2) << "finished output";
+  VLOG(2) << "Finished processing outputs";
 
   // Build the engine
   op_res->builder_->setMaxBatchSize(s.max_batch_size);
   op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+  VLOG(0) << "Max batch size= " << s.max_batch_size
+          << " max workspace size= " << s.max_workspace_size_bytes;
 
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
@@ -2475,7 +2511,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input!!!!!";
+    VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
     int output_idx = input.second;
     tensorflow::Node* node = s.graph.FindNodeId(node_id);
diff --git a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000..803d8970ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ClipByValue"
+  in_arg {
+    name: "t"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "clip_value_min"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The minimum value to clip by.
+END
+  }
+  in_arg {
+    name: "clip_value_max"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The maximum value to clip by.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A clipped `Tensor` with the same shape as input 't'.
+END
+  }
+  summary: "Clips tensor values to a specified min and max."
+  description: <<END
+Given a tensor `t`, this operation returns a tensor of the same type and
+shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+greater than `clip_value_max` are set to `clip_value_max`.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000..cacdd5c2ca
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ClipByValue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index d5bd7f8b98..22fd940d82 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 
+#ifdef INTEL_MKL
+#include <omp.h>
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -47,10 +50,24 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
 }
 
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
-  const int32 t = options.config.inter_op_parallelism_threads();
-  if (t != 0) return t;
+  const int32 inter_op = options.config.inter_op_parallelism_threads();
+  if (inter_op != 0) return inter_op;
+#ifdef INTEL_MKL
+  // MKL library executes ops in parallel using OMP threads
+  // Set inter_op conservatively to avoid thread oversubscription that could
+  // lead to severe perf degradations and OMP resource exhaustion
+  const int mkl_intra_op = omp_get_max_threads();
+  CHECK_GE(mkl_intra_op, 1);
+  const int32 mkl_inter_op = std::max(
+      (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+  VLOG(0) << "Creating new thread pool with default inter op setting: "
+          << mkl_inter_op
+          << ". Tune using inter_op_parallelism_threads for best performance.";
+  return mkl_inter_op;
+#else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
+#endif
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index aa5102017c..96342fedc1 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -11,6 +11,10 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_protos_grappler",
 )
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
+)
 
 cc_library(
     name = "static_schedule",
@@ -537,11 +541,28 @@ tf_cuda_cc_test(
     ],
 )
 
+# This rule is header-only unless the build is static (--config=monolithic). Its
+# implementation is included directly in the framework shared object.
 cc_library(
     name = "custom_graph_optimizer_registry",
-    srcs = ["custom_graph_optimizer_registry.cc"],
     hdrs = ["custom_graph_optimizer_registry.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":custom_graph_optimizer",
+        "//tensorflow/core:lib",
+    ] + if_static(
+        [":custom_graph_optimizer_registry_impl"],
+    ),
+)
+
+# This rule contains static variables for the optimizer registry. Do not depend
+# on it directly; use :custom_graph_optimizer_registry, and link against
+# libtensorflow_framework.so for the registry symbols.
+cc_library(
+    name = "custom_graph_optimizer_registry_impl",
+    srcs = ["custom_graph_optimizer_registry.cc"],
+    hdrs = ["custom_graph_optimizer_registry.h"],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":custom_graph_optimizer",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e2af540dac..7ef15da143 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3549,6 +3549,7 @@ tf_kernel_library(
         "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
+        ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":conv_ops",
@@ -3559,6 +3560,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 1466f24202..1920c54e80 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -18,9 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
           int64);
-#if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
-#endif
 
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
new file mode 100644
index 0000000000..14d889e8e3
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -0,0 +1,225 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Basic coefficient-wise tenary operations.
+// This is the case for example of the clip_by_value.
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined above. E.g., functor::clip.
+template <typename Device, typename T>
+class ClipOp : public OpKernel {
+ public:
+  explicit ClipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    const Tensor& in2 = ctx->input(2);
+
+    auto in0_flat = in0.flat<T>();
+    auto in1_flat = in1.flat<T>();
+    auto in2_flat = in2.flat<T>();
+    const Device& d = ctx->eigen_device<Device>();
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
+    if (in1.shape() == in2.shape()) {
+      if (in0.shape() == in1.shape()) {
+        functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                            out_flat);
+      } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                          out_flat);
+      }
+    } else {
+      if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                               out_flat);
+      } else {
+        OP_REQUIRES(ctx,
+                    (in0.shape() == in2.shape() &&
+                     TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                                out_flat);
+      }
+    }
+  }
+};
+
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipFunc {
+  UnaryClipFunc(const T& value_min, const T& value_max)
+      : value_min(value_min), value_max(value_max) {}
+  const T operator()(const T& value) const {
+    return std::max(std::min(value, value_max), value_min);
+  }
+  T value_min;
+  T value_max;
+};
+template <typename T>
+struct UnaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat = in0_flat.unaryExpr(UnaryClipFunc<T>(in1_flat(0), in2_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipFunc {
+  explicit BinaryRightClipFunc(const T& value_min) : value_min(value_min) {}
+  const T operator()(const T& value, const T& value_max) const {
+    return std::max(std::min(value, value_max), value_min);
+  }
+  T value_min;
+};
+template <typename T>
+struct BinaryRightClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in2_flat, BinaryRightClipFunc<T>(in1_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipFunc {
+  explicit BinaryLeftClipFunc(const T& value_max) : value_max(value_max) {}
+  const T operator()(const T& value, const T& value_min) const {
+    return std::max(std::min(value, value_max), value_min);
+  }
+  T value_max;
+};
+template <typename T>
+struct BinaryLeftClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in1_flat, BinaryLeftClipFunc<T>(in2_flat(0)));
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_CPU(T)                         \
+  template struct UnaryClipOp<CPUDevice, T>;       \
+  template struct BinaryRightClipOp<CPUDevice, T>; \
+  template struct BinaryLeftClipOp<CPUDevice, T>;  \
+  template struct TernaryClipOp<CPUDevice, T>;
+INSTANTIATE_CPU(Eigen::half);
+INSTANTIATE_CPU(float);
+INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(int8);
+INSTANTIATE_CPU(int16);
+INSTANTIATE_CPU(int32);
+INSTANTIATE_CPU(int64);
+INSTANTIATE_CPU(uint8);
+INSTANTIATE_CPU(uint16);
+#undef INSTANTIATE_CPU
+}  // namespace functor
+
+#define REGISTER_CPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ClipOp<CPUDevice, type>);
+
+REGISTER_CPU_KERNEL(Eigen::half);
+REGISTER_CPU_KERNEL(float);
+REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(int8);
+REGISTER_CPU_KERNEL(int16);
+REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int64);
+REGISTER_CPU_KERNEL(uint8);
+REGISTER_CPU_KERNEL(uint16);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      ClipOp<GPUDevice, type>);
+REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int64);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(uint16);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("ClipByValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("t")
+                            .HostMemory("clip_value_min")
+                            .HostMemory("clip_value_max")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ClipOp<CPUDevice, int32>);
+
+#undef REGISTER_GPU_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_clip.h b/tensorflow/core/kernels/cwise_op_clip.h
new file mode 100644
index 0000000000..171b6932c2
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename Device, typename T>
+struct UnaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename Device, typename T>
+struct BinaryRightClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename Device, typename T>
+struct BinaryLeftClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename Device, typename T>
+struct TernaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
new file mode 100644
index 0000000000..44dea7dee9
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+template <typename T>
+__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
+                                      const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
+                                            const T *in1, const T *in2,
+                                            T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[i] < in0[i] ? in2[i] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
+                                           const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[i] ? in1[i] : value;
+  }
+}
+
+namespace functor {
+
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    UnaryClipCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+            out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryRightClipCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+            out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryLeftClipCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+            out_flat.data());
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_GPU(T)                         \
+  template struct UnaryClipOp<GPUDevice, T>;       \
+  template struct BinaryRightClipOp<GPUDevice, T>; \
+  template struct BinaryLeftClipOp<GPUDevice, T>;  \
+  template struct TernaryClipOp<GPUDevice, T>;
+INSTANTIATE_GPU(Eigen::half);
+INSTANTIATE_GPU(float);
+INSTANTIATE_GPU(double);
+INSTANTIATE_GPU(int8);
+INSTANTIATE_GPU(int16);
+INSTANTIATE_GPU(int32);
+INSTANTIATE_GPU(int64);
+INSTANTIATE_GPU(uint8);
+INSTANTIATE_GPU(uint16);
+#undef INSTANTIATE_GPU
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 9be7408012..aaaf45d3e7 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/eigen_pooling.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -56,7 +57,7 @@ template <typename Device, typename T>
 static void SpatialMaxPoolWithArgMaxHelper(
     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
-    const PoolParameters& params, const Padding& padding) {
+    const PoolParameters& params) {
   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
       ConstEigenMatrixMap;
   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -151,7 +152,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
       }
     }
 
-    {
+    if (input_backprop != nullptr) {
       auto input_backprop_flat = input_backprop->flat<T>();
       auto out_arg_max_flat = output_arg_max->flat<int64>();
       auto out_backprop_flat = out_backprop.flat<T>();
@@ -173,9 +174,9 @@ static void SpatialMaxPoolWithArgMaxHelper(
         // Although this check is in the inner loop, it is worth its value
         // so we don't end up with memory corruptions. Our benchmark shows that
         // the performance impact is quite small
-        CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
-            << "Invalid input backprop index: " << input_backprop_index << ", "
-            << in_start << ", " << in_end;
+        // CHECK(input_backprop_index >= in_start && input_backprop_index <
+        // in_end)
+        FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
@@ -293,7 +294,7 @@ class MaxPoolingGradOp : public OpKernel {
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
-        out_backprop, params, padding_);
+        out_backprop, params);
   }
 
  private:
@@ -869,6 +870,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output, Tensor* argmax,
+                     bool propagate_nans) {
+    Tensor unused;
+    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
+        context, output, argmax, nullptr, input, unused, params);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingWithArgmaxOp : public OpKernel {
  public:
@@ -921,6 +933,53 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingGradWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      EigenMatrixMap;
+
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&grad_in, &argmax, &grad_out](int64 start, int64 limit) {
+      const int64 batch_size =
+          GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+      const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
+      const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
+
+      {
+        auto grad_out_flat = grad_out->flat<T>();
+        auto argmax_flat = argmax.flat<int64>();
+        auto grad_in_flat = grad_in.flat<T>();
+
+        const int64 output_start = start * output_size_per_batch;
+        const int64 output_end = limit * output_size_per_batch;
+        EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
+                                  output_end - output_start);
+        inputShard.setConstant(T(0));
+
+        const int input_start = start * input_size_per_batch;
+        const int input_end = limit * input_size_per_batch;
+        for (int64 index = input_start; index < input_end; index++) {
+          const int64 grad_out_index = argmax_flat(index);
+          CHECK(grad_out_index >= output_start && grad_out_index < output_end)
+              << "Invalid output gradient index: " << grad_out_index << ", "
+              << output_start << ", " << output_end;
+          grad_out_flat(grad_out_index) += grad_in_flat(index);
+        }
+      }
+    };
+
+    const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+    const int64 shard_cost = grad_out->NumElements() / batch_size;
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          shard_cost, shard);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingGradWithArgmaxOp : public OpKernel {
  public:
@@ -1309,7 +1368,17 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
                               .HostMemory("ksize")                       \
                               .HostMemory("strides")                     \
                               .TypeConstraint<T>("T"),                   \
-                          MaxPoolingGradGradOp<D##Device, T>);
+                          MaxPoolingGradGradOp<D##Device, T>)            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<int64>("Targmax")          \
+                              .TypeConstraint<T>("T"),                   \
+                          MaxPoolingWithArgmaxOp<D##Device, T>);         \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .TypeConstraint<int64>("Targmax"),         \
+                          MaxPoolingGradWithArgmaxOp<D##Device, T>);
 
 // Below kernels implemented only for CPU device.
 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
@@ -1374,16 +1443,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
                               .HostMemory("strides")                 \
                               .TypeConstraint<T>("T"),               \
                           MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<int64>("Targmax")      \
-                              .TypeConstraint<T>("T"),               \
-                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<int64>("Targmax"),     \
-                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index a5186bdacb..183e5a1d58 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
 
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 2852c49e19..b25abbcc67 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -117,7 +117,11 @@ REGISTER_OP("TakeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SkipDataset")
     .Input("input_dataset: variant")
@@ -125,7 +129,11 @@ REGISTER_OP("SkipDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1180973e41..8f8443a46c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1558,6 +1558,14 @@ REGISTER_OP("Bucketize")
     .Attr("boundaries: list(float)")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("ClipByValue")
+    .Input("t: T")
+    .Input("clip_value_min: T")
+    .Input("clip_value_max: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
     .Input("inputs: N * T")
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 1b1faed703..3723968175 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -31,13 +31,14 @@ limitations under the License.
   __attribute__((__format__(__printf__, string_index, first_to_check)))
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
   __attribute__((__format__(__scanf__, string_index, first_to_check)))
-#elif defined(COMPILER_MSVC)
+#elif defined(_MSC_VER)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
-#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
 #define TF_MUST_USE_RESULT
 #define TF_PACKED
 #define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
@@ -57,7 +58,7 @@ limitations under the License.
 #endif
 
 // Control visiblity outside .so
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_EXPORT __declspec(dllexport)
 #else
@@ -65,7 +66,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 
 #ifdef __has_builtin
 #define TF_HAS_BUILTIN(x) __has_builtin(x)
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 6f2107ef40..d5bc7a5a7a 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -148,19 +148,7 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
 
-1. If pip isn't installed on your machine, install it now by issuing the
-following command:
-
-        $ sudo easy_install pip
-
-2. Use pip to install codegen, mock, and pandas by issuing the following
-   command (Note: If you are using
-   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
-   dependencies, you may not want to use sudo for these installations):
-
-        $ sudo pip install codegen mock pandas
-
-3. If bazel is not installed on your machine, install it now. If you are on
+1. If bazel is not installed on your machine, install it now. If you are on
    Linux, install bazel by issuing the following command:
 
         $ sudo apt-get install bazel  # Linux
@@ -168,10 +156,10 @@ following command:
     If you are on Mac OS, find bazel installation instructions on
     [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
 
-4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+2. Change directory to the top-level `tensorflow` directory of the TensorFlow
    source code.
 
-5. Run the `configure` script and answer its prompts appropriately for your
+3. Run the `configure` script and answer its prompts appropriately for your
    system.
 
         $ ./configure
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 15075e1df8..84da2165b5 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -530,56 +530,58 @@ form [described below](#attr_types).
 
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
-    .Input("to\_zero: int32")
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("preserve_index: int")
+    .Input("to_zero: int32")
     .Output("zeroed: int32");
-</code></pre>
+```
 
 (Note that the set of [attribute types](#attr_types) is different from the
 @{tf.DType$tensor types} used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {
     // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
-    // Check that preserve\_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
-                                        preserve\_index_));
-  </b>}
-  void Compute(OpKernelContext\* context) override {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preserve_index", &preserve_index_));
+    // Check that preserve_index is positive
+    OP_REQUIRES(context, preserve_index_ >= 0,
+                errors::InvalidArgument("Need preserve_index >= 0, got ",
+                                        preserve_index_));
+  }
+  void Compute(OpKernelContext* context) override {
     // ...
   }
- <b>private:
-  int preserve\_index\_;</b>
+ private:
+  int preserve_index_;
 };
-</code></pre>
+```
 
 which can then be used in the `Compute` method:
-<pre class="prettyprint"><code class="lang-cpp">
-  void Compute(OpKernelContext\* context) override {
+```c++
+  void Compute(OpKernelContext* context) override {
     // ...
-<br/>
-    <b>// We're using saved attr to validate potentially dynamic input
-    // So we check that preserve\_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
-    </b>// Set all the elements of the output tensor to 0
+
+    // We're using saved attr to validate potentially dynamic input
+    // So we check that preserve_index is in range
+    OP_REQUIRES(context, preserve_index_ < input.dimension(0),
+                errors::InvalidArgument("preserve_index out of range"));
+
+    // Set all the elements of the output tensor to 0
     const int N = input.size();
     for (int i = 0; i < N; i++) {
       output\_flat(i) = 0;
-    }<br/>
-    <b>// Preserve the requested input value
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+    }
+
+    // Preserve the requested input value
+    output_flat(preserve_index_) = input(preserve_index_);
   }
-</code></pre>
+```
 
 #### Attr types
 
@@ -725,12 +727,12 @@ you would then register an `OpKernel` for each supported type.
 
 For instance, if you'd like the `ZeroOut` op to work on `float`s
 in addition to `int32`s, your op registration might look like:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Your op registration now specifies that the input's type must be `float`, or
 `int32`, and that its output will be the same type, since both have type `T`.
@@ -790,66 +792,73 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
 
-<pre class="prettyprint"><code class="lang-cpp">
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
-class ZeroOut<b>Int32</b>Op : public OpKernel {
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+class ZeroOutInt32Op : public OpKernel {
   // as before
-};<br/>
-class ZeroOut<b>Float</b>Op : public OpKernel {
+};
+
+class ZeroOutFloatOp : public OpKernel {
  public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
-      : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutFloatOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<float>();
+
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<float>();
+
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/><b>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
-    ZeroOutOp<b>Int32</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOpInt32);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
     ZeroOutFloatOp);
-</b></code></pre>
+```
 
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
 > an existing op:
 >
-> <pre class="prettyprint"><code class="lang-cpp">
-> REGISTER\_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
->   .Input("to\_zero: T")
+> ```c++
+> REGISTER_OP("ZeroOut")
+>   .Attr("T: {float, int32} = DT_INT32")
+>   .Input("to_zero: T")
 >   .Output("zeroed: T")
-> </code></pre>
+> ```
 
 Let's say you wanted to add more types, say `double`:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, double, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 941c3e1690..275cda12bc 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -546,7 +546,7 @@ In brief, here's what the three graphs tell you:
 
 * accuracy: The accuracy is recorded by the following two lines:
 
-    * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+    * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation.
     * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
 
 These tensorboard graphs are one of the main reasons it's important to pass a
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index a3eca4bf37..274413e294 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -113,6 +113,6 @@ If executing `a.out` fails, ask yourself the following questions:
   * Did you export those environment variables?
 
 If you are still seeing build or execution error messages, search (or post to)
-[StackOverflow](www.stackoverflow.com/questions/tagged/tensorflow) for
+[StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow) for
 possible solutions.
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 580a899ac4..b1796cf9b2 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -475,7 +475,7 @@ optimizations.
 ### TensorFlow with Intel® MKL DNN
 
 Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon
-Phi™ though the use of Intel® Math Kernel Library for Deep Neural Networks
+Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks
 (Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups
 for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel
 published paper
@@ -581,9 +581,9 @@ Each variable that impacts performance is discussed below.
     for optimal settings.
 
 *   **intra_op_parallelism_threads**: Setting this equal to the number of
-    physical cores is recommended. Setting the value to 0, which is the default
-    and will result in the value being set to the number of logical cores, is an
-    option to try for some architectures.  This value and `OMP_NUM_THREADS`
+    physical cores is recommended. Setting the value to 0, which is the default,
+    results in the value being set to the number of logical cores - this is an
+    alternate option to try for some architectures.  This value and `OMP_NUM_THREADS`
     should be equal.
 
 *   **inter_op_parallelism_threads**: Setting this equal to the number of
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index d1cd7e7c06..f5a0eb0a20 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -4,29 +4,28 @@
 
 [TOC]
 
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
-lets you view the internal structure and states of running TensorFlow graphs
-during training and inference, which is difficult to debug with general-purpose
-debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
-
-> NOTE: TensorFlow debugger uses a
-> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
-> text user interface. On Mac OS X, the `ncurses` library is required and can
-> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
-> isn't as well supported, so a
-> [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
-> be used with tfdbg by installing `pyreadline` with pip.
-> If you use Anaconda3, you can install it with a command
-> such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
-> Unofficial Windows curses packages can be downloaded
-> [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
-> installed using `pip install <your_version>.whl`, however curses on Windows
-> may not work as reliably as curses on Linux or Mac.
-
-> NOTE: This guide focuses on the command-line interface (CLI) of tfdbg. For
-> guide on how to use the graphical user interface (GUI) of tfdbg, i.e., the
-> **TensorBoard Debugger Plugin**, please visit
-> [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal
+structure and states of running TensorFlow graphs during training and inference,
+which is difficult to debug with general-purpose debuggers such as Python's `pdb`
+due to TensorFlow's computation-graph paradigm.
+
+This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on
+how to use the graphical user interface (GUI) of tfdbg, i.e., the
+**TensorBoard Debugger Plugin**, please visit
+[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+
+Note: The TensorFlow debugger uses a
+[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
+user interface. On Mac OS X, the `ncurses` library is required and can be
+installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
+interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
+use Anaconda3, you can install it with a command such as
+`"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial
+Windows curses packages can be downloaded
+[here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
+installed using `pip install <your_version>.whl`, however curses on Windows may
+not work as reliably as curses on Linux or Mac.
 
 This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance
 of [`nan`s](https://en.wikipedia.org/wiki/NaN)
@@ -748,16 +747,16 @@ There are three possible workarounds or solutions:
    to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
    debug data on a disk with larger free space. For example:
 
-   ``` python
-   # For LocalCLIDebugWrapperSession
-   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
-
-   # For LocalCLIDebugHook
-   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
-   ```
+```python
+# For LocalCLIDebugWrapperSession
+sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
 
+# For LocalCLIDebugHook
+hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
+```
    Make sure that the directory pointed to by dump_root is empty or nonexistent.
-   tfdbg cleans up the dump directories before exiting.
+   `tfdbg` cleans up the dump directories before exiting.
+
 *  Reduce the batch size used during the runs.
 *  Use the filtering options of tfdbg's `run` command to watch only specific
    nodes in the graph. For example:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 72284fd50b..a683c8cfa6 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -835,6 +835,7 @@ py_library(
     srcs = ["framework/tensor_shape.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dtypes",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 51ff5171a3..807582bd7e 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -651,6 +651,11 @@ QUANTIZED_DTYPES = frozenset([
 ])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
+_PYTHON_TO_TF = {
+    float: float32,
+    bool: bool,
+}
+
 
 @tf_export("as_dtype")
 def as_dtype(type_value):
@@ -682,6 +687,11 @@ def as_dtype(type_value):
   except KeyError:
     pass
 
+  try:
+    return _PYTHON_TO_TF[type_value]
+  except KeyError:
+    pass
+
   if isinstance(type_value, np.dtype):
     # The numpy dtype for strings is variable length. We can not compare
     # dtype with a single constant (np.string does not exist) to decide
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index e55783bb79..a873670e04 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,10 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonTypesConversion(self):
+    self.assertIs(dtypes.float32, dtypes.as_dtype(float))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
+
   def testReduce(self):
     for enum in dtypes._TYPE_TO_STRING:
       dtype = dtypes.DType(enum)
@@ -307,3 +311,4 @@ class TypesTest(test_util.TensorFlowTestCase):
 
 if __name__ == "__main__":
   googletest.main()
+
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index c05396b06e..d6bc14fbc7 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
@@ -1362,7 +1361,7 @@ class UnrollLSTMTest(test.TestCase):
         value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
     new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
         i_g) * math_ops.tanh(i_i)
-    new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
+    new_c = math_ops.maximum(math_ops.minimum(new_c, 50.0), -50.0)
     new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
     return new_m, new_c
 
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 00f256cd45..0dd29460ed 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -30,6 +31,8 @@ class Dimension(object):
     """Creates a new Dimension with the given value."""
     if value is None:
       self._value = None
+    elif isinstance(value, dtypes.DType):
+      raise TypeError("Cannot convert %s to Dimension" % value)
     else:
       self._value = int(value)
       if (not isinstance(value, compat.bytes_or_text_types) and
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 498574eded..9232d99a1f 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -184,6 +185,10 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(str(tensor_shape.Dimension(7)), "7")
     self.assertEqual(str(tensor_shape.Dimension(None)), "?")
 
+  def testUnsupportedType(self):
+    with self.assertRaises(TypeError):
+      tensor_shape.Dimension(dtypes.string)
+
   def testMod(self):
     four = tensor_shape.Dimension(4)
     nine = tensor_shape.Dimension(9)
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index bbf1d2a3d9..f82e3277de 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import defaultdict
-import sys
 
 import numpy as np
+import six
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -160,13 +160,11 @@ def ask_to_proceed_with_overwrite(filepath):
   Returns:
       True if we can proceed with overwrite, False otherwise.
   """
-  get_input = input
-  if sys.version_info[:2] <= (2, 7):
-    get_input = raw_input
-  overwrite = get_input('[WARNING] %s already exists - overwrite? '
-                        '[y/n]' % (filepath))
-  while overwrite not in ['y', 'n']:
-    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
+                              '[y/n]' % (filepath)).strip().lower()
+  while overwrite not in ('y', 'n'):
+    overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
+                                '(cancel).').strip().lower()
   if overwrite == 'n':
     return False
   print('[TIP] Next time specify overwrite=True!')
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 5c8b71da17..e08123b041 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -19,16 +19,33 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
 
 
 class ClipTest(test.TestCase):
 
+  def DISABLED_testClipByValueGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
+    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
+    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
+    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
+    with self.test_session():
+      error_1 = gradient_checker.compute_gradient_error(inputs, [4], outputs_1,
+                                                        [4])
+      self.assertLess(error_1, 1e-4)
+
+      error_2 = gradient_checker.compute_gradient_error(inputs, [4], outputs_2,
+                                                        [4])
+      self.assertLess(error_2, 1e-4)
+
   # ClipByValue test
   def testClipByValue(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
@@ -37,8 +54,76 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  # [Tensor, Scalar, Scalar]
+  def DISABLED_testClipByValue0Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = 2
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Scalar]
+  def DISABLED_testClipByValue1Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = constant_op.constant(
+            [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Scalar, Tensor]
+  def DISABLED_testClipByValue2Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[4, 4, 4], [4, 5, 6]]
+        clip_value_min = 4
+        clip_value_max = constant_op.constant(
+            [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Tensor]
+  def DISABLED_testClipByValue3Type(self):
+    for dtype in [
+        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
+    ]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [5, 5, 6]]
+        clip_value_min = constant_op.constant(
+            [2, 2, 2, 5, 5, 5], shape=[2, 3], dtype=dtype)
+        clip_value_max = constant_op.constant(
+            [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
   def testClipByValueBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -48,6 +133,7 @@ class ClipTest(test.TestCase):
         _ = clip_ops.clip_by_value(x, 1.0, clip)
 
   def testClipByValueNonFinite(self):
+    # TODO(b/78016351): Enable test on GPU once the bug is fixed.
     with self.test_session():
       x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
       np_ans = [float('NaN'), 4.0, -4.0]
@@ -60,7 +146,7 @@ class ClipTest(test.TestCase):
   # ClipByNorm tests
   def testClipByNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
@@ -76,7 +162,7 @@ class ClipTest(test.TestCase):
     self.assertAllClose(np_ans, tf_ans_tensor)
 
   def testClipByNormBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -85,7 +171,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -97,7 +183,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
@@ -109,7 +195,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim0(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
@@ -121,7 +207,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim1(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
@@ -133,7 +219,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClippedWithAxes(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
@@ -146,7 +232,7 @@ class ClipTest(test.TestCase):
   # ClipByGlobalNorm tests
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -167,7 +253,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -188,7 +274,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -211,7 +297,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = ops.IndexedSlices(
           constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@@ -244,7 +330,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -263,7 +349,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([0.0, 0.0])
       # Norm = 0, no changes
@@ -282,7 +368,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -294,7 +380,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClippedTensor(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -306,7 +392,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormNotClipped(self):
     # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -318,7 +404,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormZero(self):
     # No norm clipping when average clip_norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Average norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index ed44a1a4d1..a0c372db7d 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -817,9 +817,6 @@ class PoolingTest(test.TestCase):
           cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     with self.test_session(use_gpu=True) as sess:
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
@@ -836,9 +833,6 @@ class PoolingTest(test.TestCase):
       self.assertAllEqual(argmax.ravel(), [0, 1, 3, 5])
 
   def testMaxPoolingGradWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 49f8c66531..75c459a9cf 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -70,6 +71,35 @@ def clip_by_value(t, clip_value_min, clip_value_max,
     _ = t.shape.merge_with(t_max.shape)
 
   return t_max
+  # TODO(scottzhu): switch to use new implmentation in 2 weeks.
+    # return gen_math_ops.clip_by_value(
+    #     t, clip_value_min, clip_value_max, name=name)
+
+
+# TODO(scottzhu): switch to use new implmentation in 2 weeks.
+# @ops.RegisterGradient("ClipByValue")
+def _clip_by_value_grad(op, grad):
+  """Returns grad of clip_by_value."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  z = op.inputs[2]
+  gdtype = grad.dtype
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  sz = array_ops.shape(z)
+  gradshape = array_ops.shape(grad)
+  zeros = array_ops.zeros(gradshape, gdtype)
+  xymask = math_ops.less(x, y)
+  xzmask = math_ops.greater(x, z)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
+  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
+  ygrad = array_ops.where(xymask, grad, zeros)
+  zgrad = array_ops.where(xzmask, grad, zeros)
+  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
+  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
+  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
+  return (gx, gy, gz)
 
 
 @tf_export("clip_by_norm")
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
new file mode 100644
index 0000000000..e1217e984c
--- /dev/null
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -0,0 +1,395 @@
+# array_ops
+BatchToSpace
+BroadcastArgs
+BroadcastGradientArgs
+ConcatOffset
+Concat
+ConcatV2
+ConjugateTranspose
+Const
+DebugGradientIdentity
+DebugGradientRefIdentity
+EditDistance
+ExpandDims
+ListDiff
+MirrorPad
+MirrorPadGrad
+OneHot
+Pack
+Pad
+PadV2
+ParallelConcat
+Placeholder
+RefIdentity
+Reverse
+Snapshot
+SpaceToBatch
+Split
+SplitV
+Squeeze
+Slice
+TileGrad  # Exported through array_grad instead of array_ops.
+ZerosLike  # TODO(josh11b): Use this instead of the Python version.
+Unique
+UniqueV2
+UniqueWithCounts
+UniqueWithCountsV2
+Unpack
+
+# candidate_sampling_ops
+AllCandidateSampler
+ComputeAccidentalHits
+FixedUnigramCandidateSampler
+LearnedUnigramCandidateSampler
+LogUniformCandidateSampler
+ThreadUnsafeUnigramCandidateSampler
+UniformCandidateSampler
+
+# checkpoint_ops
+GenerateVocabRemapping
+LoadAndRemapMatrix
+
+
+# control_flow_ops
+Switch
+Merge
+RefMerge
+Exit
+RefExit
+
+# ctc_ops
+CTCLoss
+CTCGreedyDecoder
+CTCBeamSearchDecoder
+
+# data_flow_ops
+Barrier
+BarrierClose
+BarrierIncompleteSize
+BarrierInsertMany
+BarrierReadySize
+BarrierTakeMany
+DeleteSessionTensor
+FakeQueue
+FIFOQueue
+FIFOQueueV2
+GetSessionHandle
+GetSessionHandleV2
+GetSessionTensor
+HashTable
+HashTableV2
+InitializeTable
+InitializeTableV2
+InitializeTableFromTextFile
+InitializeTableFromTextFileV2
+LookupTableExport
+LookupTableExportV2
+LookupTableFind
+LookupTableFindV2
+LookupTableImport
+LookupTableImportV2
+LookupTableInsert
+LookupTableInsertV2
+LookupTableSize
+LookupTableSizeV2
+MutableDenseHashTable
+MutableDenseHashTableV2
+MutableHashTable
+MutableHashTableV2
+MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
+Mutex
+MutexAcquire
+MutexRelease
+PaddingFIFOQueue
+PaddingFIFOQueueV2
+PriorityQueue
+PriorityQueueV2
+QueueClose
+QueueCloseV2
+QueueDequeue
+QueueDequeueV2
+QueueDequeueMany
+QueueDequeueManyV2
+QueueDequeueUpTo
+QueueDequeueUpToV2
+QueueEnqueue
+QueueEnqueueV2
+QueueEnqueueMany
+QueueEnqueueManyV2
+QueueSize
+QueueSizeV2
+RandomShuffleQueue
+RandomShuffleQueueV2
+Stack
+StackClose
+StackPop
+StackPush
+StackV2
+StackCloseV2
+StackPopV2
+StackPushV2
+TensorArray
+TensorArrayClose
+TensorArrayCloseV2
+TensorArrayConcat
+TensorArrayConcatV2
+TensorArrayGather
+TensorArrayGatherV2
+TensorArrayGrad
+TensorArrayGradV2
+TensorArrayPack
+TensorArrayPackV2
+TensorArrayRead
+TensorArrayReadV2
+TensorArrayScatter
+TensorArrayScatterV2
+TensorArraySize
+TensorArraySizeV2
+TensorArraySplit
+TensorArraySplitV2
+TensorArrayUnpack
+TensorArrayUnpackV2
+TensorArrayV2
+TensorArrayWrite
+TensorArrayWriteV2
+TensorArrayV3
+TensorArrayCloseV3
+TensorArrayConcatV3
+TensorArrayGatherV3
+TensorArrayGradV3
+TensorArrayReadV3
+TensorArrayPackV3
+TensorArrayScatterV3
+TensorArraySizeV3
+TensorArraySplitV3
+TensorArrayUnpackV3
+TensorArrayWriteV3
+
+# functional_ops
+SymbolicGradient
+
+# image_ops
+AdjustContrastv2
+NonMaxSuppression
+NonMaxSuppressionV2
+RandomCrop
+ResizeBilinearGrad
+ResizeBicubicGrad
+ResizeNearestNeighborGrad
+SampleDistortedBoundingBox
+SampleDistortedBoundingBoxV2
+ScaleImageGrad
+
+# io_ops
+FixedLengthRecordReader
+IdentityReader
+ReaderNumRecordsProduced
+ReaderNumWorkUnitsCompleted
+ReaderRead
+ReaderReadUpTo
+ReaderReset
+ReaderRestoreState
+ReaderSerializeState
+ReaderWorkQueueLength
+FixedLengthRecordReaderV2
+IdentityReaderV2
+ReaderNumRecordsProducedV2
+ReaderNumWorkUnitsCompletedV2
+ReaderReadV2
+ReaderReadUpToV2
+ReaderResetV2
+ReaderRestoreStateV2
+ReaderSerializeStateV2
+ReaderWorkQueueLengthV2
+Restore
+RestoreSlice
+Save
+SaveSlices
+ShardedFilename
+ShardedFilespec
+TextLineReader
+TFRecordReader
+WholeFileReader
+TextLineReaderV2
+TFRecordReaderV2
+WholeFileReaderV2
+LMDBReader
+DecodeCSV
+
+# linalg_ops
+BatchCholesky
+BatchCholeskyGrad
+BatchMatrixDeterminant
+BatchMatrixInverse
+BatchMatrixSolve
+BatchMatrixSolveLs
+BatchMatrixTriangularSolve
+BatchSelfAdjointEig
+BatchSelfAdjointEigV2
+BatchSvd
+LogMatrixDeterminant
+MatrixExponential
+MatrixLogarithm
+MatrixSolveLs
+SelfAdjointEig
+SelfAdjointEigV2
+Svd
+
+# logging_ops
+Assert
+AudioSummary
+AudioSummaryV2
+HistogramSummary
+ImageSummary
+MergeSummary
+Print
+ScalarSummary
+TensorSummary
+TensorSummaryV2
+
+# math_ops
+Abs
+AccumulateNV2
+AddN
+AddV2
+All
+Any
+BatchMatMul
+BatchFFT
+BatchFFT2D
+BatchFFT3D
+BatchIFFT
+BatchIFFT2D
+BatchIFFT3D
+Bucketize
+ClipByValue
+Complex
+ComplexAbs
+Conj
+FloorDiv
+FloorMod
+HistogramFixedWidth
+Max
+Mean
+Min
+Mul
+Neg
+Pow
+Prod
+Range
+RealDiv
+Select
+SparseMatMul
+Sub
+Sum
+MatMul
+Sigmoid
+Tanh
+SigmoidGrad
+TanhGrad
+InvGrad
+ReciprocalGrad
+SqrtGrad
+RsqrtGrad
+TruncateDiv
+TruncateMod
+
+# nn_ops
+AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
+AvgPool3DGrad
+BatchNormWithGlobalNormalization
+BatchNormWithGlobalNormalizationGrad
+FusedBatchNorm
+FusedBatchNormV2
+SoftmaxCrossEntropyWithLogits
+SparseSoftmaxCrossEntropyWithLogits
+LRNGrad
+MaxPoolGrad
+MaxPoolGradWithArgmax
+MaxPoolGradGrad
+MaxPoolGradGradWithArgmax
+MaxPool3DGrad
+MaxPool3DGradGrad
+ReluGrad
+Relu6Grad
+EluGrad
+SeluGrad
+SoftplusGrad
+SoftsignGrad
+TopK
+TopKV2
+BiasAdd
+BiasAddV1
+Relu6
+AvgPool
+MaxPool
+MaxPoolV2
+Softmax
+LogSoftmax
+FractionalAvgPoolGrad
+FractionalMaxPoolGrad
+InTopK
+InTopKV2
+
+# parsing_ops
+ParseExample
+ParseSingleSequenceExample
+
+# random_ops
+RandomGamma
+RandomPoisson
+RandomUniform
+RandomUniformInt
+RandomShuffle
+RandomStandardNormal
+ParameterizedTruncatedNormal
+TruncatedNormal
+
+# script_ops
+PyFunc
+PyFuncStateless
+EagerPyFunc
+
+# sdca_ops
+
+# state_ops
+Variable
+VariableV2
+TemporaryVariable
+DestroyTemporaryVariable
+
+# sparse_ops
+AddSparseToTensorsMap
+AddManySparseToTensorsMap
+TakeManySparseFromTensorsMap
+DeserializeManySparse
+DeserializeSparse
+SerializeManySparse
+SerializeSparse
+SparseAdd
+SparseAddGrad
+SparseConcat
+SparseCross
+SparseFillEmptyRows
+SparseFillEmptyRowsGrad
+SparseSplit
+SparseSelectLastK
+SparseReorder
+SparseReshape
+SparseToDense
+SparseTensorDenseAdd
+SparseTensorDenseMatMul
+
+# string_ops
+StringSplit
+
+# user_ops
+Fact
+
+# training_ops
+# (None)
+
+# word2vec deprecated ops
+NegTrain
+Skipgram
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 4ab8a72a83..663036de8a 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import namedtuple
 import inspect as _inspect
 
 from tensorflow.python.util import tf_decorator
@@ -24,6 +25,15 @@ from tensorflow.python.util import tf_decorator
 ArgSpec = _inspect.ArgSpec
 
 
+if hasattr(_inspect, 'FullArgSpec'):
+  FullArgSpec = _inspect.FullArgSpec  # pylint: disable=invalid-name
+else:
+  FullArgSpec = namedtuple('FullArgSpec', [
+      'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+      'annotations'
+  ])
+
+
 def currentframe():
   """TFDecorator-aware replacement for inspect.currentframe."""
   return _inspect.stack()[1][0]
@@ -55,13 +65,36 @@ def getfullargspec(obj):  # pylint: disable=redefined-builtin
     obj: A callable, possibly decorated.
 
   Returns:
-    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    The `FullArgSpec` that describes the signature of
     the outermost decorator that changes the callable's signature. If the
-    callable is not decorated, `inspect.getfullargspec()`
-    (`inspect.getargspec()` in Python 2) will be called directly on the
-    callable.
+    callable is not decorated, `inspect.getfullargspec()` will be called
+    directly on the callable.
   """
-  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  if hasattr(_inspect, 'getfullargspec'):
+    spec_fn = _inspect.getfullargspec
+  else:
+    def spec_fn(target):
+      """Spec function that adding default value from FullArgSpec.
+
+      It is used when getfullargspec is not available (eg in PY2).
+
+      Args:
+        target: the target object to inspect.
+      Returns:
+        The full argument specs with empty kwonlyargs, kwonlydefaults and
+        annotations.
+      """
+      argspecs = _inspect.getargspec(target)
+      fullargspecs = FullArgSpec(
+          args=argspecs.args,
+          varargs=argspecs.varargs,
+          varkw=argspecs.keywords,
+          defaults=argspecs.defaults,
+          kwonlyargs=[],
+          kwonlydefaults=None,
+          annotations={})
+      return fullargspecs
+
   decorators, target = tf_decorator.unwrap(obj)
   return next((d.decorator_argspec for d in decorators
                if d.decorator_argspec is not None), spec_fn(target))
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index fd44b0eb3b..528f811b40 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -810,7 +810,33 @@ def tf_cc_test_mkl(srcs,
                    tags=[],
                    size="medium",
                    args=None):
-  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
+  for src in srcs:
+    native.cc_test(
+      name=src_to_test_name(src),
+      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
+      copts=tf_copts(),
+      linkopts=select({
+        clean_dep("//tensorflow:android"): [
+            "-pie",
+          ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
+        "//conditions:default": [
+            "-lpthread",
+            "-lm"
+        ],
+      }) + _rpath_linkopts(src_to_test_name(src)),
+      deps=deps + if_mkl(
+          [
+              "//third_party/mkl:intel_binary_blob",
+          ],
+      ),
+      linkstatic=linkstatic,
+      tags=tags,
+      size=size,
+      args=args,
+      nocopts="-fno-exceptions")
+
 
 def tf_cc_tests_gpu(srcs,
                     deps,
@@ -1029,16 +1055,12 @@ register_extension_info(
 def tf_mkl_kernel_library(name,
                           prefix=None,
                           srcs=None,
-                          gpu_srcs=None,
                           hdrs=None,
                           deps=None,
                           alwayslink=1,
                           copts=tf_copts(),
-                          nocopts="-fno-exceptions",
-                          **kwargs):
+                          nocopts="-fno-exceptions"):
   """A rule to build MKL-based TensorFlow kernel libraries."""
-  gpu_srcs = gpu_srcs  # unused argument
-  kwargs = kwargs  # unused argument
 
   if not bool(srcs):
     srcs = []
@@ -1051,16 +1073,15 @@ def tf_mkl_kernel_library(name,
     hdrs = hdrs + native.glob(
         [prefix + "*.h"])
 
-  if_mkl(
-      native.cc_library(
-          name=name,
-          srcs=srcs,
-          hdrs=hdrs,
-          deps=deps,
-          alwayslink=alwayslink,
-          copts=copts,
-          nocopts=nocopts
-      ))
+  native.cc_library(
+      name=name,
+      srcs=if_mkl(srcs),
+      hdrs=hdrs,
+      deps=deps,
+      alwayslink=alwayslink,
+      copts=copts,
+      nocopts=nocopts
+  )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 6fa48cd70c..c06a39bfbd 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -160,7 +160,8 @@ def get_api_init_text():
   # we want to traverse over TensorFlow Python modules.
   for module in sys.modules.values():
     # Only look at tensorflow modules.
-    if not module or 'tensorflow.' not in module.__name__:
+    if (not module or not hasattr(module, '__name__') or
+        'tensorflow.' not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 024cb40eb4..78cb4d250e 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 11f476d12c..b3dbe475d2 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1fcb6428b2..bfb96da58d 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 625321e123..9e1708662e 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 5585ebdcd3..824fe14560 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -1207,7 +1207,7 @@
    "source": [
     "# Training computation: logits + cross-entropy loss.\n",
     "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n",
+    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
     "  labels=train_labels_node, logits=logits))\n",
     "\n",
     "# L2 regularization for the fully connected parameters.\n",
@@ -2031,7 +2031,7 @@
    "views": {}
   },
   "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2049,5 +2049,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b4fba5b8f5..05de25f2cb 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@@ -306,7 +306,7 @@ else
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
     else
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index d370fbd246..0c1fd0cf9d 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -37,7 +37,7 @@ py_library(
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["@com_github_andreif_codegen"],
+    deps = ["@astor_archive//:astor"],
 )
 
 py_test(
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index ae293f6576..0cbf8b478f 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
 import textwrap
 
 import tensorflow as tf
@@ -39,10 +38,6 @@ class Flags(object):
 class BuildDocsTest(googletest.TestCase):
 
   def testBuildDocs(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     doc_generator = generate_lib.DocGenerator()
 
     doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 9cc261d7dd..111d54d820 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import argparse
 import fnmatch
 import os
-import sys
 
 import six
 
@@ -134,8 +133,12 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
     try:
       if not os.path.exists(directory):
         os.makedirs(directory)
-      with open(path, 'w') as f:
-        f.write(pretty_docs.build_md_page(page_info))
+      # This function returns raw bytes in PY2 or unicode in PY3.
+      text = pretty_docs.build_md_page(page_info)
+      if six.PY3:
+        text = text.encode('utf-8')
+      with open(path, 'wb') as f:
+        f.write(text)
     except OSError as e:
       print('Cannot write documentation for %s to %s: %s' % (full_name,
                                                              directory, e))
@@ -437,19 +440,19 @@ def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
       full_out_path = os.path.join(output_dir, suffix)
       if not fnmatch.fnmatch(base_name, file_pattern):
         print('Copying un-matched file %s...' % suffix)
-        open(full_out_path, 'w').write(open(full_in_path).read())
+        open(full_out_path, 'wb').write(open(full_in_path, 'rb').read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
         content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        content = open(full_in_path).read()
+        content = open(full_in_path, 'rb').read().decode('utf-8')
 
       content = reference_resolver.replace_references(content,
                                                       relative_path_to_root)
-      with open(full_out_path, 'w') as f:
-        f.write(content)
+      with open(full_out_path, 'wb') as f:
+        f.write(content.encode('utf-8'))
 
   print('Done.')
 
@@ -458,8 +461,6 @@ class DocGenerator(object):
   """Main entry point for generating docs."""
 
   def __init__(self):
-    if sys.version_info >= (3, 0):
-      sys.exit('Doc generation is not supported from python3.')
     self.argument_parser = argparse.ArgumentParser()
     self._py_modules = None
     self._private_map = _get_default_private_map()
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index 1ceaf31f1c..ea6d28a02b 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -52,9 +52,6 @@ class DummyVisitor(object):
 class GenerateTest(googletest.TestCase):
 
   def test_write(self):
-    if sys.version_info >= (3, 0):
-      self.skipTest('Warning: Doc generation is not supported from python3.')
-
     module = sys.modules[__name__]
 
     index = {
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index d2a63ecc49..fb0bd2c2ff 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -26,7 +26,7 @@ import os
 import re
 import sys
 
-import codegen
+import astor
 import six
 
 from google.protobuf.message import Message as ProtoMessage
@@ -621,20 +621,20 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
 def _get_arg_spec(func):
   """Extracts signature information from a function or functools.partial object.
 
-  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
-  corrects the signature of the underlying function to take into account the
-  removed arguments.
+  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
+  objects, corrects the signature of the underlying function to take into
+  account the removed arguments.
 
   Args:
     func: A function whose signature to extract.
 
   Returns:
-    An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
-    by `tf_inspect.getargspec`.
+    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
+    as returned by `tf_inspect.getfullargspec`.
   """
-  # getargspec does not work for functools.partial objects directly.
+  # getfullargspec does not work for functools.partial objects directly.
   if isinstance(func, functools.partial):
-    argspec = tf_inspect.getargspec(func.func)
+    argspec = tf_inspect.getfullargspec(func.func)
     # Remove the args from the original function that have been used up.
     first_default_arg = (
         len(argspec.args or []) - len(argspec.defaults or []))
@@ -657,12 +657,16 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    return tf_inspect.ArgSpec(args=argspec_args,
-                              varargs=argspec.varargs,
-                              keywords=argspec.keywords,
-                              defaults=tuple(argspec_defaults))
+    return tf_inspect.FullArgSpec(
+        args=argspec_args,
+        varargs=argspec.varargs,
+        varkw=argspec.varkw,
+        defaults=tuple(argspec_defaults),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
   else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getargspec(func)
+    return tf_inspect.getfullargspec(func)
 
 
 def _remove_first_line_indent(string):
@@ -670,11 +674,14 @@ def _remove_first_line_indent(string):
   return '\n'.join([line[indent:] for line in string.split('\n')])
 
 
+PAREN_NUMBER_RE = re.compile(r'^\(([0-9.e-]+)\)')
+
+
 def _generate_signature(func, reverse_index):
   """Given a function, returns a list of strings representing its args.
 
   This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getargspec, which
+  python function. It uses tf_inspect.getfullargspec, which
   does not generalize well to Python 3.x, which is more flexible in how *args
   and **kwargs are handled. This is not a problem in TF, since we have to remain
   compatible to Python 2.7 anyway.
@@ -725,7 +732,11 @@ def _generate_signature(func, reverse_index):
       if id(default) in reverse_index:
         default_text = reverse_index[id(default)]
       elif ast_default is not None:
-        default_text = codegen.to_source(ast_default)
+        default_text = (
+            astor.to_source(ast_default).rstrip('\n').replace('\t', '\\t')
+            .replace('\n', '\\n').replace('"""', "'"))
+        default_text = PAREN_NUMBER_RE.sub('\\1', default_text)
+
         if default_text != repr(default):
           # This may be an internal name. If so, handle the ones we know about.
           # TODO(wicke): This should be replaced with a lookup in the index.
@@ -758,8 +769,8 @@ def _generate_signature(func, reverse_index):
   # Add *args and *kwargs.
   if argspec.varargs:
     args_list.append('*' + argspec.varargs)
-  if argspec.keywords:
-    args_list.append('**' + argspec.keywords)
+  if argspec.varkw:
+    args_list.append('**' + argspec.varkw)
 
   return args_list
 
@@ -1136,9 +1147,11 @@ class _ClassPageInfo(object):
 
     for short_name in parser_config.tree[self.full_name]:
       # Remove builtin members that we never want to document.
-      if short_name in ['__class__', '__base__', '__weakref__', '__doc__',
-                        '__module__', '__dict__', '__abstractmethods__',
-                        '__slots__', '__getnewargs__']:
+      if short_name in [
+          '__class__', '__base__', '__weakref__', '__doc__', '__module__',
+          '__dict__', '__abstractmethods__', '__slots__', '__getnewargs__',
+          '__str__', '__repr__', '__hash__'
+      ]:
         continue
 
       child_name = '.'.join([self.full_name, short_name])
@@ -1183,7 +1196,8 @@ class _ClassPageInfo(object):
         # obvious what they do, don't include them in the docs if there's no
         # docstring.
         if not child_doc.brief.strip() and short_name in [
-            '__str__', '__repr__', '__hash__', '__del__', '__copy__']:
+            '__del__', '__copy__'
+        ]:
           print('Skipping %s, defined in %s, no docstring.' % (child_name,
                                                                defining_class))
           continue
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index fca5436ca5..274d48ef66 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -398,7 +398,6 @@ class ParserTest(googletest.TestCase):
     self.assertIn('<code>test_function', docs)
 
   def test_argspec_for_functools_partial(self):
-
     # pylint: disable=unused-argument
     def test_function_for_partial1(arg1, arg2, kwarg1=1, kwarg2=2):
       pass
@@ -409,42 +408,95 @@ class ParserTest(googletest.TestCase):
 
     # pylint: disable=protected-access
     # Make sure everything works for regular functions.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 2),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 2),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting args from the front works.
-    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
-                                  (1, 2))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg2', 'kwarg1', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(1, 2),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(
+        args=['kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(2,),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting kwargs works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg2'],
+        varargs=None,
+        varkw=None,
+        defaults=(2,),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1', 'arg2', 'kwarg1'],
+        varargs=None,
+        varkw=None,
+        defaults=(1,),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.FullArgSpec(
+        args=['arg1'],
+        varargs=None,
+        varkw=None,
+        defaults=(),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure *args, *kwargs is accounted for.
-    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.FullArgSpec(
+        args=[],
+        varargs='my_args',
+        varkw='my_kwargs',
+        defaults=(),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
@@ -524,10 +576,6 @@ class TestParseFunctionDetails(googletest.TestCase):
 class TestGenerateSignature(googletest.TestCase):
 
   def test_known_object(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     known_object = object()
     reverse_index = {id(known_object): 'location.of.object.in.api'}
 
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 543b5fa6fe..55ab5bdd49 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -101,7 +101,7 @@ def _build_class_page(page_info):
 
     link_template = '[`{short_name}`]({url})'
     parts.append(', '.join(
-        link_template.format(**base.__dict__) for base in page_info.bases))
+        link_template.format(**base._asdict()) for base in page_info.bases))
 
   parts.append('\n\n')
 
@@ -159,7 +159,7 @@ def _build_class_page(page_info):
       h3 = ('<h3 id="{short_name}">'
             '<code>{short_name}</code>'
             '</h3>\n\n')
-      parts.append(h3.format(**method_info.__dict__))
+      parts.append(h3.format(**method_info._asdict()))
 
       if method_info.signature is not None:
         parts.append(_build_signature(method_info, use_full_name=False))
@@ -217,7 +217,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}`]({url}) module'
 
     for item in page_info.modules:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -229,7 +229,7 @@ def _build_module_page(page_info):
     template = '[`class {short_name}`]({url})'
 
     for item in page_info.classes:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -241,7 +241,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}(...)`]({url})'
 
     for item in page_info.functions:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -254,7 +254,7 @@ def _build_module_page(page_info):
     parts.append('## Other Members\n\n')
 
     for item in page_info.other_members:
-      parts.append('`{short_name}`\n\n'.format(**item.__dict__))
+      parts.append('`{short_name}`\n\n'.format(**item._asdict()))
 
   return ''.join(parts)
 
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
index 216353ecee..328f42d18f 100644
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -44,7 +44,7 @@ class PyGuideParser(object):
 
   def process(self, full_path):
     """Read and process the file at `full_path`."""
-    md_string = open(full_path).read()
+    md_string = open(full_path, 'rb').read().decode('utf-8')
     self._lines = md_string.split('\n')
     seen = set()
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index aab0fb41fb..f775491e4a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -315,18 +315,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "backports.weakref-1.0rc1/src",
       build_file = clean_dep("//third_party:backports_weakref.BUILD"),
   )
-
-  tf_http_archive(
-      name = "com_github_andreif_codegen",
-      urls = [
-          "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
-          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
-      ],
-      sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
-      strip_prefix = "codegen-1.0",
-      build_file = clean_dep("//third_party:codegen.BUILD"),
-  )
-
+  
   filegroup_external(
       name = "org_python_license",
       licenses = ["notice"],  # Python 2.0
-- 
GitLab


From 518119dd13fdc89a99e73ec19e12fd35af577068 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Fri, 13 Apr 2018 19:09:07 -0700
Subject: [PATCH 0815/1262] Clarify a caveat about metrics.

PiperOrigin-RevId: 192855733
---
 tensorflow/contrib/distribute/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 2482731198..5d22d9aa2b 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -116,7 +116,8 @@ in the input function gives a solid boost in performance. When using
 ## Caveats
 This feature is in early stages and there are a lot of improvements forthcoming:
 
-* Metrics are not yet supported during distributed training.
+* Metrics are not yet supported during distributed training. They are still
+supported during the evaluation.
 * Summaries are only computed in the first tower in `MirroredStrategy`.
 * Evaluation is not yet distributed.
 * Eager support is in the works; performance can be more challenging with eager
-- 
GitLab


From 7cebffb82c31d29dc6ef3ef40225186220f6ff7c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 19:18:28 -0700
Subject: [PATCH 0816/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 192856167
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 44 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 44 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index a45a95ae09..083119662b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -11761,6 +11761,50 @@ op {
     }
   }
 }
+op {
+  name: "ClipByValue"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CloseSummaryWriter"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index afb3dab3fe..4c483125cc 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4727,6 +4727,50 @@ op {
     }
   }
 }
+op {
+  name: "ClipByValue"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CloseSummaryWriter"
   input_arg {
-- 
GitLab


From 0a9dbc8c354b3abf6bd5e0acdba59013c579687f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 19:22:02 -0700
Subject: [PATCH 0817/1262] Internal Change

PiperOrigin-RevId: 192856330
---
 tensorflow/python/kernel_tests/cwise_ops_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 34e7751243..87da89831c 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -398,14 +398,17 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.abs, _ABS)
     self._compareCpu(x, np.negative, math_ops.negative)
     self._compareCpu(x, np.negative, _NEG)
-    self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(x, np.sign, math_ops.sign)
 
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
-    self._compareBothSparse(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
+  def testInt64Square(self):
+    x = np.arange(-6 << 20, 6 << 20, 2 << 20).reshape(1, 3, 2).astype(np.int64)
+    self._compareCpu(x, np.square, math_ops.square)
+    self._compareBothSparse(x, np.square, math_ops.square)
+
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex64)
-- 
GitLab


From 27210f41427645bda64699aad4273f697b8a408c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 20:59:14 -0700
Subject: [PATCH 0818/1262] Adding 1d and 3d orthogonal kernels convolutions.

PiperOrigin-RevId: 192861809
---
 tensorflow/contrib/framework/__init__.py      |   4 +
 .../python/kernel_tests/init_ops_test.py      | 272 ++++++++++++-
 tensorflow/python/ops/init_ops.py             | 374 +++++++++++++++---
 3 files changed, 588 insertions(+), 62 deletions(-)

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index a52907f163..bb4f1eb384 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -72,7 +72,9 @@ See the @{$python/contrib.framework} guide.
 @@variable
 @@VariableDeviceChooser
 @@convolutional_delta_orthogonal
+@@convolutional_orthogonal_1d
 @@convolutional_orthogonal_2d
+@@convolutional_orthogonal_3d
 @@zero_initializer
 
 @@load_checkpoint
@@ -117,7 +119,9 @@ from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['nest']
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index f7a7119b34..a9b55854f1 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -613,10 +613,12 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
 
   def testShapesValues(self):
+    gain = 3.14
     for dtype in [dtypes.float32]:
       for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]:
         tol = 1e-2
-        # Check orthogonality by computing the 2-norms of the inputs and outputs.
+        # Check orthogonality by computing ratio between
+        # the 2-norms of the inputs and outputs.
         if len(kernel_size) == 1:
           shape = [4, 32, 64]
           convolution = convolutional.conv1d
@@ -632,9 +634,10 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
             inputs, padding="same", filters=128,
             kernel_size=kernel_size, use_bias=False,
             kernel_initializer=init_ops.convolutional_delta_orthogonal(
-                gain=3.14))
+                gain=gain))
         outputs_shape = shape[0:-1] + [128]
         outputs_2norm = linalg_ops.norm(outputs)
+        ratio = outputs_2norm / inputs_2norm
         my_ops = variables.global_variables_initializer()
         with self.test_session(use_gpu=True) as sess:
           sess.run(my_ops)
@@ -642,10 +645,8 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
           t = outputs.eval()
           self.assertAllEqual(t.shape, outputs_shape)
           # Check isometry of the delta-orthogonal kernel.
-          self.assertAllClose(
-              sess.run(inputs_2norm)/np.sqrt(np.prod(shape)),
-              sess.run(outputs_2norm)/(np.sqrt(np.prod(shape))*np.sqrt(3.14)),
-              rtol=tol, atol=tol)
+          self.assertAllClose(sess.run(ratio), np.sqrt(gain),
+                              rtol=tol, atol=tol)
 
   def testNonuniformity(self):
     value = 0
@@ -653,7 +654,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     shape = [3, 3, 10, 10]
     count = 70
     tol = 1e-5
-    with self.test_session(use_gpu=True):  # as sess:
+    with self.test_session(use_gpu=True):
       for i in range(count):
         x = variable_scope.get_variable("{}".format(i), shape=shape,
                                         initializer=
@@ -672,6 +673,120 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
 
+class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_1d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_1d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_1d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_1d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_1d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 10, 10]
+    count = 70
+    tol = 1e-5
+    with self.test_session(use_gpu=True):
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_orthogonal_1d)
+        x.initializer.run()
+        y = np.sum(x.eval(), axis=0)
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants.
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Pad input_ for computing (circular) convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+
+      beginning = kernel_size // 2
+      end = kernel_size - 1 - beginning
+
+      tmp_up = array_ops.slice(input_, [0, width - beginning, 0],
+                               [-1, beginning, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0], [-1, end, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      return tmp
+
+    cout = 64
+    shape = [10, 20, 32]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1], [2], [3], [4], [5], [6]]:
+      convolution = convolutional.conv1d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size[0], use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_1d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      ratio = outputs_2norm / inputs_2norm
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+
+
 class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
 
   def testInitializerIdentical(self):
@@ -722,17 +837,17 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       Returns:
         a tensor whose width is (width + kernel_size - 1).
       """
-      beg = kernel_size // 2
-      end = kernel_size - 1 - beg
+      beginning = kernel_size // 2
+      end = kernel_size - 1 - beginning
 
-      tmp_up = array_ops.slice(input_, [0, width - beg, 0, 0],
-                               [-1, beg, width, -1])
+      tmp_up = array_ops.slice(input_, [0, width - beginning, 0, 0],
+                               [-1, beginning, width, -1])
       tmp_down = array_ops.slice(input_, [0, 0, 0, 0], [-1, end, width, -1])
       tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
 
       new_width = width + kernel_size - 1
-      tmp_left = array_ops.slice(tmp, [0, 0, width - beg, 0],
-                                 [-1, new_width, beg, -1])
+      tmp_left = array_ops.slice(tmp, [0, 0, width - beginning, 0],
+                                 [-1, new_width, beginning, -1])
       tmp_right = array_ops.slice(tmp, [0, 0, 0, 0], [-1, new_width, end, -1])
 
       final = array_ops.concat([tmp_left, tmp, tmp_right], 2)
@@ -756,6 +871,132 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
           kernel_size=kernel_size, use_bias=False,
           kernel_initializer=init_ops.convolutional_orthogonal_2d(gain=gain))
       outputs_2norm = linalg_ops.norm(outputs)
+      ratio = outputs_2norm / inputs_2norm
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+
+
+class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_3d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_3d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_3d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_3d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 3, 3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 3, 3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_3d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 3, 3, 5, 5]
+    count = 20
+    tol = 1e-5
+    with self.test_session(use_gpu=True):
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_orthogonal_3d)
+        x.initializer.run()
+        y = np.sum(x.eval(), axis=(0, 1, 2))
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Padding input_ for computing circular convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+
+      beginning = kernel_size // 2
+      end = kernel_size - 1 - beginning
+
+      tmp_up = array_ops.slice(input_, [0, width - beginning, 0, 0, 0],
+                               [-1, beginning, -1, -1, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0, 0, 0],
+                                 [-1, end, -1, -1, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      tmp_left = array_ops.slice(tmp, [0, 0, width - beginning, 0, 0],
+                                 [-1, -1, beginning, -1, -1])
+      tmp_right = array_ops.slice(tmp, [0, 0, 0, 0, 0],
+                                  [-1, -1, end, -1, -1])
+      tmp = array_ops.concat([tmp_left, tmp, tmp_right], 2)
+
+      tmp_front = array_ops.slice(tmp, [0, 0, 0, width - beginning, 0],
+                                  [-1, -1, -1, beginning, -1])
+      tmp_back = array_ops.slice(tmp, [0, 0, 0, 0, 0], [-1, -1, -1, end, -1])
+      return array_ops.concat([tmp_front, tmp, tmp_back], 3)
+
+    cout = 32
+    shape = [1, 7, 7, 7, 16]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1, 1, 1], [2, 2, 2], [3, 3, 3]]:
+      convolution = convolutional.conv3d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size[0], use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_3d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.test_session(use_gpu=True) as sess:
         sess.run(my_ops)
@@ -763,10 +1004,7 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
         t = outputs.eval()
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(
-            sess.run(inputs_2norm)/np.sqrt(np.prod(shape)),
-            sess.run(outputs_2norm)/(np.sqrt(np.prod(shape))*np.sqrt(gain)),
-            rtol=tol, atol=tol)
+        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
 
 
 class IdentityInitializerTest(test.TestCase):
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 5ded3f7cc2..39b7295124 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -549,12 +549,11 @@ class ConvolutionDeltaOrthogonal(Initializer):
   tensor form an orthogonal matrix. Other pixels are set to be zero.
 
   Args:
-    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
-      for behavior.
+      @{tf.set_random_seed} for behavior.
     dtype: The data type.
   """
 
@@ -600,21 +599,17 @@ class ConvolutionDeltaOrthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
-class ConvolutionOrthogonal2D(Initializer):
-  """Initializer that generates a 2D orthogonal kernel for ConvNets.
+class ConvolutionOrthogonal(Initializer):
+  """Initializer that generates orthogonal kernel for ConvNets.
 
-  The shape of the tensor must have length 2. The number of input
-  filters must not exceed the number of output filters.
-  The orthogonality(==isometry) is exact when the inputs are circular padded.
-  There are finite-width effects with non-circular padding (e.g. zero padding).
+  Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
 
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
-      for behavior.
+      @{tf.set_random_seed} for behavior.
     dtype: The data type.
   """
 
@@ -624,21 +619,7 @@ class ConvolutionOrthogonal2D(Initializer):
     self.seed = seed
 
   def __call__(self, shape, dtype=None, partition_info=None):
-    if dtype is None:
-      dtype = self.dtype
-    # Check the shape
-    if len(shape) != 4:
-      raise ValueError("The tensor to initialize must be four-dimensional")
-
-    if shape[-2] > shape[-1]:
-      raise ValueError("In_filters cannot be greater than out_filters.")
-
-    if shape[0] != shape[1]:
-      raise ValueError("Kernel sizes must be equal.")
-
-    kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
-    return kernel
+    raise NotImplementedError
 
   def get_config(self):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
@@ -648,9 +629,9 @@ class ConvolutionOrthogonal2D(Initializer):
     """Construct an n x n orthogonal matrix.
 
     Args:
-      n: dimension.
+      n: Dimension.
     Returns:
-      a n x n orthogonal matrix.
+      A n x n orthogonal matrix.
     """
     a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed)
     if self.seed:
@@ -665,9 +646,9 @@ class ConvolutionOrthogonal2D(Initializer):
     """Compute a n x n symmetric projection matrix.
 
     Args:
-      n: dimension.
+      n: Dimension.
     Returns:
-      a n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T.
+      A n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T.
     """
     q = self._orthogonal_matrix(n)
     # randomly zeroing out some columns
@@ -678,15 +659,49 @@ class ConvolutionOrthogonal2D(Initializer):
     c = math_ops.multiply(q, mask)
     return math_ops.matmul(c, array_ops.matrix_transpose(c))
 
+
+class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
+  """Initializer that generates a 2D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 4. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      This has the effect of scaling the output 2-norm by a factor of
+      `sqrt(gain)`.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    dtype: The data type.
+  """
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    if len(shape) != 4:
+      raise ValueError("The tensor to initialize must be four-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    if shape[0] != shape[1]:
+      raise ValueError("Kernel sizes must be equal.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
   def _dict_to_tensor(self, x, k1, k2):
     """Convert a dictionary to a tensor.
 
     Args:
-      x: a k1 * k2 dictionary.
-      k1: first dimension of x.
-      k2: second dimension of x.
+      x: A k1 * k2 dictionary.
+      k1: First dimension of x.
+      k2: Second dimension of x.
     Returns:
-      a k1 * k2 tensor.
+      A k1 * k2 tensor.
     """
 
     return array_ops.stack([array_ops.stack([x[i, j] for j in range(k2)])
@@ -696,13 +711,13 @@ class ConvolutionOrthogonal2D(Initializer):
     """Construct a 2 x 2 kernel. Used to construct orthgonal kernel.
 
     Args:
-      p1: a symmetric projection matrix
-      p2: a symmetric projection matrix
+      p1: A symmetric projection matrix.
+      p2: A symmetric projection matrix.
     Returns:
-      a 2 x 2 kernel [[p1p2,         p1(1-p2)],
+      A 2 x 2 kernel [[p1p2,         p1(1-p2)],
                       [(1-p1)p2, (1-p1)(1-p2)]].
     Raises:
-      ValueError: if the dimensions of p1 and p2 are different.
+      ValueError: If the dimensions of p1 and p2 are different.
     """
     if p1.shape.as_list() != p2.shape.as_list():
       raise ValueError("The dimension of the matrices must be the same.")
@@ -720,8 +735,8 @@ class ConvolutionOrthogonal2D(Initializer):
     """Matrix convolution.
 
     Args:
-      m1: is a k x k dictionary, each element is a n x n matrix.
-      m2: is a l x l dictionary, each element is a n x n matrix.
+      m1: A k x k dictionary, each element is a n x n matrix.
+      m2: A l x l dictionary, each element is a n x n matrix.
 
     Returns:
       (k + l - 1) * (k + l - 1) dictionary each element is a n x n matrix.
@@ -752,13 +767,13 @@ class ConvolutionOrthogonal2D(Initializer):
     """Construct orthogonal kernel for convolution.
 
     Args:
-      ksize: kernel size
-      cin: number of input channels
-      cout: number of output channels
+      ksize: Kernel size.
+      cin: Number of input channels.
+      cout: Number of output channels.
     Returns:
-      an [ksize, ksize, cin, cout] orthogonal kernel.
+      An [ksize, ksize, cin, cout] orthogonal kernel.
     Raises:
-      ValueError: if cin > cout.
+      ValueError: If cin > cout.
     """
     if cin > cout:
       raise ValueError("The number of input channels cannot exceed "
@@ -780,6 +795,273 @@ class ConvolutionOrthogonal2D(Initializer):
     return self._dict_to_tensor(p, ksize, ksize)
 
 
+class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
+  """Initializer that generates a 1D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 3. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed}
+      for behavior.
+    dtype: The data type.
+  """
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    if len(shape) != 3:
+      raise ValueError("The tensor to initialize must be three-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def _dict_to_tensor(self, x, k):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: A dictionary of length k.
+      k: Dimension of x.
+    Returns:
+      A tensor with the same dimension.
+    """
+
+    return array_ops.stack([x[i] for i in range(k)])
+
+  def _block_orth(self, projection_matrix):
+    """Construct a kernel. Used to construct orthgonal kernel.
+
+    Args:
+      projection_matrix: A symmetric projection matrix of size n x n.
+    Returns:
+      [projection_matrix, (1 - projection_matrix)].
+    """
+    n = projection_matrix.shape.as_list()[0]
+    kernel = {}
+    eye = linalg_ops.eye(n, dtype=self.dtype)
+    kernel[0] = projection_matrix
+    kernel[1] = eye - projection_matrix
+    return kernel
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: A dictionary of length k, each element is a n x n matrix.
+      m2: A dictionary of length l, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1)  dictionary each element is a n x n matrix.
+    Raises:
+      ValueError: Ff the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0]).shape.as_list()[0]
+    if n != (m2[0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = len(m1)
+    l = len(m2)
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      result[i] = array_ops.zeros([n, n], self.dtype)
+      for index in range(min(k, i + 1)):
+        if (i - index) < l:
+          result[i] += math_ops.matmul(m1[index], m2[i - index])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: Kernel size.
+      cin: Number of input channels.
+      cout: Number of output channels.
+    Returns:
+      An [ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: If cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(orth, 0)
+
+    p = self._block_orth(self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      p[i] = math_ops.matmul(orth, p[i])
+
+    return self._dict_to_tensor(p, ksize)
+
+
+class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
+  """Initializer that generates a 3D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 5. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    dtype: The data type.
+  """
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    if len(shape) != 5:
+      raise ValueError("The tensor to initialize must be five-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    if shape[0] != shape[1] or shape[0] != shape[2]:
+      raise ValueError("Kernel sizes must be equal.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def _dict_to_tensor(self, x, k1, k2, k3):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: A k1 * k2 dictionary.
+      k1: First dimension of x.
+      k2: Second dimension of x.
+      k3: Third dimension of x.
+    Returns:
+      A k1 * k2 * k3 tensor.
+    """
+
+    return array_ops.stack([array_ops.stack(
+        [array_ops.stack([x[i, j, k] for k in range(k3)])
+         for j in range(k2)]) for i in range(k1)])
+
+  def _block_orth(self, p1, p2, p3):
+    """Construct a 3 x 3 kernel. Used to construct orthgonal kernel.
+
+    Args:
+      p1: A symmetric projection matrix.
+      p2: A symmetric projection matrix.
+      p3: A symmetric projection matrix.
+    Returns:
+      A 2 x 2 x 2 kernel.
+    Raises:
+      ValueError: If the dimensions of p1, p2 and p3 are different.
+    """
+    p1_shape = p1.shape.as_list()
+    if p1_shape != p2.shape.as_list() or p1_shape != p3.shape.as_list():
+      raise ValueError("The dimension of the matrices must be the same.")
+    n = p1_shape[0]
+    eye = linalg_ops.eye(n, dtype=self.dtype)
+    kernel2x2x2 = {}
+    def matmul(p1, p2, p3):
+      return math_ops.matmul(math_ops.matmul(p1, p2), p3)
+    def cast(i, p):
+      """Return p or (1-p)."""
+      return i * p + (1-i) * (eye - p)
+    for i in [0, 1]:
+      for j in [0, 1]:
+        for k in [0, 1]:
+          kernel2x2x2[i, j, k] = matmul(cast(i, p1), cast(j, p2), cast(k, p3))
+    return kernel2x2x2
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: is a k x k x k  dictionary, each element is a n x n matrix.
+      m2: is a l x l x l dictionary, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1) x (k + l - 1) x (k + l - 1) dictionary each
+      element is a n x n matrix.
+    Raises:
+      ValueError: if the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0, 0, 0]).shape.as_list()[0]
+    if n != (m2[0, 0, 0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = int(np.cbrt(len(m1)))
+    l = int(np.cbrt(len(m2)))
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      for j in range(size):
+        for r in range(size):
+          result[i, j, r] = array_ops.zeros([n, n], self.dtype)
+          for index1 in range(min(k, i + 1)):
+            for index2 in range(min(k, j + 1)):
+              for index3 in range(min(k, r + 1)):
+                if (i - index1) < l and (j - index2) < l and (r - index3) < l:
+                  result[i, j, r] += math_ops.matmul(m1[index1, index2, index3],
+                                                     m2[i - index1, j - index2,
+                                                        r - index3])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: Kernel size.
+      cin: Number of input channels.
+      cout: Number of output channels.
+    Returns:
+      An [ksize, ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: If cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(
+          array_ops.expand_dims(
+              array_ops.expand_dims(orth, 0), 0), 0)
+
+    p = self._block_orth(self._symmetric_projection(cout),
+                         self._symmetric_projection(cout),
+                         self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout),
+                              self._symmetric_projection(cout),
+                              self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      for j in range(ksize):
+        for k in range(ksize):
+          p[i, j, k] = math_ops.matmul(orth, p[i, j, k])
+
+    return self._dict_to_tensor(p, ksize, ksize, ksize)
+
+
 @tf_export("keras.initializers.Identity", "initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
@@ -825,7 +1107,9 @@ variance_scaling_initializer = VarianceScaling
 orthogonal_initializer = Orthogonal
 identity_initializer = Identity
 convolutional_delta_orthogonal = ConvolutionDeltaOrthogonal
+convolutional_orthogonal_1d = ConvolutionOrthogonal1D
 convolutional_orthogonal_2d = ConvolutionOrthogonal2D
+convolutional_orthogonal_3d = ConvolutionOrthogonal3D
 # pylint: enable=invalid-name
 
 
-- 
GitLab


From 1093fe4075b77774af7e9a913d61cefda7abba96 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 21:00:02 -0700
Subject: [PATCH 0819/1262] Check there are no duplicate entries in sparse
 features as this would invalidate the example norm computation in SDCA.

PiperOrigin-RevId: 192861834
---
 .../python/kernel_tests/sdca_ops_test.py      | 36 +++++++++++++++++--
 tensorflow/core/kernels/sdca_internal.cc      | 36 ++++++++++++-------
 tensorflow/core/kernels/sdca_internal.h       |  2 +-
 3 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index ac50699f59..6e6c812adc 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -105,11 +105,13 @@ def make_example_dict(example_protos, example_weights):
 
 def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
   random.seed(1)
+
   sparse_features = [
       SparseFeatureColumn(
-          [int(i / num_non_zero) for i in range(num_examples * num_non_zero)],
-          [int(random.random() * dim) for _ in range(
-              num_examples * num_non_zero)],
+          [i for i in range(num_examples) for _ in range(num_non_zero)], [
+              i for _ in range(num_examples)
+              for i in random.sample(range(dim), num_non_zero)
+          ],
           [num_non_zero**(-0.5) for _ in range(num_examples * num_non_zero)])
   ]
   examples_dict = dict(
@@ -289,6 +291,34 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
       # It would be 0.01 without shuffling and 0.02 with adaptive sampling.
       self.assertNear(0.0, lr.approximate_duality_gap().eval(), err=1e-3)
 
+  def testSparseDuplicate(self):
+    # Setup test data
+    example_protos = [
+        make_example_proto({
+            'age': [0] * 5,
+            'gender': [0] * 5
+        }, 0),
+        make_example_proto({
+            'age': [1] * 5,
+            'gender': [1] * 5
+        }, 1),
+    ]
+    example_weights = [1.0, 1.0]
+    with self._single_threaded_test_session():
+      examples = make_example_dict(example_protos, example_weights)
+      variables = make_variable_dict(1, 1)
+      options = dict(
+          symmetric_l2_regularization=1,
+          symmetric_l1_regularization=0,
+          loss_type='logistic_loss')
+
+      lr = SdcaModel(examples, variables, options)
+      variables_lib.global_variables_initializer().run()
+      train_op = lr.minimize()
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   'Duplicate'):
+        train_op.run()
+
   def testDistributedSimple(self):
     # Setup test data
     example_protos = [
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 623de2a482..3e16ba8d04 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <random>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
@@ -368,9 +369,9 @@ Status Examples::Initialize(OpKernelContext* const context,
   TF_RETURN_IF_ERROR(CreateDenseFeatureRepresentation(
       worker_threads, num_examples, num_dense_features, weights,
       dense_features_inputs, &examples_));
-  ComputeSquaredNormPerExample(worker_threads, num_examples,
-                               num_sparse_features, num_dense_features,
-                               &examples_);
+  TF_RETURN_IF_ERROR(ComputeSquaredNormPerExample(
+      worker_threads, num_examples, num_sparse_features, num_dense_features,
+      &examples_));
   return Status::OK();
 }
 
@@ -382,7 +383,7 @@ Status Examples::CreateSparseFeatureRepresentation(
     const OpInputList& sparse_feature_values_inputs,
     std::vector<Example>* const examples) {
   mutex mu;
-  Status result GUARDED_BY(mu);
+  Status result;  // Guarded by mu
   auto parse_partition = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
@@ -460,7 +461,7 @@ Status Examples::CreateDenseFeatureRepresentation(
     const OpInputList& dense_features_inputs,
     std::vector<Example>* const examples) {
   mutex mu;
-  Status result GUARDED_BY(mu);
+  Status result;  // Guarded by mu
   auto parse_partition = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
@@ -486,14 +487,17 @@ Status Examples::CreateDenseFeatureRepresentation(
   return result;
 }
 
-void Examples::ComputeSquaredNormPerExample(
+Status Examples::ComputeSquaredNormPerExample(
     const DeviceBase::CpuWorkerThreads& worker_threads, const int num_examples,
     const int num_sparse_features, const int num_dense_features,
     std::vector<Example>* const examples) {
+  mutex mu;
+  Status result;  // Guarded by mu
   // Compute norm of examples.
   auto compute_example_norm = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
+    gtl::FlatSet<int64> previous_indices;
     for (int example_id = static_cast<int>(begin); example_id < end;
          ++example_id) {
       double squared_norm = 0;
@@ -501,12 +505,19 @@ void Examples::ComputeSquaredNormPerExample(
       for (int j = 0; j < num_sparse_features; ++j) {
         const Example::SparseFeatures& sparse_features =
             example->sparse_features_[j];
-        if (sparse_features.values) {
-          const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
-              sparse_features.values->square().sum();
-          squared_norm += sn();
-        } else {
-          squared_norm += sparse_features.indices->size();
+        previous_indices.clear();
+        for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
+          const int64 feature_index = (*sparse_features.indices)(k);
+          if (previous_indices.insert(feature_index).second == false) {
+            mutex_lock l(mu);
+            result =
+                errors::InvalidArgument("Duplicate index in sparse vector.");
+            return;
+          }
+          const double feature_value = sparse_features.values == nullptr
+                                           ? 1.0
+                                           : (*sparse_features.values)(k);
+          squared_norm += feature_value * feature_value;
         }
       }
       for (int j = 0; j < num_dense_features; ++j) {
@@ -521,6 +532,7 @@ void Examples::ComputeSquaredNormPerExample(
   const int64 kCostPerUnit = num_dense_features + num_sparse_features;
   Shard(worker_threads.num_threads, worker_threads.workers, num_examples,
         kCostPerUnit, compute_example_norm);
+  return result;
 }
 
 }  // namespace sdca
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index bfdb3febdc..897c488702 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -369,7 +369,7 @@ class Examples {
 
   // Computes squared example norm per example i.e |x|^2. This function modifies
   // the |examples| passed in and adds the squared norm per example.
-  static void ComputeSquaredNormPerExample(
+  static Status ComputeSquaredNormPerExample(
       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
       int num_sparse_features, int num_dense_features,
       std::vector<Example>* const examples);
-- 
GitLab


From cc9a8f789a4d224a3e73737fa6c921676441a6c8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 21:09:37 -0700
Subject: [PATCH 0820/1262] Upgrade gRPC version used in OSS Tensorflow

PiperOrigin-RevId: 192862541
---
 tensorflow/contrib/cmake/external/grpc.cmake              | 2 +-
 .../distributed_runtime/rpc/grpc_worker_service_impl.h    | 2 +-
 tensorflow/workspace.bzl                                  | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 35c2a294ec..693dc7cd67 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 09386db3939cae1ac12e5f09b735adfa8958c68e)
+set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 62b299d5c2..0abac4f3c7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -35,7 +35,7 @@ class GrpcByteSource : public TensorResponse::Source {
   explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
   ~GrpcByteSource() override { DeleteStream(); }
 
-  typedef ::grpc::GrpcProtoBufferReader Reader;
+  typedef ::grpc::ProtoBufferReader Reader;
 
   protobuf::io::ZeroCopyInputStream* contents() override {
     DeleteStream();
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f775491e4a..79730f591f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -427,11 +427,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
-          "https://github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/d184fa229d75d336aedea0041bd59cb93e7e267f.tar.gz",
+          "https://github.com/grpc/grpc/archive/d184fa229d75d336aedea0041bd59cb93e7e267f.tar.gz",
       ],
-      sha256 = "b857969c667c14f37faa507afc07a3f39a47fbf73203be889d55925622e7b317",
-      strip_prefix = "grpc-09386db3939cae1ac12e5f09b735adfa8958c68e",
+      sha256 = "895b31310e718a61f7335759a778c068a6edde1c089883598a0830cbb7075673",
+      strip_prefix = "grpc-d184fa229d75d336aedea0041bd59cb93e7e267f",
   )
 
 
-- 
GitLab


From 6a581e1d7c28f5b8f487f2a91649d7e2866974f4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 21:15:59 -0700
Subject: [PATCH 0821/1262] [XLA] Use pattern matcher in algebraic simplifier

PiperOrigin-RevId: 192862841
---
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../xla/service/algebraic_simplifier.cc       | 226 +++++++++---------
 2 files changed, 108 insertions(+), 119 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index ddc099807d..9831a09c1f 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1283,6 +1283,7 @@ cc_library(
         ":hlo_creation_utils",
         ":hlo_pass",
         ":hlo_query",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 6cb1bd5669..cd5737e4f9 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -44,8 +45,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
+
 namespace {
 
+namespace m = match;
+
 // Returns whether operand is a literal with the given value.
 bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
   return operand->opcode() == HloOpcode::kConstant &&
@@ -105,6 +109,7 @@ HloComputation* CreateScalarBinaryComputation(HloModule* module,
       module->AddEmbeddedComputation(b.Build(scalar_op));
   return scalar_computation;
 }
+
 }  // namespace
 
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
@@ -350,8 +355,9 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
 }
 
 Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
-  auto lhs = add->mutable_operand(0);
-  auto rhs = add->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(add, m::Add(m::Op(&lhs), m::Op(&rhs))));
+
   // A + 0 => A
   VLOG(10) << "trying transform [A + 0 => A]: " << add->ToString();
   if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
@@ -366,7 +372,7 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   // Canonicalization: Put constants on the right.  This makes the reassociation
   // rules below simpler.
   VLOG(10) << "trying transform [Const + A => A + Const]";
-  if (lhs->IsConstant() && !rhs->IsConstant()) {
+  if (Match(add, m::Add(m::Constant(), m::NonConstant()))) {
     return ReplaceWithNewInstruction(
         add,
         HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd, rhs, lhs));
@@ -379,16 +385,13 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   //   (A + C1) + (B + C2) =>  A + B + (C1 + C2).
   //
   VLOG(10) << "trying transform [(A + C1) + C2 => A + (C1 + C2)]";
-  if (rhs->IsConstant() && lhs->opcode() == HloOpcode::kAdd &&
-      !lhs->operand(0)->IsConstant() && lhs->operand(1)->IsConstant()) {
-    auto* c1 = lhs->mutable_operand(1);
-    auto* c2 = rhs;
-
+  HloInstruction *a, *c1, *c2;
+  if (Match(add, m::Add(m::Add(m::NonConstant(&a), m::Constant(&c1)),
+                        m::Constant(&c2)))) {
     TF_ASSIGN_OR_RETURN(auto* sum_of_constants,
                         MakeBinaryHlo(HloOpcode::kAdd, c1, c2));
     return ReplaceWithNewInstruction(
-        add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd,
-                                          lhs->mutable_operand(0),
+        add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd, a,
                                           sum_of_constants));
   }
 
@@ -397,11 +400,11 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
 
 Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   // If a bitcast feeds a bitcast, make it a single bitcast.
-  if (bitcast->operand(0)->opcode() == HloOpcode::kBitcast) {
+  HloInstruction* op;
+  if (Match(bitcast, m::Bitcast(m::Bitcast(m::Op(&op))))) {
     return ReplaceWithNewInstruction(
-        bitcast, HloInstruction::CreateUnary(
-                     bitcast->shape(), HloOpcode::kBitcast,
-                     bitcast->mutable_operand(0)->mutable_operand(0)));
+        bitcast,
+        HloInstruction::CreateUnary(bitcast->shape(), HloOpcode::kBitcast, op));
   }
   // All bitcasts can be eliminated (assuming layout constraints are
   // satisified).
@@ -418,11 +421,10 @@ Status AlgebraicSimplifierVisitor::HandleBitcastConvert(
 
 Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
   // If a copy feeds a copy, make it a single copy.
-  if (copy->operand(0)->opcode() == HloOpcode::kCopy) {
+  HloInstruction* op;
+  if (Match(copy, m::Copy(m::Copy(m::Op(&op))))) {
     return ReplaceWithNewInstruction(
-        copy, HloInstruction::CreateUnary(
-                  copy->shape(), HloOpcode::kCopy,
-                  copy->mutable_operand(0)->mutable_operand(0)));
+        copy, HloInstruction::CreateUnary(copy->shape(), HloOpcode::kCopy, op));
   }
   // All copies can be eliminated (assuming layout constraints are satisified).
   ReplaceInstructionIfSameShape(copy, copy->mutable_operand(0));
@@ -462,12 +464,10 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
   } else if (operands.size() == 2) {
     // A binary concat with a broadcasted scalar as an operand can be converted
     // into a pad which is simpler to fold into other operations.
-    bool is_effective_low_pad =
-        operands[0]->opcode() == HloOpcode::kBroadcast &&
-        ShapeUtil::IsScalar(operands[0]->operand(0)->shape());
-    bool is_effective_high_pad =
-        operands[1]->opcode() == HloOpcode::kBroadcast &&
-        ShapeUtil::IsScalar(operands[1]->operand(0)->shape());
+    bool is_effective_low_pad = Match(
+        operands[0], m::Broadcast(m::Op().WithShape(m::Shape().IsScalar())));
+    bool is_effective_high_pad = Match(
+        operands[1], m::Broadcast(m::Op().WithShape(m::Shape().IsScalar())));
     if (!is_effective_low_pad && !is_effective_high_pad) {
       return Status::OK();
     }
@@ -537,8 +537,8 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
 }
 
 Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
-  auto lhs = sub->mutable_operand(0);
-  auto rhs = sub->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(sub, m::Subtract(m::Op(&lhs), m::Op(&rhs))));
   // A - 0 => A
   VLOG(10) << "trying transform [A - 0 => A]: " << sub->ToString();
   if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
@@ -547,7 +547,7 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
 
   // Canonicalize subtraction of a constant to addition.
   VLOG(10) << "trying transform [A - Const => A + (-Const)]";
-  if (rhs->IsConstant() && !lhs->IsConstant()) {
+  if (Match(sub, m::Subtract(m::NonConstant(&lhs), m::Constant(&rhs)))) {
     HloInstruction* negative_const = computation_->AddInstruction(
         HloInstruction::CreateUnary(rhs->shape(), HloOpcode::kNegate, rhs));
     return ReplaceWithNewInstruction(
@@ -559,56 +559,53 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
 }
 
 Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
-  auto lhs = divide->mutable_operand(0);
-  auto rhs = divide->mutable_operand(1);
+  Shape* shape;
+  HloInstruction *a, *b, *c, *d;
+  CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
   // A/1 => A
   VLOG(10) << "trying transform [A/1 => A]: " << divide->ToString();
-  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(divide, lhs)) {
+  if (IsAll(b, 1) && ReplaceInstructionIfSameShape(divide, a)) {
     return Status::OK();
   }
 
   // exp(A)/exp(B) => exp(A-B)
-  if (lhs->opcode() == HloOpcode::kExp && rhs->opcode() == HloOpcode::kExp) {
+  if (Match(divide, m::Divide(m::Exp(m::Op(&a)), m::Exp(m::Op(&b)))
+                        .WithShape(m::Shape(&shape)))) {
     VLOG(10) << "transform [exp(A)/exp(B) => exp(A-B)]: " << divide->ToString();
-    HloInstruction* subtract =
-        computation_->AddInstruction(HloInstruction::CreateBinary(
-            divide->shape(), HloOpcode::kSubtract, lhs->mutable_operand(0),
-            rhs->mutable_operand(0)));
+    HloInstruction* subtract = computation_->AddInstruction(
+        HloInstruction::CreateBinary(*shape, HloOpcode::kSubtract, a, b));
     return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp,
-                                            subtract));
+        divide, HloInstruction::CreateUnary(*shape, HloOpcode::kExp, subtract));
   }
 
   // A/exp(B) => A*exp(-B)
-  if (rhs->opcode() == HloOpcode::kExp) {
+  if (Match(divide, m::Divide(m::Op(&a), m::Exp(m::Op(&b))))) {
     VLOG(10) << "transform [A/exp(B) => A*exp(-B)]: " << divide->ToString();
-    HloInstruction* negate =
-        computation_->AddInstruction(HloInstruction::CreateUnary(
-            divide->shape(), HloOpcode::kNegate, rhs->mutable_operand(0)));
+    HloInstruction* negate = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kNegate, b));
     HloInstruction* new_exp = computation_->AddInstruction(
         HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp, negate));
     return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kMultiply, lhs, new_exp));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kMultiply, a, new_exp));
   }
 
   // A/pow(B,C) => A*pow(B,-C)
-  if (rhs->opcode() == HloOpcode::kPower) {
+  if (Match(divide, m::Divide(m::Op(&a), m::Power(m::Op(&b), m::Op(&c))))) {
     VLOG(10) << "transform [A/pow(B,C) => A*pow(B,-C)]: " << divide->ToString();
     // The output shape of the created negate operator should be the same as the
     // input.
-    const Shape& negate_shape = rhs->operand(1)->shape();
-    HloInstruction* negate =
-        computation_->AddInstruction(HloInstruction::CreateUnary(
-            negate_shape, HloOpcode::kNegate, rhs->mutable_operand(1)));
+    const Shape& negate_shape = c->shape();
+    HloInstruction* negate = computation_->AddInstruction(
+        HloInstruction::CreateUnary(negate_shape, HloOpcode::kNegate, c));
     // And the power operator should retain the output shape of the old one.
-    const Shape& new_power_shape = rhs->shape();
-    HloInstruction* new_power = computation_->AddInstruction(
-        HloInstruction::CreateBinary(new_power_shape, HloOpcode::kPower,
-                                     rhs->mutable_operand(0), negate));
+    const Shape& new_power_shape = b->shape();
+    HloInstruction* new_power =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            new_power_shape, HloOpcode::kPower, b, negate));
     return ReplaceWithNewInstruction(
         divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kMultiply, lhs, new_power));
+                    divide->shape(), HloOpcode::kMultiply, a, new_power));
   }
 
   // Simplifying integral division would produce unexpected results.
@@ -620,28 +617,24 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   //
   // (Backends can do this transformation, but generally only if the constant is
   // a scalar.)
-  if (lhs->opcode() != HloOpcode::kConstant &&
-      rhs->opcode() == HloOpcode::kConstant) {
+  if (Match(divide, m::Divide(m::NonConstant(&a), m::Constant(&b)))) {
     HloInstruction* one =
         computation_->AddInstruction(HloInstruction::CreateConstant(
-            Literal::One(lhs->shape().element_type()).CloneToUnique()));
-    HloInstruction* inverse =
-        computation_->AddInstruction(HloInstruction::CreateBinary(
-            rhs->shape(), HloOpcode::kDivide, one, rhs));
+            Literal::One(a->shape().element_type()).CloneToUnique()));
+    HloInstruction* inverse = computation_->AddInstruction(
+        HloInstruction::CreateBinary(b->shape(), HloOpcode::kDivide, one, b));
     return ReplaceWithNewInstruction(
-        divide, HloInstruction::CreateBinary(
-                    divide->shape(), HloOpcode::kMultiply, lhs, inverse));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kMultiply, a, inverse));
   }
 
   // (A / B) / (C / D)  =>  (A / B)*(D / C) => (A * D) / (B * C)
-  if (lhs->opcode() == HloOpcode::kDivide &&
-      rhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(auto a_times_d, MakeBinaryHlo(HloOpcode::kMultiply,
-                                                      lhs->mutable_operand(0),
-                                                      rhs->mutable_operand(1)));
-    TF_ASSIGN_OR_RETURN(auto b_times_c, MakeBinaryHlo(HloOpcode::kMultiply,
-                                                      lhs->mutable_operand(1),
-                                                      rhs->mutable_operand(0)));
+  if (Match(divide, m::Divide(m::Divide(m::Op(&a), m::Op(&b)),
+                              m::Divide(m::Op(&c), m::Op(&d))))) {
+    TF_ASSIGN_OR_RETURN(auto a_times_d,
+                        MakeBinaryHlo(HloOpcode::kMultiply, a, d));
+    TF_ASSIGN_OR_RETURN(auto b_times_c,
+                        MakeBinaryHlo(HloOpcode::kMultiply, b, c));
     TF_ASSIGN_OR_RETURN(auto new_divide, MakeBinaryHlo(HloOpcode::kDivide,
                                                        a_times_d, b_times_c));
 
@@ -649,24 +642,21 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   }
 
   // (A / B) / C => A / (B * C)
-  if (lhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(
-        auto b_times_c,
-        MakeBinaryHlo(HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
+  if (Match(divide, m::Divide(m::Divide(m::Op(&a), m::Op(&b)), m::Op(&c)))) {
+    TF_ASSIGN_OR_RETURN(auto b_times_c,
+                        MakeBinaryHlo(HloOpcode::kMultiply, b, c));
     return ReplaceWithNewInstruction(
-        divide,
-        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
-                                     lhs->mutable_operand(0), b_times_c));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kDivide, a, b_times_c));
   }
 
   // A / (B / C) => (A*C) / B
-  if (rhs->opcode() == HloOpcode::kDivide) {
-    TF_ASSIGN_OR_RETURN(auto a_times_c, MakeBinaryHlo(HloOpcode::kMultiply, lhs,
-                                                      rhs->mutable_operand(1)));
+  if (Match(divide, m::Divide(m::Op(&a), m::Divide(m::Op(&b), m::Op(&c))))) {
+    TF_ASSIGN_OR_RETURN(auto a_times_c,
+                        MakeBinaryHlo(HloOpcode::kMultiply, a, c));
     return ReplaceWithNewInstruction(
-        divide,
-        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
-                                     a_times_c, rhs->mutable_operand(0)));
+        divide, HloInstruction::CreateBinary(divide->shape(),
+                                             HloOpcode::kDivide, a_times_c, b));
   }
 
   return Status::OK();
@@ -674,8 +664,8 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
 
 StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     HloInstruction* dot) {
-  HloInstruction* lhs = dot->mutable_operand(0);
-  HloInstruction* rhs = dot->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
   int64 lhs_collapsing_dim =
       dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
   if (lhs->IsRank2Transpose()) {
@@ -792,8 +782,8 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
 
   const int64 lhs_contracting_dim = dnums.lhs_contracting_dimensions(0);
   const int64 rhs_contracting_dim = dnums.rhs_contracting_dimensions(0);
-  HloInstruction* lhs = dot->mutable_operand(0);
-  HloInstruction* rhs = dot->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * optimized_lhs_concat,
@@ -923,8 +913,8 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
 }
 
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
-  auto lhs = dot->mutable_operand(0);
-  auto rhs = dot->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
 
   // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
   // below.
@@ -976,8 +966,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
 }
 
 Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
-  auto lhs = multiply->mutable_operand(0);
-  auto rhs = multiply->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(multiply, m::Multiply(m::Op(&lhs), m::Op(&rhs))));
   // A*1 => A
   VLOG(10) << "trying transform [A*1 => A]: " << multiply->ToString();
   if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(multiply, lhs)) {
@@ -990,10 +980,9 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   }
 
   // exp(A) * exp(B) => exp(A+B)
-  if (lhs->opcode() == HloOpcode::kExp && rhs->opcode() == HloOpcode::kExp) {
+  if (Match(multiply, m::Multiply(m::Exp(m::Op(&lhs)), m::Exp(m::Op(&rhs))))) {
     auto add = computation_->AddInstruction(HloInstruction::CreateBinary(
-        multiply->shape(), HloOpcode::kAdd, lhs->mutable_operand(0),
-        rhs->mutable_operand(0)));
+        multiply->shape(), HloOpcode::kAdd, lhs, rhs));
     return ReplaceWithNewInstruction(
         multiply,
         HloInstruction::CreateUnary(multiply->shape(), HloOpcode::kExp, add));
@@ -1004,20 +993,19 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
 Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   // ln(exp(A)) => A
   VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString();
-  auto operand = log->mutable_operand(0);
-  if (operand->opcode() == HloOpcode::kExp &&
-      ReplaceInstructionIfSameShape(log, operand->mutable_operand(0))) {
+  HloInstruction *a, *b;
+  if (Match(log, m::Log(m::Exp(m::Op(&a)))) &&
+      ReplaceInstructionIfSameShape(log, a)) {
     return Status::OK();
   }
 
   // ln(pow(A,B)) => B*ln(A)
-  if (operand->opcode() == HloOpcode::kPower) {
-    auto new_log = computation_->AddInstruction(HloInstruction::CreateUnary(
-        log->shape(), HloOpcode::kLog, operand->mutable_operand(0)));
+  if (Match(log, m::Log(m::Power(m::Op(&a), m::Op(&b))))) {
+    auto new_log = computation_->AddInstruction(
+        HloInstruction::CreateUnary(log->shape(), HloOpcode::kLog, a));
     return ReplaceWithNewInstruction(
-        log,
-        HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
-                                     new_log, operand->mutable_operand(1)));
+        log, HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
+                                          new_log, b));
   }
 
   return Status::OK();
@@ -1120,7 +1108,8 @@ bool OutputIsSubsetOfOperandElements(HloInstruction* instruction,
 }  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
-  auto operand = broadcast->mutable_operand(0);
+  HloInstruction* operand;
+  CHECK(Match(broadcast, m::Broadcast(m::Op(&operand))));
   auto dims = broadcast->dimensions();
   // A degenerate broadcast of a reshape that does not change the number of
   // elements can be replaced by a reshape.
@@ -1231,30 +1220,28 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
 
 // Complex(Real(c), Imag(c)) -> c
 Status AlgebraicSimplifierVisitor::HandleComplex(HloInstruction* complex) {
-  auto real = complex->mutable_operand(0);
-  auto imag = complex->mutable_operand(1);
-  if (real->opcode() == HloOpcode::kReal &&
-      imag->opcode() == HloOpcode::kImag &&
-      real->operand(0) == imag->operand(0)) {
-    return ReplaceInstruction(complex, real->mutable_operand(0));
+  HloInstruction *c0, *c1;
+  if (Match(complex, m::Complex(m::Real(m::Op(&c0)), m::Imag(m::Op(&c1)))) &&
+      c0 == c1) {
+    return ReplaceInstruction(complex, c0);
   }
   return Status::OK();
 }
 
 // Real(Complex(r, i)) -> r
 Status AlgebraicSimplifierVisitor::HandleReal(HloInstruction* real) {
-  auto operand = real->mutable_operand(0);
-  if (operand->opcode() == HloOpcode::kComplex) {
-    return ReplaceInstruction(real, operand->mutable_operand(0));
+  HloInstruction* op;
+  if (Match(real, m::Real(m::Complex(m::Op(&op), m::Op())))) {
+    return ReplaceInstruction(real, op);
   }
   return Status::OK();
 }
 
 // Imag(Complex(r, i)) -> i
 Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) {
-  auto operand = imag->mutable_operand(0);
-  if (operand->opcode() == HloOpcode::kComplex) {
-    return ReplaceInstruction(imag, operand->mutable_operand(1));
+  HloInstruction* op;
+  if (Match(imag, m::Imag(m::Complex(m::Op(), m::Op(&op))))) {
+    return ReplaceInstruction(imag, op);
   }
   return Status::OK();
 }
@@ -1351,8 +1338,8 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
 
 Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
-  auto lhs = power->mutable_operand(0);
-  auto rhs = power->mutable_operand(1);
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(power, m::Power(m::Op(&lhs), m::Op(&rhs))));
   if (IsAll(rhs, 0)) {
     auto one = HloInstruction::CreateConstant(
         Literal::One(power->shape().element_type()).CloneToUnique());
@@ -1372,9 +1359,10 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   }
 
   // pow(exp(A),B) => exp(A*B)
-  if (lhs->opcode() == HloOpcode::kExp) {
+  HloInstruction *a, *b;
+  if (Match(power, m::Power(m::Exp(m::Op(&a)), m::Op(&b)))) {
     auto a_times_b = computation_->AddInstruction(HloInstruction::CreateBinary(
-        power->shape(), HloOpcode::kMultiply, lhs->operands()[0], rhs));
+        power->shape(), HloOpcode::kMultiply, a, b));
     return ReplaceWithNewInstruction(
         power, HloInstruction::CreateUnary(power->shape(), HloOpcode::kExp,
                                            a_times_b));
-- 
GitLab


From c364e0cbbbc8bc931011396da1e22315a10e5e46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 14 Apr 2018 13:23:59 +0800
Subject: [PATCH 0822/1262] BLD: upgrade with eager.contex API

---
 tensorflow/contrib/opt/python/training/adamax.py   |  6 +++---
 .../contrib/opt/python/training/adamax_test.py     | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
index 4692f88349..686bac0d84 100644
--- a/tensorflow/contrib/opt/python/training/adamax.py
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -87,10 +87,10 @@ class AdaMaxOptimizer(adam.AdamOptimizer):
                                           epsilon, use_locking, name)
 
   def _get_beta_accumulators(self):
-    if context.in_graph_mode():
-      graph = ops.get_default_graph()
-    else:
+    if context.executing_eagerly():
       graph = None
+    else:
+      graph = ops.get_default_graph()
     return self._get_non_slot_variable("beta1_power", graph=graph)
 
   def _create_slots(self, var_list):
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index ccd08c0934..bc92a7006f 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -202,7 +202,7 @@ class AdaMaxOptimizerTest(test.TestCase):
           # Shouldn't return non-slot variables from other graphs.
           self.assertEqual(0, len(opt.variables()))
 
-        if context.in_graph_mode():
+        if not context.executing_eagerly():
           self.evaluate(variables.global_variables_initializer())
           # Fetch params to validate initial values
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -212,7 +212,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
-          if context.in_graph_mode():
+          if not context.executing_eagerly():
             self.evaluate(update)
           elif t > 1:
             opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
@@ -333,6 +333,16 @@ class AdaMaxOptimizerTest(test.TestCase):
         # fails.
         optimizer.apply_gradients([(grads0, var0)])
 
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.AdaMaxOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(5, len(set(opt.variables())))
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 945efa4222a66977c03638086773c369c16d5c61 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 14 Apr 2018 01:22:59 -0700
Subject: [PATCH 0823/1262] Make sure that same nodes are not optimized as part
 of multiple groups.

Replace recusrsion with iteration in AbsorbInputByOptimizedNodesGroup.

PiperOrigin-RevId: 192874364
---
 .../optimizers/arithmetic_optimizer.cc        | 145 +++++++++++-------
 .../optimizers/arithmetic_optimizer_test.cc   |   9 +-
 .../optimizers/graph_optimizer_stage.h        |   8 +
 3 files changed, 99 insertions(+), 63 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index b80ae5fa40..232132e1e8 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -260,7 +260,7 @@ NodeDef* GetTailOfValuePreservingChain(
                         is_value_preserving_non_branching);
 }
 
-// Graph optimizer context extension specific to ArithmeticOptimizer
+// Graph optimizer context extension specific to ArithmeticOptimizer.
 struct ArithmeticOptimizerContext {
   explicit ArithmeticOptimizerContext(SetVector<NodeDef*>* nodes_to_simplify)
       : nodes_to_simplify(nodes_to_simplify) {}
@@ -365,27 +365,37 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
 
   // Check if input can become a part of current optimized nodes group.
   virtual bool IsAbsorbableByOptimizedNodesGroup(
-      const OptimizedNodesGroup& group, const string& input) const = 0;
+      const OptimizedNodesGroup& group, const NodeDef& node) const = 0;
 
   Status AbsorbInputByOptimizedNodesGroup(const string& input,
                                           OptimizedNodesGroup* group) const {
-    NodeDef* node;
-    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
-
-    if (IsAbsorbableByOptimizedNodesGroup(*group, input)) {
-      for (int i = 0; i < node->input_size(); ++i) {
-        const string& input_i = node->input(i);
-        if (!IsControlInput(input)) {
-          TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+    std::deque<const string*> input_tensors;
+    input_tensors.push_front(&input);
+
+    while (!input_tensors.empty()) {
+      const string* input_tensor = input_tensors.front();
+      input_tensors.pop_front();
+
+      // Get a node for the input tensor.
+      NodeDef* input_node;
+      TF_RETURN_IF_ERROR(GetInputNode(*input_tensor, &input_node));
+
+      if (IsAbsorbableByOptimizedNodesGroup(*group, *input_node)) {
+        group->optimized_nodes.push_back(input_node);
+        for (int i = input_node->input_size() - 1; i >= 0; --i) {
+          const string& absorbed_node_input = input_node->input(i);
+          // TODO(ezhulenev): support control inputs
+          if (IsControlInput(absorbed_node_input)) continue;
+          input_tensors.push_front(&absorbed_node_input);
         }
+      } else {
+        // If input node can't be absorbed, add it to OptimizedNodesGroup input.
+        OpInfo::TensorProperties properties;
+        TF_RETURN_IF_ERROR(GetTensorProperties(*input_tensor, &properties));
+        group->inputs.emplace_back(*input_tensor, properties.shape());
       }
-      group->optimized_nodes.push_back(node);
-    } else {
-      // If node can't be absorbed, add it to OptimizedNodesGroup input
-      OpInfo::TensorProperties properties;
-      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
-      group->inputs.emplace_back(input, properties.shape());
     }
+
     return Status::OK();
   }
 
@@ -401,9 +411,9 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
     group->optimized_nodes.reserve(root_node->input_size());
     for (int i = 0; i < root_node->input_size(); ++i) {
       const string& input_i = root_node->input(i);
-      if (!IsControlInput(input_i)) {
-        TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
-      }
+      // TODO(ezhulenev): add support for control inputs
+      if (IsControlInput(input_i)) continue;
+      TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
     }
 
     return Status::OK();
@@ -455,6 +465,11 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
     optimized_nodes_.insert(node->name());
   }
 
+  void AddAllMembersToOptimizedNodes(const OptimizedNodesGroup& group) {
+    AddToOptimizedNodes(group.root_node);
+    for (const NodeDef* opt : group.optimized_nodes) AddToOptimizedNodes(opt);
+  }
+
   bool IsOnTheSameDevice(const OptimizedNodesGroup& group,
                          const NodeDef& node) const {
     return group.root_node->device() == node.device();
@@ -510,7 +525,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
 
   // Check if a node can become a root of AddOpsGroup
   bool IsSupported(const NodeDef* node) const override {
-    if (!CanOptimize(node)) return false;
+    if (!CanOptimize(*node)) return false;
 
     // shape must be symbolically defined and all inputs compatible with it
     OpInfo::TensorProperties properties;
@@ -522,59 +537,69 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
  protected:
   // Check if a node can be absorbed by current OptimizedNodesGroup
   bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
-                                         const string& input) const override {
-    NodeDef* node;
-    Status node_status = GetInputNode(input, &node);
-    if (!node_status.ok() || !CanOptimize(node)) return false;
+                                         const NodeDef& node) const override {
+    if (!CanOptimize(node)) return false;
 
-    if (!IsOnTheSameDevice(group, *node)) {
+    if (!IsOnTheSameDevice(group, node)) {
       return false;
     }
     // with a single output data consumer (presumably if we reach this node from
     // previously absorbed or a root node, it means that this node is not used
     // as an input to any other op, outside of the group)
-    if (NumNonControlDataOutputs(*node, *ctx_.node_map) != 1) {
+    if (NumNonControlDataOutputs(node, *ctx_.node_map) != 1) {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
     OpInfo::TensorProperties properties;
-    Status has_properties = GetTensorProperties(input, &properties);
+    Status has_properties = GetTensorProperties(node.name(), &properties);
     return has_properties.ok() &&
-           HasAllInputsBroadcastableToShape(*node, properties);
+           HasAllInputsBroadcastableToShape(node, properties);
   }
 
   // Node requirements both for a root node and an absorbed node
-  bool CanOptimize(const NodeDef* node) const {
+  bool CanOptimize(const NodeDef& node) const {
     // TODO(ezhulenev): check if AccumulateNV2 can be supported too
-    if (!IsAdd(*node) && !IsAddN(*node)) {
-      return false;
-    }
-    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
+    if (!IsAdd(node) && !IsAddN(node)) {
       return false;
     }
-    // it must not be created by this stage at any of previous optimization runs
-    if (str_util::StrContains(node->name(), stage_name_)) {
+    if (IsInPreserveSet(node) || IsAlreadyOptimized(node)) {
       return false;
     }
     // TODO(ezhulenev): relax this condition for root node
-    return !(IsDrivenByControlDependency(*node) ||
-             DrivesControlDependency(*node));
+    return !(IsDrivenByControlDependency(node) ||
+             DrivesControlDependency(node));
   }
 
   // Rewrite a group of add ops into a single AddN if all input shapes are
   // symbolically equal. If not, create AddN for equal shapes first, and then
   // build an Add tree, minimizing the cost of broadcasts.
   string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
-    // all new nodes will be placed under the scope of a root node
+    VLOG(2) << "Collapse Add/AddN: root=" << group.root_node->name()
+            << " op=" << group.root_node->op()
+            << " num_optimized_nodes=" << group.optimized_nodes.size()
+            << " num_inputs=" << group.inputs.size();
+
+    // Do not optimize any of the nodes that are part of this group.
+    AddAllMembersToOptimizedNodes(group);
+
+    // All new nodes will be placed under the scope of a root node.
     auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
 
-    // Find what shapes are present in the inputs of absorbed nodes
+    // Find what shapes are present in the inputs of absorbed nodes.
     std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
     for (const auto& input : group.inputs) {
       shape_sig_to_inputs[ShapeSignature(input.shape)].push_back(input);
     }
 
-    // Collect all the shapes from representative elements
+    using SigKV = decltype(shape_sig_to_inputs)::value_type;
+    VLOG(3) << "Add/AddN group has " << shape_sig_to_inputs.size()
+            << " unique shapes: "
+            << str_util::Join(shape_sig_to_inputs, ", ",
+                              [](string* out, SigKV p) {
+                                strings::StrAppend(out, p.first);
+                              });
+
+    // Collect all the shapes from representative elements.
     std::vector<TensorShapeProto> shapes;
     shapes.reserve(shape_sig_to_inputs.size());
     for (const auto& el : shape_sig_to_inputs)
@@ -936,6 +961,7 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
 
   bool IsSupported(const NodeDef* node) const override {
     if (!IsBinaryAssociative(*node)) return false;
+    if (IsAlreadyOptimized(*node)) return false;
 
     // has a symbolically defined shape with broadcastable inputs
     OpInfo::TensorProperties properties;
@@ -955,33 +981,29 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
 
   // Check if a node can be absorbed by current OptimizedNodesGroup
   bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
-                                         const string& input) const override {
-    NodeDef* node;
-    Status node_status = GetInputNode(input, &node);
-    if (!node_status.ok()) return false;
-
-    if (!IsSameOp(group, *node)) {
+                                         const NodeDef& node) const override {
+    if (!IsSameOp(group, node)) {
       return false;
     }
-    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
+    if (IsInPreserveSet(node) || IsAlreadyOptimized(node)) {
       return false;
     }
-    if (IsDrivenByControlDependency(*node) || DrivesControlDependency(*node)) {
+    if (IsDrivenByControlDependency(node) || DrivesControlDependency(node)) {
       return false;
     }
-    if (!IsOnTheSameDevice(group, *node)) {
+    if (!IsOnTheSameDevice(group, node)) {
       return false;
     }
     // Optimized nodes updated in place, and that would break the graph, if the
     // node has multiple output consumers
-    if (NumNonControlOutputs(*node, *ctx_.node_map) != 1) {
+    if (NumNonControlOutputs(node, *ctx_.node_map) != 1) {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
     OpInfo::TensorProperties properties;
-    Status has_properties = GetTensorProperties(input, &properties);
+    Status has_properties = GetTensorProperties(node.name(), &properties);
     return has_properties.ok() &&
-           HasAllInputsBroadcastableToShape(*node, properties);
+           HasAllInputsBroadcastableToShape(node, properties);
   }
 
   std::size_t CountUniqueShapes(const std::vector<InputAndShape>& inputs) {
@@ -993,7 +1015,15 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
   }
 
   string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    VLOG(2) << "Minimize broadcast: root=" << group.root_node->name()
+            << " op=" << group.root_node->op()
+            << " num_optimized_nodes=" << group.optimized_nodes.size();
+
+    // Do not optimize any of the nodes that are part of this group.
+    AddAllMembersToOptimizedNodes(group);
+
     if (CountUniqueShapes(group.inputs) <= 1) {
+      VLOG(3) << "Skip min-bcast group with single unique shape";
       // nothing to optimize when all shapes are the same
       return group.root_node->name();
     }
@@ -1033,8 +1063,8 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
       NodeDef* node;
       if (!optimized_nodes.empty()) {
         // re-purpose optimized nodes to build a new tree
-        node = optimized_nodes.front();
-        optimized_nodes.pop_front();
+        node = optimized_nodes.back();
+        optimized_nodes.pop_back();
       } else {
         // or use root node if none optimized nodes left
         node = group.root_node;
@@ -1101,9 +1131,6 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
       AddToOptimizationQueue(node);
     }
 
-    // Do not add updated node to any other group
-    AddToOptimizedNodes(node);
-
     TensorShapeProto shape;  // shape is not important at this point
     return InputAndShape(node->name(), shape);
   }
@@ -1969,8 +1996,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
 
-  VLOG(1) << "Simplify arithmetic ops using " << pipeline.NumStages()
-          << " arithmetic optimization stages";
+  VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
+          << str_util::Join(pipeline.StageNames(), ", ");
 
   while (!nodes_to_simplify.Empty()) {
     NodeDef* node = nodes_to_simplify.PopBack();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index e639812858..cb1f2ea732 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -105,6 +105,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.remove_identity_transpose = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
+    options.remove_negation = false;
     optimizer->options_ = options;
   }
 
@@ -2069,20 +2070,20 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
   //  a   b c   D          a   b
   NodeMap node_map(&output);
 
-  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  const NodeDef* mul1_node = node_map.GetNode("mul2");
   ASSERT_NE(mul1_node, nullptr);
   EXPECT_EQ("a", mul1_node->input(0));
   EXPECT_EQ("b", mul1_node->input(1));
 
-  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  const NodeDef* mul2_node = node_map.GetNode("mul1");
   ASSERT_NE(mul2_node, nullptr);
-  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("mul2", mul2_node->input(0));
   EXPECT_EQ("c", mul2_node->input(1));
 
   const NodeDef* mul3_node = node_map.GetNode("mul3");
   ASSERT_NE(mul3_node, nullptr);
   EXPECT_EQ("D", mul3_node->input(0));
-  EXPECT_EQ("mul2", mul3_node->input(1));
+  EXPECT_EQ("mul1", mul3_node->input(1));
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 072f772946..ed398525f3 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -239,6 +239,14 @@ class GraphOptimizerStagePipeline {
 
   std::size_t NumStages() { return stages_.size(); }
 
+  std::vector<string> StageNames() {
+    std::vector<string> names;
+    for (const auto& stage : stages_) {
+      names.push_back(stage->stage_name());
+    }
+    return names;
+  }
+
  private:
   std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
   std::function<bool(const Result&)> break_predicate_;
-- 
GitLab


From 7e0a12d669319f55fbf0351f5800787f32e3cb1a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 14 Apr 2018 02:15:58 -0700
Subject: [PATCH 0824/1262] Style nit: avoid creating local variables when not
 required.

PiperOrigin-RevId: 192876802
---
 tensorflow/python/ops/template.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 0294ecee54..9b6b8c508f 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -452,8 +452,7 @@ class Template(checkpointable.CheckpointableBase):
       # Only reuse variables if they were already created.
       with variable_scope.variable_scope(
           self._variable_scope, reuse=self._variables_created):
-        result = self._call_func(args, kwargs)
-      return result
+        return self._call_func(args, kwargs)
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
@@ -461,8 +460,7 @@ class Template(checkpointable.CheckpointableBase):
           self._unique_name, self._name,
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
-        result = self._call_func(args, kwargs)
-        return result
+        return self._call_func(args, kwargs)
 
   @property
   def name(self):
@@ -730,8 +728,7 @@ class EagerTemplate(Template):
             self._variable_scope, reuse=variable_scope.AUTO_REUSE)
       with self._variable_scope_context_manager:
         with self._template_store.as_default():
-          result = self._call_func(args, kwargs)
-      return result
+          return self._call_func(args, kwargs)
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
@@ -743,8 +740,7 @@ class EagerTemplate(Template):
         # store's variable scope name is unset; set it here.
         self._template_store.set_variable_scope_name(vs.name)
         with self._template_store.as_default():
-          result = self._call_func(args, kwargs)
-        return result
+          return self._call_func(args, kwargs)
 
   @property
   def name(self):
-- 
GitLab


From 708e640f67b3f8298aad27e4e106eb8fa9f9dc60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Sat, 14 Apr 2018 13:39:09 +0800
Subject: [PATCH 0825/1262] CLN: hide ApplyAdaMax op

---
 tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt    | 1 +
 .../core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt   | 1 +
 tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt  | 4 ++++
 .../core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt | 4 ++++
 4 files changed, 10 insertions(+)
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
index 5e705c009c..145d05de59 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
   in_arg {
     name: "var"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
index ad99b78af1..a3f2188ba5 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
   in_arg {
     name: "var"
     description: <<END
diff --git a/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..e49a355b81
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000..ca679e6889
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
+}
-- 
GitLab


From d17de3d27f02d76d5771e0216c489c6718c78323 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 14 Apr 2018 09:58:38 -0700
Subject: [PATCH 0826/1262] Add shape validation for tag input of StatsDataset
 (#18462)

* Add shape validation for tag input of StatsDataset

The tag field of the StatsDataset needs to be a scalar.
However, there was no check in the shape function. This
fix adds the check of the tag shape.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for tag shape check with BytesProducedStatsDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add tag shape check for LatencyStatsDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for tag shape check with LatencyStatsDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../kernel_tests/stats_dataset_ops_test.py       | 16 ++++++++++++++++
 tensorflow/core/ops/dataset_ops.cc               | 12 ++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 07bdf92044..c3a7f291c5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -218,6 +218,14 @@ class StatsDatasetSerializationTest(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
 
+  def test_bytes_produced_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.bytes_produced_stats(["bytes_produced"])),
+          None, 100)
+
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
     self.run_core_tests(
@@ -235,6 +243,14 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
 
+  def test_latency_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.latency_stats(["record_latency", "record_latency_2"])),
+          None, 100)
+
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index b25abbcc67..8a7185e005 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -141,7 +141,11 @@ REGISTER_OP("BytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("LatencyStatsDataset")
     .Input("input_dataset: variant")
@@ -149,7 +153,11 @@ REGISTER_OP("LatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
-- 
GitLab


From ea9434dbd2668b3089d64e26ba6586aea1d78b33 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sun, 15 Apr 2018 10:26:01 +0900
Subject: [PATCH 0827/1262] move eye to linalg_ops_impl

---
 tensorflow/python/BUILD                     | 19 +++++-
 tensorflow/python/ops/init_ops.py           |  9 +--
 tensorflow/python/ops/linalg/linalg.py      |  1 +
 tensorflow/python/ops/linalg/linalg_impl.py |  3 +-
 tensorflow/python/ops/linalg_ops.py         | 36 ++--------
 tensorflow/python/ops/linalg_ops_impl.py    | 73 +++++++++++++++++++++
 6 files changed, 104 insertions(+), 37 deletions(-)
 create mode 100644 tensorflow/python/ops/linalg_ops_impl.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0cd3f27140..1225786812 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1934,7 +1934,8 @@ py_library(
         ":array_ops",
         ":constant_op",
         ":dtypes",
-        ":linalg_ops",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
         ":math_ops",
         ":nn_ops",
         ":random_ops",
@@ -1971,7 +1972,6 @@ py_library(
         ":array_ops",
         ":control_flow_ops",
         ":framework_for_generated_wrappers",
-        ":functional_ops",
         ":linalg_ops",
         ":math_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
@@ -1986,7 +1986,22 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
+        ":functional_ops",
         ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "linalg_ops_impl",
+    srcs = ["ops/linalg_ops_impl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":dtypes",
+        ":framework_ops",
         ":math_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 9dfe5ffbf4..366a72c972 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -39,7 +39,8 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import random_ops
@@ -529,7 +530,7 @@ class Orthogonal(Initializer):
     # Generate a random matrix
     a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, r = linalg_ops.qr(a, full_matrices=False)
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
@@ -578,7 +579,7 @@ class ConvolutionDeltaOrthogonal(Initializer):
     a = random_ops.random_normal([shape[-1], shape[-1]],
                                  dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, r = linalg_ops.qr(a, full_matrices=False)
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
     # ph = d / math_ops.abs(d)
@@ -623,7 +624,7 @@ class Identity(Initializer):
           "Identity matrix initializer can only be used for 2D matrices.")
     if dtype is None:
       dtype = self.dtype
-    initializer = linalg_ops.eye(*full_shape, dtype=dtype)
+    initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
     if partition_info is not None:
       initializer = array_ops.slice(initializer, partition_info.var_offset,
                                     shape)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index 14319025ff..7e9c3cde18 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -39,6 +39,7 @@ del ops
 del array_ops
 del gen_linalg_ops
 del linalg_ops
+del linalg_ops_impl
 del math_ops
 del special_math_ops
 del tf_export
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 8343c62816..6b1a046c06 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -40,7 +41,7 @@ eigvalsh = linalg_ops.self_adjoint_eigvals
 einsum = special_math_ops.einsum
 expm = gen_linalg_ops.matrix_exponential
 tf_export('linalg.expm')(expm)
-eye = linalg_ops.eye
+eye = linalg_ops_impl.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
 tf_export('linalg.logm')(logm)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 50706e5781..805fbd99ef 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
@@ -160,36 +161,11 @@ def eye(num_rows,
   Returns:
     A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
   """
-  with ops.name_scope(
-      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
-    is_square = num_columns is None
-    batch_shape = [] if batch_shape is None else batch_shape
-    num_columns = num_rows if num_columns is None else num_columns
-    if isinstance(num_rows, ops.Tensor) or isinstance(
-        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
-      batch_shape = ops.convert_to_tensor(
-          batch_shape, name='shape', dtype=dtypes.int32)
-      diag_size = math_ops.minimum(num_rows, num_columns)
-      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
-      if not is_square:
-        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
-    else:
-      if not isinstance(num_rows, compat.integral_types) or not isinstance(
-          num_columns, compat.integral_types):
-        raise TypeError(
-            'num_rows and num_columns must be positive integer values.')
-      batch_shape = [dim for dim in batch_shape]
-      is_square = num_rows == num_columns
-      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
-      if not is_square:
-        shape = batch_shape + [num_rows, num_columns]
-
-    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
-    if is_square:
-      return array_ops.matrix_diag(diag_ones)
-    else:
-      zero_matrix = array_ops.zeros(shape, dtype=dtype)
-      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
+  return linalg_ops_impl.eye(num_rows,
+                             num_columns=num_columns,
+                             batch_shape=batch_shape,
+                             dtype=dtype,
+                             name=name)
 
 
 @tf_export('matrix_solve_ls', 'linalg.lstsq')
diff --git a/tensorflow/python/ops/linalg_ops_impl.py b/tensorflow/python/ops/linalg_ops_impl.py
new file mode 100644
index 0000000000..9263b95336
--- /dev/null
+++ b/tensorflow/python/ops/linalg_ops_impl.py
@@ -0,0 +1,73 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for linear algebra."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import compat
+
+# Names below are lower_case.
+# pylint: disable=invalid-name
+
+
+def eye(num_rows,
+        num_columns=None,
+        batch_shape=None,
+        dtype=dtypes.float32,
+        name=None):
+  """Construct an identity matrix, or a batch of matrices.
+
+  See `linalg_ops.eye`.
+  """
+  with ops.name_scope(
+      name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
+    is_square = num_columns is None
+    batch_shape = [] if batch_shape is None else batch_shape
+    num_columns = num_rows if num_columns is None else num_columns
+    if isinstance(num_rows, ops.Tensor) or isinstance(
+        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
+      batch_shape = ops.convert_to_tensor(
+          batch_shape, name='shape', dtype=dtypes.int32)
+      diag_size = math_ops.minimum(num_rows, num_columns)
+      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
+      if not is_square:
+        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
+    else:
+      if not isinstance(num_rows, compat.integral_types) or not isinstance(
+          num_columns, compat.integral_types):
+        raise TypeError(
+            'num_rows and num_columns must be positive integer values.')
+      batch_shape = [dim for dim in batch_shape]
+      is_square = num_rows == num_columns
+      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
+      if not is_square:
+        shape = batch_shape + [num_rows, num_columns]
+
+    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
+    if is_square:
+      return array_ops.matrix_diag(diag_ones)
+    else:
+      zero_matrix = array_ops.zeros(shape, dtype=dtype)
+      return array_ops.matrix_set_diag(zero_matrix, diag_ones)
+
+# pylint: enable=invalid-name,redefined-builtin
-- 
GitLab


From 73bfc3234e0864cc7074d7fc7e680a4e7deeade0 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Sun, 15 Apr 2018 11:38:00 +0900
Subject: [PATCH 0828/1262] revert unneeded change

---
 tensorflow/python/ops/linalg/linalg_impl.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 6b1a046c06..8343c62816 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -22,7 +22,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -41,7 +40,7 @@ eigvalsh = linalg_ops.self_adjoint_eigvals
 einsum = special_math_ops.einsum
 expm = gen_linalg_ops.matrix_exponential
 tf_export('linalg.expm')(expm)
-eye = linalg_ops_impl.eye
+eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
 tf_export('linalg.logm')(logm)
-- 
GitLab


From e49733b99ed9bedda46b32910cbd2183f12a4fe3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 15 Apr 2018 13:23:47 -0700
Subject: [PATCH 0829/1262] Update sqlite version for cmake build (#18524)

The sqlite has been updated in bazel, though
cmake version was not updated. This fix updates
sqlite in cmake so that cmake and bazel versions
are synced.

The fix has been tested on Linux:
```
tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cmake/external/sqlite.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 57c4ae7651..7f835d2d51 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(sqlite_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/sqlite)
-set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
-set(sqlite_HASH SHA256=208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4)
+set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip)
+set(sqlite_HASH SHA256=4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc)
 set(sqlite_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sqlite/src/sqlite)
 set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
 
-- 
GitLab


From e1ea51146d6c2c3b579b84941ca5b05ce1a4fa8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 15 Apr 2018 17:35:39 -0700
Subject: [PATCH 0830/1262] Minor cleanup.

PiperOrigin-RevId: 192971080
---
 tensorflow/contrib/autograph/converters/asserts.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/asserts.py b/tensorflow/contrib/autograph/converters/asserts.py
index f011a97ade..2d9e2c58e3 100644
--- a/tensorflow/contrib/autograph/converters/asserts.py
+++ b/tensorflow/contrib/autograph/converters/asserts.py
@@ -27,8 +27,6 @@ from tensorflow.contrib.autograph.pyct import transformer
 class AssertsTransformer(transformer.Base):
   """Transforms Print nodes to Call so they can be handled as functions."""
 
-  # pylint:disable=invalid-name
-
   def visit_Assert(self, node):
     self.generic_visit(node)
 
@@ -44,9 +42,7 @@ class AssertsTransformer(transformer.Base):
     elif isinstance(node.msg, gast.Str):
       return templates.replace(template, test=node.test, msg=node.msg)
     else:
-      raise NotImplementedError('Can only convert string messages for now.')
-
-  # pylint:enable=invalid-name
+      raise NotImplementedError('can only convert string messages for now.')
 
 
 def transform(node, context):
-- 
GitLab


From 6764c6bb2a4b9efd75204e5aeb857c8d0ad00130 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Mon, 16 Apr 2018 11:10:17 +0900
Subject: [PATCH 0831/1262] Fix typo (#18416)

* fix typo

* fix typo

* fix typo

* fix typo

* fix typo

* fix typo

* fix typo

* Improve comment
---
 tensorflow/contrib/lite/toco/model.h   | 6 +++---
 tensorflow/core/framework/collective.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 1c4c96ae70..787c20e574 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -151,9 +151,9 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that that does not by itself tell whether the values in the array are
-// real (are literally interpreted as real numbers) or quantized (only acquire
-// a meaning as real numbers in conjunction with QuantizationParams).
+// Note that the type does not by itself tell whether the values in the array
+// are real (are literally interpreted as real numbers) or quantized (only
+// acquire a meaning as real numbers in conjunction with QuantizationParams).
 //
 // In practice though:
 //   float values are always real
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 5810c7fa54..a82fb50d88 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -178,7 +178,7 @@ class StepSequenceInterface {
   virtual void RefreshStepIdSequenceAsync(int64 graph_key,
                                           const StatusCallback& done) = 0;
 
-  // Returns the the step_id that should be used for initiating a new execution
+  // Returns the step_id that should be used for initiating a new execution
   // on the specified graph. May return the same step_id multiple times if
   // RetireStepId or RefreshStepIdReservation is not called.
   virtual int64 NextStepId(int64 graph_key) = 0;
-- 
GitLab


From 64b8af0a0859f1729e66649b5f84da508566d09a Mon Sep 17 00:00:00 2001
From: Elson Rodriguez <elson.rodriguez@gmail.com>
Date: Sun, 15 Apr 2018 19:10:50 -0700
Subject: [PATCH 0832/1262] Improving S3 documentation. (#18406)

* Improving S3 documentation.

Added a copy-pastable guide on the variables, and also provided usable examples that give immediate feedback.

* Updating docs based on feedback.

Added back old configuration section, moved s3 implementations to bottom of document.

* Rearranged documentation before example, renamed sections to be more clear.
---
 tensorflow/docs_src/deploy/s3.md | 81 ++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 14 deletions(-)

diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index 38f8428634..ef3b030e32 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -1,22 +1,13 @@
 # How to run TensorFlow on S3
 
-This document describes how to run TensorFlow on S3 file system.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
-## S3
+This document guides you through the required setup, and provides examples on usage.
 
-We assume that you are familiar with @{$reading_data$reading data}.
-
-To use S3 with TensorFlow, change the file paths you use to read and write
-data to an S3 path. For example:
-
-```python
-filenames = ["s3://bucketname/path/to/file1.tfrecord",
-             "s3://bucketname/path/to/file2.tfrecord"]
-dataset = tf.data.TFRecordDataset(filenames)
-```
+## Configuration
 
 When reading or writing data on S3 with your TensorFlow program, the behavior
-could be controlled by various environmental variables:
+can be controlled by various environmental variables:
 
 *   **AWS_REGION**: By default, regional endpoint is used for S3, with region
     controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
@@ -28,7 +19,7 @@ could be controlled by various environmental variables:
 *   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
     with `S3_VERIFY_SSL=0`.
 
-To read or write objects in a bucket that is no publicly accessible,
+To read or write objects in a bucket that is not publicly accessible,
 AWS credentials must be provided through one of the following methods:
 
 *   Set credentials in the AWS credentials profile file on the local system,
@@ -38,3 +29,65 @@ AWS credentials must be provided through one of the following methods:
     variables.
 *   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
     give the EC2 instance access to that role.
+
+## Example Setup
+
+Using the above information, we can configure Tensorflow to communicate to an S3 endpoint by setting the following environment variables:
+
+```bash
+AWS_ACCESS_KEY_ID=XXXXX                 # Credentials only needed if connecting to a private endpoint
+AWS_SECRET_ACCESS_KEY=XXXXX
+AWS_REGION=us-east-1                    # Region for the S3 bucket, this is not always needed. Default is us-east-1.
+S3_ENDPOINT=s3.us-east-1.amazonaws.com  # The S3 API Endpoint to connect to. This is specified in a HOST:PORT format.
+S3_USE_HTTPS=1                          # Whether or not to use HTTPS. Disable with 0.
+S3_VERIFY_SSL=1                         # If HTTPS is used, conterols if SSL should be enabled. Disable with 0.
+```
+
+## Usage
+
+Once setup is completed, Tensorflow can interact with S3 in a variety of ways. Anywhere there is a Tensorflow IO function, an S3 URL can be used.
+
+### Smoke Test
+
+To test your setup, stat a file:
+
+```python
+from tensorflow.python.lib.io import file_io
+print file_io.stat('s3://bucketname/path/')
+```
+
+You should see output similar to this:
+
+```console
+<tensorflow.python.pywrap_tensorflow_internal.FileStatistics; proxy of <Swig Object of type 'tensorflow::FileStatistics *' at 0x10c2171b0> >
+```
+
+### Reading Data
+
+When @{$reading_data$reading data}, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+### Tensorflow Tools
+
+Many Tensorflow tools, such as Tensorboard or model serving, can also take S3 URLS as arguments:
+
+```bash
+tensorboard --logdir s3://bucketname/path/to/model/
+tensorflow_model_server --port=9000 --model_name=model --model_base_path=s3://bucketname/path/to/model/export/
+```
+
+This enables an end to end workflow using S3 for all data needs.
+
+## S3 Endpoint Implementations
+
+S3 was invented by Amazon, but the S3 API has spread in popularity and has several implementations. The following implementations have passed basic compatibility tests:
+
+* [Amazon S3](https://aws.amazon.com/s3/)
+* [Google Storage](https://cloud.google.com/storage/docs/interoperability)
+* [Minio](https://www.minio.io/kubernetes.html)(Standalone mode only)
-- 
GitLab


From 54772bb9a4a44badf4a70d75f41426c51f47cf3e Mon Sep 17 00:00:00 2001
From: "David T.H. Kao" <dthkao@gmail.com>
Date: Sun, 15 Apr 2018 19:11:25 -0700
Subject: [PATCH 0833/1262] Expose Scaffold.default_local_init_op as a public
 static method. (#18398)

* Expose Scaffold.default_local_init_op as a public static method.

* update api

* Add a docstring.

* Add a returns section.
---
 tensorflow/python/estimator/estimator.py           |  2 +-
 tensorflow/python/training/monitored_session.py    | 14 ++++++++++++--
 .../api/golden/tensorflow.train.-scaffold.pbtxt    |  4 ++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 301a360636..8890f74243 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -637,7 +637,7 @@ class Estimator(object):
         # pylint: disable=protected-access
         local_init_op = (
             estimator_spec.scaffold.local_init_op or
-            monitored_session.Scaffold._default_local_init_op())
+            monitored_session.Scaffold.default_local_init_op())
         # pylint: enable=protected-access
 
         # Perform the export
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 4ce6f6d002..f584a009d9 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -202,7 +202,7 @@ class Scaffold(object):
     if self._local_init_op is None:
       self._local_init_op = Scaffold.get_or_default(
           'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
-          Scaffold._default_local_init_op)
+          Scaffold.default_local_init_op)
     if self._summary_op is None:
       self._summary_op = Scaffold.get_or_default('summary_op',
                                                  ops.GraphKeys.SUMMARY_OP,
@@ -267,7 +267,17 @@ class Scaffold(object):
     return op
 
   @staticmethod
-  def _default_local_init_op():
+  def default_local_init_op():
+    """Returns an op that groups the default local init ops.
+
+    This op is used during session initialization when a Scaffold is
+    initialized without specifying the local_init_op arg. It includes
+    `tf.local_variables_initializer`, `tf.tables_initializer`, and also
+    initializes local session resources.
+
+    Returns:
+      The default Scaffold local init op.
+    """
     return control_flow_ops.group(
         variables.local_variables_initializer(),
         lookup_ops.tables_initializer(),
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
index 62b956c5ef..38cc98b48e 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
@@ -38,6 +38,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "default_local_init_op"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-- 
GitLab


From 0586c57292a7bd1a79b4a03270c0f1c32d02a4af Mon Sep 17 00:00:00 2001
From: Guillaume Klein <guillaumekln@users.noreply.github.com>
Date: Mon, 16 Apr 2018 04:21:29 +0200
Subject: [PATCH 0834/1262] Support passing layer instances to produce
 attentional hidden states (#14974)

* Support passing Layer instances to the AttentionWrapper.

* Use _compute_output_shape to get the attention layer depth

* compute_output_shape is now a public method

* Move new argument at the end
---
 .../kernel_tests/attention_wrapper_test.py    | 77 +++++++++++++++++--
 .../seq2seq/python/ops/attention_wrapper.py   | 35 ++++++++-
 2 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index d508cf3f9d..84a7b45b5a 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -110,7 +111,12 @@ class AttentionWrapperTest(test.TestCase):
                          alignment_history=False,
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
+                         attention_layer=None,
                          name=''):
+    attention_layer_sizes = (
+        [attention_layer_size] if attention_layer_size is not None else None)
+    attention_layers = (
+        [attention_layer] if attention_layer is not None else None)
     self._testWithMaybeMultiAttention(
         is_multi=False,
         create_attention_mechanisms=[create_attention_mechanism],
@@ -119,7 +125,8 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depths=[attention_mechanism_depth],
         alignment_history=alignment_history,
         expected_final_alignment_history=expected_final_alignment_history,
-        attention_layer_sizes=[attention_layer_size],
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
         name=name)
 
   def _testWithMaybeMultiAttention(self,
@@ -131,6 +138,7 @@ class AttentionWrapperTest(test.TestCase):
                                    alignment_history=False,
                                    expected_final_alignment_history=None,
                                    attention_layer_sizes=None,
+                                   attention_layers=None,
                                    name=''):
     # Allow is_multi to be True with a single mechanism to enable test for
     # passing in a single mechanism in a list.
@@ -144,12 +152,18 @@ class AttentionWrapperTest(test.TestCase):
     encoder_output_depth = 10
     cell_depth = 9
 
-    if attention_layer_sizes is None:
-      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
-    else:
+    if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
       attention_depth = sum([attention_layer_size or encoder_output_depth
                              for attention_layer_size in attention_layer_sizes])
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth])[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
 
     decoder_inputs = array_ops.placeholder_with_default(
         np.random.randn(batch_size, decoder_max_time,
@@ -171,13 +185,20 @@ class AttentionWrapperTest(test.TestCase):
       with vs.variable_scope(
           'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
+        attention_layer_size = attention_layer_sizes
+        attention_layer = attention_layers
+        if not is_multi:
+          if attention_layer_size is not None:
+            attention_layer_size = attention_layer_size[0]
+          if attention_layer is not None:
+            attention_layer = attention_layer[0]
         cell = rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
             cell,
             attention_mechanisms if is_multi else attention_mechanisms[0],
-            attention_layer_size=(attention_layer_sizes if is_multi
-                                  else attention_layer_sizes[0]),
-            alignment_history=alignment_history)
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history,
+            attention_layer=attention_layer)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
         my_decoder = basic_decoder.BasicDecoder(
@@ -797,6 +818,48 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testMultiAttentionWithLayerInstances(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 7), dtype=dtype('float32'), mean=0.0011709079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=3.2000000000000002))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0038725811),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019329828)),
+        attention=ResultSummary(
+            shape=(5, 7), dtype=dtype('float32'), mean=0.001174294),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        True,
+        create_attention_mechanisms,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        attention_layers=[layers_core.Dense(3, use_bias=False),
+                          layers_core.Dense(4, use_bias=False)],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
   def testLuongMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index f0f143ddfc..9ba541ce23 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -1082,7 +1082,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                cell_input_fn=None,
                output_attention=True,
                initial_cell_state=None,
-               name=None):
+               name=None,
+               attention_layer=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1125,7 +1126,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         (default), use the context as attention at each time step. Otherwise,
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
-        attention_layer_size must be a list of the same length.
+        attention_layer_size must be a list of the same length. If
+        attention_layer is set, this must be None.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1145,12 +1147,19 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         does not match the batch size of `initial_cell_state`, proper
         behavior is not guaranteed.
       name: Name to use when creating ops.
+      attention_layer: A list of `tf.layers.Layer` instances or a
+        single `tf.layers.Layer` instance taking the context and cell output as
+        inputs to generate attention at each time step. If None (default), use
+        the context as attention at each time step. If attention_mechanism is a
+        list, attention_layer must be a list of the same length. If
+        attention_layers_size is set, this must be None.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
         is a list but `attention_layer_size` is not; or vice versa).
       ValueError: if `attention_layer_size` is not None, `attention_mechanism`
-        is a list, and its length does not match that of `attention_layer_size`.
+        is a list, and its length does not match that of `attention_layer_size`;
+        if `attention_layer_size` and `attention_layer` are set simultaneously.
     """
     super(AttentionWrapper, self).__init__(name=name)
     rnn_cell_impl.assert_like_rnncell("cell", cell)
@@ -1181,6 +1190,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
 
+    if attention_layer_size is not None and attention_layer is not None:
+      raise ValueError("Only one of attention_layer_size and attention_layer "
+                       "should be set")
+
     if attention_layer_size is not None:
       attention_layer_sizes = tuple(
           attention_layer_size
@@ -1199,6 +1212,22 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
               dtype=attention_mechanisms[i].dtype)
           for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
+    elif attention_layer is not None:
+      self._attention_layers = tuple(
+          attention_layer
+          if isinstance(attention_layer, (list, tuple))
+          else (attention_layer,))
+      if len(self._attention_layers) != len(attention_mechanisms):
+        raise ValueError(
+            "If provided, attention_layer must contain exactly one "
+            "layer per attention_mechanism, saw: %d vs %d"
+            % (len(self._attention_layers), len(attention_mechanisms)))
+      self._attention_layer_size = sum(
+          layer.compute_output_shape(
+              [None,
+               cell.output_size + mechanism.values.shape[-1].value])[-1].value
+          for layer, mechanism in zip(
+              self._attention_layers, attention_mechanisms))
     else:
       self._attention_layers = None
       self._attention_layer_size = sum(
-- 
GitLab


From ba1c53a5f2bb106e16ec7503dbd4d0db9ecc9799 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 15 Apr 2018 19:22:39 -0700
Subject: [PATCH 0835/1262] Add support for explicit broadcasting in TensorFlow
 (#15243)

* Add support for explicit broadcasting in TensorFlow

This fix tries to adds support for explicit broadcasting in TensorFlow,
as was suggested in 14509. This fix adds the op of tf.broadcast_to,
which is equivalent to the numpy.broadcast_to in numpy.

This fix fixes 14509.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Register BroadcastTo op in array_ops.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Sanitize with clang-format -i

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test cases for tf.broadcast_to

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Sanitize bazel BUILD and python.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Split broadcast_to_ops_test from array_ops_test

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Support int64 shape

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Improve shape inference for broadcast_to

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add scalar input support for broadcast_to

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update API defs tensorflow/core/api_def/update_api_def.sh

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update API golden

```
bazel-bin/tensorflow/tools/api/tests/api_compatibility_test
           --update_goldens True
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update docstring for broadcast_to

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Enable GPU kernel for BroadcastTo

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Enable use_gpu=True for test cases

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Hiden the ops and export to tf.contrib.framework for now.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add the op to the _allowed_symbol in tf.contrib.framework

Otherwise the symbole will be hidden

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint sanity issue.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/framework/__init__.py      |   3 +-
 .../base_api/api_def_BroadcastTo.pbtxt        |  41 ++++
 .../python_api/api_def_BroadcastTo.pbtxt      |   4 +
 tensorflow/core/kernels/BUILD                 |   7 +
 tensorflow/core/kernels/broadcast_to_op.cc    |  91 ++++++++
 tensorflow/core/kernels/broadcast_to_op.h     | 220 ++++++++++++++++++
 .../core/kernels/broadcast_to_op_gpu.cu.cc    |  34 +++
 tensorflow/core/ops/array_ops.cc              |  52 +++++
 tensorflow/python/kernel_tests/BUILD          |  12 +
 .../kernel_tests/broadcast_to_ops_test.py     |  85 +++++++
 10 files changed, 548 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
 create mode 100644 tensorflow/core/kernels/broadcast_to_op.cc
 create mode 100644 tensorflow/core/kernels/broadcast_to_op.h
 create mode 100644 tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
 create mode 100644 tensorflow/python/kernel_tests/broadcast_to_ops_test.py

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index a52907f163..4a5ed0ab0f 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -116,10 +116,11 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000..763760176a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "BroadcastTo"
+  in_arg {
+    name: "input"
+    description: <<END
+A Tensor to broadcast.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+An 1-D `int` Tensor. The shape of the desired output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor.
+END
+  }
+  summary: "Broadcast an array for a compatible shape."
+  description: <<END
+Broadcasting is the process of making arrays to have compatible shapes
+for arithmetic operations. Two shapes are compatible if for each
+dimension pair they are either equal or one of them is one. When trying
+to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+and works its way forward.
+
+For example,
+```
+>>> x = tf.constant([1, 2, 3])
+>>> y = tf.broadcast_to(x, [3, 3])
+>>> sess.run(y)
+array([[1, 2, 3],
+       [1, 2, 3],
+       [1, 2, 3]], dtype=int32)
+```
+In the above example, the input Tensor with the shape of `[1, 3]`
+is broadcasted to output Tensor with shape of `[3, 3]`.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000..083eeced81
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BroadcastTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d7b8178ce7..24131cb51e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -617,6 +617,7 @@ cc_library(
         ":batch_space_ops",
         ":bcast_ops",
         ":bitcast_op",
+        ":broadcast_to_op",
         ":concat_op",
         ":constant_op",
         ":depth_space_ops",
@@ -668,6 +669,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "broadcast_to_op",
+    prefix = "broadcast_to_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "concat_op",
     prefix = "concat_op",
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
new file mode 100644
index 0000000000..2810925bbc
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BroadcastToOp : public OpKernel {
+ public:
+  explicit BroadcastToOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_tensor = ctx->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+
+    const Tensor& shape_tensor = ctx->input(1);
+
+    TensorShape output_shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
+
+    const Device& d = ctx->eigen_device<Device>();
+    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape);
+  }
+};
+
+// As MakeShape is able to handle both DT_INT32 and DT_INT64,
+// no need to have TypeConstraint for `Tidx`
+#define REGISTER_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BroadcastTo").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BroadcastToOp<CPUDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+
+namespace functor {
+#define DECLARE_GPU_TEMPLATE(Type)                              \
+  template <>                                                   \
+  void BroadcastTo<GPUDevice, Type>::operator()(                \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
+      const TensorShape& output_shape, const Tensor& input,     \
+      const TensorShape& input_shape);                          \
+  extern template struct BroadcastTo<GPUDevice, Type>;
+
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
+#undef DECLARE_GPU_KERNEL
+}  // namespace functor
+
+#define REGISTER_KERNEL(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("BroadcastTo")            \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("shape"),      \
+                          BroadcastToOp<GPUDevice, type>);
+
+TF_CALL_GPU_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
new file mode 100644
index 0000000000..608e9b6ac9
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -0,0 +1,220 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+#define TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BroadcastTo {
+  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
+                  const TensorShape &output_shape, const Tensor &input_tensor,
+                  const TensorShape &input_shape) {
+#define BROADCAST_SHAPE(broadcast, reshape, NDIMS, input_shape, output_shape) \
+  for (int i = 0; i < NDIMS; i++) {                                           \
+    OP_REQUIRES(ctx, (broadcast[i] % reshape[i] == 0),                        \
+                errors::InvalidArgument("invalid shape to broadcast from ",   \
+                                        input_shape.DebugString(), " to ",    \
+                                        output_shape.DebugString()));         \
+    broadcast[i] = broadcast[i] / reshape[i];                                 \
+  }
+
+    switch (output_shape.dims()) {
+      case 1: {
+        auto reshape = AsEigenDSizesWithPrefix<1>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<1>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 1, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 1>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 2: {
+        auto reshape = AsEigenDSizesWithPrefix<2>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<2>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 2, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 2>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 3: {
+        auto reshape = AsEigenDSizesWithPrefix<3>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<3>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 3, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 3>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 4: {
+        auto reshape = AsEigenDSizesWithPrefix<4>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<4>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 4, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 4>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 5: {
+        auto reshape = AsEigenDSizesWithPrefix<5>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<5>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 5, input_shape, output_shape);
+        auto output = output_tensor.tensor<T, 5>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 5: {
+            auto input = input_tensor.tensor<T, 5>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      default:
+        ctx->CtxFailure(errors::InvalidArgument(
+            "invalid shape to broadcast from ", input_shape.DebugString(),
+            " to ", output_shape.DebugString()));
+        break;
+    }
+  }
+
+ private:
+  template <int NDIMS>
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPrefix(
+      const TensorShape &shape) const {
+    Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+    for (int d = 0; d < NDIMS - shape.dims(); d++) {
+      dsizes[d] = 1;
+    }
+    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
+      dsizes[d] = shape.dim_size(d - (NDIMS - shape.dims()));
+    }
+    return dsizes;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
new file mode 100644
index 0000000000..6459571085
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define INSTANTIATE_GPU_KERNEL(Type) \
+  template class functor::BroadcastTo<GPUDevice, Type>;
+TF_CALL_GPU_ALL_TYPES(INSTANTIATE_GPU_KERNEL);
+#undef INSTANTIATE_GPU_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 2a8b9f9bee..88fc03826a 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -429,6 +429,58 @@ REGISTER_OP("UnravelIndex")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
 
+REGISTER_OP("BroadcastTo")
+    .Input("input: T")
+    .Input("shape: Tidx")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle in = c->input(0);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+
+      if (!c->RankKnown(out)) {
+        // We have no information about the shape of the output.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+
+      if (!c->RankKnown(in)) {
+        // We have no information about the shape of the input,
+        // nothing to do here.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+      if (c->Rank(out) < c->Rank(in)) {
+        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
+                                       c->DebugString(in), " shape ",
+                                       c->DebugString(out));
+      }
+
+      int32 in_offset = c->Rank(out) - c->Rank(in);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (c->ValueKnown(dim)) {
+          // The first in_offset dimensions for input will be expanded with 1,
+          // so no check needed.
+          if (i >= in_offset) {
+            DimensionHandle in_dim = c->Dim(in, i - in_offset);
+            if (c->ValueKnown(in_dim)) {
+              if (c->Value(dim) % c->Value(in_dim) != 0) {
+                return errors::InvalidArgument(
+                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
+                    " shape ", c->DebugString(out));
+              }
+            }
+          }
+        }
+      }
+
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e82d738f14..c277c56b8d 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1192,6 +1192,18 @@ cuda_py_test(
     shard_count = 10,
 )
 
+cuda_py_test(
+    name = "broadcast_to_ops_test",
+    size = "small",
+    srcs = ["broadcast_to_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 cuda_py_test(
     name = "inplace_ops_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
new file mode 100644
index 0000000000..6a1bd958ba
--- /dev/null
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -0,0 +1,85 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for broadcast_to ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class BroadcastToTest(test_util.TensorFlowTestCase):
+
+  def testBroadcastToBasic(self):
+    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3], dtype=dtype)
+        v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToString(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([b"1", b"2", b"3"])
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToBool(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([True, False, True], dtype=np.bool)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShape(self):
+    for input_dim in range(1, 6):
+      for output_dim in range(input_dim, 6):
+        with self.test_session(use_gpu=True):
+          input_shape = [2] * input_dim
+          output_shape = [2] * output_dim
+          x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+          v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+          v_np = np.broadcast_to(x, output_shape)
+          self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToScalar(self):
+    with self.test_session(use_gpu=True):
+      x = np.array(1, dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShapeTypeAndInference(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3])
+        v_tf = array_ops.broadcast_to(
+            constant_op.constant(x),
+            constant_op.constant([3, 3], dtype=dtype))
+        shape = v_tf.get_shape().as_list()
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+        # check shape inference when shape input is constant
+        self.assertAllEqual(shape, v_np.shape)
+
+if __name__ == "__main__":
+  test_lib.main()
-- 
GitLab


From c6fdeaca7dd32c6bec3ff2df14889c3f2c129f14 Mon Sep 17 00:00:00 2001
From: "Siu Kei, Muk" <muksiukei@gmail.com>
Date: Mon, 16 Apr 2018 10:23:20 +0800
Subject: [PATCH 0836/1262] =?UTF-8?q?adding=20ps=5Fstrategy=20to=20run=5Fc?=
 =?UTF-8?q?onfig=20to=20enable=20different=20placement=20strate=E2=80=A6?=
 =?UTF-8?q?=20(#15640)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* adding ps_strategy to run_config to enable different placement strategy in estimator

* 1. Moved estimator._device_fn to RunConfig as @property
2. Made RunConfig.device_fn to return custom device function if one is specified, otherwise the result from `tf.train.replica_device_setter` call is used
3. Added some basic unit tests, may need further tests.

* 1. Removing ps_strategy.
2. Modified estimator to take overriden device_fn from  if set.
3. Removed ps_strategy related unit tests.

* Adding manual initialization of _device_fn in legacy RunConfig class

* Updated estimator golden API through
1. bazel build //tensorflow/tools/api/tests:api_compatibility_test
2. bazel-bin/tensorflow/tools/api/tests/api_compatibility_test --update_goldens True

* fixing code styles
---
 .../python/learn/estimators/run_config.py     |  1 +
 tensorflow/python/estimator/estimator.py      |  3 +-
 tensorflow/python/estimator/run_config.py     | 33 ++++++++++++++++---
 .../python/estimator/run_config_test.py       | 24 +++++++++++---
 .../tensorflow.estimator.-run-config.pbtxt    |  6 +++-
 5 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 8c85c431be..14ee2ba609 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -299,6 +299,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     # so instead of breaking compatibility with that assumption, we
     # just manually initialize this field:
     self._train_distribute = None
+    self._device_fn = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 8890f74243..901f04719f 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -216,7 +216,8 @@ class Estimator(object):
     else:
       self._session_config = self._config.session_config
 
-    self._device_fn = _get_replica_device_setter(self._config)
+    self._device_fn = self._config.device_fn or \
+                      _get_replica_device_setter(self._config)
 
     if model_fn is None:
       raise ValueError('model_fn must be provided to Estimator.')
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index dab442aeda..8162b249f1 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,11 +27,13 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.estimator import util
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util.tf_export import tf_export
 
 
 _USE_DEFAULT = object()
+_VALID_DEVICE_FN_ARGS = set(['op'])
 
 # A list of the property names in RunConfig that the user is allowed to change.
 _DEFAULT_REPLACEABLE_LIST = [
@@ -44,7 +46,8 @@ _DEFAULT_REPLACEABLE_LIST = [
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
     'log_step_count_steps',
-    'train_distribute'
+    'train_distribute',
+    'device_fn'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -279,6 +282,11 @@ def _validate_properties(run_config):
   _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
             message='tf_random_seed must be integer.')
 
+  _validate('device_fn', lambda device_fn: six.callable(device_fn) and
+            set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
+            message='device_fn must be callable with exactly'
+                    ' one argument "op".')
+
 
 class TaskType(object):
   MASTER = 'master'
@@ -302,7 +310,8 @@ class RunConfig(object):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                log_step_count_steps=100,
-               train_distribute=None):
+               train_distribute=None,
+               device_fn=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -430,6 +439,10 @@ class RunConfig(object):
         `tf.contrib.distribute.DistributionStrategy`. If specified,
         then Estimator will distribute the user's model during training,
         according to the policy specified by that strategy.
+      device_fn: A callable invoked for every `Operation` that takes the
+        `Operation` and returns the device string. If `None`, defaults to
+        the device function returned by `tf.train.replica_device_setter`
+        with round-robin strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -466,7 +479,8 @@ class RunConfig(object):
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
         log_step_count_steps=log_step_count_steps,
-        train_distribute=train_distribute)
+        train_distribute=train_distribute,
+        device_fn=device_fn)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -568,6 +582,16 @@ class RunConfig(object):
   def cluster_spec(self):
     return self._cluster_spec
 
+  @property
+  def device_fn(self):
+    """Returns the device_fn.
+
+    If device_fn is not `None`, it overrides the default
+    device function used in `Estimator`.
+    Otherwise the default one is used.
+    """
+    return self._device_fn
+
   @property
   def evaluation_master(self):
     return self._evaluation_master
@@ -697,7 +721,8 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
-      - `train_distribute`.
+      - `train_distribute`,
+      - `device_fn`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index a3eef4c53f..c8b12605e1 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -42,6 +42,7 @@ _SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
 _KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
 _KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
 _TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+_DEVICE_FN_ERR = 'device_fn must be callable with exactly one argument "op".'
 _ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.'
 _ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.'
 _INVALID_TASK_TYPE_FOR_EVAL_MASTER = (
@@ -83,6 +84,7 @@ class RunConfigTest(test.TestCase):
     self.assertEqual(5, config.keep_checkpoint_max)
     self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
     self.assertIsNone(config.service)
+    self.assertIsNone(config.device_fn)
 
   def test_model_dir(self):
     empty_config = run_config_lib.RunConfig()
@@ -93,6 +95,7 @@ class RunConfigTest(test.TestCase):
 
   def test_replace_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig().replace(
         tf_random_seed=11,
@@ -100,13 +103,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_replace_none_value(self):
     config = run_config_lib.RunConfig().replace(
@@ -117,7 +122,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -126,6 +132,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_replace_with_disallowallowed_properties(self):
     config = run_config_lib.RunConfig()
@@ -166,9 +173,12 @@ class RunConfigTest(test.TestCase):
       config.replace(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       config.replace(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      config.replace(device_fn=lambda x, y: 0)
 
   def test_init_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig(
         tf_random_seed=11,
@@ -176,13 +186,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_init_none_value(self):
     config = run_config_lib.RunConfig(
@@ -193,7 +205,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -202,6 +215,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_init_invalid_values(self):
     with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
@@ -220,6 +234,8 @@ class RunConfigTest(test.TestCase):
       run_config_lib.RunConfig(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       run_config_lib.RunConfig(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      run_config_lib.RunConfig(device_fn=lambda x: "/cpu:0")
 
 
 class RunConfigDistributedSettingTest(test.TestCase):
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 05e603efb7..c8da55d802 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "device_fn"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
@@ -84,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\'], "
   }
   member_method {
     name: "replace"
-- 
GitLab


From 4ec3b601216a9727ebb78a764d34b487286629af Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 15 Apr 2018 19:23:53 -0700
Subject: [PATCH 0837/1262] Allow `~/` in path for transform_graph (#15894)

* Allow `~/` in path for transform_graph

This fix tries to address the issue raised in 13211 where
it was not possible to specify `~` (e.g., `~/`, `~user/`, etc)
for the path used in transform_graph. This fix adds
the support of `~` transform_graph on Linux.

This fix fixes 13211.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Reformat transform_graph.cc with clang-format

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../tools/graph_transforms/transform_graph.cc | 70 ++++++++++++++++---
 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 28387c2b48..8ce8f5e24b 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
+#if !defined(PLATFORM_WINDOWS)
+#include <pwd.h>
+#endif
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -130,16 +133,64 @@ Status ParseTransformParameters(const string& transforms_string,
   return Status::OK();
 }
 
+std::string ExpandPath(const std::string& path_string) {
+#if defined(PLATFORM_WINDOWS)
+  return path_string;
+#else
+  if (path_string.empty() || path_string[0] != '~') {
+    return path_string;
+  }
+
+  const char* home = NULL;
+  std::string::size_type prefix = path_string.find_first_of('/');
+  if (path_string.length() == 1 || prefix == 1) {
+    // The value of $HOME, e.g., ~/foo
+    home = getenv("HOME");
+    if (!home) {
+      // If HOME is not available, get uid
+      struct passwd* pw = getpwuid(getuid());
+      if (pw) {
+        home = pw->pw_dir;
+      }
+    }
+  } else {
+    // The value of ~user, e.g., ~user/foo
+    std::string user(path_string, 1, (prefix == std::string::npos)
+                                         ? std::string::npos
+                                         : prefix - 1);
+    struct passwd* pw = getpwnam(user.c_str());
+    if (pw) {
+      home = pw->pw_dir;
+    }
+  }
+
+  if (!home) {
+    return path_string;
+  }
+
+  string path(home);
+  if (prefix == std::string::npos) {
+    return path;
+  }
+
+  if (path.length() == 0 || path[path.length() - 1] != '/') {
+    path += '/';
+  }
+  path += path_string.substr(prefix + 1);
+  return path;
+#endif
+}
+
 int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
-  string in_graph = "";
-  string out_graph = "";
+  string in_graph_string = "";
+  string out_graph_string = "";
   string inputs_string = "";
   string outputs_string = "";
   string transforms_string = "";
   bool output_as_text = false;
   std::vector<Flag> flag_list = {
-      Flag("in_graph", &in_graph, "input graph file name"),
-      Flag("out_graph", &out_graph, "output graph file name"),
+      Flag("in_graph", &in_graph_string, "input graph file name"),
+      Flag("out_graph", &out_graph_string, "output graph file name"),
       Flag("inputs", &inputs_string, "inputs"),
       Flag("outputs", &outputs_string, "outputs"),
       Flag("transforms", &transforms_string, "list of transforms"),
@@ -166,11 +217,11 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     LOG(ERROR) << "Unknown argument " << argv[1] << ".\n" << usage;
     return -1;
   }
-  if (in_graph.empty()) {
+  if (in_graph_string.empty()) {
     LOG(ERROR) << "in_graph graph can't be empty.\n" << usage;
     return -1;
   }
-  if (out_graph.empty()) {
+  if (out_graph_string.empty()) {
     LOG(ERROR) << "out_graph graph can't be empty.\n" << usage;
     return -1;
   }
@@ -179,6 +230,9 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     return -1;
   }
 
+  string in_graph = ExpandPath(in_graph_string);
+  string out_graph = ExpandPath(out_graph_string);
+
   std::vector<string> inputs = str_util::Split(inputs_string, ',');
   std::vector<string> outputs = str_util::Split(outputs_string, ',');
   TransformParameters transform_params;
@@ -197,7 +251,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
   GraphDef graph_def;
   Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
-    LOG(ERROR) << "Loading graph '" << in_graph << "' failed with "
+    LOG(ERROR) << "Loading graph '" << in_graph_string << "' failed with "
                << load_status.error_message();
     LOG(ERROR) << usage;
     return -1;
@@ -219,7 +273,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     save_status = WriteBinaryProto(Env::Default(), out_graph, graph_def);
   }
   if (!save_status.ok()) {
-    LOG(ERROR) << "Saving graph '" << out_graph << "' failed with "
+    LOG(ERROR) << "Saving graph '" << out_graph_string << "' failed with "
                << save_status.error_message();
     return -1;
   }
-- 
GitLab


From 47c79c228d91a6b065fc5275b0b696490f6684cd Mon Sep 17 00:00:00 2001
From: "freedom\" Koan-Sin Tan" <koansin.tan@gmail.com>
Date: Mon, 16 Apr 2018 10:24:14 +0800
Subject: [PATCH 0838/1262] export tflite::Intepreter's  UseNNAPI() and
 setNumThreads() to java (#16065)

* export UseNNAPI() and setNumThreads() to java

Export tflite::Intepreter's UseNNAPI() and SetNumThreads() to Java
and modify the Android TfLiteCameraDemo app to use them.

* change CheckedChangeListener accordingly

* add error checking to setNumThreads()
---
 .../Camera2BasicFragment.java                 | 23 +++++++++++
 .../tflitecamerademo/ImageClassifier.java     | 10 +++++
 .../res/layout/fragment_camera2_basic.xml     | 41 +++++++++++++++++--
 .../demo/app/src/main/res/values/strings.xml  |  2 +
 .../java/org/tensorflow/lite/Interpreter.java |  7 ++++
 .../lite/NativeInterpreterWrapper.java        |  6 +++
 .../native/nativeinterpreterwrapper_jni.cc    | 10 +++++
 .../native/nativeinterpreterwrapper_jni.h     | 12 +++++-
 8 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 300786c3ca..18f6465188 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -54,6 +54,9 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
+import android.widget.CompoundButton;
+import android.widget.NumberPicker;
+import android.widget.ToggleButton;
 import android.widget.TextView;
 import android.widget.Toast;
 import java.io.IOException;
@@ -82,6 +85,8 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
+  private ToggleButton toggle;
+  private NumberPicker np;
   private ImageClassifier classifier;
 
   /** Max preview width that is guaranteed by Camera2 API */
@@ -289,6 +294,24 @@ public class Camera2BasicFragment extends Fragment
   public void onViewCreated(final View view, Bundle savedInstanceState) {
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
+    toggle = (ToggleButton) view.findViewById(R.id.button);
+
+    toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+      public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
+        classifier.setUseNNAPI(isChecked);
+      }
+    });
+
+    np = (NumberPicker) view.findViewById(R.id.np);
+    np.setMinValue(1);
+    np.setMaxValue(10);
+    np.setWrapSelectorWheel(true);
+    np.setOnValueChangedListener(new NumberPicker.OnValueChangeListener() {
+      @Override
+      public void onValueChange(NumberPicker picker, int oldVal, int newVal){
+        classifier.setNumThreads(newVal);
+      }
+    });
   }
 
   /** Load the model and labels. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index c57bb348c5..d32c077910 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -142,6 +142,16 @@ public abstract class ImageClassifier {
     }
   }
 
+  public void setUseNNAPI(Boolean nnapi) {
+    if (tflite != null)
+        tflite.setUseNNAPI(nnapi);
+  }
+
+  public void setNumThreads(int num_threads) {
+    if (tflite != null)
+        tflite.setNumThreads(num_threads);
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index 15305c436e..db557ad62f 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -22,24 +22,59 @@
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
         android:layout_alignParentTop="true" />
 
     <FrameLayout
         android:id="@+id/control"
         android:layout_width="match_parent"
-        android:layout_height="112dp"
+        android:layout_height="135dp"
         android:layout_alignParentBottom="true"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentEnd="true"
+        android:layout_alignParentRight="true"
+        android:layout_marginEnd="150dp"
+        android:layout_marginRight="150dp"
         android:background="@color/control_background">
 
-        <TextView android:id="@+id/text"
+        <TextView
+            android:id="@+id/text"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingLeft="80dp"
+            android:paddingLeft="20dp"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
 
     </FrameLayout>
 
+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="@color/control_background">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
index a08ec3eb62..29a033bcd4 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
@@ -21,4 +21,6 @@
     <string name="toggle_turn_on">NN:On</string>
     <string name="toggle_turn_off">NN:Off</string>
     <string name="toggle">Use NNAPI</string>
+    <string name="tflite">tflite</string>
+    <string name="nnapi">NNAPI</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index a33959dca4..451a1cd248 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -212,6 +212,13 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
+  public void setNumThreads(int num_threads) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The interpreter has already been closed.");
+    }
+    wrapper.setNumThreads(num_threads);
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index fc8187acfe..61a552db23 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -153,6 +153,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
+  void setNumThreads(int num_threads) {
+    numThreads(interpreterHandle, num_threads);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -321,6 +325,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
+  private static native void numThreads(long interpreterHandle, int num_threads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 844226203b..4c33a2dba4 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -315,6 +315,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
   interpreter->UseNNAPI(static_cast<bool>(state));
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->SetNumThreads(static_cast<int>(num_threads));
+}
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
     JNIEnv* env, jclass clazz, jint size) {
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index 0e28a77fee..eaa765cb34 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -61,7 +61,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JZ)
+ *  Signature: (JZ)V
  */
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
@@ -69,6 +69,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
                                                            jlong handle,
                                                            jboolean state);
 
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads);
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
-- 
GitLab


From 63c6562df68ade3a03481874a71b536a4e02b6f5 Mon Sep 17 00:00:00 2001
From: "Yuan (Terry) Tang" <terrytangyuan@gmail.com>
Date: Sun, 15 Apr 2018 22:30:34 -0400
Subject: [PATCH 0839/1262] Fix embedding_ops doc formatting (#18520)

* Fix embedding_ops doc formatting

* Add missing indentation
---
 tensorflow/python/ops/embedding_ops.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f0120f2957..9e46739bc1 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -331,11 +331,11 @@ def embedding_lookup_sparse(params,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
-    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+    sp_ids: N x M `SparseTensor` of int64 ids (typically from FeatureValueToId),
       where N is typically batch size and M is arbitrary.
-    sp_weights: either a SparseTensor of float / double weights, or None to
-      indicate all weights should be taken to be 1. If specified, sp_weights
-      must have exactly the same shape and indices as sp_ids.
+    sp_weights: either a `SparseTensor` of float / double weights, or `None` to
+      indicate all weights should be taken to be 1. If specified, `sp_weights`
+      must have exactly the same shape and indices as `sp_ids`.
     partition_strategy: A string specifying the partitioning strategy, relevant
       if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
       is `"mod"`. See `tf.nn.embedding_lookup` for more details.
@@ -351,39 +351,43 @@ def embedding_lookup_sparse(params,
 
   Returns:
     A dense tensor representing the combined embeddings for the
-    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
     looks up the embeddings for all ids in that row, multiplies them by the
     corresponding weight, and combines these embeddings as specified.
 
     In other words, if
 
-      shape(combined params) = [p0, p1, ..., pm]
+      `shape(combined params) = [p0, p1, ..., pm]`
 
     and
 
-      shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
 
     then
 
-      shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].
+      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
 
     For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
 
+      ```python
       [0, 0]: id 1, weight 2.0
       [0, 1]: id 3, weight 0.5
       [1, 0]: id 0, weight 1.0
       [2, 3]: id 1, weight 3.0
+      ```
 
     with `combiner`="mean", then the output will be a 3x20 matrix where
 
+      ```python
       output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
       output[1, :] = (params[0, :] * 1.0) / 1.0
       output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
 
   Raises:
-    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
-      None nor SparseTensor.
-    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is 
+      neither `None` nor `SparseTensor`.
+    ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
   if combiner is None:
     logging.warn("The default value of combiner will change from \"mean\" "
-- 
GitLab


From 9b747794ceb869105a144c965540a31791ce7fc3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 15 Apr 2018 21:52:17 -0700
Subject: [PATCH 0840/1262] Internal change

PiperOrigin-RevId: 192981122
---
 tensorflow/contrib/decision_trees/proto/BUILD | 5 ++++-
 tensorflow/contrib/tensorboard/BUILD          | 5 ++++-
 tensorflow/contrib/training/BUILD             | 9 ++++++---
 tensorflow/core/profiler/BUILD                | 1 +
 tensorflow/python/BUILD                       | 1 +
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index 3b50a48336..d84b1006a2 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -1,4 +1,7 @@
-package(default_visibility = ["//visibility:public"])
+package(
+    allow_proto_library_in_java_rules = 0,
+    default_visibility = ["//visibility:public"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 2b6a2b2f3c..1c6e41a57c 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -1,7 +1,10 @@
 # Description:
 # TensorBoard module containing volatile or experimental code.
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    allow_proto_library_in_java_rules = 0,
+    default_visibility = ["//tensorflow:internal"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 4d2bfd3e43..310b6de0ee 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -5,9 +5,12 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-package(default_visibility = [
-    "//tensorflow:internal",
-])
+package(
+    allow_proto_library_in_java_rules = 0,
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 3d3203cdaa..737692324b 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -1,4 +1,5 @@
 package(
+    allow_proto_library_in_java_rules = 0,
     default_visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a683c8cfa6..edc3c80352 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5,6 +5,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 package(
+    allow_proto_library_in_java_rules = 0,
     default_visibility = [
         "//engedu/ml/tf_from_scratch:__pkg__",
         "//tensorflow:internal",
-- 
GitLab


From 2308bbc4e09392aabb150d5c2df08a212ca61e6b Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Mon, 16 Apr 2018 14:53:17 +0900
Subject: [PATCH 0841/1262] fix typo

---
 .../opt/python/training/moving_average_optimizer_test.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index 85e3e8d379..ac04ad9911 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -85,7 +85,7 @@ class MovingAverageOptimizerTest(test.TestCase):
               state_ops.assign_add(ema_var1, [4.0, 4.0])
           ])
 
-          # Test taht saver with missing ema variables will fail.
+          # Test that saver with missing ema variables will fail.
           with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
             opt.swapping_saver(var_list=[var0])
 
@@ -123,7 +123,7 @@ class MovingAverageOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
             self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
             self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previou state.
+            # Restore back to previous state.
             train_saver.restore(sess, save_path)
 
           # If updates are parallel, this is not always true after the 1st step.
-- 
GitLab


From 4abef720d8800229e47a6a414b9378f95ea31218 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 16 Apr 2018 15:39:45 +0800
Subject: [PATCH 0842/1262] Fix the doc strings of nn.sampled_softmax_loss
 since it was deprecated by nn.sampled_softmax_loss_v2

---
 tensorflow/docs_src/performance/quantization.md | 2 +-
 tensorflow/python/keras/_impl/keras/backend.py  | 4 ++--
 tensorflow/python/ops/losses/losses_impl.py     | 2 +-
 tensorflow/python/ops/nn_impl.py                | 8 ++++----
 tensorflow/python/ops/nn_ops.py                 | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 411889cb1c..2fea02d861 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -110,7 +110,7 @@ we've added a separate rewrite for the *eval graph*:
 
 ```
 # Build eval model
-logits = tf.nn.softmax_cross_entropy_with_logits(...)
+logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
 
 # Call the eval rewrite which rewrites the graph in-place with
 # FakeQuantization nodes and fold batchnorm for eval.
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 81a4d2f820..449410fe08 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -3448,7 +3448,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   Returns:
       Output tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.softmax_cross_entropy_with_logits_v2
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
@@ -3512,7 +3512,7 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.sigmoid_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # transform back to logits
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 19a8eaf22c..93550dfac8 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -694,7 +694,7 @@ def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
+  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
   then the loss is simply scaled by the given value. If `weights` is a
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 1715e5b36a..d0d5ed07ce 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -987,7 +987,7 @@ def _compute_sampled_logits(weights,
         class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1012,7 +1012,7 @@ def _compute_sampled_logits(weights,
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
-        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+        `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
     out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
@@ -1285,7 +1285,7 @@ def sampled_softmax_loss(weights,
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
-    loss = tf.nn.softmax_cross_entropy_with_logits(
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
         labels=labels_one_hot,
         logits=logits)
   ```
@@ -1303,7 +1303,7 @@ def sampled_softmax_loss(weights,
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index bb454b3c3a..ea83ba7748 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1986,7 +1986,7 @@ def sparse_softmax_cross_entropy_with_logits(
   must provide a single specific index for the true class for each row of
   `logits` (each minibatch entry).  For soft softmax classification with
   a probability distribution for each entry, see
-  `softmax_cross_entropy_with_logits`.
+  `softmax_cross_entropy_with_logits_v2`.
 
   **WARNING:** This op expects unscaled logits, since it performs a `softmax`
   on `logits` internally for efficiency.  Do not call this op with the
-- 
GitLab


From 78b6592b9fe824d9634c762d628c7ebe6a6a2c46 Mon Sep 17 00:00:00 2001
From: "wenhao.hu" <wenhao.hu@leapmind.io>
Date: Mon, 16 Apr 2018 18:26:21 +0900
Subject: [PATCH 0843/1262] remove unneeded sealing API code

---
 tensorflow/python/ops/linalg/linalg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index 7e9c3cde18..14319025ff 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -39,7 +39,6 @@ del ops
 del array_ops
 del gen_linalg_ops
 del linalg_ops
-del linalg_ops_impl
 del math_ops
 del special_math_ops
 del tf_export
-- 
GitLab


From 581b56ad4dcedde4eb8f129153e993e8c44e199a Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Mon, 16 Apr 2018 22:08:02 +0800
Subject: [PATCH 0844/1262] Fix tf.argmax warnings on dimension parameter with
 axis

---
 .../contrib/losses/python/metric_learning/metric_loss_ops.py  | 4 ++--
 tensorflow/python/kernel_tests/random/multinomial_op_test.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index 2b9eee4ef7..de76acb51f 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -711,7 +711,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
       candidate_scores, margin_multiplier * nmi_scores)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   return candidate_ids[argmax_index]
 
@@ -811,7 +811,7 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
   candidate_scores = math_ops.add(scores_fac, margin_multiplier * scores_margin)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
   chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index a9dc7b7de0..051c7d86bf 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -46,7 +46,7 @@ def composed_sampler(logits, num_samples):
   logits = array_ops.expand_dims(logits, -1)
 
   # [batch size, num samples]
-  return math_ops.argmax(logits + noise, dimension=1)
+  return math_ops.argmax(logits + noise, axis=1)
 
 
 native_sampler = random_ops.multinomial
-- 
GitLab


From f610284f878b341423bde42afc90f917c337138c Mon Sep 17 00:00:00 2001
From: Harald Husum <harahu@stud.ntnu.no>
Date: Mon, 16 Apr 2018 16:21:23 +0200
Subject: [PATCH 0845/1262] Update pydoc for several tfdbg hooks (#18533)

For classes:
Specify that tfdbg.LocalCLIDebugHook can be used to debug instances
of tf.estimator.Estimator. Remove mentions of
tf.contrib.learn.Estimator and tf.contrib.learn.Experiment, as
they are deprecated.

For __init__ method of LocalCLIDebugHook:
Clarify purpose of ui_type argument.
---
 tensorflow/python/debug/wrappers/hooks.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 6705cd31e2..5e4604fda4 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -31,15 +31,18 @@ from tensorflow.python.training import session_run_hook
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s. Provides a substitute for
+  `tfdbg.LocalCLIDebugWrapperSession` in cases where the session is not directly
+  available.
   """
 
   def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
-      ui_type: (str) user-interface type.
+      ui_type: (`str`) requested user-interface type. Currently supported:
+        (curses | readline).
       dump_root: (`str`) optional path to the dump root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
@@ -153,8 +156,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
 class DumpingDebugHook(session_run_hook.SessionRunHook):
   """A debugger hook that dumps debug data to filesystem.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
@@ -229,8 +232,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
   When the arguments of debug_utils.watch_graph changes, strongly consider
   changing arguments here too so that features are available to tflearn users.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
-- 
GitLab


From 0b92d7b655e51e107393f9bd4022175fdbc16f1d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 15 Apr 2018 18:42:45 +0000
Subject: [PATCH 0846/1262] Update docs for tf.cast with supported types

This fix tries to address the issue raised in 18529 where
there are some confusion over supported types for `tf.cast`.
This fix updates the docs with explicitly supported numeric
type for `tf.cast`:
```
`uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`,
`float16`, `float32`, `float64`, `complex64`, `complex128`, `bfloat16`.
```

This fix fixes 18529.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index a38ecb2acb..c4ca8c40dc 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -762,8 +762,11 @@ def cast(x, dtype, name=None):
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
-    dtype: The destination type.
+    x: A `Tensor` or `SparseTensor` of numeric type. It could be
+      `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`,
+      `float16`, `float32`, `float64`, `complex64`, `complex128`, `bfloat16`.
+    dtype: The destination type. The list of supported dtypes is the same
+      as `x`.
     name: A name for the operation (optional).
 
   Returns:
-- 
GitLab


From a3cb195600b7acd1fce8f969d2a8f30d326bc918 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 15:00:51 +0000
Subject: [PATCH 0847/1262] Update docs for return values as well

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c4ca8c40dc..72cd56d9c1 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -770,7 +770,8 @@ def cast(x, dtype, name=None):
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x`.
+    A `Tensor` or `SparseTensor` with same shape as `x` and
+      same type as `dtype`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `dtype`.
-- 
GitLab


From 670c21e2c2e122807625962ee5152b0e5f763fc6 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 16 Apr 2018 09:34:27 -0700
Subject: [PATCH 0848/1262] Update for ObjectMemoryBuffer.h rename in upstream
 LLVM.

This will require a version bump in workspace.bzl

PiperOrigin-RevId: 193052084
---
 tensorflow/compiler/aot/embedded_protocol_buffers.cc    | 1 -
 tensorflow/compiler/xla/service/cpu/compiler_functor.cc | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index 6489929a57..0048eec93b 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 
 #include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 61b2da7a7d..6a7eb85e3b 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
@@ -158,7 +158,7 @@ std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
 
   // Construct ObjectFile from machine code buffer.
   return std::unique_ptr<llvm::MemoryBuffer>(
-      new llvm::ObjectMemoryBuffer(std::move(stream_buffer)));
+      new llvm::SmallVectorMemoryBuffer(std::move(stream_buffer)));
 }
 
 static std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl() {
-- 
GitLab


From 66af9322f2d3840d377f3e69769aacf6ba3b2d22 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 17:17:06 +0000
Subject: [PATCH 0849/1262] Update docs for tf.cast with respect to complex
 types

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 72cd56d9c1..e626a76cbb 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -761,6 +761,14 @@ def cast(x, dtype, name=None):
   tf.cast(x, tf.int32)  # [1, 2], dtype=tf.int32
   ```
 
+  The operation supports data types (for `x` and `dtype`) of
+  `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `float16`, `float32`,
+  `float64`, `complex64`, `complex128`, `bfloat16`. In case of casting from
+  complex types (`complex64`, `complex128`) to real types, only the real part
+  of `x` is returned. In case of casting from real types to complex types
+  (`complex64`, `complex128`), the imaginary part of the returned value is set
+  to `0`. The handling of complex types here matches the behavior of numpy.
+
   Args:
     x: A `Tensor` or `SparseTensor` of numeric type. It could be
       `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`,
-- 
GitLab


From 5895fde5492c834fed4f0e1824e70971b23d4ed4 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 16 Apr 2018 10:22:29 -0700
Subject: [PATCH 0850/1262] PiperOrigin-RevId: 193059174

---
 tensorflow/contrib/decision_trees/proto/BUILD |  5 +----
 tensorflow/contrib/tensorboard/BUILD          |  5 +----
 tensorflow/contrib/training/BUILD             |  9 +++------
 tensorflow/core/profiler/BUILD                |  5 +----
 tensorflow/python/BUILD                       | 19 ++++++++-----------
 5 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index d84b1006a2..3b50a48336 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -1,7 +1,4 @@
-package(
-    allow_proto_library_in_java_rules = 0,
-    default_visibility = ["//visibility:public"],
-)
+package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 1c6e41a57c..2b6a2b2f3c 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -1,10 +1,7 @@
 # Description:
 # TensorBoard module containing volatile or experimental code.
 
-package(
-    allow_proto_library_in_java_rules = 0,
-    default_visibility = ["//tensorflow:internal"],
-)
+package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 310b6de0ee..4d2bfd3e43 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -5,12 +5,9 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-package(
-    allow_proto_library_in_java_rules = 0,
-    default_visibility = [
-        "//tensorflow:internal",
-    ],
-)
+package(default_visibility = [
+    "//tensorflow:internal",
+])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 737692324b..af034bdd7d 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -1,7 +1,4 @@
-package(
-    allow_proto_library_in_java_rules = 0,
-    default_visibility = ["//visibility:public"],
-)
+package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index edc3c80352..14ce8a57bd 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4,17 +4,14 @@
 # Public targets:
 #  ":platform" - Low-level and platform-specific Python code.
 
-package(
-    allow_proto_library_in_java_rules = 0,
-    default_visibility = [
-        "//engedu/ml/tf_from_scratch:__pkg__",
-        "//tensorflow:internal",
-        "//tensorflow/contrib/lite/toco/python:__pkg__",
-        "//tensorflow_models:__subpackages__",
-        # TODO(aselle): to pass open source test.
-        "//bazel_pip/tensorflow/contrib/lite/toco/python:__pkg__",
-    ],
-)
+package(default_visibility = [
+    "//engedu/ml/tf_from_scratch:__pkg__",
+    "//tensorflow:internal",
+    "//tensorflow/contrib/lite/toco/python:__pkg__",
+    "//tensorflow_models:__subpackages__",
+    # TODO(aselle): to pass open source test.
+    "//bazel_pip/tensorflow/contrib/lite/toco/python:__pkg__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
-- 
GitLab


From 2738d08f6976eed04eec9c92f9cb913168847547 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 16 Apr 2018 09:40:40 -0700
Subject: [PATCH 0851/1262] Add ability to override git tag in __git_version__
 string.

Adding this functionality to make release process smoother. It will
allow us to create the release builds before creating the git
release tag.
---
 tensorflow/tensorflow.bzl              |  2 +-
 tensorflow/tools/git/gen_git_source.py | 37 +++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index bfb28d22a9..51e856bed0 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1703,7 +1703,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 372329b70c..2274d797cd 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -139,7 +139,7 @@ def configure(src_base_path, gen_path, debug=False):
     print("gen_git_source.py: spec is %r" % spec)
 
 
-def get_git_version(git_base_path):
+def get_git_version(git_base_path, git_tag_override):
   """Get the git version from the repository.
 
   This function runs `git describe ...` in the path given as `git_base_path`.
@@ -152,6 +152,9 @@ def get_git_version(git_base_path):
 
   Args:
     git_base_path: where the .git directory is located
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   Returns:
     A bytestring representing the git version
   """
@@ -161,6 +164,14 @@ def get_git_version(git_base_path):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    if git_tag_override:
+      split_val = val.split("-")
+      if len(split_val) != 3:
+        raise Exception(
+            "Expected git version in format 'tag-commits after tag-hash' "
+            "but got '%s'", val)
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -205,7 +216,7 @@ const int tf_monolithic_build() {
   open(filename, "w").write(contents)
 
 
-def generate(arglist):
+def generate(arglist, git_tag_override=None):
   """Generate version_info.cc as given `destination_file`.
 
   Args:
@@ -225,6 +236,10 @@ def generate(arglist):
   `ref_symlink` is unused in this script but passed, because the build
     system uses that file to detect when commits happen.
 
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
+
   Raises:
     RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
   """
@@ -242,11 +257,11 @@ def generate(arglist):
       raise RuntimeError(
           "Run ./configure again, branch was '%s' but is now '%s'" %
           (old_branch, new_branch))
-    git_version = get_git_version(data["path"])
+    git_version = get_git_version(data["path"], git_tag_override)
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file, source_dir):
+def raw_generate(output_file, source_dir, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -255,9 +270,12 @@ def raw_generate(output_file, source_dir):
   Args:
     output_file: Output filename for the version info cc
     source_dir: Base path of the source code
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   """
 
-  git_version = get_git_version(source_dir)
+  git_version = get_git_version(source_dir, git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -279,6 +297,11 @@ parser.add_argument(
     "--gen_root_path", type=str,
     help="Root path to place generated git files (created by --configure).")
 
+parser.add_argument(
+    "--git_tag_override", type=str,
+    help="Override git tag value in the __git_version__ string. Useful when "
+         "creating release builds before the release tag is created.")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -302,12 +325,12 @@ if args.configure is not None:
     raise RuntimeError("Must pass --gen_root_path arg when running --configure")
   configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
-  generate(args.generate)
+  generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
   source_path = "."
   if args.source_dir is not None:
     source_path = args.source_dir
-  raw_generate(args.raw_generate, source_path)
+  raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
-- 
GitLab


From 00592f397f75a51fc6d4f48a61f9fd6b96dd5cab Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 16 Apr 2018 10:41:49 -0700
Subject: [PATCH 0852/1262] Fix Exception syntax.

---
 tensorflow/tools/git/gen_git_source.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 2274d797cd..2151a75e84 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -168,8 +168,8 @@ def get_git_version(git_base_path, git_tag_override):
       split_val = val.split("-")
       if len(split_val) != 3:
         raise Exception(
-            "Expected git version in format 'tag-commits after tag-hash' "
-            "but got '%s'", val)
+            ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
+             "but got '%s'") % val)
       split_val[0] = git_tag_override
       val = bytes("-".join(split_val))
     return val if val else unknown_label
-- 
GitLab


From 1d2bc3318f88b075f6f5f1ec0892d87906da6a91 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 16 Apr 2018 11:20:40 -0700
Subject: [PATCH 0853/1262] Prefix clip_by_value with underscore in
 gen_math_ops so that it doesn't interfere with clip_ops.clip_by_value when
 importing.

PiperOrigin-RevId: 193069700
---
 tensorflow/python/framework/python_op_gen.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index e5e3b82199..ad6c36b4b1 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -98,7 +98,7 @@ bool IsOpWithUnderscorePrefix(const string& s) {
        // TODO(annarev): reduce usage of '*' imports and remove these from the
        // list.
        "fused_batch_norm", "histogram_fixed_width", "stack",
-       "batch_norm_with_global_normalization"});
+       "batch_norm_with_global_normalization", "clip_by_value"});
   return kUnderscoreOps->count(s) > 0;
 }
 
-- 
GitLab


From 1cbf75706031247460e588f281a47f0ae00d6812 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 16 Apr 2018 11:24:43 -0700
Subject: [PATCH 0854/1262] Porting tests for the `decode_proto` and
 `encode_proto` to OS.

PiperOrigin-RevId: 193070420
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   6 +-
 tensorflow/contrib/proto/BUILD                |  16 +
 .../contrib/proto/python/kernel_tests/BUILD   |  86 +++++
 .../proto/python/kernel_tests/build_defs.bzl  |  89 ++++++
 .../kernel_tests/decode_proto_fail_test.py    |  68 ++++
 .../kernel_tests/decode_proto_op_test.py      | 300 ++++++++++++++++++
 .../kernel_tests/encode_proto_op_test.py      | 180 +++++++++++
 .../python/kernel_tests/minmax.TestCase.pbtxt | 161 ++++++++++
 .../python/kernel_tests/nested.TestCase.pbtxt |  16 +
 .../kernel_tests/optional.TestCase.pbtxt      |  20 ++
 .../promote_unsigned.TestCase.pbtxt           |  21 ++
 .../python/kernel_tests/ragged.TestCase.pbtxt |  32 ++
 .../kernel_tests/shaped_batch.TestCase.pbtxt  |  62 ++++
 .../python/kernel_tests/simple.TestCase.pbtxt |  21 ++
 .../proto/python/kernel_tests/test_case.py    |  35 ++
 .../python/kernel_tests/test_example.proto    | 149 +++++++++
 tensorflow/tools/pip_package/BUILD            |   1 +
 19 files changed, 1263 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_case.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 9bef0d8b61..ae68f4aec4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -77,6 +77,7 @@ py_library(
         "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
+        "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index aaddb06fa0..e27ece8fa5 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -64,6 +64,7 @@ from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
+from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
 from tensorflow.contrib import recurrent
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ded15b4b66..21f59d2563 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,8 +330,10 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
-GENERATE_PYTHON_OP_LIB("decode_proto_ops")
-GENERATE_PYTHON_OP_LIB("encode_proto_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("encode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index 046652cbc5..3e9b1a0b8d 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "proto",
     srcs = [
@@ -14,3 +16,17 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
+
+py_library(
+    name = "proto_pip",
+    data = [
+        "//tensorflow/contrib/proto/python/kernel_tests:test_messages",
+    ] + if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":proto",
+        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..a380a131f8
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -0,0 +1,86 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Much of the work in this BUILD file actually happens in the corresponding
+# build_defs.bzl, which creates an individual testcase for each example .pbtxt
+# file in this directory.
+#
+load(":build_defs.bzl", "decode_proto_test_suite")
+load(":build_defs.bzl", "encode_proto_test_suite")
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :decode_proto_op_tests.
+decode_proto_test_suite(
+    name = "decode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :encode_proto_op_tests.
+encode_proto_test_suite(
+    name = "encode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# Below here are tests that are not tied to an example text proto.
+filegroup(
+    name = "test_messages",
+    srcs = glob(["*.pbtxt"]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_py_test(
+    name = "decode_proto_fail_test",
+    size = "small",
+    srcs = ["decode_proto_fail_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/proto:proto",
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
+
+py_library(
+    name = "test_case",
+    srcs = ["test_case.py"],
+    deps = ["//tensorflow/python:client_testlib"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [
+        ":test_case",
+        ":test_example_proto_py",
+    ],
+)
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
new file mode 100644
index 0000000000..f425601691
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
@@ -0,0 +1,89 @@
+"""BUILD rules for generating file-driven proto test cases.
+
+The decode_proto_test_suite() and encode_proto_test_suite() rules take a list
+of text protos and generates a tf_py_test() for each one.
+"""
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "register_extension_info")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+def _test_name(test, path):
+  return "%s_%s_test" % (test, path.split("/")[-1].split(".")[0])
+
+def decode_proto_test_suite(name, examples):
+  """Build the decode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("decode_proto", test_filename),
+        srcs = ["decode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "decode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("decode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+def encode_proto_test_suite(name, examples):
+  """Build the encode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("encode_proto", test_filename),
+        srcs = ["encode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "encode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+            "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("encode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+register_extension_info(
+    extension_name = "decode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:decode_example_.*",
+    })
+
+register_extension_info(
+    extension_name = "encode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:encode_example_.*",
+    })
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
new file mode 100644
index 0000000000..5298342ee7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -0,0 +1,68 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class DecodeProtoFailTest(test_case.ProtoOpTestCase):
+  """Test failure cases for DecodeToProto."""
+
+  def _TestCorruptProtobuf(self, sanitize):
+    """Test failure cases for DecodeToProto."""
+
+    # The goal here is to check the error reporting.
+    # Testing against a variety of corrupt protobufs is
+    # done by fuzzing.
+    corrupt_proto = 'This is not a binary protobuf'
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(corrupt_proto, dtype=object)
+    msg_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['sizes']
+    field_types = [dtypes.int32]
+
+    with self.test_session() as sess:
+      ctensor, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=msg_type,
+          field_names=field_names,
+          output_types=field_types,
+          sanitize=sanitize)
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   'Unable to parse binary protobuf'
+                                   '|Failed to consume entire buffer'):
+        _ = sess.run([ctensor] + vtensor)
+
+  def testCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=False)
+
+  def testSanitizerCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
new file mode 100644
index 0000000000..d1c13c82bc
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -0,0 +1,300 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for decode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+"""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class DecodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def _compareValues(self, fd, vs, evs):
+    """Compare lists/arrays of field values."""
+
+    if len(vs) != len(evs):
+      self.fail('Field %s decoded %d outputs, expected %d' %
+                (fd.name, len(vs), len(evs)))
+    for i, ev in enumerate(evs):
+      # Special case fuzzy match for float32. TensorFlow seems to mess with
+      # MAX_FLT slightly and the test doesn't work otherwise.
+      # TODO(nix): ask on TF list about why MAX_FLT doesn't pass through.
+      if fd.cpp_type == fd.CPPTYPE_FLOAT:
+        # Numpy isclose() is better than assertIsClose() which uses an absolute
+        # value comparison.
+        self.assertTrue(
+            np.isclose(vs[i], ev), 'expected %r, actual %r' % (ev, vs[i]))
+      elif fd.cpp_type == fd.CPPTYPE_STRING:
+        # In Python3 string tensor values will be represented as bytes, so we
+        # reencode the proto values to match that.
+        self.assertEqual(vs[i], ev.encode('ascii'))
+      else:
+        # Doubles and other types pass through unscathed.
+        self.assertEqual(vs[i], ev)
+
+  def _compareRepeatedPrimitiveValue(self, batch_shape, sizes, fields,
+                                     field_dict):
+    """Compare protos of type RepeatedPrimitiveValue.
+
+    Args:
+      batch_shape: the shape of the input tensor of serialized messages.
+      sizes: int matrix of repeat counts returned by decode_proto
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      field_dict: map from field names to decoded numpy tensors of values
+    """
+
+    # Check that expected values match.
+    for field in fields:
+      values = field_dict[field.name]
+      self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
+
+      fd = field.expected.DESCRIPTOR.fields_by_name[field.name]
+
+      # Values has the same shape as the input plus an extra
+      # dimension for repeats.
+      self.assertEqual(list(values.shape)[:-1], batch_shape)
+
+      # Nested messages are represented as TF strings, requiring
+      # some special handling.
+      if field.name == 'message_value':
+        vs = []
+        for buf in values.flat:
+          msg = test_example_pb2.PrimitiveValue()
+          msg.ParseFromString(buf)
+          vs.append(msg)
+        evs = getattr(field.expected, field.name)
+        if len(vs) != len(evs):
+          self.fail('Field %s decoded %d outputs, expected %d' %
+                    (fd.name, len(vs), len(evs)))
+        for v, ev in zip(vs, evs):
+          self.assertEqual(v, ev)
+        continue
+
+      # This can be a little confusing. For testing we are using
+      # RepeatedPrimitiveValue in two ways: it's the proto that we
+      # decode for testing, and it's used in the expected value as a
+      # union type. The two cases are slightly different: this is the
+      # second case.
+      # We may be fetching the uint64_value from the test proto, but
+      # in the expected proto we store it in the int64_value field
+      # because TensorFlow doesn't support unsigned int64.
+      tf_type_to_primitive_value_field = {
+          dtypes.float32:
+              'float_value',
+          dtypes.float64:
+              'double_value',
+          dtypes.int32:
+              'int32_value',
+          dtypes.uint8:
+              'uint8_value',
+          dtypes.int8:
+              'int8_value',
+          dtypes.string:
+              'string_value',
+          dtypes.int64:
+              'int64_value',
+          dtypes.bool:
+              'bool_value',
+          # Unhandled TensorFlow types:
+          # DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+          # DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+      }
+      tf_field_name = tf_type_to_primitive_value_field.get(field.dtype)
+      if tf_field_name is None:
+        self.fail('Unhandled tensorflow type %d' % field.dtype)
+
+      self._compareValues(fd, values.flat,
+                          getattr(field.expected, tf_field_name))
+
+  def _runDecodeProtoTests(self, fields, case_sizes, batch_shape, batch,
+                           message_type, message_format, sanitize,
+                           force_disordered=False):
+    """Run decode tests on a batch of messages.
+
+    Args:
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      case_sizes: expected sizes array
+      batch_shape: the shape of the input tensor of serialized messages
+      batch: list of serialized messages
+      message_type: descriptor name for messages
+      message_format: format of messages, 'text' or 'binary'
+      sanitize: whether to sanitize binary protobuf inputs
+      force_disordered: whether to force fields encoded out of order.
+    """
+
+    if force_disordered:
+      # Exercise code path that handles out-of-order fields by prepending extra
+      # fields with tag numbers higher than any real field. Note that this won't
+      # work with sanitization because that forces reserialization using a
+      # trusted decoder and encoder.
+      assert not sanitize
+      extra_fields = test_example_pb2.ExtraFields()
+      extra_fields.string_value = 'IGNORE ME'
+      extra_fields.bool_value = False
+      extra_msg = extra_fields.SerializeToString()
+      batch = [extra_msg + msg for msg in batch]
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(batch, dtype=object)
+    batch = np.reshape(batch, batch_shape)
+
+    field_names = [f.name for f in fields]
+    output_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=output_types,
+          message_format=message_format,
+          sanitize=sanitize)
+
+      vlist = sess.run([sizes] + vtensor)
+      sizes = vlist[0]
+      # Values is a list of tensors, one for each field.
+      value_tensors = vlist[1:]
+
+      # Check that the repeat sizes are correct.
+      self.assertTrue(
+          np.all(np.array(sizes.shape) == batch_shape + [len(field_names)]))
+
+      # Check that the decoded sizes match the expected sizes.
+      self.assertEqual(len(sizes.flat), len(case_sizes))
+      self.assertTrue(
+          np.all(sizes.flat == np.array(
+              case_sizes, dtype=np.int32)))
+
+      field_dict = dict(zip(field_names, value_tensors))
+
+      self._compareRepeatedPrimitiveValue(batch_shape, sizes, fields,
+                                          field_dict)
+
+  def testBinary(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testBinaryDisordered(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False,
+        force_disordered=True)
+
+  def testPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    packed_batch = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        packed_batch,
+        'tensorflow.contrib.proto.PackedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testText(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Note: float_format='.17g' is necessary to ensure preservation of
+    # doubles and floats in text format.
+    text_batch = [
+        text_format.MessageToString(
+            primitive, float_format='.17g') for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        text_batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'text',
+        sanitize=False)
+
+  def testSanitizerGood(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
new file mode 100644
index 0000000000..30e58e6336
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -0,0 +1,180 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for encode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+
+It tests that encode_proto is a lossless inverse of decode_proto
+(for the specified fields).
+"""
+# Python3 readiness boilerplate
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class EncodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def testBadInputs(self):
+    # Invalid field name
+    with self.test_session():
+      with self.assertRaisesOpError('Unknown field: non_existent_field'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['non_existent_field']).eval()
+
+    # Incorrect types.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Incompatible type for field double_value.'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval()
+
+    # Incorrect shapes of sizes.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r'sizes should be batch_size \+ \[len\(field_names\)\]'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values = array_ops.placeholder(dtypes.float64)
+        encode_proto_op.encode_proto(
+            sizes=sizes,
+            values=[values],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval(feed_dict={
+                sizes: [[[0, 0]]],
+                values: [[0.0]]
+            })
+
+    # Inconsistent shapes of values.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Values must match up to the last dimension'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values1 = array_ops.placeholder(dtypes.float64)
+        values2 = array_ops.placeholder(dtypes.int32)
+        (encode_proto_op.encode_proto(
+            sizes=[[1, 1]],
+            values=[values1, values2],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value', 'int32_value']).eval(feed_dict={
+                values1: [[0.0]],
+                values2: [[0], [0]]
+            }))
+
+  def _testRoundtrip(self, in_bufs, message_type, fields):
+
+    field_names = [f.name for f in fields]
+    out_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = decode_proto_op.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=out_types)
+
+      out_tensors = encode_proto_op.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        in_obj = test_example_pb2.RepeatedPrimitiveValue()
+        in_obj.ParseFromString(in_buf)
+
+        out_obj = test_example_pb2.RepeatedPrimitiveValue()
+        out_obj.ParseFromString(out_buf)
+
+        # Check that the deserialized objects are identical.
+        self.assertEqual(in_obj, out_obj)
+
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testRoundtrip(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    in_bufs = [primitive.SerializeToString() for primitive in case.primitive]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.RepeatedPrimitiveValue', case.field)
+
+  def testRoundtripPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    in_bufs = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.PackedPrimitiveValue', case.field)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
new file mode 100644
index 0000000000..b170f89c0f
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
@@ -0,0 +1,161 @@
+primitive {
+  double_value: -1.7976931348623158e+308
+  double_value: 2.2250738585072014e-308
+  double_value: 1.7976931348623158e+308
+  float_value: -3.402823466e+38
+  float_value: 1.175494351e-38
+  float_value: 3.402823466e+38
+  int64_value: -9223372036854775808
+  int64_value: 9223372036854775807
+  uint64_value: 0
+  uint64_value: 18446744073709551615
+  int32_value: -2147483648
+  int32_value: 2147483647
+  fixed64_value: 0
+  fixed64_value: 18446744073709551615
+  fixed32_value: 0
+  fixed32_value: 4294967295
+  bool_value: false
+  bool_value: true
+  string_value: ""
+  string_value: "I refer to the infinite."
+  uint32_value: 0
+  uint32_value: 4294967295
+  sfixed32_value: -2147483648
+  sfixed32_value: 2147483647
+  sfixed64_value: -9223372036854775808
+  sfixed64_value: 9223372036854775807
+  sint32_value: -2147483648
+  sint32_value: 2147483647
+  sint64_value: -9223372036854775808
+  sint64_value: 9223372036854775807
+}
+shape: 1
+sizes: 3
+sizes: 3
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: -1.7976931348623158e+308
+    double_value: 2.2250738585072014e-308
+    double_value: 1.7976931348623158e+308
+  }
+}
+field {
+  name: "float_value"
+  dtype: DT_FLOAT
+  expected {
+    float_value: -3.402823466e+38
+    float_value: 1.175494351e-38
+    float_value: 3.402823466e+38
+  }
+}
+field {
+  name: "int64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "uint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1
+  }
+}
+field {
+  name: "int32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "fixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1  # unsigned is 18446744073709551615
+  }
+}
+field {
+  name: "fixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: false
+    bool_value: true
+  }
+}
+field {
+  name: "string_value"
+  dtype: DT_STRING
+  expected {
+    string_value: ""
+    string_value: "I refer to the infinite."
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "sfixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sfixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "sint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
new file mode 100644
index 0000000000..c664e52851
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
@@ -0,0 +1,16 @@
+primitive {
+  message_value {
+    double_value: 23.5
+  }
+}
+shape: 1
+sizes: 1
+field {
+  name: "message_value"
+  dtype: DT_STRING
+  expected {
+    message_value {
+      double_value: 23.5
+    }
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
new file mode 100644
index 0000000000..125651d7ea
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
@@ -0,0 +1,20 @@
+primitive {
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 0
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 0.0
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
new file mode 100644
index 0000000000..db7555bf2d
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  fixed32_value: 4294967295
+  uint32_value: 4294967295
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "fixed32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
new file mode 100644
index 0000000000..61c7ac53f7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
@@ -0,0 +1,32 @@
+primitive {
+  double_value: 23.5
+  double_value: 123.0
+  bool_value: true
+}
+primitive {
+  double_value: 3.1
+  bool_value: false
+}
+shape: 2
+sizes: 2
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 123.0
+    double_value: 3.1
+    double_value: 0.0
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
new file mode 100644
index 0000000000..f4828076d5
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
@@ -0,0 +1,62 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+primitive {
+  double_value: 44.0
+  bool_value: false
+}
+primitive {
+  double_value: 3.14159
+  bool_value: true
+}
+primitive {
+  double_value: 1.414
+  bool_value: true
+}
+primitive {
+  double_value: -32.2
+  bool_value: false
+}
+primitive {
+  double_value: 0.0001
+  bool_value: true
+}
+shape: 3
+shape: 2
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 44.0
+    double_value: 3.14159
+    double_value: 1.414
+    double_value: -32.2
+    double_value: 0.0001
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+    bool_value: true
+    bool_value: true
+    bool_value: false
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
new file mode 100644
index 0000000000..dc20ac147b
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_case.py b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
new file mode 100644
index 0000000000..b95202c5df
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
@@ -0,0 +1,35 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Test case base for testing proto operations."""
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+from tensorflow.python.platform import test
+
+
+class ProtoOpTestCase(test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(ProtoOpTestCase, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..dc495034ff
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -0,0 +1,149 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.proto;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 376644718f..a0bae23a7c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -74,6 +74,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-- 
GitLab


From 65cb41eb36412114cdef85de1c3958e2db9831ac Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Mon, 16 Apr 2018 11:27:09 -0700
Subject: [PATCH 0855/1262] Remove the hidden replicate_model_fn copy from
 core.

PiperOrigin-RevId: 193070799
---
 tensorflow/python/estimator/BUILD             |   67 -
 .../python/estimator/replicate_model_fn.py    |  824 --------
 .../estimator/replicate_model_fn_test.py      | 1739 -----------------
 3 files changed, 2630 deletions(-)
 delete mode 100644 tensorflow/python/estimator/replicate_model_fn.py
 delete mode 100644 tensorflow/python/estimator/replicate_model_fn_test.py

diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index a34405c702..7bf4447491 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -7,7 +7,6 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
     name = "estimator_py",
@@ -25,7 +24,6 @@ py_library(
         ":linear",
         ":model_fn",
         ":parsing_utils",
-        ":replicate_model_fn",
         ":run_config",
         ":training",
         "//tensorflow/python:util",
@@ -909,68 +907,3 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
-
-py_library(
-    name = "replicate_model_fn",
-    srcs = [
-        "replicate_model_fn.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":export_output",
-        ":model_fn",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
-        "@six_archive//:six",
-    ],
-)
-
-cuda_py_test(
-    name = "replicate_model_fn_test",
-    size = "medium",
-    srcs = ["replicate_model_fn_test.py"],
-    additional_deps = [
-        "//tensorflow/python/estimator",
-        ":dnn",
-        ":export_export",
-        ":export_output",
-        ":model_fn",
-        ":numpy_io",
-        ":optimizers",
-        ":prediction_keys",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/ops/losses",
-        "//tensorflow/python/saved_model:signature_constants",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        ":replicate_model_fn",
-    ],
-    tags = [
-        "multi_gpu",
-        "noasan",  # flaky time outs
-        "notsan",  # flaky
-    ],
-)
diff --git a/tensorflow/python/estimator/replicate_model_fn.py b/tensorflow/python/estimator/replicate_model_fn.py
deleted file mode 100644
index 144d89abf3..0000000000
--- a/tensorflow/python/estimator/replicate_model_fn.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities to replicate model_fn's over local GPUs.
-
-This file contains util that allow to replicate `Estimator.model_fn` over
-GPUs.  Replicated version of a `model_fn` is returned that can subsequently
-be used with `Estimator`.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import defaultdict
-from contextlib import contextmanager
-import copy
-
-import six
-
-from tensorflow.core.framework import node_def_pb2
-from tensorflow.python.client import device_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import device as framework_device
-from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import device_setter as device_setter_lib
-from tensorflow.python.training import optimizer as optimizer_lib
-
-
-def _replicate_model_fn(model_fn,
-                        devices=None):
-  """Replicate `Estimator.model_fn` over GPUs.
-
-  The given `model_fn` specifies a single forward pass of a model.  To replicate
-  such a model over GPUs, each GPU gets its own instance of the forward pass
-  (a.k.a. a tower).  The input features and labels get sharded into the chunks
-  that correspond to the number of GPUs.  Each tower computes a loss based
-  on its input.  For each such loss, gradients are computed.  After that, the
-  available losses are aggregated to form aggregated loss.  Available
-  gradients are summed.  Then, they update weights using the specified
-  optimizer.
-
-  If `devices` are `None`, then all available GPUs are going to be used for
-  replication.  If no GPUs are available, then the model is going to be
-  placed on the CPU.
-
-  Two modes of local replication over available GPUs are supported:
-    1)  If exactly 1 GPU is detected, then variables and operations are placed
-        onto the GPU.
-    2)  If more than 1 GPU is detected, then variables are going to be placed on
-        the CPU.  Replicas of operations are placed on each individual GPU.
-
-  Here is an example of how one might use their `model_fn` to run over GPUs:
-    ```python
-       ...
-       def model_fn(...):  # See `model_fn` in `Estimator`.
-         loss = ...
-         optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
-         optimizer = tf.contrib.estimator._TowerOptimizer(optimizer)
-         if mode == tf.estimator.ModeKeys.TRAIN:
-           #  See the section below on `EstimatorSpec.train_op`.
-           return EstimatorSpec(mode=mode, loss=loss,
-                                train_op=optimizer.minimize(loss))
-
-         #  No change for `ModeKeys.EVAL` or `ModeKeys.PREDICT`.
-         return EstimatorSpec(...)
-       ...
-       classifier = tf.estimator.Estimator(
-         model_fn=tf.contrib.estimator.replicate_model_fn(model_fn))
-    ```
-
-  Please see `DNNClassifierIntegrationTest` for an example with a canned
-  Estimator.
-
-  On `EstimatorSpec.train_op`:
-  `model_fn` returns `EstimatorSpec.train_op` for
-  `tf.estimator.GraphKeys.TRAIN`. It is typically derived using an optimizer.
-  Towers are expected to populate it in the same way.  Gradients from all towers
-  are reduced and applied in the last tower.  To achieve that in the case of
-  multiple towers, `_TowerOptimizer` needs to be used.  See `_TowerOptimizer`.
-
-  On sharding input features and labels:
-  Input features and labels are split for consumption by each tower. They are
-  split across the dimension 0.  Features and labels need to be batch major.
-
-  On reduction algorithms:
-  Certain algorithms were chosen for aggregating results of computations on
-  multiple towers:
-    - Losses from all towers are reduced according to `loss_reduction` argument
-      to TowerOptimizer..
-    - Gradients from all towers are reduced according to the `loss_reduction`
-      for each trainable variable.
-    - `eval_metrics_ops` are reduced per metric using `reduce_mean`.
-    - `EstimatorSpec.predictions` and `EstimatorSpec.export_outputs` are
-      reduced using concatenation.
-    - For all other fields of `EstimatorSpec` the values of the first tower
-      are taken.
-
-  On distribution of variables:
-  Variables are not duplicated between towers.  Instead, they are placed on a
-  single device as defined above and shared across towers.
-
-  On overhead:
-  If only one device is specified, then aggregation of loss and gradients
-  doesn't happen. Replication consists of placing `model_fn` onto the
-  specified device.
-
-  On current limitations:
-    - `predictions` are not supported for `ModeKeys.EVAL`.  They are required
-       for `tf.contrib.estimator.add_metrics`.
-
-  Args:
-    model_fn: `model_fn` as defined in `Estimator`.  See the section above about
-      the train_op argument of `EstimatorSpec`.
-    devices: Optional list of devices to replicate the model across.  This
-      argument can be used to replice only on the subset of available GPUs.
-      If `None`, then all available GPUs are going to be used for replication.
-      If no GPUs are available, then the model is going to be placed on the CPU.
-
-  Returns:
-    A replicated version of the supplied `model_fn`. Returned function that
-      conforms to the requirements of `Estimator`'s `model_fn` and can be used
-      instead of the supplied `model_fn`.
-  """
-  return _replicate_model_fn_with_mode(
-      model_fn,
-      devices,
-      # TODO(isaprykin): Query the system configuration to choose modes other
-      # than `SHARED_LOCAL_PARAMETER_SERVER`, even though it is often
-      # appropriate.
-      mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER)
-
-
-class _VariableDistributionMode(object):
-  """Modes for variable distribution used for forcing a particular one.
-
-  Forcing a mode is meant for performance experimentation purposes rather than
-  for general use cases.
-  """
-
-  SHARED_LOCAL_PARAMETER_SERVER = 1
-  """Variables are placed on a single device and shared across all devices.
-
-  Two ways to achieve this distribution over available GPUs are supported:
-    1)  If exactly 1 GPU is detected, then variables and operations are placed
-        onto GPU.
-    2)  If more than 1 GPU is detected, then variables are going to be placed on
-        the CPU.  Replicas of operations are placed on each individual GPU.
-  """
-
-  SHARED_ROUND_ROBIN = 2
-  """Variables are placed on all devices in a round-robin fashion.
-
-  Every subsequent variable is placed on the next device.  There is only one
-  copy of each variable that is shared across all devices.
-  """
-
-
-def _replicate_model_fn_with_mode(
-    model_fn,
-    devices=None,
-    mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER):
-  """A version of `replicate_model_fn` that allows to specify a `mode`."""
-  if not devices:
-    devices = _get_local_devices('GPU') or _get_local_devices('CPU')
-
-  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0].upper()
-  consolidation_device = devices[0] if is_a_single_gpu_case else '/CPU:0'
-
-  ps_devices = [consolidation_device]
-  if mode == _VariableDistributionMode.SHARED_ROUND_ROBIN:
-    ps_devices = devices
-
-  tf_logging.info('Replicating the `model_fn` across {}.  Variables are going '
-                  'to be placed on {}.  Consolidation device is going to be {}.'
-                  .format(devices, ps_devices, consolidation_device))
-
-  def single_device_model_fn(features, labels, mode, params=None, config=None):
-    """`model_fn` on a single device without reduction overhead."""
-    return _get_loss_towers(
-        model_fn=model_fn,
-        mode=mode,
-        features=[features],
-        labels=[labels],
-        params=params,
-        config=config,
-        devices=devices,
-        local_ps_devices=ps_devices)[0]  # One device, so one spec is out.
-
-  def replicated_model_fn(features, labels, mode, params=None, config=None):
-    """Replicated version of `model_fn` to be used instead."""
-    feature_shards, label_shards = _split_batch(
-        features, labels, len(devices), device=consolidation_device)
-    tower_specs = _get_loss_towers(
-        model_fn=model_fn,
-        mode=mode,
-        features=feature_shards,
-        labels=label_shards,
-        params=params,
-        config=config,
-        devices=devices,
-        local_ps_devices=ps_devices)
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      train_op = _minimize_towers(tower_specs)
-      return _train_spec(
-          tower_specs, train_op, aggregation_device=consolidation_device)
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return _eval_spec(tower_specs, aggregation_device=consolidation_device)
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return _predict_spec(tower_specs, aggregation_device=consolidation_device)
-
-  if len(devices) == 1:
-    return single_device_model_fn
-  else:
-    return replicated_model_fn
-
-
-class _TowerOptimizer(optimizer_lib.Optimizer):
-  """Gathers gradients from all towers and reduces them in the last one."""
-
-  COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states'
-
-  def __init__(self, optimizer_or_optimizer_fn,
-               loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE):
-    """Wrap an existing optimizer for gathering gradients across towers.
-
-    Each invocation of model_fn has to call the same optimizers in the same
-    order.
-
-    Multiple optimizers that use the same or different losses are supported.
-
-    If _TowerOptimizer is used but `replicate_model_fn` isn't, then no
-    aggregation will happen.  All calls will simply be forwarded to the
-    underlying optimizer. The behavior is similar if there is only one tower.
-
-    If _TowerOptimizer is used together with SyncReplicasOptimizer that wraps
-    the user's optimizer, then it's the SyncReplicasOptimizer that needs to be
-    wrapped with _TowerOptimizer.
-
-    Args:
-      optimizer_or_optimizer_fn: an instance of optimizer to wrap.  That
-        instance is going to be used for optimizer-specific logic.  This can
-        also be a no-argument function that returns such an optimizer instance.
-      loss_reduction: controls whether losses are summed or averaged.
-    """
-    self._optimizer_or_optimizer_fn = optimizer_or_optimizer_fn
-    self._loss_reduction = loss_reduction
-
-  @staticmethod
-  def has_been_used():
-    return _TowerOptimizer._graph_state().has_tower_optimizer_been_used
-
-  def get_slot(self, *args, **kwargs):
-    return self._get_optimizer().get_slot(*args, **kwargs)
-
-  def get_slot_names(self, *args, **kwargs):
-    return self._get_optimizer().get_slot_names(*args, **kwargs)
-
-  def get_name(self, *args, **kwargs):
-    return self._get_optimizer().get_name(*args, **kwargs)
-
-  def variables(self, *args, **kwargs):
-    return self._get_optimizer().variables(*args, **kwargs)
-
-  def compute_gradients(self, loss, *args, **kwargs):
-    """Compute gradients, but first, if needed, scale the loss."""
-    _TowerOptimizer._graph_state().set_loss_reduction(self._loss_reduction)
-    loss = _scale_loss(loss,
-                       self._loss_reduction,
-                       self._graph_state().number_of_towers)
-    return self._get_optimizer().compute_gradients(loss, *args, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, global_step=None, **kwargs):
-    """Collect gradients updates to apply them with the last tower."""
-    if self._graph_state().number_of_towers == 1:
-      # Avoid the overhead of reduction if there's only one tower.
-      #
-      # There assumed to be only one tower if aggregation-related methods were
-      # not called by `_get_loss_towers`, for example if the model_fn uses
-      # TowerEstimator, but `replicate_model_fn` isn't used.
-      return self._get_optimizer().apply_gradients(grads_and_vars, global_step,
-                                                   **kwargs)
-
-    self._graph_state().collect_gradients(grads_and_vars)
-
-    if not self._graph_state().is_the_last_tower:
-      with ops_lib.control_dependencies(_extract_tensors(grads_and_vars)):
-        return self._construct_no_op_train_op()
-    else:
-      # Gradients need to be gathered and applied in the scope of the first
-      # tower, so that the tensors are accessible via names without prefixes.
-      var_scope, name_scope = self._graph_state().scopes_of_the_first_tower
-      with variable_scope.variable_scope(var_scope):
-        with ops_lib.name_scope(name_scope):
-          return self._apply_gathered_gradients(global_step, **kwargs)
-
-  def _apply_gathered_gradients(self, global_step, **kwargs):
-    graph_state = self._graph_state()
-    optimizer = self._get_optimizer()
-
-    grad_lists = {}
-    for grad, var in graph_state.get_latest_gradients_from_all_towers():
-      if grad is not None:
-        grad_lists.setdefault(var, []).append(grad)
-
-    aggregated_grads = []
-    with ops_lib.name_scope('gradient_aggregating'):
-      for var, grads in six.iteritems(grad_lists):
-        grad = _compute_sum_on_device(grads, var.device)
-        aggregated_grads.append((grad, var))
-    return optimizer.apply_gradients(
-        aggregated_grads, global_step=global_step, **kwargs)
-
-  def _get_optimizer(self):
-    if callable(self._optimizer_or_optimizer_fn):
-      # If optimizer is given as a function then we need to wait till we are
-      # under the right graph context before constructing it.  That's why the
-      # optimizer is constructed in _get_optimizer() rather than __init__().
-      self._optimizer_or_optimizer_fn = self._optimizer_or_optimizer_fn()
-    self._graph_state().has_tower_optimizer_been_used = True
-    return self._optimizer_or_optimizer_fn
-
-  def _construct_no_op_train_op(self):
-    return control_flow_ops.no_op(name='train_op_placeholder')
-
-  @staticmethod
-  def _graph_state():
-    graph_states = ops_lib.get_default_graph().get_collection_ref(
-        _TowerOptimizer.COLLECTION_FOR_GRAPH_STATES)
-    if not graph_states:
-      graph_states.append(_TowerOptimizer._PerGraphState())
-    return graph_states[-1]
-
-  @staticmethod
-  def _did_towers_have_same_optimizer_calls():
-    graph_state = _TowerOptimizer._graph_state()
-    return graph_state.did_towers_have_same_optimizer_calls()
-
-  @staticmethod
-  def _clear_graph_state():
-    # Clearing the Graph collection will prevent _PerGraphState from being
-    # serialized.
-    ops_lib.get_default_graph().clear_collection(
-        _TowerOptimizer.COLLECTION_FOR_GRAPH_STATES)
-
-  class _PerGraphState(object):
-    """Gradient reduction related state of a Tensorflow graph."""
-
-    def __init__(self):
-      self._collected_grads_and_vars = defaultdict(list)
-      self._current_tower_index = 0
-      self._number_of_towers = 1
-      self._loss_reduction = None
-      # Scopes of the first tower that don't have a prefix:
-      self._variable_scope = None
-      self._name_scope = None
-      # If needed, alert that _TowerOptimizer needs to be used with model_fn.
-      self._has_tower_optimizer_been_used = False
-
-    def collect_gradients(self, grads_and_vars):
-      self._collected_grads_and_vars[self._current_tower_index].append(
-          grads_and_vars)
-
-    def get_latest_gradients_from_all_towers(self):
-      """Get gradients across towers for the last called optimizer."""
-      grads_and_vars = []
-      index_of_last_gradients = len(
-          self._collected_grads_and_vars[self._current_tower_index]) - 1
-      for tower_id in range(self._current_tower_index + 1):
-        grads_and_vars.extend(
-            self._collected_grads_and_vars[tower_id][index_of_last_gradients])
-      return grads_and_vars
-
-    def set_number_of_towers(self, number_of_towers):
-      self._number_of_towers = number_of_towers
-
-    def set_loss_reduction(self, loss_reduction):
-      self._loss_reduction = loss_reduction
-
-    @contextmanager
-    def tower(self, tower_id, var_scope, name_scope):
-      if tower_id == 0:
-        self._variable_scope = var_scope
-        self._name_scope = name_scope
-      self._current_tower_index = tower_id
-      yield
-
-    @property
-    def scopes_of_the_first_tower(self):
-      return self._variable_scope, self._name_scope
-
-    @property
-    def is_the_last_tower(self):
-      return self._current_tower_index == (self._number_of_towers - 1)
-
-    @property
-    def number_of_towers(self):
-      return self._number_of_towers
-
-    @property
-    def loss_reduction(self):
-      return self._loss_reduction
-
-    @property
-    def has_tower_optimizer_been_used(self):
-      return self._has_tower_optimizer_been_used
-
-    @has_tower_optimizer_been_used.setter
-    def has_tower_optimizer_been_used(self, value):
-      self._has_tower_optimizer_been_used = value
-
-    def did_towers_have_same_optimizer_calls(self):
-      total_number_of_grads = sum([
-          len(grads)
-          for _, grads in six.iteritems(self._collected_grads_and_vars)
-      ])
-      return total_number_of_grads % self._number_of_towers == 0
-
-
-def _get_local_devices(device_type):
-  local_device_protos = device_lib.list_local_devices()
-  return [
-      device.name
-      for device in local_device_protos
-      if device.device_type == device_type
-  ]
-
-
-def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labes into batches."""
-
-  def ensure_divisible_by_shards(sequence):
-    batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
-    if batch_size % number_of_shards != 0:
-      raise ValueError(
-          'Batch size {} needs to be divisible by the number of GPUs, which '
-          'is {}.'.format(batch_size, number_of_shards))
-
-  def split_dictionary(dictionary):
-    """Split a dictionary into shards."""
-    shards = [{} for _ in range(number_of_shards)]
-    for name, tensor in six.iteritems(dictionary):
-      if isinstance(tensor, sparse_tensor.SparseTensor):
-        for i, shard in enumerate(
-            sparse_ops.sparse_split(
-                sp_input=tensor, num_split=number_of_shards, axis=0)):
-          shards[i][name] = shard
-      else:
-        ensure_divisible_by_shards(tensor)
-        for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
-          shards[i][name] = shard
-    return shards
-
-  with ops_lib.name_scope('split_inputs'):
-    with ops_lib.device(device):
-      if isinstance(features, dict):
-        feature_shards = split_dictionary(features)
-      else:
-        ensure_divisible_by_shards(features)
-        feature_shards = array_ops.split(features, number_of_shards)
-
-      if labels is None:
-        label_shards = None
-      elif isinstance(labels, dict):
-        label_shards = split_dictionary(labels)
-      else:
-        ensure_divisible_by_shards(labels)
-        label_shards = array_ops.split(labels, number_of_shards)
-  return feature_shards, label_shards
-
-
-_DEFAULT_NAME_SCOPE_PATTERN = 'tower_{}'
-
-
-def _get_loss_towers(model_fn,
-                     mode,
-                     features,
-                     labels,
-                     params,
-                     config,
-                     devices,
-                     local_ps_devices,
-                     name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
-  """Replicate the loss computation across devices."""
-  tower_specs = []
-
-  model_fn_args = util.fn_args(model_fn)
-  optional_params = {}
-  if 'params' in model_fn_args:
-    optional_params['params'] = copy.deepcopy(params)
-  if 'config' in model_fn_args:
-    optional_params['config'] = copy.deepcopy(config)
-
-  # pylint: disable=protected-access
-  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
-      num_tasks=len(local_ps_devices))
-  _TowerOptimizer._graph_state().set_number_of_towers(len(devices))
-
-  for i, device in enumerate(devices):
-    is_the_first_tower = (i == 0)
-
-    device_setter = _local_device_setter(
-        worker_device=device,
-        ps_devices=local_ps_devices,
-        ps_strategy=round_robin_strategy)
-
-    # We would like to preserve the names of the variables and ops that the user
-    # might be relying on. Names without a prefix are going to resolve to
-    # variables and ops of the first tower.
-    name_scope = name_scope_pattern
-    if is_the_first_tower:
-      name_scope = ''
-
-    with variable_scope.variable_scope(
-        '', reuse=not is_the_first_tower) as var_scope:
-      with ops_lib.name_scope(name_scope.format(i)) as name_scope:
-        with _TowerOptimizer._graph_state().tower(
-            tower_id=i, var_scope=var_scope, name_scope=name_scope):
-          with ops_lib.device(device_setter):
-            labels_shard = None
-            if labels:
-              labels_shard = labels[i]
-
-            tower_spec = model_fn(
-                mode=mode,
-                features=features[i],
-                labels=labels_shard,
-                **optional_params)
-
-            if (tower_spec.train_op is not None and len(devices) > 1 and
-                not _TowerOptimizer.has_been_used()):
-              raise ValueError('Please wrap optimizers with _TowerOptimizer'
-                               ' in order to use replicate_model_fn with'
-                               ' multiple `devices`.')
-
-            # Scaling the loss here doesn't actually affect gradients.  Another
-            # instance of scaling happens inside the _TowerOptimizer.
-            tower_spec = _scale_tower_loss(
-                tower_spec,
-                _TowerOptimizer._graph_state().loss_reduction,
-                number_of_towers=len(devices))
-            tower_specs.append(tower_spec)
-
-  if not _TowerOptimizer._did_towers_have_same_optimizer_calls():
-    raise ValueError('Each invocation of model_fn was supposed to make the same'
-                     ' optimizer calls.')
-  _TowerOptimizer._clear_graph_state()
-  # pylint: enable=protected-access
-  return tower_specs
-
-
-def _local_device_setter(worker_device, ps_devices, ps_strategy):
-  """A device setter that puts distributes Var/Ops to PS/workers."""
-  ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
-
-  def local_device_chooser(op):
-    current_device = framework_device.DeviceSpec.from_string(op.device or '')
-
-    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
-    if node_def.op in ps_ops:
-      ps_device_spec = framework_device.DeviceSpec.from_string(
-          '{}'.format(ps_devices[ps_strategy(op)]))
-
-      ps_device_spec.merge_from(current_device)
-      return ps_device_spec.to_string()
-    else:
-      worker_device_spec = framework_device.DeviceSpec.from_string(
-          worker_device or '')
-      worker_device_spec.merge_from(current_device)
-      return worker_device_spec.to_string()
-
-  return local_device_chooser
-
-
-def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with approproriately scaled loss."""
-  if tower_spec.loss is None:
-    return tower_spec
-
-  estimator_spec = _asdict(tower_spec)
-  estimator_spec['loss'] = _scale_loss(
-      tower_spec.loss,
-      loss_reduction,
-      number_of_towers,
-      reduced_loss_name='averaged_loss')
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _scale_loss(loss, loss_reduction, number_of_towers, reduced_loss_name=None):
-  """If needed, scale down the loss for averaging loss by summing."""
-  if loss is None:
-    return None
-  if number_of_towers == 1:
-    return loss
-
-  if loss_reduction == losses.Reduction.NONE:
-    raise ValueError('Tower losses need to be reduced in some way, yet {} '
-                     'reduction is specified.'.format(loss_reduction))
-
-  if loss_reduction != losses.Reduction.SUM:
-    return math_ops.div(loss, 1.0 * number_of_towers, name=reduced_loss_name)
-  else:
-    return loss
-
-
-def _minimize_towers(tower_specs):
-  """`train_op` of the last tower applies aggregated gradients."""
-  return tower_specs[-1].train_op
-
-
-def _compute_sum_on_device(values, device, name=None):
-  with ops_lib.device(device):
-    if isinstance(values[0], ops_lib.IndexedSlices):
-      if name:
-        raise ValueError('The name {} is not expected to be given to '
-                         'IndexedSlices {}'.format(name, values))
-
-      values_concat = array_ops.concat([v.values for v in values], axis=0)
-      indices_concat = array_ops.concat([v.indices for v in values], axis=0)
-      return ops_lib.IndexedSlices(values_concat, indices_concat,
-                                   values[0].dense_shape)
-    else:
-      return math_ops.add_n(values, name=name)
-
-
-def _train_spec(tower_specs,
-                train_op,
-                aggregation_device,
-                aggregated_loss_name='loss'):
-  """Populate replicated EstimatorSpec for `GraphKeys.TRAIN`."""
-  # Spec of the last tower is used as the template for the final spec, because
-  # some `EstimatorSpec.training_hooks` rely on calls made in model_fn.  For
-  # example, `SyncReplicasOptimizerHook` validates the
-  # `SyncReplicasOptimizer.apply_gradients` call. `TowerEstimator` makes that
-  # call only in the last tower.
-  estimator_spec = _asdict(tower_specs[-1])
-  estimator_spec['mode'] = model_fn_lib.ModeKeys.TRAIN
-  estimator_spec['train_op'] = train_op
-  estimator_spec['loss'] = _compute_sum_on_device(
-      [spec.loss for spec in tower_specs], aggregation_device,
-      aggregated_loss_name)
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
-  """Populate replicated EstimatorSpec for `GraphKeys.EVAL`."""
-  estimator_spec = _asdict(tower_specs[0])
-  estimator_spec['mode'] = model_fn_lib.ModeKeys.EVAL
-  estimator_spec['loss'] = _compute_sum_on_device(
-      [spec.loss for spec in tower_specs], aggregation_device,
-      aggregated_loss_name)
-
-  update_ops = []
-  for tower_spec in tower_specs:
-    for name, (_, update_op) in six.iteritems(tower_spec.eval_metric_ops):
-      update_ops.append(update_op)
-
-  with ops_lib.control_dependencies(update_ops):
-    reduced_update_op = _reduce_metric_variables(len(tower_specs))
-
-  eval_metric_ops = {}
-  for name, (metric_tensor, _) in six.iteritems(tower_specs[0].eval_metric_ops):
-    eval_metric_ops[name] = (metric_tensor, reduced_update_op)
-  estimator_spec['eval_metric_ops'] = eval_metric_ops
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _reduce_metric_variables(number_of_towers):
-  """Aggregate local variables used in metrics into the first tower."""
-  if number_of_towers == 1:
-    return control_flow_ops.no_op(name='no_eval_metric_reduction')
-
-  metric_variables = ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)
-  variables_per_tower = len(metric_variables) // number_of_towers
-
-  if len(metric_variables) % number_of_towers != 0:
-    raise ValueError(
-        'Different `EstimatorSpec.eval_metric_ops` across `model_fn()` calls.'
-        ' Expected {} local variables, but got {} instead.'.format(
-            variables_per_tower * number_of_towers, len(metric_variables)))
-
-  # `metric_variables` has the size of `variables_per_tower` x
-  #  number_of_towers.  Each tower is produced by calling the same model_fn.
-  #  First `variables_per_tower` correspond to the first tower.  Each such
-  #  variable has an replica at the `(variables_per_tower * i)` position, where
-  #  `i` is `[1.. number_of_towers]`.  We are going to add values from replicas
-  #  to each variable of the first tower.  We then zero out replica values, so
-  #  that `_reduce_metric_variables` operation is idempotent.  If a metric
-  #  is then computed based on local variables from the first tower, then the
-  #  resulting metric is an estimate for all `number_of_towers` towers.
-  ops = []
-  for i in range(0, variables_per_tower):
-    next_replica_id = i + variables_per_tower
-    replicas = [
-        metric_variables[replica_id]
-        for replica_id in range(next_replica_id, len(metric_variables),
-                                variables_per_tower)
-    ]  #  `replicas` doesn't contain the first-tower variable.
-
-    reduce_op = state_ops.assign_add(metric_variables[i],
-                                     math_ops.add_n(replicas))
-
-    with ops_lib.control_dependencies([reduce_op]):
-      for replica in replicas:
-        zeros_for_replica = array_ops.zeros(
-            array_ops.shape(replica), dtype=replica.dtype)
-        zero_out_replica_op = state_ops.assign(replica, zeros_for_replica)
-        ops.append(zero_out_replica_op)
-
-  return control_flow_ops.group(*ops)
-
-
-def _predict_spec(tower_specs, aggregation_device):
-  """Populate replicated EstimatorSpec for `GraphKeys.PREDICT`."""
-  estimator_spec = _asdict(tower_specs[0])
-  estimator_spec['mode'] = model_fn_lib.ModeKeys.PREDICT
-
-  with ops_lib.device(aggregation_device):
-    estimator_spec['predictions'] = _concat_tensor_dicts(
-        *[tower_spec.predictions for tower_spec in tower_specs])
-
-    export_outputs_dict = _dict_concat(
-        *[tower_spec.export_outputs for tower_spec in tower_specs])
-
-    export_outputs = {}
-    for name, export_output_list in six.iteritems(export_outputs_dict):
-      if isinstance(export_output_list[0], export_output_lib.PredictOutput):
-        export_outputs[name] = export_output_lib.PredictOutput(
-            outputs=_concat_tensor_dicts(*[
-                export_output.outputs for export_output in export_output_list
-            ]))
-      elif isinstance(export_output_list[0],
-                      export_output_lib.RegressionOutput):
-        export_outputs[name] = export_output_lib.RegressionOutput(
-            value=array_ops.concat(
-                [export_output.value for export_output in export_output_list],
-                axis=0))
-      elif isinstance(export_output_list[0],
-                      export_output_lib.ClassificationOutput):
-        scores = None
-        if export_output_list[0].scores is not None:
-          scores = array_ops.concat(
-              [export_output.scores for export_output in export_output_list],
-              axis=0)
-
-        classes = None
-        if export_output_list[0].classes is not None:
-          classes = array_ops.stack(
-              [export_output.classes for export_output in export_output_list],
-              axis=0)
-
-        export_outputs[name] = export_output_lib.ClassificationOutput(
-            scores=scores, classes=classes)
-
-  estimator_spec['export_outputs'] = export_outputs
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _concat_tensor_dicts(*tensor_dicts):
-  return {
-      name: array_ops.concat(tensors, axis=0, name=name)
-      for name, tensors in six.iteritems(_dict_concat(*tensor_dicts))
-  }
-
-
-def _extract_tensors(tensors_and_vars):
-  tensors = []
-  for tensor_and_var in tensors_and_vars:
-    tensor, _ = tensor_and_var
-    if isinstance(tensor, ops_lib.IndexedSlices):
-      tensors.append(tensor.values)
-    elif tensor is not None:
-      tensors.append(tensor)
-  return tensors
-
-
-def _dict_concat(*dicts):
-  list_dict = {}
-  for d in dicts:
-    if d is None:
-      continue
-
-    for k, v in six.iteritems(d):
-      list_dict.setdefault(k, []).append(v)
-  return list_dict
-
-
-def _asdict(namedtuple):
-  """Returns a namedtuple as a dictionary.
-
-  This is required because `_asdict()` in Python 3.x.x is broken in classes
-  that inherit from `collections.namedtuple`. See
-  https://bugs.python.org/issue24931 for more details.
-
-  Args:
-    namedtuple: An object that inherits from `collections.namedtuple`.
-
-  Returns:
-    A dictionary version of the tuple.
-  """
-  return {k: getattr(namedtuple, k) for k in namedtuple._fields}
diff --git a/tensorflow/python/estimator/replicate_model_fn_test.py b/tensorflow/python/estimator/replicate_model_fn_test.py
deleted file mode 100644
index ad1f9c02b9..0000000000
--- a/tensorflow/python/estimator/replicate_model_fn_test.py
+++ /dev/null
@@ -1,1739 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for utilities that replicate `Estimator.model_fn` over GPUs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-import shutil
-import tempfile
-import numpy as np
-import six
-
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import replicate_model_fn
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import losses
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import adam
-from tensorflow.python.training import device_setter
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import training
-
-
-# TODO(isaprykin):  Parametrize all the tests on
-#   replicate_model_fn._VariableDistributionMode when it's supported.
-class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def test_complete_flow_with_public_version(self):
-    return self._complete_flow_with_mode(mode=None)
-
-  def test_complete_flow_with_mode_local_ps_server(self):
-    return self._complete_flow_with_mode(
-        replicate_model_fn._VariableDistributionMode.
-        SHARED_LOCAL_PARAMETER_SERVER)
-
-  def test_complete_flow_with_mode_round_robin(self):
-    return self._complete_flow_with_mode(
-        replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN)
-
-  def _complete_flow_with_mode(self, mode):
-    n_classes = 3
-    input_dimension = 2
-    batch_size = 12
-
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    categorical_data = np.random.random_integers(
-        0, len(x_data), size=len(x_data))
-    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        y=y_data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,)),
-        feature_column.embedding_column(
-            feature_column.categorical_column_with_vocabulary_list(
-                'categories',
-                vocabulary_list=np.linspace(
-                    0., len(x_data), len(x_data), dtype=np.int64)), 1)
-    ]
-
-    def optimizer_fn():
-      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
-
-    estimator = dnn.DNNClassifier(
-        hidden_units=(2, 2),
-        # Adagrad is configured with `get_optimizer_instance`, so the function
-        # form of `TowerOptimizer.__init__` is used.
-        optimizer=replicate_model_fn._TowerOptimizer(
-            optimizer_fn, loss_reduction=losses.Reduction.SUM),
-        feature_columns=feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    if not mode:  # Use the public `replicate_model_fn`.
-      model_fn = replicate_model_fn._replicate_model_fn(
-          estimator.model_fn, devices=['/gpu:0', '/gpu:1', '/gpu:2'])
-    else:
-      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
-          estimator.model_fn,
-          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
-          mode=mode)
-
-    estimator = estimator_lib.Estimator(
-        model_fn=model_fn,
-        model_dir=estimator.model_dir,
-        config=estimator.config,
-        params=estimator.params)
-
-    num_steps = 10
-    estimator.train(train_input_fn, steps=num_steps)
-
-    scores = estimator.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in estimator.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
-                                             serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-    # Nothing should be left in the graph so that it doesn't get serialized.
-    self.assertFalse(ops_lib.get_default_graph().get_collection_ref(
-        replicate_model_fn._TowerOptimizer.COLLECTION_FOR_GRAPH_STATES))
-
-  def _as_label(self, data_in_float):
-    return np.rint(data_in_float).astype(np.int64)
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-
-class ReplicateModelTest(test_util.TensorFlowTestCase):
-
-  def create_model_fn_with_loss_reduction(self, loss_reduction):
-
-    def model_fn(mode, features, labels, params):
-      c = variable_scope.get_variable(
-          'c',
-          initializer=constant_op.constant(10, dtype=dtypes.float64),
-          dtype=dtypes.float64)
-
-      predictions = math_ops.multiply(features, c)
-
-      loss = losses.absolute_difference(
-          labels=labels,
-          predictions=predictions,
-          reduction=losses.Reduction.SUM)
-      loss = math_ops.reduce_sum(loss)
-
-      metrics = {
-          'accuracy': metrics_lib.accuracy(labels, predictions),
-          'auc': metrics_lib.auc(labels, predictions)
-      }
-
-      optimizer = replicate_model_fn._TowerOptimizer(
-          gradient_descent.GradientDescentOptimizer(params['learning_rate']),
-          loss_reduction=loss_reduction)
-
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metric_ops=metrics,
-          predictions={'probabilities': predictions},
-          train_op=optimizer.minimize(loss))
-
-    return model_fn
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # derivative of loss = (1*c - 1) + (2*c - 2) is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(7.0, session.run(c))
-
-  def test_train_with_mean_reduction(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session() as session:
-      # Add another trainable variable that doesn't produce a gradient to
-      # verify that None gradients are supported.
-      _ = variable_scope.get_variable(
-          'another_variable',
-          initializer=constant_op.constant(1, dtype=dtypes.float64),
-          dtype=dtypes.float64)
-
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.MEAN),
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)) / 2.0
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # derivative of loss = (1*c - 1)/2 + (2*c - 2)/2 is 1.5.
-      # It's the same computation as without mean reduction, but the
-      # loss from every tower is scaled by 1/<number of towers>.
-      # new value of c = 10 - learning rate * 1.5 = 8.5
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(8.5, session.run(c))
-
-  def test_train_two_steps_collected_gradients_are_reset_between_steps(self):
-    with ops_lib.Graph().as_default():
-      features = array_ops.placeholder(dtypes.float64)
-      labels = array_ops.placeholder(dtypes.float64)
-
-      feature_inputs = np.array([[1.0], [2.0]]), np.array([[1.5], [2.5]])
-      label_inputs = np.array([[1.0], [2.0]]), np.array([[1.5], [2.5]])
-
-      # loss = feature * c - label
-      expected_losses = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0),
-                         (1.5 * 7.0 - 1.5) + (2.5 * 7.0 - 2.5))
-      # Derivative of the loss is 1.0 + 2.0 for the first step and 1.5 + 2.5
-      # for the second.
-      expected_c = 10.0 - 3.0, 7.0 - 4.0
-
-      with self.test_session() as session, variable_scope.variable_scope(
-          '', reuse=variable_scope.AUTO_REUSE):
-        replicated_model_fn = replicate_model_fn._replicate_model_fn(
-            self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-            devices=['/gpu:0', '/gpu:1'])
-        estimator_spec = replicated_model_fn(
-            features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-        session.run(variables.global_variables_initializer())
-
-        for feature_input, label_input, loss, weight in zip(
-            feature_inputs, label_inputs, expected_losses, expected_c):
-          feeds = {features: feature_input, labels: label_input}
-
-          self.assertEqual(loss, session.run(estimator_spec.loss, feeds))
-
-          session.run(estimator_spec.train_op, feeds)
-          c = variable_scope.get_variable('c', dtype=dtypes.float64)
-          self.assertEqual(weight, session.run(c, feeds))
-
-  def test_eval(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
-      session.run(variables.local_variables_initializer())
-      session.run(variables.global_variables_initializer())
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      # loss[i] = features[i] * 10 - labels[i].
-      # Accuracy is 0.0 (no match) in the first tower.
-      # Accuracy is 1.0 (match) in the second tower, since the feature
-      # times weight "c" happened to be equal to the label.
-      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02))
-
-      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
-
-  def test_eval_with_mean_reduction(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.MEAN),
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
-      session.run(variables.local_variables_initializer())
-      session.run(variables.global_variables_initializer())
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      # loss[i] = features[i] * 10 - labels[i].
-      # Accuracy is 0.0 (no match) in the first tower.
-      # Accuracy is 1.0 (match) in the second tower, since the feature
-      # times weight "c" happened to be equal to the label.
-      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02)) / 2.0
-
-      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
-
-  def test_predict(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
-      session.run(variables.global_variables_initializer())
-
-      self.assertAllClose({
-          'probabilities': np.array([[0.1], [0.02]])
-      }, session.run(estimator_spec.predictions))
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # loss' of c is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(7.0, session.run(c))
-
-  def test_eval_single_tower(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
-      session.run(variables.local_variables_initializer())
-      session.run(variables.global_variables_initializer())
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      # Accuracy is 0.0 (no match) in the first tower.
-      # Accuracy is 1.0 (match) in the second tower, since the feature
-      # times weight "c" happened to be equal to the label.
-      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02))
-
-      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
-
-  def test_predict_single_tower(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
-      session.run(variables.global_variables_initializer())
-
-      self.assertAllClose({
-          'probabilities': np.array([[0.1], [0.02]])
-      }, session.run(estimator_spec.predictions))
-
-  def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self):
-    features = np.array([[1.0], [2.0], [3.0]])
-    labels = np.array([[1.0], [2.0], [3.0]])
-
-    with self.assertRaisesRegexp(
-        ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'):
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0', '/gpu:1'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-  def test_unsupported_loss_reduction(self):
-    features = np.array([[1.0], [2.0], [3.0]])
-    labels = np.array([[1.0], [2.0], [3.0]])
-
-    with self.assertRaisesRegexp(ValueError,
-                                 '.+none.+reduction.+is.+specified.+'):
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.NONE),
-          devices=['/gpu:0', '/gpu:1', '/gpu:2'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-  def test_places_on_gpu_with_upper_case_spelling(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session():
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/GPU:0'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', c.device)
-
-  def test_places_on_gpu_with_lower_case_spelling(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.test_session():
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          devices=['/gpu:0'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', c.device)
-
-
-class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
-    test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = gradient_descent.GradientDescentOptimizer(
-        params['learning_rate'])
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.model_fn, devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # loss' of c is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(7.0, session.run(c))
-
-
-class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    features = features['features']
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = replicate_model_fn._TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(params['learning_rate']))
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'features': features}, y=labels, batch_size=2, shuffle=False)
-
-    with self.test_session():
-      estimator = estimator_lib.Estimator(
-          model_fn=self.model_fn,
-          model_dir=tempfile.mkdtemp(),
-          params=self.params)
-      estimator.train(train_input_fn, steps=1)
-
-      self.assertEqual(7.0, estimator.get_variable_value('c'))
-
-
-class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    features = features['features']
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = gradient_descent.GradientDescentOptimizer(
-        params['learning_rate'])
-    optimizer = training.SyncReplicasOptimizer(
-        optimizer, replicas_to_aggregate=1)
-    sync_hook = optimizer.make_session_run_hook(True)
-    optimizer = replicate_model_fn._TowerOptimizer(
-        optimizer, loss_reduction=losses.Reduction.SUM)
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        training_hooks=[sync_hook],
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(
-            loss, global_step=training.get_global_step()))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_multiple_towers(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'features': features}, y=labels, batch_size=2, shuffle=False)
-
-    model_fn = replicate_model_fn._replicate_model_fn(
-        self.model_fn,
-        devices=['/gpu:0', '/gpu:1'])
-
-    estimator = estimator_lib.Estimator(
-        model_fn=model_fn, model_dir=tempfile.mkdtemp(), params=self.params)
-    estimator.train(train_input_fn, steps=1)
-
-    self.assertEqual(7.0, estimator.get_variable_value('c'))
-
-
-class ReplicateWithTwoOptimizersTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    side_effects = variable_scope.get_variable(
-        'side_effects',
-        initializer=constant_op.constant(0, dtype=dtypes.float64),
-        dtype=dtypes.float64,
-        use_resource=True,
-        trainable=False)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    first_optimizer = replicate_model_fn._TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(1.0),
-        loss_reduction=losses.Reduction.SUM)
-    second_optimizer = replicate_model_fn._TowerOptimizer(
-        adam.AdamOptimizer(1.0), loss_reduction=losses.Reduction.SUM)
-
-    with ops_lib.control_dependencies([side_effects.assign_add(1.0)]):
-      first_grads_and_vars = first_optimizer.compute_gradients(loss)
-
-    train_op = control_flow_ops.group(
-        [first_optimizer.apply_gradients(first_grads_and_vars),
-         second_optimizer.minimize(loss)])
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=train_op)
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.model_fn,
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(features, labels,
-                                           model_fn_lib.ModeKeys.TRAIN, {})
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # loss' of c is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      # Adam subtracts another ~1.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertNear(6.0, session.run(c), 0.000001)
-
-        side_effects = variable_scope.get_variable(
-            'side_effects', dtype=dtypes.float64)
-        self.assertNear(2.0, session.run(side_effects), 0.000001)
-
-
-class ReplicateWithTwoLossesAndOneOptimizer(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._should_skip_optimizer = False
-    self._towers_left_before_skipping_optimizer = -1
-
-  def incorrectly_skip_optimizer_for_tower(self, tower_number):
-    self._should_skip_optimizer = True
-    self._towers_left_before_skipping_optimizer = tower_number
-
-  def should_skip_optimizer(self):
-    if not self._should_skip_optimizer:
-      return False
-    if self._towers_left_before_skipping_optimizer == 0:
-      return True
-    else:
-      self._towers_left_before_skipping_optimizer -= 1
-      return False
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-    d = variable_scope.get_variable(
-        'd',
-        initializer=constant_op.constant(2, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    another_predictions = math_ops.multiply(features, d)
-    another_loss = losses.absolute_difference(
-        labels=labels,
-        predictions=another_predictions,
-        reduction=losses.Reduction.SUM)
-    another_loss = math_ops.reduce_sum(another_loss)
-
-    total_loss = math_ops.add(loss, another_loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    train_ops = []
-
-    optimizer = replicate_model_fn._TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(1.0),
-        loss_reduction=losses.Reduction.SUM)
-    train_ops.append(optimizer.minimize(loss, var_list=[c]))
-    if not self.should_skip_optimizer():
-      another_optimizer = replicate_model_fn._TowerOptimizer(
-          gradient_descent.GradientDescentOptimizer(1.0),
-          loss_reduction=losses.Reduction.SUM)
-      train_ops.append(another_optimizer.minimize(another_loss, var_list=[d]))
-
-    train_op = control_flow_ops.group(train_ops)
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=total_loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=train_op)
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with ops_lib.Graph().as_default(), self.test_session() as session:
-      replicated_model_fn = replicate_model_fn._replicate_model_fn(
-          self.model_fn,
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(features, labels,
-                                           model_fn_lib.ModeKeys.TRAIN, {})
-      session.run(variables.global_variables_initializer())
-
-      # For each tower, loss = (feature * c - label) + (feature * d - label).
-      total_loss = (1.0 * 10 - 1.0 + 1.0 * 2.0 - 1.0) + (
-          2.0 * 10 - 2.0 + 2.0 * 2.0 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      session.run(estimator_spec.train_op)
-
-      # loss' of c or loss' of d is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      # new value of d = 2  - learning rate * 3 = -1.0.
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertNear(7.0, session.run(c), 0.000001)
-        d = variable_scope.get_variable('d', dtype=dtypes.float64)
-        self.assertNear(-1.0, session.run(d), 0.000001)
-
-  def test_different_optimizer_calls_within_towers(self):
-    self.incorrectly_skip_optimizer_for_tower(1)
-
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session(), ops_lib.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError, '.+was.+supposed.+to.+make.+same.+optimizer.+calls.+'):
-        replicated_model_fn = replicate_model_fn._replicate_model_fn(
-            self.model_fn, devices=['/gpu:0', '/gpu:1'])
-        _ = replicated_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN,
-                                {})
-
-
-class FailToWrapOptimizerInTheModelFn(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = gradient_descent.GradientDescentOptimizer(1.0)
-    train_op = optimizer.minimize(loss)
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=train_op)
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   'Please.+wrap.+with.+TowerOptimizer'):
-        replicated_model_fn = replicate_model_fn._replicate_model_fn(
-            self.model_fn, devices=['/gpu:0', '/gpu:1'])
-        _ = replicated_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN,
-                                {})
-
-
-class GetLossTowersTest(test_util.TensorFlowTestCase):
-
-  def create_model_fn_with_loss_reduction(self, loss_reduction):
-
-    def model_fn(mode, features, labels, params):
-      del params
-      c = variable_scope.get_variable(
-          'c',
-          initializer=constant_op.constant(0.25, dtype=dtypes.float64),
-          dtype=dtypes.float64)
-
-      predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
-      labels = np.array([0.1, 0.2, 0.3, labels[0]])
-
-      loss = losses.absolute_difference(
-          labels=labels,
-          predictions=predictions,
-          reduction=losses.Reduction.SUM)
-
-      optimizer = replicate_model_fn._TowerOptimizer(
-          gradient_descent.GradientDescentOptimizer(1.0),
-          loss_reduction)
-
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=math_ops.reduce_sum(loss),
-          train_op=optimizer.minimize(loss))
-
-    return model_fn
-
-  def test_gradients_are_computed(self):
-    with self.test_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.SUM),
-          mode=None,
-          features=[[0.6], [1.6]],
-          labels=[[0.6], [0.6]],
-          params=None,
-          config=None,
-          devices=['/gpu:0', '/gpu:1'],
-          local_ps_devices=['/gpu:0'],
-          name_scope_pattern='test_tower_{}')
-      session.run(variables.global_variables_initializer())
-
-      self.assertEqual(len(tower_specs), 2)
-
-      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
-      self.assertEqual('Sum:0', tower_specs[0].loss.name)
-      self.assertEqual(1.0, session.run(tower_specs[0].loss))
-
-      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
-      self.assertEqual('test_tower_1/Sum:0', tower_specs[1].loss.name)
-      # The input batch for the second tower had a loss that is 1.0
-      # bigger: 0.6 vs 1.6.
-      self.assertEqual(2.0, session.run(tower_specs[1].loss))
-
-      self.assertEqual(1, len(variables.global_variables()))
-      self.assertEqual(1, len(variables.trainable_variables()))
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(0.25, session.run(c))
-
-  def test_gradients_are_computed_with_mean_reduction(self):
-    with self.test_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          self.create_model_fn_with_loss_reduction(losses.Reduction.MEAN),
-          mode=model_fn_lib.ModeKeys.EVAL,
-          features=[[0.6], [1.6]],
-          labels=[[0.6], [0.6]],
-          params=None,
-          config=None,
-          devices=['/gpu:0', '/gpu:1'],
-          local_ps_devices=['/gpu:0'],
-          name_scope_pattern='test_tower_{}')
-      session.run(variables.global_variables_initializer())
-
-      self.assertEqual(len(tower_specs), 2)
-
-      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
-      self.assertEqual('averaged_loss:0', tower_specs[0].loss.name)
-      self.assertEqual(0.5, session.run(tower_specs[0].loss))
-
-      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
-      self.assertEqual('test_tower_1/averaged_loss:0', tower_specs[1].loss.name)
-      # The input batch for the second tower had a loss that is 1.0
-      # bigger: 0.6 vs 1.6.
-      self.assertEqual(1.0, session.run(tower_specs[1].loss))
-
-      self.assertEqual(1, len(variables.global_variables()))
-      self.assertEqual(1, len(variables.trainable_variables()))
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(0.25, session.run(c))
-
-  def test_variables_are_round_robined_correctly(self):
-    """Test that creates multiple variables and tests round-robin placement."""
-
-    def model_fn(mode, features, labels, params):
-      del params
-      for variable_name in ['a', 'b', 'c', 'd']:
-        c = variable_scope.get_variable(
-            variable_name,
-            initializer=constant_op.constant(0.25, dtype=dtypes.float64),
-            dtype=dtypes.float64)
-
-      predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
-      labels = np.array([0.1, 0.2, 0.3, labels[0]])
-      loss = losses.absolute_difference(
-          labels=labels,
-          predictions=predictions,
-          reduction=losses.Reduction.SUM)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode, loss=math_ops.reduce_sum(loss))
-
-    with self.test_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          model_fn,
-          mode=None,
-          features=[[0.6], [1.6], [2.6]],
-          labels=[[0.6], [0.6], [2.6]],
-          params=None,
-          config=None,
-          devices=['/gpu:0', '/gpu:1', '/gpu:3'],
-          local_ps_devices=['/gpu:0', '/gpu:1', '/gpu:3'],
-          name_scope_pattern='test_tower_{}')
-      session.run(variables.global_variables_initializer())
-
-      self.assertEqual(len(tower_specs), 3)
-      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
-      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
-      self.assertEqual('/device:GPU:3', tower_specs[2].loss.device)
-
-      with variable_scope.variable_scope('', reuse=True):
-        a = variable_scope.get_variable('a', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', a.device)
-        b = variable_scope.get_variable('b', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:1', b.device)
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:3', c.device)
-        d = variable_scope.get_variable('d', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', d.device)
-
-
-class SplitBatchTest(test_util.TensorFlowTestCase):
-
-  def evaluate_shards(self, first_list, second_list):
-    evaluate_items = lambda x: x.eval()
-    return list(map(evaluate_items, first_list)), list(
-        map(evaluate_items, second_list))
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def test_simple_half_split(self):
-    with self.test_session():
-      features = [0.0, 1.0, 2.0, 3.0]
-      labels = [10.0, 11.0, 12.0, 13.0]
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      feature_shards, label_shards = self.evaluate_shards(
-          feature_shards, label_shards)
-
-      self.assertAllEqual([[0.0, 1.0], [2.0, 3.0]], feature_shards)
-      self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
-
-  def test_to_each_their_own(self):
-    with self.test_session():
-      features = [0.0, 1.0, 2.0, 3.0]
-      labels = [10.0, 11.0, 12.0, 13.0]
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 4, device='/gpu:0')
-
-      feature_shards, label_shards = self.evaluate_shards(
-          feature_shards, label_shards)
-
-      self.assertAllEqual([[0.0], [1.0], [2.0], [3.0]], feature_shards)
-      self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
-
-  def test_one_batch(self):
-    with self.test_session():
-      features = [0.0, 1.0, 2.0, 3.0]
-      labels = [10.0, 11.0, 12.0, 13.0]
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 1, device='/gpu:0')
-
-      feature_shards, label_shards = self.evaluate_shards(
-          feature_shards, label_shards)
-
-      self.assertAllEqual([[0.0, 1.0, 2.0, 3.0]], feature_shards)
-      self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
-
-  def test_half_split_in_dictionary(self):
-    with self.test_session():
-      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-      labels = [10.0, 11.0, 12.0, 13.0]
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertAllEqual([0.0, 1.0], feature_shards[0]['first'].eval())
-      self.assertAllEqual([4.0, 5.0], feature_shards[0]['second'].eval())
-      self.assertAllEqual([2.0, 3.0], feature_shards[1]['first'].eval())
-      self.assertAllEqual([6.0, 7.0], feature_shards[1]['second'].eval())
-      self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
-      self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
-
-  def test_sparse_tensor_can_be_split_unevenly(self):
-    with self.test_session():
-      features = {
-          'x':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [1, 2], [2, 2]],
-                  values=[1.0, 2.0, 3.0],
-                  dense_shape=[3, 4])
-      }
-      labels = np.array([[1.0], [2.0]])
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertSparseValuesEqual(
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]),
-          feature_shards[0]['x'].eval())
-      self.assertSparseValuesEqual(
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 2]], values=[3.], dense_shape=[1, 4]),
-          feature_shards[1]['x'].eval())
-      self.assertAllEqual([[1.0]], label_shards[0].eval())
-      self.assertAllEqual([[2.0]], label_shards[1].eval())
-
-  def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
-    with self.test_session():
-      features = {
-          'x':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [1, 0], [1, 1]],
-                  values=[1.0, 2.0, 3.0],
-                  dense_shape=[3, 4])
-      }
-      labels = np.array([[1.0], [2.0]])
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertSparseValuesEqual(
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [1, 0], [1, 1]],
-              values=[1., 2., 3.],
-              dense_shape=[2, 4]), feature_shards[0]['x'].eval())
-
-      second_batch = feature_shards[1]['x'].eval()
-      self.assertFalse(len(second_batch.indices))
-      self.assertFalse(len(second_batch.values))
-      self.assertAllEqual([1, 4], second_batch.dense_shape)
-      self.assertAllEqual([[1.0]], label_shards[0].eval())
-      self.assertAllEqual([[2.0]], label_shards[1].eval())
-
-  def test_one_batch_in_dictionary(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
-      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-      labels = [10.0, 11.0, 12.0, 13.0]
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 1, device='/gpu:0')
-
-      self.assertAllEqual([0.0, 1.0, 2.0, 3.0],
-                          feature_shards[0]['first'].eval())
-      self.assertAllEqual([4.0, 5.0, 6.0, 7.0],
-                          feature_shards[0]['second'].eval())
-      self.assertAllEqual([10.0, 11.0, 12.0, 13.0], label_shards[0].eval())
-
-  def test_feature_and_label_dictionaries(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
-      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-      labels = {'first': [10.0, 11.0], 'second': [12.0, 13.0]}
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertAllEqual([0.0, 1.0], feature_shards[0]['first'].eval())
-      self.assertAllEqual([4.0, 5.0], feature_shards[0]['second'].eval())
-      self.assertAllEqual([2.0, 3.0], feature_shards[1]['first'].eval())
-      self.assertAllEqual([6.0, 7.0], feature_shards[1]['second'].eval())
-      self.assertAllEqual([10.0], label_shards[0]['first'].eval())
-      self.assertAllEqual([12.0], label_shards[0]['second'].eval())
-      self.assertAllEqual([11], label_shards[1]['first'].eval())
-      self.assertAllEqual([13.0], label_shards[1]['second'].eval())
-
-
-class TrainSpecTest(test_util.TensorFlowTestCase):
-
-  expected_predictions = {}
-
-  def create_estimator_spec(self, loss):
-    return model_fn_lib.EstimatorSpec(
-        mode=model_fn_lib.ModeKeys.TRAIN,
-        loss=loss,
-        train_op=loss,  # Not used; currently required.
-        predictions=self.expected_predictions)
-
-  def create_constant_loss(self, loss_value):
-    return constant_op.constant(loss_value, dtype=dtypes.float64)
-
-  def test_example(self):
-    with self.test_session() as session:
-      tower_losses = list(map(self.create_constant_loss, [2, 4, 6]))
-      tower_specs = list(map(self.create_estimator_spec, tower_losses))
-
-      expected_train_op = tower_losses[1]
-
-      estimator_spec = replicate_model_fn._train_spec(
-          tower_specs, expected_train_op, aggregation_device='/gpu:0')
-
-      self.assertEqual(expected_train_op, estimator_spec.train_op)
-      self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
-      self.assertEqual(self.expected_predictions, estimator_spec.predictions)
-
-
-class EvalSpecTest(test_util.TensorFlowTestCase):
-
-  def create_estimator_spec(self, loss, metrics):
-    return model_fn_lib.EstimatorSpec(
-        mode=model_fn_lib.ModeKeys.EVAL, loss=loss, eval_metric_ops=metrics)
-
-  def create_constant_loss(self, loss_value):
-    return constant_op.constant(loss_value, dtype=dtypes.float64)
-
-  def create_eval_metrics(self, noise):
-    predictions = np.array([0.1, 0.2, 0.3, 0.6 + noise])
-    labels = np.array([0.1, 0.2, 0.3, 0.6])
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-    return metrics
-
-  def test_example(self):
-    with self.test_session() as session:
-      tower_losses = map(self.create_constant_loss, [2, 4, 6])
-      tower_metrics = map(self.create_eval_metrics, [0, 0.2, 0.3])
-      tower_specs = [
-          self.create_estimator_spec(l, m)
-          for l, m in zip(tower_losses, tower_metrics)
-      ]
-      session.run(variables.local_variables_initializer())
-
-      estimator_spec = replicate_model_fn._eval_spec(
-          tower_specs, aggregation_device='/device:GPU:0')
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      self.assertEqual('/device:CPU:0', accuracy.device)
-      self.assertEqual('/device:CPU:0', auc.device)
-
-      session.run([a, b])
-      accuracy, auc = session.run([accuracy, auc])
-
-      self.assertNear((12 - 2) / 12, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
-
-  def test_handles_single_tower(self):
-    with self.test_session() as session:
-      tower_losses = map(self.create_constant_loss, [5])
-      tower_metrics = map(self.create_eval_metrics, [0.2])
-      tower_specs = [
-          self.create_estimator_spec(l, m)
-          for l, m in zip(tower_losses, tower_metrics)
-      ]
-      session.run(variables.local_variables_initializer())
-
-      estimator_spec = replicate_model_fn._eval_spec(
-          tower_specs, aggregation_device='/device:GPU:0')
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      self.assertEqual('/device:CPU:0', accuracy.device)
-      self.assertEqual('/device:CPU:0', auc.device)
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      self.assertNear((4 - 1) / 4, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertEqual(5, session.run(estimator_spec.loss))
-
-
-class PredictSpecTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(0.25, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.add(np.array([features[0], features[0]]), c)
-
-    return model_fn_lib.EstimatorSpec(
-        mode=model_fn_lib.ModeKeys.PREDICT,
-        predictions={
-            'probabilities': predictions
-        })
-
-  def test_example(self):
-    with self.test_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          self.model_fn,
-          mode=None,
-          features=[[0.1], [0.2]],
-          labels=[[], []],
-          params=None,
-          config=None,
-          devices=['/gpu:0', '/gpu:1'],
-          local_ps_devices=['/gpu:0'],
-      )
-      session.run(variables.global_variables_initializer())
-
-      estimator_spec = replicate_model_fn._predict_spec(
-          tower_specs, aggregation_device='/gpu:0')
-
-      self.assertEqual('/device:GPU:0',
-                       estimator_spec.predictions['probabilities'].device)
-      self.assertAllClose({
-          'probabilities': np.array([0.35, 0.35, 0.45, 0.45])
-      }, session.run(estimator_spec.predictions))
-
-
-class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
-
-  def create_metric_variable(self, initial_value, name):
-    return variable_scope.variable(
-        initial_value,
-        trainable=False,
-        collections=[ops_lib.GraphKeys.METRIC_VARIABLES],
-        validate_shape=True,
-        name=name)
-
-  def create_tower_metrics(self, tower_id):
-    with variable_scope.variable_scope('', reuse=(tower_id != 0)):
-      self.create_metric_variable(1.3 * (tower_id + 1), 'total')
-      self.create_metric_variable(2.3 * (tower_id + 1), 'count')
-      self.create_metric_variable(
-          np.array([3.3, 3.5, 3.7]) * (tower_id + 1), 'total')
-
-  def test_example(self):
-    with self.test_session() as session:
-      for tower_id in range(3):
-        self.create_tower_metrics(tower_id)
-
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      session.run(
-          replicate_model_fn._reduce_metric_variables(number_of_towers=3))
-
-      # 1st tower = 1.3, 2.3,  [3.3, 3.5, 3.7]
-      # 2nd tower = 2.6, 4.6,  [6.6, 7.0, 7.4]
-      # 3rd tower = 3.9, 6.9,  [9.9, 10.5, 11.1]
-      # Reduced =   7.8, 13.8, [19.8, 21.0, 22.2]
-      # Towers are accumulated in the first tower.
-      local_metrics = session.run(
-          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
-
-      self.assertNear(7.8, local_metrics[0], 0.01)
-      self.assertNear(13.8, local_metrics[1], 0.01)
-      self.assertAllClose([19.8, 21., 22.1], local_metrics[2], 0.01)
-      self.assertNear(0.0, local_metrics[3], 0.01)
-      self.assertNear(0.0, local_metrics[4], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[5], 0.01)
-      self.assertNear(0.0, local_metrics[6], 0.01)
-      self.assertNear(0.0, local_metrics[7], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
-
-  def test_reduce_is_idempotent(self):
-    with self.test_session() as session:
-      for tower_id in range(3):
-        self.create_tower_metrics(tower_id)
-
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      for _ in range(20):
-        session.run(
-            replicate_model_fn._reduce_metric_variables(number_of_towers=3))
-
-      local_metrics = session.run(
-          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
-
-      self.assertNear(7.8, local_metrics[0], 0.01)
-      self.assertNear(13.8, local_metrics[1], 0.01)
-      self.assertAllClose([19.8, 21., 22.1], local_metrics[2], 0.01)
-      self.assertNear(0.0, local_metrics[3], 0.01)
-      self.assertNear(0.0, local_metrics[4], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[5], 0.01)
-      self.assertNear(0.0, local_metrics[6], 0.01)
-      self.assertNear(0.0, local_metrics[7], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
-
-  def test_handles_single_tower(self):
-    with self.test_session() as session:
-      self.create_tower_metrics(0)
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      session.run(
-          replicate_model_fn._reduce_metric_variables(number_of_towers=1))
-
-      local_metrics = session.run(
-          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
-
-      self.assertNear(1.3, local_metrics[0], 0.01)
-      self.assertNear(2.3, local_metrics[1], 0.01)
-      self.assertAllClose([3.3, 3.5, 3.7], local_metrics[2], 0.01)
-
-  def test_doesnt_accept_uneven_number_of_variables(self):
-    with self.test_session() as session:
-      for tower_id in range(3):
-        self.create_tower_metrics(tower_id)
-      self.create_metric_variable(-1.0, 'oddball')
-
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      with self.assertRaisesRegexp(
-          ValueError, '.+Expected.+local.+variables.+but.+got.+instead.+'):
-        session.run(
-            replicate_model_fn._reduce_metric_variables(number_of_towers=3))
-
-
-class MergeExportOutputsTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = {'probabilities': math_ops.multiply(features, c)}
-    loss = losses.absolute_difference(
-        labels=labels,
-        predictions=predictions['probabilities'],
-        reduction=losses.Reduction.SUM)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions['probabilities']),
-        'auc': metrics_lib.auc(labels, predictions['probabilities'])
-    }
-    tensor_string_repr = str(features)
-    classes = constant_op.constant(
-        re.search('(split_inputs/split:[0-9])', tensor_string_repr).group(1),
-        dtype=dtypes.string)
-
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.PredictOutput(predictions),
-        'classification_output':
-            export_output.ClassificationOutput(predictions['probabilities'],
-                                               classes),
-        'classification_scores':
-            export_output.ClassificationOutput(
-                scores=predictions['probabilities']),
-        'classification_classes':
-            export_output.ClassificationOutput(classes=classes),
-        'regression_output':
-            export_output.RegressionOutput(predictions['probabilities']),
-    }
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=math_ops.reduce_sum(loss),
-        eval_metric_ops=metrics,
-        predictions=predictions,
-        export_outputs=export_outputs)
-
-  def replicate_estimator_spec(self, session):
-    features = np.array([0.01, 0.002])
-    labels = np.array([0.01, 0.02])
-
-    replicated_model_fn = replicate_model_fn._replicate_model_fn(
-        self.model_fn, devices=['/gpu:0', '/gpu:1'])
-    estimator_spec = replicated_model_fn(features, labels,
-                                         model_fn_lib.ModeKeys.PREDICT, {})
-    session.run(variables.global_variables_initializer())
-    return estimator_spec
-
-  def test_merge_predict_output(self):
-    with self.test_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          {
-              'probabilities': np.array([0.1, 0.02])
-          },
-          session.run(estimator_spec.export_outputs[
-              signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs))
-
-  def test_merge_classification_output_scores_classes(self):
-    with self.test_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          [0.1, 0.02],
-          session.run(
-              estimator_spec.export_outputs['classification_output'].scores))
-      self.assertAllEqual(
-          [b'split_inputs/split:0', b'split_inputs/split:1'],
-          session.run(
-              estimator_spec.export_outputs['classification_output'].classes))
-
-  def test_merge_classification_output_scores(self):
-    with self.test_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          [0.1, 0.02],
-          session.run(
-              estimator_spec.export_outputs['classification_scores'].scores))
-      self.assertEqual(
-          None, estimator_spec.export_outputs['classification_scores'].classes)
-
-  def test_merge_classification_output_classes(self):
-    with self.test_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllEqual(
-          [b'split_inputs/split:0', b'split_inputs/split:1'],
-          session.run(
-              estimator_spec.export_outputs['classification_classes'].classes))
-      self.assertEqual(
-          None, estimator_spec.export_outputs['classification_classes'].scores)
-
-  def test_merge_regression_output(self):
-    with self.test_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          [0.1, 0.02],
-          session.run(estimator_spec.export_outputs['regression_output'].value))
-
-
-class GetLocalDevicesTest(test_util.TensorFlowTestCase):
-
-  def test_there_is_at_least_a_cpu(self):
-    self.assertTrue(replicate_model_fn._get_local_devices('CPU'))
-
-  def test_there_is_no_xpu(self):
-    self.assertFalse(
-        replicate_model_fn._get_local_devices('XPU'))  # XPU doesn't exist.
-
-  def test_whether_there_is_a_gpu(self):
-    if test.is_gpu_available():
-      self.assertTrue(len(replicate_model_fn._get_local_devices('GPU')))
-
-
-class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
-
-  def test_vars_are_on_ps_but_ops_are_on_workers(self):
-    ps_devices = ['/device:GPU:3']
-    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
-
-    local_device_setter = replicate_model_fn._local_device_setter(
-        ps_devices=ps_devices,
-        ps_strategy=round_robin,
-        worker_device='/device:GPU:2')
-
-    with ops_lib.device(local_device_setter):
-      a = variables.Variable(0.01)
-      self.assertEqual('/device:GPU:3', a.device)
-
-      b = variables.Variable(0.02)
-      self.assertEqual('/device:GPU:3', b.device)
-
-      c = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:3', c.device)
-
-      a_op = array_ops.concat(a, axis=0)
-      self.assertEqual('/device:GPU:2', a_op.device)
-
-      b_op = array_ops.concat(b, axis=0)
-      self.assertEqual('/device:GPU:2', b_op.device)
-
-  def test_round_robin_placement(self):
-    ps_devices = [
-        '/device:GPU:0', '/device:GPU:1', '/device:GPU:3', '/device:GPU:4'
-    ]
-    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
-
-    local_device_setter = replicate_model_fn._local_device_setter(
-        ps_devices=ps_devices,
-        ps_strategy=round_robin,
-        worker_device='/device:GPU:2')
-
-    with ops_lib.device(local_device_setter):
-      a = variables.Variable(0.01)
-      self.assertEqual('/device:GPU:0', a.device)
-
-      b = variables.Variable(0.02)
-      self.assertEqual('/device:GPU:1', b.device)
-
-      c = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:3', c.device)
-
-      a_op = array_ops.concat(a, axis=0)
-      self.assertEqual('/device:GPU:2', a_op.device)
-
-      b_op = array_ops.concat(b, axis=0)
-      self.assertEqual('/device:GPU:2', b_op.device)
-
-      c = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:4', c.device)
-
-      d = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:0', d.device)
-
-      c_op = array_ops.concat(c, axis=0)
-      self.assertEqual('/device:GPU:2', c_op.device)
-
-
-class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
-
-  def test_vectors(self):
-    with self.test_session() as session:
-      total = replicate_model_fn._compute_sum_on_device(
-          [1.0, 2.0, 3.0, 4.0], device='/device:GPU:0', name='test_sum')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertEqual('test_sum', total.op.name)
-      self.assertEqual(10.0, session.run(total))
-
-  def test_tensors(self):
-    with self.test_session() as session:
-      total = replicate_model_fn._compute_sum_on_device(
-          [[1.0, 2.0], [3.0, 4.0]], device='/device:GPU:0', name='test_sum')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertEqual('test_sum', total.op.name)
-      self.assertAllEqual([4.0, 6.0], session.run(total))
-
-  def test_indexedslices(self):
-    with self.test_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([1.0, 2.0]), [0, 1],
-          dense_shape=constant_op.constant([2]))
-      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([4.0, 6.0],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_indexedslices_higher_dimensions(self):
-    with self.test_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([[1.0, 5.0], [2.0, 6.0]]), [0, 1],
-          dense_shape=constant_op.constant([2, 4]))
-      b = ops_lib.IndexedSlices(
-          constant_op.constant([[3.0, 7.0], [4.0, 8.0]]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([[4.0, 12.0], [6.0, 14.0]],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_indexedslices_some_dont_overlap(self):
-    with self.test_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([1.0, 2.0]), [0, 3],
-          dense_shape=constant_op.constant([4]))
-      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([4.0, 4.0, 0.0, 2.0],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_no_name_for_indexslices(self):
-    a = ops_lib.IndexedSlices(
-        constant_op.constant([1.0, 2.0]), [0, 1],
-        dense_shape=constant_op.constant([2]))
-    b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-    with self.assertRaisesRegexp(ValueError, '.+name.+not.+expected.+'):
-      _ = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0', name='cant_name_indexslices')
-
-
-class ConcatTensorDictsTest(test_util.TensorFlowTestCase):
-
-  def test_example(self):
-    tensor_dicts = [
-        {
-            'a': np.array([1.0, 2.0]),
-            'b': np.array([11.0]),
-            'c': np.array([21.0]),
-        },
-        {
-            'a': np.array([3.0]),
-            'b': np.array([12.0, 13.0]),
-        },
-        {
-            'b': np.array([14.0]),
-        },
-    ]
-
-    with self.test_session() as session:
-      self.assertAllClose({
-          'a': np.array([1.0, 2.0, 3.0]),
-          'b': np.array([11.0, 12.0, 13.0, 14.0]),
-          'c': np.array([21.0]),
-      }, session.run(replicate_model_fn._concat_tensor_dicts(*tensor_dicts)))
-
-
-if __name__ == '__main__':
-  test.main()
-- 
GitLab


From 0d54d983a079f1d6541da91ac0dfcbbd4959eba4 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Mon, 16 Apr 2018 11:33:14 -0700
Subject: [PATCH 0856/1262] Internal testing changes

PiperOrigin-RevId: 193071881
---
 tensorflow/contrib/lite/kernels/BUILD | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index ac7c3f071f..8cfa7e53d1 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -825,8 +825,7 @@ tf_cc_test(
         "comparisons_test.cc",
     ],
     tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
+        "tflite_not_portable_ios",
     ],
     deps = [
         ":builtin_ops",
-- 
GitLab


From 70738af3f685531a7d9fa169f35640c0810dfd2e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 11:47:11 -0700
Subject: [PATCH 0857/1262] Refactoring: Rename the __ops module to ag__
 (double underscore prefix has special meaning in Python). Consolidate all
 internal API calls through the ag__ module.

PiperOrigin-RevId: 193074379
---
 tensorflow/contrib/autograph/__init__.py      |  1 +
 .../autograph/converters/builtin_functions.py |  8 ++-----
 .../autograph/converters/call_trees.py        |  6 +++---
 .../autograph/converters/control_flow.py      |  8 +++----
 .../converters/converter_test_base.py         |  7 ++++---
 .../contrib/autograph/converters/ifexp.py     |  2 +-
 .../contrib/autograph/converters/lists.py     |  4 ++--
 .../converters/side_effect_guards.py          |  6 +++---
 .../contrib/autograph/impl/conversion.py      | 21 +++++++++++++------
 .../contrib/autograph/impl/conversion_test.py | 10 ++++++---
 10 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index a39f44b21a..3386c4eca4 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -21,6 +21,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
diff --git a/tensorflow/contrib/autograph/converters/builtin_functions.py b/tensorflow/contrib/autograph/converters/builtin_functions.py
index 0349ce29ce..cd889cb663 100644
--- a/tensorflow/contrib/autograph/converters/builtin_functions.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions.py
@@ -34,17 +34,15 @@ class BuiltinFunctionTransformer(transformer.Base):
   def __init__(self, context):
     super(BuiltinFunctionTransformer, self).__init__(context)
 
-  # pylint:disable=invalid-name
-
   def _convert_builtin(self, node):
     template = """
-      autograph_utils.dynamic_builtin(func, args)
+      ag__.utils.dynamic_builtin(func, args)
     """
     return templates.replace(template, func=node.func, args=node.args)[0].value
 
   def _convert_print(self, node):
     template = """
-      autograph_utils.dynamic_print(args)
+      ag__.utils.dynamic_print(args)
     """
     return templates.replace(template, args=node.args)[0].value
 
@@ -70,8 +68,6 @@ class BuiltinFunctionTransformer(transformer.Base):
     function_call = templates.replace(template, fname='print', args=args)[0]
     return self.visit(function_call)
 
-  # pylint:enable=invalid-name
-
 
 def transform(node, context):
   return BuiltinFunctionTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 9424966696..e390d1a262 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -198,7 +198,7 @@ class CallTreeTransformer(transformer.Base):
   def _wrap_to_py_func_no_return(self, node):
     # TODO(mdan): Properly handle varargs, etc.
     template = """
-      autograph_utils.wrap_py_func(func, None, (args,), kwargs, True)
+      ag__.utils.wrap_py_func(func, None, (args,), kwargs, True)
     """
     return templates.replace(
         template,
@@ -209,7 +209,7 @@ class CallTreeTransformer(transformer.Base):
   def _wrap_to_py_func_single_return(self, node, dtype):
     # TODO(mdan): Properly handle varargs, etc.
     template = """
-      autograph_utils.wrap_py_func(func, dtype, (args,), kwargs, False)
+      ag__.utils.wrap_py_func(func, dtype, (args,), kwargs, False)
     """
     return templates.replace_as_expression(
         template,
@@ -237,7 +237,7 @@ class CallTreeTransformer(transformer.Base):
     # Before we could convert all the time though, we'd need a reasonable
     # caching mechanism.
     template = """
-      autograph_api.converted_call(func, True, False, {}, args)
+      ag__.converted_call(func, True, False, {}, args)
     """
     call_expr = templates.replace(template, func=node.func, args=node.args)
     new_call = call_expr[0].value
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 55a28e8ac3..2e26cdb3d9 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -78,7 +78,7 @@ class ControlFlowTransformer(transformer.Base):
   def _create_cond_expr(self, results, test, body_name, orelse_name):
     if results is not None:
       template = """
-        results = autograph_utils.run_cond(test, body_name, orelse_name)
+        results = ag__.utils.run_cond(test, body_name, orelse_name)
       """
       return templates.replace(
           template,
@@ -88,7 +88,7 @@ class ControlFlowTransformer(transformer.Base):
           orelse_name=orelse_name)
     else:
       template = """
-        autograph_utils.run_cond(test, body_name, orelse_name)
+        ag__.utils.run_cond(test, body_name, orelse_name)
       """
       return templates.replace(
           template, test=test, body_name=body_name, orelse_name=orelse_name)
@@ -207,7 +207,7 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = __ops.while_loop(
+      state_ast_tuple = ag__.while_loop(
           test_name, body_name, (state,), (extra_deps,))
     """
     node = templates.replace(
@@ -264,7 +264,7 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(iterate, state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = __ops.for_loop(
+      state_ast_tuple = ag__.for_loop(
           iterated, extra_cond_name, body_name, (state,))
     """
     node = templates.replace(
diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 6f75e9a529..23b61cf781 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -76,9 +76,10 @@ class TestCase(test.TestCase):
     try:
       result, source = compiler.ast_to_object(node)
       result.tf = self.make_fake_mod('fake_tf', *symbols)
-      result.autograph_utils = utils
-      result.autograph_api = self.make_fake_mod('fake_api', converted_call)
-      result.__dict__['__ops'] = operators
+      fake_ag = self.make_fake_mod('fake_ag', converted_call)
+      fake_ag.__dict__.update(operators.__dict__)
+      fake_ag.__dict__['utils'] = utils
+      result.__dict__['ag__'] = fake_ag
       yield result
     except Exception:  # pylint:disable=broad-except
       if source is None:
diff --git a/tensorflow/contrib/autograph/converters/ifexp.py b/tensorflow/contrib/autograph/converters/ifexp.py
index bb0c0a36a7..616d222762 100644
--- a/tensorflow/contrib/autograph/converters/ifexp.py
+++ b/tensorflow/contrib/autograph/converters/ifexp.py
@@ -27,7 +27,7 @@ class IfExp(transformer.Base):
 
   def visit_IfExp(self, node):
     template = """
-        autograph_utils.run_cond(test, lambda: (body,), lambda: (orelse,))
+        ag__.utils.run_cond(test, lambda: (body,), lambda: (orelse,))
     """
     desugared_ifexp = templates.replace_as_expression(
         template, test=node.test, body=node.body, orelse=node.orelse)
diff --git a/tensorflow/contrib/autograph/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py
index 234a0a7487..6dda554acc 100644
--- a/tensorflow/contrib/autograph/converters/lists.py
+++ b/tensorflow/contrib/autograph/converters/lists.py
@@ -45,7 +45,7 @@ class ListTransformer(transformer.Base):
     if not anno.hasanno(node, 'element_type'):
       raise NotImplementedError(
           'type inference for empty lists is not yet supported; '
-          'use utils.set_element_type(<list>, <dtype>) to continue')
+          'use set_element_type(<list>, <dtype>) to continue')
     dtype = anno.getanno(node, 'element_type')
     if not isinstance(dtype, dtypes.DType):
       # TODO(mdan): Allow non-TF dtypes?
@@ -74,7 +74,7 @@ class ListTransformer(transformer.Base):
 
       if qn.qn[-1] == 'append' and (len(call_node.args) == 1):
         template = """
-          target = autograph_utils.dynamic_list_append(target, element)
+          target = ag__.utils.dynamic_list_append(target, element)
         """
         node = templates.replace(
             template,
diff --git a/tensorflow/contrib/autograph/converters/side_effect_guards.py b/tensorflow/contrib/autograph/converters/side_effect_guards.py
index 1c1293d2c4..3bcb2d3c42 100644
--- a/tensorflow/contrib/autograph/converters/side_effect_guards.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards.py
@@ -160,8 +160,8 @@ class SideEffectGuardTransformer(transformer.Base):
               [alias_map.get(s, s).ast() for s in guarded_args], None)
 
         template = """
-          with autograph_utils.control_dependency_on_returns(call):
-            aliased_guarded_args = autograph_utils.alias_tensors(guarded_args)
+          with ag__.utils.control_dependency_on_returns(call):
+            aliased_guarded_args = ag__.utils.alias_tensors(guarded_args)
         """
         control_deps_guard = templates.replace(
             template,
@@ -172,7 +172,7 @@ class SideEffectGuardTransformer(transformer.Base):
         alias_map = {}
 
         template = """
-          with autograph_utils.control_dependency_on_returns(call):
+          with ag__.utils.control_dependency_on_returns(call):
             pass
         """
         control_deps_guard = templates.replace(template, call=node.value)[-1]
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 373dc1602b..11bbe7888a 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import imp
+
 import gast
 
 from tensorflow.contrib.autograph import operators
@@ -221,12 +223,17 @@ def _add_reserved_symbol(namespace, name, entity):
 
 
 def _add_self_references(namespace, api_module):
-  # Manually add the utils namespace which may be used from generated code.
-  _add_reserved_symbol(namespace, 'autograph_utils', utils)
-  _add_reserved_symbol(namespace, '__ops', operators)
-  # We also make reference to the api module for dynamic conversion, but
-  # to avoid circular references we don't import it here.
-  _add_reserved_symbol(namespace, 'autograph_api', api_module)
+  # Craft a module that exposes parts of the external API as well as certain
+  # internal modules.
+  ag_internal = imp.new_module('autograph')
+  ag_internal.converted_call = api_module.converted_call
+  ag_internal.utils = utils
+  # TODO(mdan): Add safeguards against name clashes.
+  # We don't want to create a submodule because we want the operators to be
+  # accessible as ag__.<operator>
+  ag_internal.__dict__.update(operators.__dict__)
+
+  _add_reserved_symbol(namespace, 'ag__', ag_internal)
 
 
 def function_to_graph(f, conversion_map, arg_values, arg_types,
@@ -312,6 +319,8 @@ def node_to_graph(node, ctx, nocompile_decorators):
   node = ifexp.transform(node, ctx)
   node, deps = decorators.transform(node, nocompile_decorators)
   node = break_statements.transform(node, ctx)
+  node = _static_analysis_pass(node, ctx)
+
   node = asserts.transform(node, ctx)
 
   # Note: sequencing continue canonicalization before for loop one avoids
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 962009c71f..f0b597c12f 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.impl import api
 from tensorflow.contrib.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
@@ -28,6 +29,9 @@ from tensorflow.python.platform import test
 
 class ConversionTest(test.TestCase):
 
+  def _simple_conversion_map(self):
+    return conversion.ConversionMap(True, (), (), api)
+
   def test_is_whitelisted_for_graph(self):
 
     def test_fn():
@@ -39,7 +43,7 @@ class ConversionTest(test.TestCase):
 
   def test_entity_to_graph_unsupported_types(self):
     with self.assertRaises(ValueError):
-      conversion_map = conversion.ConversionMap(True, (), (), None)
+      conversion_map = self._simple_conversion_map()
       conversion.entity_to_graph('dummy', conversion_map, None, None)
 
   def test_entity_to_graph_callable(self):
@@ -47,7 +51,7 @@ class ConversionTest(test.TestCase):
     def f(a):
       return a + b
 
-    conversion_map = conversion.ConversionMap(True, (), (), None)
+    conversion_map = self._simple_conversion_map()
     ast, name, ns = conversion.entity_to_graph(f, conversion_map, None, None)
     self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
     self.assertEqual('tf__f', name)
@@ -61,7 +65,7 @@ class ConversionTest(test.TestCase):
     def f(a):
       return g(a)
 
-    conversion_map = conversion.ConversionMap(True, (), (), None)
+    conversion_map = self._simple_conversion_map()
     conversion.entity_to_graph(f, conversion_map, None, None)
 
     self.assertTrue(f in conversion_map.dependency_cache)
-- 
GitLab


From 0fdad03d31854ad37ad8e8a2cf5df9283a2ee050 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 11:56:15 -0700
Subject: [PATCH 0858/1262] Mark the parent list as modified for index writes.
 Add special case for constructors where we know setting an attribute actually
 creates a new symbol. Clean up the tests a bit.

PiperOrigin-RevId: 193075909
---
 .../pyct/static_analysis/activity.py          |  49 ++++-
 .../pyct/static_analysis/activity_test.py     | 189 +++++++++++++++---
 2 files changed, 201 insertions(+), 37 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index 6dd53091fa..b81f5c7f87 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -133,18 +133,18 @@ class Scope(object):
   def mark_param(self, name):
     self.params.add(name)
 
-  def mark_creation(self, name):
+  def mark_creation(self, name, writes_create_symbol=False):
     if name.is_composite():
       parent = name.parent
       if self.has(parent):
-        # This is considered mutation of the parent, not creation.
-        # TODO(mdan): Is that really so?
-        return
+        if not writes_create_symbol:
+          return
       else:
         raise ValueError('Unknown symbol "%s".' % parent)
     self.created.add(name)
 
   def mark_write(self, name):
+    """Marks the given symbol as modified in the current scope."""
     self.modified.add(name)
     if self.isolated:
       self.mark_creation(name)
@@ -170,15 +170,37 @@ class ActivityAnalizer(transformer.Base):
     self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
-  def _track_symbol(self, node):
-    # This can happen when we have an attribute (or subscript) on a function
-    # call.  Example: a().b
+  @property
+  def _in_constructor(self):
+    innermost = self.enclosing_entities[-1]
+    if len(self.enclosing_entities) > 1:
+      parent = self.enclosing_entities[-2]
+      return isinstance(parent, gast.ClassDef) and innermost.name == '__init__'
+    return False
+
+  def _node_sets_self_attribute(self, node):
+    if anno.hasanno(node, anno.Basic.QN):
+      qn = anno.getanno(node, anno.Basic.QN)
+      # TODO(mdan): The 'self' argument is not guaranteed to be called 'self'.
+      if qn.has_attr and qn.parent.qn == ('self',):
+        return True
+
+  def _track_symbol(self,
+                    node,
+                    composite_writes_alter_parent=False,
+                    writes_create_symbol=False):
+    # A QN may be missing when we have an attribute (or subscript) on a function
+    # call. Example: a().b
     if not anno.hasanno(node, anno.Basic.QN):
       return
     qn = anno.getanno(node, anno.Basic.QN)
 
     if isinstance(node.ctx, gast.Store):
       self.scope.mark_write(qn)
+      if qn.is_composite and composite_writes_alter_parent:
+        self.scope.mark_write(qn.parent)
+      if writes_create_symbol:
+        self.scope.mark_creation(qn, writes_create_symbol=True)
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
@@ -207,7 +229,18 @@ class ActivityAnalizer(transformer.Base):
 
   def visit_Attribute(self, node):
     self.generic_visit(node)
-    self._track_symbol(node)
+    if self._in_constructor and self._node_sets_self_attribute(node):
+      self._track_symbol(
+          node, composite_writes_alter_parent=True, writes_create_symbol=True)
+    else:
+      self._track_symbol(node)
+    return node
+
+  def visit_Subscript(self, node):
+    self.generic_visit(node)
+    # Subscript writes (e.g. a[b] = "value") are considered to modify
+    # both the element itself (a[b]) and its parent (a).
+    self._track_symbol(node, composite_writes_alter_parent=True)
     return node
 
   def visit_Print(self, node):
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index 1e6c686b01..d1c4a94b14 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -144,10 +144,21 @@ class ActivityAnalizerTest(test.TestCase):
         anno.getanno(node.body[0].body[2].value,
                      NodeAnno.IS_LOCAL))  # b in return b
 
+  def assertSymbolSetsAre(self, expected, actual, name):
+    expected = set(expected)
+    actual = set(str(s) for s in actual)
+    self.assertSetEqual(
+        expected, actual, 'for symbol set: %s\n'
+        '  Expected: %s\n'
+        '  Got:      %s\n'
+        '  Missing:  %s\n'
+        '  Extra:    %s\n' % (name.upper(), expected, actual,
+                              expected - actual, actual - expected))
+
   def assertScopeIsRmc(self, scope, used, modified, created):
-    self.assertItemsEqual(used, tuple(str(s) for s in scope.used))
-    self.assertItemsEqual(modified, tuple(str(s) for s in scope.modified))
-    self.assertItemsEqual(created, tuple(str(s) for s in scope.created))
+    self.assertSymbolSetsAre(used, scope.used, 'read')
+    self.assertSymbolSetsAre(modified, scope.modified, 'modified')
+    self.assertSymbolSetsAre(created, scope.created, 'created')
 
   def test_print_statement(self):
 
@@ -172,7 +183,7 @@ class ActivityAnalizerTest(test.TestCase):
     # arguments.
     self.assertScopeIsRmc(print_args_scope, ('a', 'b'), (), ())
 
-  def test_call(self):
+  def test_call_args(self):
 
     def test_fn(a):
       b = 0
@@ -187,6 +198,57 @@ class ActivityAnalizerTest(test.TestCase):
     self.assertScopeIsRmc(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'b'), (), ())
 
+  def test_call_args_attributes(self):
+
+    def foo(*_):
+      pass
+
+    def test_fn(a):
+      a.c = 0
+      foo(a.b, a.c)
+      return a.d
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[1].value
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
+        ('a', 'a.b', 'a.c'),
+        (),
+        (),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE).parent,
+        ('a', 'a.b', 'a.c', 'a.d', 'foo'),
+        ('a.c',),
+        ('a',),
+    )
+
+  def test_call_args_subscripts(self):
+
+    def foo(*_):
+      pass
+
+    def test_fn(a):
+      b = 1
+      c = 2
+      foo(a[0], a[b])
+      return a[c]
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[2].value
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
+        ('a', 'a[0]', 'a[b]', 'b'),
+        (),
+        (),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE).parent,
+        ('a', 'a[0]', 'a[b]', 'a[c]', 'b', 'c', 'foo'),
+        ('b', 'c'),
+        ('a', 'b', 'c'),
+    )
+
   def test_while(self):
 
     def test_fn(a):
@@ -253,7 +315,72 @@ class ActivityAnalizerTest(test.TestCase):
         anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent, ('x', 'z', 'u'),
         ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
 
-  def test_nested_if_else_creation(self):
+  def test_if_attributes(self):
+
+    def test_fn(a):
+      if a > 0:
+        a.b = -a.c
+        d = 2 * a
+      else:
+        a.b = a.c
+        d = 1
+      return d
+
+    node = self._parse_and_analyze(test_fn)
+    if_node = node.body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE),
+        ('a', 'a.c'),
+        ('a.b', 'd'),
+        ('d',),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
+        ('a', 'a.c'),
+        ('a.b', 'd'),
+        ('d',),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent,
+        ('a', 'a.c', 'd'),
+        ('a.b', 'd'),
+        ('a', 'd'),
+    )
+
+  def test_if_subscripts(self):
+
+    def test_fn(a, b, c, e):
+      if a > 0:
+        a[b] = -a[c]
+        d = 2 * a
+      else:
+        a[0] = e
+        d = 1
+      return d
+
+    node = self._parse_and_analyze(test_fn)
+    if_node = node.body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE),
+        ('a', 'b', 'c', 'a[c]'),
+        ('a', 'a[b]', 'd'),
+        ('d',),
+    )
+    # TODO(mdan): Should subscript writes (a[0] = 1) be considered to read "a"?
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
+        ('a', 'e'),
+        ('a', 'a[0]', 'd'),
+        ('d',),
+    )
+    self.assertScopeIsRmc(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent,
+        ('a', 'b', 'c', 'd', 'e', 'a[c]'),
+        ('a', 'd', 'a[b]', 'a[0]'),
+        ('a', 'b', 'c', 'd', 'e'),
+    )
+
+  def test_nested_if(self):
 
     def test_fn(b):
       if b > 0:
@@ -272,7 +399,7 @@ class ActivityAnalizerTest(test.TestCase):
         anno.getanno(inner_if_node, NodeAnno.ORELSE_SCOPE), ('b',), ('a',),
         ('a',))
 
-  def test_function_def(self):
+  def test_nested_function(self):
 
     def test_fn(a):
 
@@ -287,44 +414,48 @@ class ActivityAnalizerTest(test.TestCase):
       return b, c
 
     node = self._parse_and_analyze(test_fn)
-    fndef_node = node.body[0].body[0]
+    fn_def_node = node.body[0].body[0]
 
     self.assertScopeIsRmc(
-        anno.getanno(fndef_node,
+        anno.getanno(fn_def_node,
                      NodeAnno.BODY_SCOPE).parent, ('b', 'i', 'f', 'c', 'a'),
         ('f', 'b', 'c', 'i'), ('f', 'a', 'b', 'c', 'i'))
     self.assertScopeIsRmc(
-        anno.getanno(fndef_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',), (
+        anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',), (
             'x',
             'y',
         ))
 
-  def test_call_with_composite_names(self):
+  def test_constructor_attributes(self):
 
-    def foo(*_):
-      pass
+    class TestClass(object):
+
+      def __init__(self, a):
+        self.b = a
+        self.b.c = 1
+
+    node = self._parse_and_analyze(TestClass)
+    init_node = node.body[0].body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(init_node, NodeAnno.BODY_SCOPE),
+        ('self', 'a', 'self.b'),
+        ('self', 'self.b', 'self.b.c'),
+        ('self', 'a', 'self.b'),
+    )
+
+  def test_aug_assign_subscripts(self):
 
     def test_fn(a):
-      foo(a.b, a.c)
-      if a > 0:
-        a.b = 2
-      else:
-        d = 2
-        d.e = a.c
-        f = d.e + 1
-        a.c = f
+      a[0] += 1
 
     node = self._parse_and_analyze(test_fn)
-    call_node = node.body[0].body[0].value
+    fn_node = node.body[0]
     self.assertScopeIsRmc(
-        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'a.b', 'a.c'), (),
-        ())
-    if_node = node.body[0].body[1]
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a',), ('a.b',), ())
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
-        ('a', 'a.c', 'd', 'd.e', 'f'), ('a.c', 'd', 'd.e', 'f'), ('d', 'f'))
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
+        ('a',),
+        ('a', 'a[0]'),
+        ('a',),
+    )
 
 
 if __name__ == '__main__':
-- 
GitLab


From 345ccea1ea751e426a2d2d8e8d44455c43336d8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 12:09:24 -0700
Subject: [PATCH 0859/1262] Remove obsolete tests. Patch the unexpected print
 output in Python 3.

PiperOrigin-RevId: 193078330
---
 .../converters/builtin_functions_test.py      | 38 +++----------------
 .../contrib/autograph/utils/builtins.py       | 10 ++++-
 2 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/builtin_functions_test.py b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
index ac7e756c47..30272409df 100644
--- a/tensorflow/contrib/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
@@ -26,8 +26,6 @@ from tensorflow.contrib.autograph.converters import builtin_functions
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
@@ -49,7 +47,7 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
 
         self.assertEqual(3, result.test_fn([0, 0, 0]))
 
-  def test_print_with_op(self):
+  def test_print(self):
 
     def test_fn(a):
       print(a)
@@ -57,14 +55,12 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {'print': print})
     node = builtin_functions.transform(node, self.ctx)
 
-    # Note: it's relevant not to include script_ops.py_func here, to verify
-    # that tf.Print is used.
-    with self.compiled(node, logging_ops.Print) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         try:
           out_capturer = six.StringIO()
           sys.stdout = out_capturer
-          result.test_fn('a')
+          result.test_fn(constant_op.constant('a'))
           sess.run(sess.graph.get_operations())
           self.assertEqual(out_capturer.getvalue(), 'a\n')
         finally:
@@ -72,41 +68,19 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
 
   def test_print_with_op_multiple_values(self):
 
-    def test_fn(a, b):
-      print(a, b)
-
-    node = self.parse_and_analyze(test_fn, {'print': print})
-    node = builtin_functions.transform(node, self.ctx)
-
-    # Note: it's relevant not to include script_ops.py_func here, to verify
-    # that tf.Print is used.
-    with self.compiled(node, logging_ops.Print) as result:
-      with self.test_session() as sess:
-        try:
-          out_capturer = six.StringIO()
-          sys.stdout = out_capturer
-          result.test_fn('a', 1)
-          sess.run(sess.graph.get_operations())
-          self.assertEqual(out_capturer.getvalue(), 'a 1\n')
-        finally:
-          sys.stdout = sys.__stdout__
-
-  def test_print_with_py_func(self):
-
     def test_fn(a, b, c):
       print(a, b, c)
 
     node = self.parse_and_analyze(test_fn, {'print': print})
     node = builtin_functions.transform(node, self.ctx)
 
-    # Note: it's relevant not to include logging_ops.Print here, to verify
-    # that py_func is used.
-    with self.compiled(node, script_ops.py_func) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         try:
           out_capturer = six.StringIO()
           sys.stdout = out_capturer
-          result.test_fn('a', 1, [2, 3])
+          result.test_fn(
+              constant_op.constant('a'), constant_op.constant(1), [2, 3])
           sess.run(sess.graph.get_operations())
           self.assertEqual(out_capturer.getvalue(), 'a 1 [2, 3]\n')
         finally:
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 7fbb7c09d8..349b7b6f2a 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -98,9 +98,15 @@ def dynamic_print(*values):
   if all(map(is_tf_print_compatible, values)):
     return logging_ops.Print(1, values)
 
-  def flushed_print(*vals):
+  def print_wrapper(*vals):
+    if six.PY3:
+      # TensorFlow doesn't seem to generate Unicode when passing strings to
+      # py_func. This causes the print to add a "b'" wrapper to the output,
+      # which is probably never what you want.
+      vals = tuple(v.decode() if isinstance(v, bytes) else v for v in vals)
     print(*vals)
+    # The flush helps avoid garbled output in IPython.
     sys.stdout.flush()
 
   return py_func.wrap_py_func(
-      flushed_print, None, values, use_dummy_return=True)
+      print_wrapper, None, values, use_dummy_return=True)
-- 
GitLab


From d3fb437da12fc326d8229bdb955580c63eaccb5f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 12:09:33 -0700
Subject: [PATCH 0860/1262] Copy the if statement handlers over to the
 operators module. They will enabled in a follow-up CL.

PiperOrigin-RevId: 193078348
---
 .../autograph/operators/control_flow.py       | 32 +++++++++++++++++++
 .../autograph/operators/control_flow_test.py  | 29 +++++++++++++----
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index 81ae64f110..d9d8b0d593 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -25,6 +25,9 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 
+# TODO(mdan): Rename _loop to _stmt to follow Python nomenclature.
+# TODO(mdan): Rename arguments to match the AST names.
+
 
 def for_loop(iterated, extra_cond, loop_body, init_state):
   """Functional form of a for statement.
@@ -182,3 +185,32 @@ def _py_while_loop(loop_cond, loop_body, init_state, opts):
   while loop_cond(*state):
     state = loop_body(*state)
   return state
+
+
+def if_stmt(cond, body, orelse):
+  """Functional form of an if statement.
+
+  Args:
+    cond: Boolean.
+    body: Callable with no arguments, and outputs of the positive (if) branch
+        as return type.
+    orelse: Callable with no arguments, and outputs of the negative (else)
+        branch as return type.
+
+  Returns:
+    Tuple containing the statement outputs.
+  """
+  if tensor_util.is_tensor(cond):
+    return _tf_if_stmt(cond, body, orelse)
+  else:
+    return _py_if_stmt(cond, body, orelse)
+
+
+def _tf_if_stmt(cond, body, orelse):
+  """Overload of if_stmt that stages a TF cond."""
+  return control_flow_ops.cond(cond, body, orelse)
+
+
+def _py_if_stmt(cond, body, orelse):
+  """Overload of if_stmt that executes a Python if statement."""
+  return body() if cond else orelse()
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
index 9112b1627f..a0cd0bfa82 100644
--- a/tensorflow/contrib/autograph/operators/control_flow_test.py
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.autograph import operators
+from tensorflow.contrib.autograph.operators import control_flow
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 class ForLoopTest(test.TestCase):
 
   def test_tensor(self):
-    s = operators.for_loop(
+    s = control_flow.for_loop(
         constant_op.constant([1, 2, 3, 4]),
         extra_cond=lambda s: True,
         loop_body=lambda i, s: (s + i,),
@@ -38,7 +38,7 @@ class ForLoopTest(test.TestCase):
       self.assertEqual((10,), sess.run(s))
 
   def test_python(self):
-    s = operators.for_loop(
+    s = control_flow.for_loop(
         range(5),
         extra_cond=lambda s: True,
         loop_body=lambda i, s: (s + i,),
@@ -47,7 +47,7 @@ class ForLoopTest(test.TestCase):
 
   def test_dataset(self):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
-    s = operators.for_loop(
+    s = control_flow.for_loop(
         dataset_ops.Dataset.range(5).map(to_int32),
         extra_cond=lambda s: True,
         loop_body=lambda i, s: (s + i,),
@@ -60,7 +60,7 @@ class WhileLoopTest(test.TestCase):
 
   def test_tensor(self):
     n = constant_op.constant(5)
-    results = operators.while_loop(
+    results = control_flow.while_loop(
         loop_cond=lambda i, s: i < n,
         loop_body=lambda i, s: (i + 1, s + i,),
         init_state=(0, 0),
@@ -70,7 +70,7 @@ class WhileLoopTest(test.TestCase):
 
   def test_python(self):
     n = 5
-    results = operators.while_loop(
+    results = control_flow.while_loop(
         loop_cond=lambda i, s: i < n,
         loop_body=lambda i, s: (i + 1, s + i),
         init_state=(0, 0),
@@ -78,5 +78,22 @@ class WhileLoopTest(test.TestCase):
     self.assertEqual((5, 10), results)
 
 
+class IfStmtTest(test.TestCase):
+
+  def test_tensor(self):
+    def test_if_stmt(cond):
+      return control_flow.if_stmt(
+          cond=cond,
+          body=lambda: 1,
+          orelse=lambda: -1)
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(test_if_stmt(constant_op.constant(True))))
+      self.assertEqual(-1, sess.run(test_if_stmt(constant_op.constant(False))))
+
+  def test_python(self):
+    self.assertEqual(1, control_flow.if_stmt(True, lambda: 1, lambda: -1))
+    self.assertEqual(-1, control_flow.if_stmt(False, lambda: 1, lambda: -1))
+
+
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 2343304e1757942c47645d985615defdb48e3f21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 12:09:36 -0700
Subject: [PATCH 0861/1262] Add a common transformer feature that allows
 keeping temporary state across nodes. To be used in the break, continue and
 return canonicalizers.

PiperOrigin-RevId: 193078359
---
 .../contrib/autograph/pyct/transformer.py     |  34 +++++-
 .../autograph/pyct/transformer_test.py        | 102 ++++++++++++++++--
 2 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index b38d52c5b2..3e414d7ba5 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -40,7 +40,13 @@ def try_ast_to_source(node):
 
 
 class Base(gast.NodeTransformer):
-  """Base class for specialized transformers."""
+  """Base class for specialized transformers.
+
+  Scope-local state tracking: to keep state across nodes, at the level of
+  (possibly nested) scopes, use enter/exit_local_scope and set/get_local.
+  You must call enter/exit_local_scope manually, but the transformer detects
+  when they are not properly paired.
+  """
 
   def __init__(self, context):
     """Initialize the transformer. Subclasses should call this.
@@ -53,10 +59,28 @@ class Base(gast.NodeTransformer):
     self.context = context
     self._enclosing_entities = []
 
+    # A stack that allows keeping mutable, scope-local state where scopes may be
+    # nested. For example, it can be used to track the usage of break
+    # statements in each loop, where loops may be nested.
+    self._local_scope_state = []
+    self.enter_local_scope()
+
   @property
   def enclosing_entities(self):
     return tuple(self._enclosing_entities)
 
+  def enter_local_scope(self):
+    self._local_scope_state.append({})
+
+  def exit_local_scope(self):
+    return self._local_scope_state.pop()
+
+  def set_local(self, name, value):
+    self._local_scope_state[-1][name] = value
+
+  def get_local(self, name, default=None):
+    return self._local_scope_state[-1].get(name, default)
+
   def debug_print(self, node):
     """Helper method useful for debugging."""
     if __debug__:
@@ -67,6 +91,7 @@ class Base(gast.NodeTransformer):
     source_code = self.context.source_code
     source_file = self.context.source_file
     did_enter_function = False
+    local_scope_state_size = len(self._local_scope_state)
 
     try:
       if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
@@ -97,3 +122,10 @@ class Base(gast.NodeTransformer):
     finally:
       if did_enter_function:
         self._enclosing_entities.pop()
+
+      if local_scope_state_size != len(self._local_scope_state):
+        raise AssertionError(
+            'Inconsistent local scope stack. Before entering node %s, the'
+            ' stack had length %d, after exit it has length %d. This'
+            ' indicates enter_local_scope and exit_local_scope are not'
+            ' well paired.')
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
index 57f1c31ef6..f96b0dc377 100644
--- a/tensorflow/contrib/autograph/pyct/transformer_test.py
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -27,6 +27,17 @@ from tensorflow.python.platform import test
 
 class TransformerTest(test.TestCase):
 
+  def _context_for_nodetesting(self):
+    return context.EntityContext(
+        namer=None,
+        source_code=None,
+        source_file=None,
+        namespace=None,
+        arg_values=None,
+        arg_types=None,
+        owner_type=None,
+        recursive=False)
+
   def test_entity_scope_tracking(self):
 
     class TestTransformer(transformer.Base):
@@ -42,16 +53,7 @@ class TransformerTest(test.TestCase):
         anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
         return self.generic_visit(node)
 
-    tr = TestTransformer(
-        context.EntityContext(
-            namer=None,
-            source_code=None,
-            source_file=None,
-            namespace=None,
-            arg_values=None,
-            arg_types=None,
-            owner_type=None,
-            recursive=False))
+    tr = TestTransformer(self._context_for_nodetesting())
 
     def test_function():
       a = 0
@@ -92,6 +94,86 @@ class TransformerTest(test.TestCase):
                       inner_function, lambda_node),
                      anno.getanno(lambda_expr, 'enclosing_entities'))
 
+  def test_statement_info_stack(self):
+
+    class TestTransformer(transformer.Base):
+
+      # Extract all string constants from the block.
+      def visit_Str(self, node):
+        self.set_local('string', self.get_local('string', default='') + node.s)
+        return self.generic_visit(node)
+
+      def _annotate_result(self, node):
+        self.enter_local_scope()
+        node = self.generic_visit(node)
+        anno.setanno(node, 'test', self.get_local('string'))
+        self.exit_local_scope()
+        return node
+
+      def visit_While(self, node):
+        return self._annotate_result(node)
+
+      def visit_For(self, node):
+        return self._annotate_result(node)
+
+    tr = TestTransformer(self._context_for_nodetesting())
+
+    def test_function(a):
+      """Docstring."""
+      assert a == 'This should not be counted'
+      for i in range(3):
+        _ = 'a'
+        if i > 2:
+          return 'b'
+        else:
+          _ = 'c'
+          while True:
+            raise '1'
+      return 'nor this'
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+
+    for_node = node.body[0].body[2]
+    while_node = for_node.body[1].orelse[1]
+
+    self.assertFalse(anno.hasanno(for_node, 'string'))
+    self.assertEqual('abc', anno.getanno(for_node, 'test'))
+    self.assertFalse(anno.hasanno(while_node, 'string'))
+    self.assertEqual('1', anno.getanno(while_node, 'test'))
+
+  def test_statement_info_stack_checks_integrity(self):
+
+    class TestTransformer(transformer.Base):
+
+      def visit_If(self, node):
+        self.enter_local_scope()
+        return self.generic_visit(node)
+
+      def visit_For(self, node):
+        node = self.generic_visit(node)
+        self.exit_local_scope()
+        return node
+
+    tr = TestTransformer(self._context_for_nodetesting())
+
+    def no_exit(a):
+      if a > 0:
+        print(a)
+      return None
+
+    node, _ = parser.parse_entity(no_exit)
+    with self.assertRaises(AssertionError):
+      tr.visit(node)
+
+    def no_entry(a):
+      for _ in a:
+        print(a)
+
+    node, _ = parser.parse_entity(no_entry)
+    with self.assertRaises(AssertionError):
+      tr.visit(node)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From aab497b2f520954d26a48f871548c7fd1ac41441 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 12:12:46 -0700
Subject: [PATCH 0862/1262] Tighten label check in
 BinaryLogisticHeadWithSigmoidCrossEntropyLoss

PiperOrigin-RevId: 193078844
---
 tensorflow/python/estimator/canned/head.py    | 10 ++++-----
 .../python/estimator/canned/head_test.py      | 22 +++++++++++++++++--
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 189b81aeea..c365ea8b4a 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -1039,7 +1039,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           vocabulary_list=tuple(self._label_vocabulary),
           name='class_id_lookup').lookup(labels)
     labels = math_ops.to_float(labels)
-    labels = _assert_range(labels, 2)
+    labels = _assert_range(labels, n_classes=2)
     if self._loss_fn:
       unweighted_loss = _call_loss_fn(
           loss_fn=self._loss_fn, labels=labels, logits=logits,
@@ -1447,12 +1447,12 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
 
 def _assert_range(labels, n_classes, message=None):
   with ops.name_scope(None, 'assert_range', (labels,)):
-    assert_less = check_ops.assert_less(
+    assert_less = check_ops.assert_less_equal(
         labels,
-        ops.convert_to_tensor(n_classes, dtype=labels.dtype),
-        message=message or 'Label IDs must < n_classes')
+        ops.convert_to_tensor(n_classes - 1, dtype=labels.dtype),
+        message=message or 'Labels must <= n_classes - 1')
     assert_greater = check_ops.assert_non_negative(
-        labels, message=message or 'Label IDs must >= 0')
+        labels, message=message or 'Labels must >= 0')
     with ops.control_dependencies((assert_less, assert_greater)):
       return array_ops.identity(labels)
 
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index fe6ee07529..7da3df01dc 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -255,14 +255,14 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
         logits=logits_placeholder,
         labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesOpError('Label IDs must < n_classes'):
+      with self.assertRaisesOpError('Labels must <= n_classes - 1'):
         training_loss.eval({
             labels_placeholder: labels_2x1_with_large_id,
             logits_placeholder: logits_2x3
         })
 
     with self.test_session():
-      with self.assertRaisesOpError('Label IDs must >= 0'):
+      with self.assertRaisesOpError('Labels must >= 0'):
         training_loss.eval({
             labels_placeholder: labels_2x1_with_negative_id,
             logits_placeholder: logits_2x3
@@ -2090,6 +2090,24 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
               expected_regularization_loss),
       }, summary_str)
 
+  def test_float_labels_invalid_values(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
+    labels = np.array([[1.2], [0.4]], dtype=np.float32)
+    features = {'x': np.array([[42]], dtype=np.float32)}
+    training_loss = head.create_loss(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)[0]
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r'Labels must <= n_classes - 1'):
+      with self.test_session():
+        _initialize_variables(self, monitored_session.Scaffold())
+        training_loss.eval()
+
   def test_float_labels_train_create_loss(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
 
-- 
GitLab


From 249a00e8d72983b1aa0cd061ee6298238a5cfbfe Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Mon, 16 Apr 2018 12:21:15 -0700
Subject: [PATCH 0863/1262] Early TPU distribution strategy and the associated
 testing infrastructure.

PiperOrigin-RevId: 193080098
---
 tensorflow/contrib/distribute/python/BUILD    | 37 +++++++--
 .../contrib/distribute/python/combinations.py | 17 +++-
 .../distribute/python/minimize_loss_test.py   | 35 +++++++-
 .../contrib/distribute/python/tpu_strategy.py | 82 +++++++++++++++++++
 4 files changed, 161 insertions(+), 10 deletions(-)
 create mode 100644 tensorflow/contrib/distribute/python/tpu_strategy.py

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 5aad21cccd..837a1f1348 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -131,6 +131,7 @@ py_library(
     deps = [
         ":mirrored_strategy",
         ":one_device_strategy",
+        ":tpu_strategy",
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
@@ -225,14 +226,30 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "minimize_loss_test",
+py_library(
+    name = "tpu_strategy",
+    srcs = ["tpu_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/contrib/distribute/python:one_device_strategy",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/contrib/tpu",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "minimize_loss_test_lib",
+    testonly = 1,
     srcs = ["minimize_loss_test.py"],
-    additional_deps = [
+    deps = [
         ":combinations",
         ":single_loss_example",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
@@ -240,6 +257,16 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "minimize_loss_test",
+    srcs = ["minimize_loss_test.py"],
+    additional_deps = [
+        ":minimize_loss_test_lib",
     ],
     tags = [
         "multi_and_single_gpu",
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 02b1e7ef9f..1f66997e6e 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -45,6 +45,7 @@ from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import tpu_strategy
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.eager import context
@@ -55,6 +56,7 @@ from tensorflow.python.util import tf_inspect
 
 
 GPU_TEST = "test_gpu" in sys.argv[0]
+TPU_TEST = "test_tpu" in sys.argv[0]
 
 
 def generate(combinations):
@@ -108,6 +110,11 @@ def generate(combinations):
       if "distribution" in kwargs:
         distribution = kwargs["distribution"]
         kwargs["distribution"] = distribution.strategy
+        if distribution.required_tpu and not TPU_TEST:
+          self.skipTest("Test requires a TPU, but it's not available.")
+        if not distribution.required_tpu and TPU_TEST:
+          self.skipTest("Test that doesn't require a TPU.")
+
         if not distribution.required_gpus:
           if GPU_TEST:
             self.skipTest("Test that doesn't require GPUs.")
@@ -232,10 +239,12 @@ class NamedObject(object):
 class NamedDistribution(object):
   """Translates DistributionStrategy and its data into a good name."""
 
-  def __init__(self, name, distribution, required_gpus):
+  def __init__(self, name, distribution, required_gpus=None,
+               required_tpu=False):
     self._distribution = distribution
     self._name = name
     self._required_gpus = required_gpus
+    self._required_tpu = required_tpu
 
   def __repr__(self):
     return self._name
@@ -248,10 +257,16 @@ class NamedDistribution(object):
   def required_gpus(self):
     return self._required_gpus
 
+  @property
+  def required_tpu(self):
+    return self._required_tpu
+
 
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
     None)
+tpu_strategy = NamedDistribution(
+    "TPU", tpu_strategy.TpuStrategy(), required_tpu=True)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
     mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 0fa90df79b..4219d54cbd 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
@@ -42,24 +43,46 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
-          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
-  def testTrainNetwork(self, distribution, optimizer_fn,
-                       use_callable_loss=True):
+          + combinations.combine(mode=["eager"], use_callable_loss=[True]),
+          combinations.combine(is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=[combinations.adam_optimizer_v1_fn],
+          mode=["graph"],
+          use_callable_loss=[False],
+          is_tpu=[True]))
+  def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
+                       is_tpu):
     with distribution.scope():
       model_fn, dataset, layer = minimize_loss_example(
           optimizer_fn,
           use_bias=True,
           use_callable_loss=use_callable_loss)
 
+      # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
+      # `DistributionStrategy.create_monitor` so that each DistributionStrategy
+      # could influence its training loop. That method would return an instance
+      # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
+      # tpu.shutdown_system().
+      if is_tpu:
+        dataset = dataset.batch(2)
+
       iterator = distribution.distribute_dataset(dataset)
 
       def run_step():
+        # TODO(isaprykin): Make iterator get_next() return a list of sub-
+        # batches for each iteration. Pass iterator.get_next() and not iterator
+        # to call_for_each_tower.
         return distribution.group(
             distribution.call_for_each_tower(
-                model_fn, iterator.get_next(), run_concurrently=layer.built))
+                model_fn,
+                iterator.get_next() if not is_tpu else iterator,
+                run_concurrently=layer.built))
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
@@ -70,6 +93,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         weights.append(self.evaluate(distribution.fetch(layer.kernel)))
         biases.append(self.evaluate(distribution.fetch(layer.bias)))
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
new file mode 100644
index 0000000000..0ac307dd6a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU Distribution Strategy.
+
+This is experimental.  It's not ready for general use.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import tpu
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+
+
+# TODO(isaprykin):  Consider whether inheriting is really appropriate.
+class TpuStrategy(one_device_strategy.OneDeviceStrategy):
+
+  def __init__(self, master=None, iterations=None, model_dir=None):
+    super(TpuStrategy, self).__init__('/cpu:0')
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    kwargs.pop('run_concurrently', None)
+
+    # TODO(isaprykin): Give an API for many iterations per step.
+    iterations = 1
+
+    # TODO(isaprykin): Do not hard code shapes and input format :)
+    # TODO(isaprykin): Detect the number of TPU cores automatically.
+
+    def dequeueing_fn(*args, **kwargs):
+      del args, kwargs
+      x, = tpu.infeed_dequeue_tuple(dtypes=[dtypes.float32], shapes=[[1, 1, 1]])
+      return fn(x)
+
+    iterator = args[0]
+
+    def infeed_input(i):
+      """Get input, split it and then enqueue."""
+      batches = iterator.get_next()
+      batches = array_ops.split(batches, 2)
+
+      infeeds = [
+          tpu_ops.infeed_enqueue_tuple(
+              inputs=[batches[j]], shapes=[[1, 1, 1]], device_ordinal=j)
+          for j in range(2)
+      ]
+
+      with ops.control_dependencies(infeeds):
+        return i + 1
+
+    with ops.device('/task:0/device:CPU:0'):
+      enqueue_ops = control_flow_ops.while_loop(
+          lambda i: i < iterations,
+          infeed_input, [constant_op.constant(0)],
+          parallel_iterations=1)
+
+    def iterate_on_tpu():
+      return tpu.repeat(iterations, dequeueing_fn, [])
+
+    with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
+      tpu_result = tpu.batch_parallel(iterate_on_tpu, [], num_shards=2)
+
+    return control_flow_ops.group(tpu_result, enqueue_ops)
-- 
GitLab


From 21ba571a5ca4072de772cd81a759a3d7a869fd8a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 14 Apr 2018 19:00:36 +0000
Subject: [PATCH 0864/1262] Update libpng to v1.6.34 for cmake

The libpng has been updated from v1.2.53 to v1.6.34 in PR 18299.
However, the cmake version of libpng has not been updated yet.
This fix updates the libpng for cmake to v1.6.34.

The fix is tested with cmake on linux:
```
tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cmake/external/png.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 6cd66a6599..9cabecd788 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
-set(png_URL https://storage.googleapis.com/libpng-public-archive/libpng-1.2.53.tar.gz)
-set(png_HASH SHA256=e05c9056d7f323088fd7824d8c6acc03a4a758c4b4916715924edc5dd3223a72)
+set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz)
+set(png_HASH SHA256=e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef)
 set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
-- 
GitLab


From 2e641b67c328826f8f523a741cf24a4ee439cab9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 14 Apr 2018 19:04:52 +0000
Subject: [PATCH 0865/1262] Update library file names `libpng12` -> `libpng16`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cmake/external/png.cmake | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 9cabecd788..558d73dbda 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -23,24 +23,24 @@ set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(png_STATIC_LIBRARIES 
-      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
   else()
     if(CMAKE_BUILD_TYPE EQUAL Debug)
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib)
     else()
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
     endif()
   endif()
 else()
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
+  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng16.a)
 endif()
 
 set(png_HEADERS
-    "${png_INSTALL}/include/libpng12/png.h"
-    "${png_INSTALL}/include/libpng12/pngconf.h"
+    "${png_INSTALL}/include/libpng16/png.h"
+    "${png_INSTALL}/include/libpng16/pngconf.h"
 )
 
 ExternalProject_Add(png
-- 
GitLab


From c6ad4136813107e5adb0c2e62d9a73f720e8ccd3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 14 Apr 2018 19:05:28 +0000
Subject: [PATCH 0866/1262] Add missing header pnglibconf.h

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cmake/external/png.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 558d73dbda..ad2af01bc0 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -41,6 +41,7 @@ endif()
 set(png_HEADERS
     "${png_INSTALL}/include/libpng16/png.h"
     "${png_INSTALL}/include/libpng16/pngconf.h"
+    "${png_INSTALL}/include/libpng16/pnglibconf.h"
 )
 
 ExternalProject_Add(png
-- 
GitLab


From 7810e47e7d7c90b0e3df8e251964a38ebff9d978 Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Mon, 16 Apr 2018 11:56:46 -0700
Subject: [PATCH 0867/1262] Merge pull request #18568 from
 case540/enable_git_tag_override

Add ability to override git tag in __git_version__ string.
---
 tensorflow/tensorflow.bzl              |  2 +-
 tensorflow/tools/git/gen_git_source.py | 37 +++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 528f811b40..b286834ded 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1704,7 +1704,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b80..db2580755b 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -139,7 +139,7 @@ def configure(src_base_path, gen_path, debug=False):
     print("gen_git_source.py: spec is %r" % spec)
 
 
-def get_git_version(git_base_path):
+def get_git_version(git_base_path, git_tag_override):
   """Get the git version from the repository.
 
   This function runs `git describe ...` in the path given as `git_base_path`.
@@ -152,6 +152,9 @@ def get_git_version(git_base_path):
 
   Args:
     git_base_path: where the .git directory is located
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   Returns:
     A bytestring representing the git version
   """
@@ -161,6 +164,14 @@ def get_git_version(git_base_path):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    if git_tag_override:
+      split_val = val.split("-")
+      if len(split_val) != 3:
+        raise Exception(
+            ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
+             "but got '%s'") % val)
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -197,7 +208,7 @@ const int tf_monolithic_build() {
   open(filename, "w").write(contents)
 
 
-def generate(arglist):
+def generate(arglist, git_tag_override=None):
   """Generate version_info.cc as given `destination_file`.
 
   Args:
@@ -217,6 +228,10 @@ def generate(arglist):
   `ref_symlink` is unused in this script but passed, because the build
     system uses that file to detect when commits happen.
 
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
+
   Raises:
     RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
   """
@@ -234,11 +249,11 @@ def generate(arglist):
       raise RuntimeError(
           "Run ./configure again, branch was '%s' but is now '%s'" %
           (old_branch, new_branch))
-    git_version = get_git_version(data["path"])
+    git_version = get_git_version(data["path"], git_tag_override)
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -246,9 +261,12 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(".", git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -270,6 +288,11 @@ parser.add_argument(
     "--gen_root_path", type=str,
     help="Root path to place generated git files (created by --configure).")
 
+parser.add_argument(
+    "--git_tag_override", type=str,
+    help="Override git tag value in the __git_version__ string. Useful when "
+         "creating release builds before the release tag is created.")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -288,9 +311,9 @@ if args.configure is not None:
     raise RuntimeError("Must pass --gen_root_path arg when running --configure")
   configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
-  generate(args.generate)
+  generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  raw_generate(args.raw_generate, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
-- 
GitLab


From b69d1d44a073389a44ed807b4e7ded137be5bf69 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 16 Apr 2018 12:56:14 -0700
Subject: [PATCH 0868/1262] boosted_trees: Make some regularizer/hyper-params
 as inputs instead of attributes.

PiperOrigin-RevId: 193085059
---
 ...tedTreesCalculateBestGainsPerFeature.pbtxt | 38 +++++++--------
 .../api_def_BoostedTreesPredict.pbtxt         |  6 ---
 .../api_def_BoostedTreesTrainingPredict.pbtxt |  6 ---
 .../api_def_BoostedTreesUpdateEnsemble.pbtxt  |  4 +-
 .../kernels/boosted_trees/prediction_ops.cc   | 16 +++----
 .../core/kernels/boosted_trees/stats_ops.cc   | 44 ++++++++++--------
 .../kernels/boosted_trees/training_ops.cc     | 19 ++++----
 tensorflow/core/ops/boosted_trees_ops.cc      | 36 +++++----------
 .../core/ops/compat/ops_history.v1.pbtxt      | 46 +++++++------------
 .../python/estimator/canned/boosted_trees.py  |  6 +--
 .../boosted_trees/prediction_ops_test.py      | 14 +-----
 11 files changed, 96 insertions(+), 139 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index 62876a293c..7f18c64574 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -11,6 +11,24 @@ END
     name: "stats_summary_list"
     description: <<END
 A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+l1 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+l2 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "tree_complexity"
+    description: <<END
+adjustment to the gain, per leaf based.
 END
   }
   out_arg {
@@ -41,24 +59,6 @@ END
     name: "right_node_contribs_list"
     description: <<END
 A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-END
-  }
-  attr {
-    name: "l1"
-    description: <<END
-l1 regularization factor on leaf weights, per instance based.
-END
-  }
-  attr {
-    name: "l2"
-    description: <<END
-l2 regularization factor on leaf weights, per instance based.
-END
-  }
-  attr {
-    name: "tree_complexity"
-    description: <<END
-adjustment to the gain, per leaf based.
 END
   }
   attr {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
index b23e77a1fa..60ad9b4640 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -25,12 +25,6 @@ END
     description: <<END
 scalar, dimension of the logits, to be used for partial logits
 shape.
-END
-  }
-  attr {
-    name: "max_depth"
-    description: <<END
-scalar, max depth of trees. To be used for parallelization costs.
 END
   }
   summary: "Runs multiple additive regression ensemble predictors on input instances and"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
index 7203d3cb58..f8a3639c9b 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -52,12 +52,6 @@ END
     description: <<END
 scalar, dimension of the logits, to be used for partial logits
 shape.
-END
-  }
-  attr {
-    name: "max_depth"
-    description: <<END
-scalar, max depth of trees. To be used for parallelization costs.
 END
   }
   summary: "Runs multiple additive regression ensemble predictors on input instances and"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
index 00f8953875..3cf486d087 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -51,13 +51,13 @@ of the feature's splits. Will be added to the previous node values to constitute
 the values of the right nodes.
 END
   }
-  attr {
+  in_arg {
     name: "max_depth"
     description: <<END
 Max depth of the tree to build.
 END
   }
-  attr {
+  in_arg {
     name: "learning_rate"
     description: <<END
 shrinkage const for each new tree.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index b13a450546..1b5ce32b7b 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -50,7 +50,6 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
     OP_REQUIRES(context, logits_dimension_ == 1,
                 errors::InvalidArgument(
                     "Currently only one dimensional outputs are supported."));
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -155,9 +154,10 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           output_partial_logits(i, 0) = partial_all_logit;
         }
       };
-      // Assume we will not go over more than one full tree. 4 is a magic
-      // number.
-      const int64 cost = 4 * max_depth_;
+      // 30 is the magic number. The actual value might be a function of (the
+      // number of layers) * (cpu cycles spent on each layer), but this value
+      // would work for many cases. May be tuned later.
+      const int64 cost = 30;
       thread::ThreadPool* const worker_threads =
           context->device()->tensorflow_cpu_worker_threads()->workers;
       Shard(worker_threads->NumThreads(), worker_threads, batch_size,
@@ -168,7 +168,6 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
  private:
   int32 logits_dimension_;         // the size of the output prediction vector.
   int32 num_bucketized_features_;  // Indicates the number of features.
-  int32 max_depth_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesTrainingPredict").Device(DEVICE_CPU),
@@ -186,7 +185,6 @@ class BoostedTreesPredictOp : public OpKernel {
     OP_REQUIRES(context, logits_dimension_ == 1,
                 errors::InvalidArgument(
                     "Currently only one dimensional outputs are supported."));
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -243,7 +241,10 @@ class BoostedTreesPredictOp : public OpKernel {
         output_logits(i, 0) = tree_logit;
       }
     };
-    const int64 cost = (latest_tree + 1) * max_depth_;
+    // 10 is the magic number. The actual number might depend on (the number of
+    // layers in the trees) and (cpu cycles spent on each layer), but this
+    // value would work for many cases. May be tuned later.
+    const int64 cost = (latest_tree + 1) * 10;
     thread::ThreadPool* const worker_threads =
         context->device()->tensorflow_cpu_worker_threads()->workers;
     Shard(worker_threads->NumThreads(), worker_threads, batch_size,
@@ -254,7 +255,6 @@ class BoostedTreesPredictOp : public OpKernel {
   int32
       logits_dimension_;  // Indicates the size of the output prediction vector.
   int32 num_bucketized_features_;  // Indicates the number of features.
-  int32 max_depth_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesPredict").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 16e65cf284..40f50333d3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -29,10 +29,6 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   explicit BoostedTreesCalculateBestGainsPerFeatureOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("l1", &l1_));
-    OP_REQUIRES_OK(context, context->GetAttr("l2", &l2_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("tree_complexity", &tree_complexity_));
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
   }
@@ -54,6 +50,16 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     for (const auto& tensor : stats_summary_list) {
       stats_summary.emplace_back(tensor.tensor<float, 3>());
     }
+    const Tensor* l1_t;
+    OP_REQUIRES_OK(context, context->input("l1", &l1_t));
+    const auto l1 = l1_t->scalar<float>()();
+    const Tensor* l2_t;
+    OP_REQUIRES_OK(context, context->input("l2", &l2_t));
+    const auto l2 = l2_t->scalar<float>()();
+    const Tensor* tree_complexity_t;
+    OP_REQUIRES_OK(context,
+                   context->input("tree_complexity", &tree_complexity_t));
+    const auto tree_complexity = tree_complexity_t->scalar<float>()();
 
     // Allocate output lists of tensors:
     OpOutputList output_node_ids_list;
@@ -106,7 +112,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
         // Parent gain.
         float parent_gain;
         float unused;
-        CalculateWeightsAndGains(total_grad, total_hess, &unused, &parent_gain);
+        CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &unused,
+                                 &parent_gain);
 
         for (int bucket = 0; bucket < num_buckets; ++bucket) {
           const float cum_grad_bucket = cum_grad[bucket];
@@ -114,13 +121,13 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
           // Left child.
           float contrib_for_left;
           float gain_for_left;
-          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket,
+          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
                                    &contrib_for_left, &gain_for_left);
           // Right child.
           float contrib_for_right;
           float gain_for_right;
           CalculateWeightsAndGains(total_grad - cum_grad_bucket,
-                                   total_hess - cum_hess_bucket,
+                                   total_hess - cum_hess_bucket, l1, l2,
                                    &contrib_for_right, &gain_for_right);
 
           if (gain_for_left + gain_for_right > best_gain) {
@@ -173,7 +180,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       for (int i = 0; i < num_nodes; ++i) {
         output_node_ids_vec(i) = output_node_ids[i];
         // Adjust the gains to penalize by tree complexity.
-        output_gains_vec(i) = output_gains[i] - tree_complexity_;
+        output_gains_vec(i) = output_gains[i] - tree_complexity;
         output_thresholds_vec(i) = output_thresholds[i];
         // Logits are 1-dimensional for now.
         // TODO(nponomareva): Consider multi-dimensional logits.
@@ -184,8 +191,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   }
 
  private:
-  void CalculateWeightsAndGains(const float g, const float h, float* weight,
-                                float* gain) {
+  void CalculateWeightsAndGains(const float g, const float h, const float l1,
+                                const float l2, float* weight, float* gain) {
     //
     // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
     // (g+l1*sgn(w))^2/(h+l2).
@@ -196,11 +203,11 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
     // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
     // For g from (-l1, l1), thus there is no solution => set to 0.
-    if (l1_ > 0) {
-      if (g > l1_) {
-        g_with_l1 -= l1_;
-      } else if (g < -l1_) {
-        g_with_l1 += l1_;
+    if (l1 > 0) {
+      if (g > l1) {
+        g_with_l1 -= l1;
+      } else if (g < -l1) {
+        g_with_l1 += l1;
       } else {
         *weight = 0.0;
         *gain = 0.0;
@@ -208,19 +215,16 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       }
     }
     // Apply L2 regularization.
-    if (h + l2_ <= kEps) {
+    if (h + l2 <= kEps) {
       // Avoid division by 0 or infinitesimal.
       *weight = 0;
       *gain = 0;
     } else {
-      *weight = -g_with_l1 / (h + l2_);
+      *weight = -g_with_l1 / (h + l2);
       *gain = -g_with_l1 * (*weight);
     }
   }
 
-  float l1_;
-  float l2_;
-  float tree_complexity_;
   int max_splits_;
   int num_features_;
 };
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 67cac14c52..a14fd4a133 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -43,8 +43,6 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("learning_rate", &learning_rate_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
 
     int32 pruning_index;
@@ -79,8 +77,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
     const Tensor* feature_ids_t;
     OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    const auto feature_ids = feature_ids_t->vec<int32>();
 
-    auto feature_ids = feature_ids_t->vec<int32>();
+    const Tensor* max_depth_t;
+    OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    const auto max_depth = max_depth_t->scalar<int32>()();
+
+    const Tensor* learning_rate_t;
+    OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    const auto learning_rate = learning_rate_t->scalar<float>()();
 
     // Find best splits for each active node.
     std::map<int32, SplitCandidate> best_splits;
@@ -125,10 +130,10 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       // For now assume that the weights vectors are one dimensional.
       // TODO(nponomareva): change here for multiclass.
       const float left_contrib =
-          learning_rate_ *
+          learning_rate *
           left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
       const float right_contrib =
-          learning_rate_ *
+          learning_rate *
           right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
 
       // unused.
@@ -145,7 +150,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
-      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth) {
         // If the tree is finalized, next growing will start from node 0;
         node_id_start = 0;
         node_id_end = 1;
@@ -216,8 +221,6 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
  private:
   int32 num_features_;
-  float learning_rate_;
-  int32 max_depth_;
   PruningMode pruning_mode_;
 };
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 8af4903418..4d74e6d63a 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -37,9 +37,9 @@ REGISTER_OP("IsBoostedTreesEnsembleInitialized")
 REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
     .Input("node_id_range: int32")
     .Input("stats_summary_list: num_features * float32")
-    .Attr("l1: float")
-    .Attr("l2: float")
-    .Attr("tree_complexity: float")
+    .Input("l1: float")
+    .Input("l2: float")
+    .Input("tree_complexity: float")
     .Attr("max_splits: int >= 1")
     .Attr("num_features: int >= 1")  // not passed but populated automatically.
     .Output("node_ids_list: num_features * int32")
@@ -51,19 +51,6 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
       // Confirms the rank of the inputs and sets the shape of the outputs.
       int max_splits;
       int num_features;
-      float l1, l2, tree_complexity;
-      TF_RETURN_IF_ERROR(c->GetAttr("l1", &l1));
-      if (l1 < 0) {
-        return errors::InvalidArgument("l1 must be non-negative.");
-      }
-      TF_RETURN_IF_ERROR(c->GetAttr("l2", &l2));
-      if (l2 < 0) {
-        return errors::InvalidArgument("l2 must be non-negative.");
-      }
-      TF_RETURN_IF_ERROR(c->GetAttr("tree_complexity", &tree_complexity));
-      if (tree_complexity < 0) {
-        return errors::InvalidArgument("Tree complexity must be non-negative.");
-      }
       TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
       TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
       shape_inference::ShapeHandle node_id_range_shape;
@@ -83,6 +70,12 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
         TF_RETURN_IF_ERROR(
             c->Merge(summary_shape_base, summary_shape, &unused_shape));
       }
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 1), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 2), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 3), 0, &unused_shape));
       // Sets the output lists.
       std::vector<shape_inference::ShapeHandle> output_shapes_vec(
           num_features, c->MakeShape({-1}));
@@ -185,9 +178,8 @@ REGISTER_OP("BoostedTreesMakeStatsSummary")
 REGISTER_OP("BoostedTreesPredict")
     .Input("tree_ensemble_handle: resource")
     .Input("bucketized_features: num_bucketized_features * int32")
-    .Attr("num_bucketized_features: int >= 1")
+    .Attr("num_bucketized_features: int >= 1")  // Inferred.
     .Attr("logits_dimension: int")
-    .Attr("max_depth: int >= 1")
     .Output("logits: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle feature_shape;
@@ -229,7 +221,6 @@ REGISTER_OP("BoostedTreesTrainingPredict")
     .Input("bucketized_features: num_bucketized_features * int32")
     .Attr("num_bucketized_features: int >= 1")
     .Attr("logits_dimension: int")
-    .Attr("max_depth: int >= 1")
     .Output("partial_logits: float")
     .Output("tree_ids: int32")
     .Output("node_ids: int32")
@@ -239,9 +230,6 @@ REGISTER_OP("BoostedTreesTrainingPredict")
       TF_RETURN_IF_ERROR(
           c->GetAttr("num_bucketized_features", &num_bucketized_features));
 
-      int max_depth;
-      TF_RETURN_IF_ERROR(c->GetAttr("max_depth", &max_depth));
-
       shape_inference::ShapeHandle unused_input;
       for (int i = 0; i < num_bucketized_features; ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
@@ -273,8 +261,8 @@ REGISTER_OP("BoostedTreesUpdateEnsemble")
     .Input("thresholds: num_features * int32")
     .Input("left_node_contribs: num_features * float")
     .Input("right_node_contribs: num_features * float")
-    .Attr("max_depth: int >= 1")
-    .Attr("learning_rate: float")
+    .Input("max_depth: int32")
+    .Input("learning_rate: float")
     .Attr("pruning_mode: int >=0")
     .Attr("num_features: int >= 0")  // Inferred.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 083119662b..0af560010f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10855,6 +10855,18 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
@@ -10880,18 +10892,6 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "tree_complexity"
-    type: "float"
-  }
   attr {
     name: "max_splits"
     type: "int"
@@ -11054,12 +11054,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -11119,12 +11113,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -11162,15 +11150,13 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
+  input_arg {
     name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "learning_rate"
-    type: "float"
+    type: DT_FLOAT
   }
   attr {
     name: "pruning_mode"
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 0ecc8c7089..d099d308f5 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -325,8 +325,7 @@ def _bt_model_fn(
           # so no local copy is needed; using tree_ensemble directly.
           tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
-          logits_dimension=head.logits_dimension,
-          max_depth=tree_hparams.max_depth)
+          logits_dimension=head.logits_dimension)
     else:
       if is_single_machine:
         local_tree_ensemble = tree_ensemble
@@ -361,8 +360,7 @@ def _bt_model_fn(
             cached_tree_ids=cached_tree_ids,
             cached_node_ids=cached_node_ids,
             bucketized_features=input_feature_list,
-            logits_dimension=head.logits_dimension,
-            max_depth=tree_hparams.max_depth)
+            logits_dimension=head.logits_dimension)
       logits = cached_logits + partial_logits
 
     # Create training graph.
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index d132f15e51..54f33f3360 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -49,7 +49,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -116,7 +115,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values],
@@ -189,7 +187,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=4,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -299,7 +296,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=4,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -429,7 +425,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -562,7 +557,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=3,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -705,7 +699,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=3,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -782,7 +775,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=1,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -905,8 +897,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1,
-          max_depth=2)
+          logits_dimension=1)
 
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
@@ -915,8 +906,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1,
-          max_depth=2)
+          logits_dimension=1)
 
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
-- 
GitLab


From 35b8a8cfebe910687f3cc038c00a6e33ba09637a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 1 Apr 2018 02:01:47 +0000
Subject: [PATCH 0869/1262] Fix the issue with Bahdanau attention when
 normalized=True and dtype = float16/32

While revisiting 18016 I noticed that Bahdanau attention has a similiar
dtype mismatch issue when normalized=True. The issue comes from:
```
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
         initializer=math.sqrt((1. / num_units)))
```
where the initializer value does not work well with differnt dtype.

This fix converts changes the initializer to `init_ops.constant_initializer`
to address the issue, and adds additional test cases for it.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 9ba541ce23..867e49b565 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -472,7 +472,7 @@ def _bahdanau_score(processed_query, keys, normalize):
     # Scalar used in weight normalization
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
-        initializer=math.sqrt((1. / num_units)))
+        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))), shape=())
     # Bias added prior to the nonlinearity
     b = variable_scope.get_variable(
         "attention_b", [num_units], dtype=dtype,
-- 
GitLab


From fe4ab63ab258d67f37844f374db265130ceecf2a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 1 Apr 2018 02:07:30 +0000
Subject: [PATCH 0870/1262] Add test case for Bahdanau attention when
 normalized=True and dtype = float16/32

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../kernel_tests/attention_wrapper_test.py    | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 84a7b45b5a..6781433a1f 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -281,6 +281,41 @@ class AttentionWrapperTest(test.TestCase):
             expected_final_alignment_history,
             final_alignment_history_info)
 
+  def testBahdanauNormalizedDType(self):
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.BahdanauAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          normalize=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
-- 
GitLab


From d744b314682d2313bd3e8ffe0b34e022cbeacb7b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 1 Apr 2018 02:08:53 +0000
Subject: [PATCH 0871/1262] Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 867e49b565..a0f57417b8 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -472,7 +472,8 @@ def _bahdanau_score(processed_query, keys, normalize):
     # Scalar used in weight normalization
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
-        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))), shape=())
+        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
+        shape=())
     # Bias added prior to the nonlinearity
     b = variable_scope.get_variable(
         "attention_b", [num_units], dtype=dtype,
-- 
GitLab


From 457eaab8d9a3a08de57b5b2f11bf36a5030c2304 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 13:04:23 -0700
Subject: [PATCH 0872/1262] Simplify the implementation of break_statements.py

PiperOrigin-RevId: 193086371
---
 .../autograph/converters/break_statements.py  | 92 +++++++------------
 1 file changed, 33 insertions(+), 59 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 62115d4005..5dfb7a59d5 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gast
-
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
@@ -35,86 +33,62 @@ class BreakCanonicalizationTransformer(transformer.Base):
     # Each item is a list [break_used, break_variable_name]
     self.break_uses = []
 
-  def _create_break_check(self):
-    template = """
-      (not var_name)
-    """
-    expr, = templates.replace(template, var_name=self.break_uses[-1][1])
-    return expr.value
-
-  def _create_break_trigger(self):
+  def visit_Break(self, node):
+    self.break_uses[-1][0] = True
     template = """
       var_name = True
+      continue
     """
-    block = templates.replace(template, var_name=self.break_uses[-1][1])
-    block.append(gast.Continue())
-    return block
-
-  def _create_break_init(self):
-    template = """
-      var_name = False
-    """
-    assign, = templates.replace(template, var_name=self.break_uses[-1][1])
-    return assign
-
-  # TODO(mdan): Surely the transformer supports this better?
-  def _manual_visit_list(self, block):
-    new_block = []
-    for n in block:
-      new_n = self.visit(n)
-      if isinstance(new_n, list):
-        new_block.extend(new_n)
-      else:
-        new_block.append(new_n)
-    return new_block
+    return templates.replace(template, var_name=self.break_uses[-1][1])
 
   def visit_While(self, node):
-    self.generic_visit(node.test)
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-
     break_var = self.context.namer.new_symbol('break_requested',
                                               scope.referenced)
+
     self.break_uses.append([False, break_var])
-    node.body = self._manual_visit_list(node.body)
+    node = self.generic_visit(node)
     if self.break_uses[-1][0]:
-      node.test = gast.BoolOp(gast.And(), [
-          node.test,
-          gast.UnaryOp(gast.Not(), gast.Name(break_var, gast.Load(), None))
-      ])
-      final_nodes = [self._create_break_init(), node]
-    else:
-      final_nodes = node
+      template = """
+        var_name = False
+        while original_test and not var_name:
+          original_body
+        else:
+          original_orelse
+      """
+      node = templates.replace(
+          template,
+          var_name=break_var,
+          original_test=node.test,
+          original_body=node.body,
+          original_orelse=node.orelse)
     self.break_uses.pop()
 
-    for n in node.orelse:
-      self.generic_visit(n)
-    return final_nodes
+    return node
 
   def visit_For(self, node):
-    self.generic_visit(node.target)
-    self.generic_visit(node.iter)
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-
     break_var = self.context.namer.new_symbol('break_requested',
                                               scope.referenced)
+
     self.break_uses.append([False, break_var])
-    node.body = self._manual_visit_list(node.body)
+    node = self.generic_visit(node)
     if self.break_uses[-1][0]:
+      template = """
+        var_name = False
+        original_for
+      """
+      node = templates.replace(
+          template,
+          var_name=break_var,
+          original_for=node)
       extra_cond = templates.replace_as_expression(
           'not var_name', var_name=break_var)
-      anno.setanno(node, 'extra_cond', extra_cond)
-      final_nodes = [self._create_break_init(), node]
-    else:
-      final_nodes = node
+      new_for_node = node[1]
+      anno.setanno(new_for_node, 'extra_cond', extra_cond)
     self.break_uses.pop()
 
-    for n in node.orelse:
-      self.generic_visit(n)
-    return final_nodes
-
-  def visit_Break(self, node):
-    self.break_uses[-1][0] = True
-    return self._create_break_trigger()
+    return node
 
 
 def transform(node, context):
-- 
GitLab


From c6903c9a35d5035e9c26931571124bda4977bb57 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 16:27:50 +0000
Subject: [PATCH 0873/1262] Update gemmlowp version for cmake build

The gemmlowp has been updated in bazel, though
cmake version was not updated. This fix updates
gemmlowp in cmake so that cmake and bazel versions
are synced.

The fix has been tested on Linux:
```
tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cmake/external/gemmlowp.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index a235442dc5..cdaa6b73b9 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
-set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip)
+set(gemmlowp_HASH SHA256=b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
-- 
GitLab


From 17b1f7a17441ead6460dd9a14885df9d1af76870 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 19:25:05 +0000
Subject: [PATCH 0874/1262] Update gemmlowp on bazel to sync cmake changes

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f775491e4a..5746f32826 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -167,11 +167,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip",
-          "https://github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
-      sha256 = "b852cc90259a7357c8a323f108f2cec6e85979fc3b18b5590b99e0130044b2cf",
-      strip_prefix = "gemmlowp-7c7c744640ddc3d0af18fb245b4d23228813a71b",
+      sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
+      strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98",
   )
 
   tf_http_archive(
-- 
GitLab


From 3fea5138e87f59db0342165a75d0b475c8d36f83 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 19:40:43 +0000
Subject: [PATCH 0875/1262] Temporary comment out mirror.bazel.build for
 gemmlowp

Will reenable once the mirror is propagated.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5746f32826..2a85be08e7 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -167,7 +167,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
           "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
       sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
-- 
GitLab


From acb3239130d8810bba60011f6d3bddbf5d67c1df Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 20:02:46 +0000
Subject: [PATCH 0876/1262] Add gemmlowp to whitelist

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 third_party/repo.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index aa178fa8ca..8202dafac8 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,6 +17,7 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
+    "gemmlowp",
 ])
 
 def _is_windows(ctx):
-- 
GitLab


From daf9ef0c4016350d67de03125eb1d45f6c48edf3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 13:22:58 -0700
Subject: [PATCH 0877/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193089301
---
 tensorflow/core/ops/ops.pbtxt | 46 ++++++++++++-----------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 4c483125cc..1659adc9fe 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4009,6 +4009,18 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
@@ -4034,18 +4046,6 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "tree_complexity"
-    type: "float"
-  }
   attr {
     name: "max_splits"
     type: "int"
@@ -4208,12 +4208,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -4273,12 +4267,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -4316,15 +4304,13 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
+  input_arg {
     name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "learning_rate"
-    type: "float"
+    type: DT_FLOAT
   }
   attr {
     name: "pruning_mode"
-- 
GitLab


From c877eb3fcdff70ed43bfbd54df9eb678e3268eb5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 13:32:12 -0700
Subject: [PATCH 0878/1262] Adding several utility functions to TF2XLA to help
 with the Cholesky refactor.  Mainly responsible for handling batching
 properly.

PiperOrigin-RevId: 193090634
---
 tensorflow/compiler/tf2xla/lib/BUILD        |  24 ++++
 tensorflow/compiler/tf2xla/lib/util.cc      |  63 ++++++++-
 tensorflow/compiler/tf2xla/lib/util.h       |  30 +++-
 tensorflow/compiler/tf2xla/lib/util_test.cc | 145 ++++++++++++++++++++
 4 files changed, 258 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/compiler/tf2xla/lib/util_test.cc

diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 344773c8c5..ea6e1a4c89 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -126,6 +126,30 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "util_test",
+    srcs = ["util_test.cc"],
+    deps = [
+        ":batch_dot",
+        ":util",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "while_loop",
     srcs = ["while_loop.cc"],
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index f579669bbd..31d823ca33 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -140,13 +140,47 @@ xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
   return builder->Slice(x, padded_start, padded_end, strides);
 }
 
+std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
+                                    const gtl::ArraySlice<int64>& major_dims,
+                                    const gtl::ArraySlice<int64>& indices) {
+  std::vector<int64> output(indices.size() + major_dims.size());
+  std::copy(major_dims.begin(), major_dims.end(), output.begin());
+  std::copy(indices.begin(), indices.end(), output.begin() + major_dims.size());
+  return output;
+}
+
+xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts,
+    const gtl::ArraySlice<int64>& sizes) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  int64 n_minor_dims = starts.size();
+  TF_RET_CHECK(n_minor_dims == sizes.size());
+  TF_RET_CHECK(n_minor_dims <= n_dims);
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape->dimensions()),
+                                    /*pos=*/0,
+                                    /*len=*/n_dims - sizes.size());
+  TF_ASSIGN_OR_RETURN(auto padded_starts,
+                      PrependZerosInMajorDims(builder, x, starts));
+  auto padded_sizes = PrependMajorDims(builder, major_dims, sizes);
+  return builder->DynamicSlice(x, padded_starts, padded_sizes);
+}
+
 xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start) {
   // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
   std::vector<int32> start_as_int32(start.begin(), start.end());
-  return builder->DynamicUpdateSlice(
-      x, update, builder->ConstantR1<int32>(start_as_int32));
+  auto start_constant = builder->ConstantR1<int32>(start_as_int32);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> start_constant_shape,
+                      builder->GetShape(start_constant));
+  const int64 start_length =
+      xla::ShapeUtil::GetDimension(*start_constant_shape, -1);
+  TF_RET_CHECK(start_length == n_dims);
+  return builder->DynamicUpdateSlice(x, update, start_constant);
 }
 
 xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
@@ -162,6 +196,29 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
+xla::StatusOr<xla::ComputationDataHandle> DynamicUpdateSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update,
+    const std::vector<xla::ComputationDataHandle>& starts) {
+  TF_ASSIGN_OR_RETURN(auto padded_starts,
+                      PrependZerosInMajorDims(builder, x, starts));
+  return builder->DynamicUpdateSlice(x, update, padded_starts);
+}
+
+xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  auto zero = builder->Reshape(builder->ConstantR0<int32>(0), {1});
+  std::vector<xla::ComputationDataHandle> padded_starts(n_dims, zero);
+  for (int i = 0; i < starts.size(); ++i) {
+    padded_starts[n_dims - starts.size() + i] =
+        builder->Reshape(starts[i], {1});
+  }
+  return builder->ConcatInDim(padded_starts, 0);
+}
+
 xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 51f8baaf00..b684123f13 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -32,16 +32,39 @@ xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
 xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
                                         xla::PrimitiveType type, double value);
 
+// Makes a 1D tensor [0, ..., x, y] from two tensors x and y with zeros
+// prepended until the array is length n_dims.
+xla::ComputationDataHandle PrependZerosInMajorDims(
+    xla::ComputationBuilder* builder,
+    gtl::ArraySlice<xla::ComputationDataHandle> starts);
+
 // Returns a integer scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
 xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
                                           xla::PrimitiveType type, int64 value);
 
+// Builds a vector of zeros of length rank(x) with the last two values being
+// those in `starts`.
+xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts);
+
 // Performs a slice in the minor dimensions of a Tensor.
 xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end);
 
+// Builds a 1-d vector out of a concatenation of `major_dims` and `starts`.
+std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
+                                    const gtl::ArraySlice<int64>& major_dims,
+                                    const gtl::ArraySlice<int64>& indices);
+
+// Performs a dynamic slice in the minor dimensions of a Tensor.
+xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const std::vector<xla::ComputationDataHandle>& starts,
+    const gtl::ArraySlice<int64>& sizes);
+
 // Updates a slice of 'x', i.e.,
 // x[start[0], ..., start[n]] = update
 xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
@@ -54,6 +77,11 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
 
+xla::StatusOr<xla::ComputationDataHandle> DynamicUpdateSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update,
+    const std::vector<xla::ComputationDataHandle>& starts);
+
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
 xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc
new file mode 100644
index 0000000000..b6bd33af2e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/util_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+using UtilTest = xla::ClientLibraryTestBase;
+using UtilLeftLookingTest = xla::ClientLibraryTestBase;
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(UtilTest, Simple2dLookup) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, x, y;
+  auto a_data = CreateR2Parameter<float>(BValsRight(), 0, "a", &builder, &a);
+  auto x_data = CreateR0Parameter<int>(2, 1, "x", &builder, &x);
+  auto y_data = CreateR0Parameter<int>(1, 2, "y", &builder, &y);
+  auto result = DynamicSliceInMinorDims(&builder, a, {x, y}, {1, 1});
+  TF_ASSERT_OK(result.status());
+
+  ComputeAndCompareR2<float>(&builder, {{10}},
+                             {a_data.get(), x_data.get(), y_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(UtilTest, Simple3dLookup) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto index_data = CreateR0Parameter<int>(1, 1, "index", &builder, &index);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto l_index,
+      DynamicSliceInMinorDims(&builder, a,
+                              {index, builder.ConstantR0<int32>(0)}, {1, 4}));
+
+  ComputeAndCompareR3<float>(&builder, {{{3, 6, 0, 1}}, {{24, 61, 82, 48}}},
+                             {a_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b, x, y;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>({{9, 1, -10}}, 1, "b", &builder, &b);
+  auto x_data = CreateR0Parameter<int>(2, 2, "x", &builder, &x);
+  auto y_data = CreateR0Parameter<int>(1, 3, "y", &builder, &y);
+
+  auto result = DynamicUpdateSliceInMinorDims(&builder, a, b, {x, y});
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected(
+      {{{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 9, 1, -10}, {5, 8, 10, 11}}});
+
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
+}
+
+XLA_TEST_F(UtilTest, RowBatchDot) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  int n = 4;
+
+  xla::ComputationDataHandle a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto l_index,
+      DynamicSliceInMinorDims(&builder, a,
+                              {index, builder.ConstantR0<int32>(0)}, {1, n}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto dot, BatchDot(&builder, l_index, row,
+                         /*transpose_x=*/false, /*transpose_y=*/true));
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From d54acdc8f669079301e4858b7675f2bbedea8190 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 16 Apr 2018 13:36:55 -0700
Subject: [PATCH 0879/1262] [XLA] Document and enforce reduction order of init
 value

All existing backends apply the init_value on the lhs, except for the evaluator.  This causes problems for reductions which apply an identity function to a reduce or reduce window.

PiperOrigin-RevId: 193091323
---
 .../xla/service/algebraic_simplifier.cc       |  2 +-
 .../xla/service/cpu/elemental_ir_emitter.cc   |  2 +-
 .../compiler/xla/service/hlo_evaluator.cc     |  6 ++--
 tensorflow/compiler/xla/tests/reduce_test.cc  | 32 +++++++++++++++++++
 .../compiler/xla/tests/reduce_window_test.cc  | 17 ++++++++++
 .../performance/xla/operation_semantics.md    | 19 +++++------
 6 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index cd5737e4f9..8d26938c6e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1695,7 +1695,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
         HloInstruction::CreateReshape(reduce->shape(), arg));
     return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateMap(reduce->shape(),
-                                          {reshape, init_value}, function));
+                                          {init_value, reshape}, function));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index 99c5e16db7..e97113dfa0 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -115,7 +115,7 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
       for (int i = 0; i < hlo->operand_count(); i++) {
         TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
                             operand_to_generator.at(hlo->operand(i))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
+                                ElementwiseSourceIndex(index, *hlo, i)));
         operands.push_back(operand_value);
       }
       return ir_emitter_->EmitScalarCall(hlo->shape().element_type(),
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index b4f9a9db9c..52bc2c0448 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1604,8 +1604,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             // Evaluate computation with specified literal operands.
             auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
             auto result_val_literal = Literal::CreateR0<ReturnT>(result_val);
-            std::vector<const Literal*> args = {curr_val_literal.get(),
-                                                result_val_literal.get()};
+            std::vector<const Literal*> args = {result_val_literal.get(),
+                                                curr_val_literal.get()};
 
             std::unique_ptr<Literal> computed_result =
                 embedded_evaluator.Evaluate<const Literal*>(*function, args)
@@ -1804,7 +1804,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 const auto result_val_literal =
                     Literal::CreateR0<ReturnT>(result_val);
                 const std::vector<const Literal*> args = {
-                    curr_val_literal.get(), result_val_literal.get()};
+                    result_val_literal.get(), curr_val_literal.get()};
                 std::unique_ptr<Literal> computed_result =
                     embedded_evaluator.Evaluate<const Literal*>(*function, args)
                         .ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 768beec15e..423ccadb5b 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -934,5 +935,36 @@ XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) {
   DoTest<uint64>(1234556789123, 1024);
 }
 
+// Test the operational semantic that the init value is passed on the lhs for
+// reduces. Can be tested by performing an "identity" reduce (that simply
+// returns one of the parameters). In this case, we return the rhs, which for
+// a 1D array with one element, should not be the init value.
+XLA_TEST_F(ReduceTest, ReduceIdentity) {
+  ComputationBuilder builder(client_, TestName());
+  Shape single_float = ShapeUtil::MakeShape(F32, {});
+  builder.Parameter(0, single_float, "lhs-unused");
+  builder.Parameter(1, single_float, "rhs-used");
+  auto computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  Shape operand_shape = ShapeUtil::MakeShape(F32, {1});
+  builder.Reduce(builder.Parameter(0, operand_shape, "operand"),
+                 builder.Parameter(1, single_float, "init"),
+                 computation_status.ValueOrDie(), {0});
+
+  float operand[] = {42.0f};
+  float init = 58.5f;
+  float expected = 42.0f;
+  std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(operand);
+  std::unique_ptr<GlobalData> input_global_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  std::unique_ptr<Literal> input_literal2 = Literal::CreateR0<float>(init);
+  std::unique_ptr<GlobalData> input_global_data2 =
+      client_->TransferToServer(*input_literal2).ConsumeValueOrDie();
+  ComputeAndCompareR0<float>(
+      &builder, expected, {input_global_data.get(), input_global_data2.get()},
+      ErrorSpec(0.0001));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 6a054a5dd3..0a09766722 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -1435,5 +1435,22 @@ ENTRY R3Window {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
+TEST_F(HloTestBase, ReduceWindowIdentity) {
+  const string& hlo_string = R"(
+HloModule ReduceWindowIdentity
+identity.pad_to_reduce_window {
+  param0 = f32[] parameter(0)
+  ROOT param1 = f32[] parameter(1)
+}
+ENTRY reduce-window-identity {
+  operand = f32[1,32,64]{2,1,0} parameter(0)
+  constant.4466 = f32[] constant(0)
+  ROOT reduce-window = f32[1,33,64]{2,1,0} reduce-window(operand, constant.4466),     window={size=1x1x1 pad=0_0x1_0x0_0}, to_apply=identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 3963d5faa7..8373a1219d 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -1417,12 +1417,12 @@ Applies a reduction function to an array.
 | `dimensions`  | `int64` array           | unordered array of dimensions to |
 :               :                         : reduce                           :
 
-Conceptually, this operation reduces one or more dimensions in the input array
-into scalars. The rank of the result array is `rank(operand) - len(dimensions)`.
-`init_value` is the initial value used for every reduction and may also be
-inserted anywhere during computation if the back-end chooses to do so. So in
-most cases `init_value` should be an identity of the reduction function (for
-example, 0 for addition).
+This operation reduces one or more dimensions of the input array into scalars.
+The rank of the returned array is `rank(operand) - len(dimensions)`.
+`init_value` is the initial value used for every reduction and may be inserted
+anywhere during computation by the back-end. In most cases, `init_value` is an
+identity of the reduction function (for example, 0 for addition). The applied
+`computation` is always passed the `init_value` on the left-hand side.
 
 The evaluation order of the reduction function is arbitrary and may be
 non-deterministic. Therefore, the reduction function should not be overly
@@ -1442,8 +1442,7 @@ could be computed as
 
 but there are also many other possibilities, e.g.
 
-`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(13,
-init_value))))`
+`f(init_value, f(f(10, f(init_value, 11)), f(f(init_value, 12), f(init_value, 13))))`
 
 The following is a rough pseudo-code example of how reduction could be
 implemented, using summation as the reduction computation with an initial value
@@ -1561,7 +1560,9 @@ See also
 Applies a reduction function to all elements in each window of the input
 multi-dimensional array, producing an output multi-dimensional array with the
 same number of elements as the number of valid positions of the window. A
-pooling layer can be expressed as a `ReduceWindow`.
+pooling layer can be expressed as a `ReduceWindow`. Similar to
+[`Reduce`](#reduce), the applied `computation` is always passed the `init_value`
+on the left-hand side.
 
 <b> `ReduceWindow(operand, init_value, computation, window_dimensions,
 window_strides, padding)` </b>
-- 
GitLab


From 3d4cddf87d544f4f5868497caf5c6ab3e25aea2b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 13:54:03 -0700
Subject: [PATCH 0880/1262] Simplify the recursion when processing unpackings.

PiperOrigin-RevId: 193094078
---
 .../pyct/static_analysis/type_info.py         | 37 ++++++++++---------
 .../pyct/static_analysis/type_info_test.py    | 19 +++++++---
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 203aa3c3d1..a75ba7a272 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -168,16 +168,8 @@ class TypeInfoResolver(transformer.Base):
                      anno.getanno(definition, 'element_type'))
     return node
 
-  def _process_tuple_assignment(self, source, t):
-    for i, e in enumerate(t.elts):
-      if isinstance(e, gast.Tuple):
-        self._process_tuple_assignment(source, e)
-      else:
-        self.scope.setval(
-            anno.getanno(e, anno.Basic.QN),
-            gast.Subscript(source, gast.Index(i), ctx=gast.Store()))
-
   def _process_variable_assignment(self, source, targets):
+    # Special case: constructors.
     if isinstance(source, gast.Call):
       func = source.func
       if anno.hasanno(func, 'live_val'):
@@ -190,15 +182,26 @@ class TypeInfoResolver(transformer.Base):
           # We can have a whitelist of no-side-effects constructors.
           # We can also step inside the constructor and further analyze.
 
-    for t in targets:
-      if isinstance(t, gast.Tuple):
-        # need to recurse on the case of assigning nested tuples,
-        # ex. a, (b, c) = f()
-        self._process_tuple_assignment(source, t)
-      elif isinstance(t, (gast.Name, gast.Attribute)):
-        self.scope.setval(anno.getanno(t, anno.Basic.QN), source)
+    # Multiple targets mean multiple assignment.
+    for target in targets:
+      # Tuple target means unpacking.
+      if isinstance(target, gast.Tuple):
+        for i, target_item in enumerate(target.elts):
+          # Two cases here:
+          #   1. Static unpacking, e.g. a, b = c, d
+          #   2. Dynamic unpacking, e.g. a, b = c
+          # The former case is optimized away.
+          if isinstance(source, (gast.Tuple, gast.List)):
+            source_item = source.elts[i]
+          else:
+            source_item = gast.Subscript(source, gast.Index(i), ctx=None)
+          self._process_variable_assignment(source_item, (target_item,))
+      elif isinstance(target, (gast.Name, gast.Attribute)):
+        target_symbol = anno.getanno(target, anno.Basic.QN)
+        self.scope.setval(target_symbol, source)
       else:
-        raise ValueError('Dont know how to handle assignment to %s' % t)
+        raise ValueError(
+            'assignment target has unknown type: %s' % target_item)
 
   def visit_With(self, node):
     for wi in node.items:
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
index c0de4a6043..4f53923275 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
@@ -196,19 +196,26 @@ class TypeInfoResolverTest(test.TestCase):
     f_ref = node.body[0].body[1].value
     self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
 
-  def test_nested_assignment(self):
+  def test_nested_unpacking(self):
 
-    def test_fn(foo):
-      a, (b, c) = foo
+    class Foo(object):
+      pass
+
+    class Bar(object):
+      pass
+
+    def test_fn():
+      a, (b, c) = (Foo(), (Bar(), Foo()))
       return a, b, c
 
-    node = self._parse_and_analyze(test_fn, {'foo': (1, 2, 3)})
+    node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'Bar': Bar})
     lhs = node.body[0].body[1].value.elts
     a = lhs[0]
     b = lhs[1]
     c = lhs[2]
-    # TODO(mdan): change these once we have the live values propagating
-    # correctly
+    self.assertEquals(Foo, anno.getanno(a, 'type'))
+    self.assertEquals(Bar, anno.getanno(b, 'type'))
+    self.assertEquals(Foo, anno.getanno(c, 'type'))
     self.assertFalse(anno.hasanno(a, 'live_val'))
     self.assertFalse(anno.hasanno(b, 'live_val'))
     self.assertFalse(anno.hasanno(c, 'live_val'))
-- 
GitLab


From bc410d9c0133673e7b93a49487d7e14758cba280 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 14:13:52 -0700
Subject: [PATCH 0881/1262] Use fixed sized tensor arrays and max loop
 iterations in dynamic_decode if the user supplies it and if the inputs were
 created in an XLA context.

PiperOrigin-RevId: 193097293
---
 tensorflow/contrib/seq2seq/BUILD              |  8 +++-
 .../python/kernel_tests/decoder_test.py       |  4 ++
 .../contrib/seq2seq/python/ops/decoder.py     | 39 ++++++++++++++-----
 3 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index a62069a252..1a1591d798 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -3,9 +3,12 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+package(default_visibility = [
+    "//learning/brain/google/xla/tests:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
+exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -38,6 +41,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:functional_ops",
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
index ac830ae98e..b549cbf568 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -92,14 +92,18 @@ class DynamicDecodeRNNTest(test.TestCase):
 
       # Mostly a smoke test
       time_steps = max_out
+      expected_length = sequence_length
       if maximum_iterations is not None:
         time_steps = min(max_out, maximum_iterations)
+        expected_length = [min(x, maximum_iterations) for x in expected_length]
       self.assertEqual(
           _t((batch_size, time_steps, cell_depth)),
           sess_results["final_outputs"].rnn_output.shape)
       self.assertEqual(
           _t((batch_size, time_steps)),
           sess_results["final_outputs"].sample_id.shape)
+      self.assertItemsEqual(expected_length,
+                            sess_results["final_sequence_length"])
 
   def testDynamicDecodeRNNBatchMajor(self):
     self._testDynamicDecodeRNN(time_major=False)
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index 898493662d..e69725ff8a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
@@ -181,6 +182,15 @@ def dynamic_decode(decoder,
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
+  def _is_xla_tensor(tensor):
+    try:
+      op = tensor.op
+    except AttributeError:
+      return False
+    if control_flow_util.IsInXLAContext(op):
+      return True
+    return False
+
   with variable_scope.variable_scope(scope, "decoder") as varscope:
     # Properly cache variable values inside the while_loop
     if varscope.caching_device is None:
@@ -198,6 +208,11 @@ def dynamic_decode(decoder,
                                         decoder.output_dtype,
                                         decoder.batch_size)
 
+    is_xla = False
+    if any([_is_xla_tensor(i) for i in nest.flatten(initial_inputs)]):
+      is_xla = True
+    if is_xla and maximum_iterations is None:
+      raise ValueError("maximum_iterations is required for XLA compilation.")
     if maximum_iterations is not None:
       initial_finished = math_ops.logical_or(
           initial_finished, 0 >= maximum_iterations)
@@ -215,11 +230,13 @@ def dynamic_decode(decoder,
                 batch_size, name="batch_size"))
         return tensor_shape.TensorShape([batch_size]).concatenate(from_shape)
 
+    dynamic_size = maximum_iterations is None or not is_xla
+
     def _create_ta(s, d):
       return tensor_array_ops.TensorArray(
           dtype=d,
-          size=0,
-          dynamic_size=True,
+          size=0 if dynamic_size else maximum_iterations,
+          dynamic_size=dynamic_size,
           element_shape=_shape(decoder.batch_size, s))
 
     initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size,
@@ -251,11 +268,8 @@ def dynamic_decode(decoder,
         next_finished = decoder_finished
       else:
         next_finished = math_ops.logical_or(decoder_finished, finished)
-      if maximum_iterations is not None:
-        next_finished = math_ops.logical_or(
-            next_finished, time + 1 >= maximum_iterations)
       next_sequence_lengths = array_ops.where(
-          math_ops.logical_and(math_ops.logical_not(finished), next_finished),
+          math_ops.logical_not(finished),
           array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
           sequence_lengths)
 
@@ -296,11 +310,16 @@ def dynamic_decode(decoder,
     res = control_flow_ops.while_loop(
         condition,
         body,
-        loop_vars=[
-            initial_time, initial_outputs_ta, initial_state, initial_inputs,
-            initial_finished, initial_sequence_lengths,
-        ],
+        loop_vars=(
+            initial_time,
+            initial_outputs_ta,
+            initial_state,
+            initial_inputs,
+            initial_finished,
+            initial_sequence_lengths,
+        ),
         parallel_iterations=parallel_iterations,
+        maximum_iterations=maximum_iterations,
         swap_memory=swap_memory)
 
     final_outputs_ta = res[1]
-- 
GitLab


From a72e6139104d426e347254850b3ccdbba32c2e6e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 20:16:06 +0000
Subject: [PATCH 0882/1262] Update bazel

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 1 +
 third_party/repo.bzl     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2a85be08e7..01c1b962b7 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -167,6 +167,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
+          # TODO (yongtang): uncomment once mirror.bazel.build is propagated.
           # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
           "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 8202dafac8..36f5aa5bde 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -69,7 +69,7 @@ def _apply_delete(ctx, paths):
   _execute_and_check_ret_code(ctx, cmd)
 
 def _tf_http_archive(ctx):
-  if ("mirror.bazel.build" not in ctx.attr.urls[0] or
+  if ("mirror.bazel.build" not in ctx.attr.urls[0] and
       (len(ctx.attr.urls) < 2 and
        ctx.attr.name not in _SINGLE_URL_WHITELIST)):
     fail("tf_http_archive(urls) must have redundant URLs. The " +
-- 
GitLab


From 255de90197f3da6b9c014aac7a2aa3105221b593 Mon Sep 17 00:00:00 2001
From: Olivia Nordquist <nolivia@google.com>
Date: Mon, 16 Apr 2018 14:17:02 -0700
Subject: [PATCH 0883/1262] disabling test that fails tensorflow.asan

PiperOrigin-RevId: 193097794
---
 tensorflow/core/grappler/optimizers/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 96342fedc1..3070eb1799 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -112,6 +112,7 @@ tf_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
     shard_count = 5,
+    tags = ["noasan"],
     deps = [
         ":constant_folding",
         "//tensorflow/cc:cc_ops",
-- 
GitLab


From e9e5356b206e9399b5d06b618fc77f460e9613bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 28 Mar 2018 10:03:37 -0700
Subject: [PATCH 0884/1262] Enable the Grappler arithmetic optimizer by default
 in Python tests.

PiperOrigin-RevId: 190787954
---
 tensorflow/python/framework/test_util.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 990fa429a1..bf00fa6439 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -974,8 +974,6 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
-      config.graph_options.rewrite_options.arithmetic_optimization = (
-          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
-- 
GitLab


From 9e4818375f3853c1a8cdd18fe22d1b1f447cfaef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 13 Apr 2018 10:30:32 -0700
Subject: [PATCH 0885/1262] Disable x * x -> square(x) Grapler rewrite for
 complex types unless the op is on CPU. Square is not registered for complex
 types on GPU, and doing so produces a crash in with CUDA_ILLEGAL_INSTRUCTION
 when running it on open source ubuntu.

PiperOrigin-RevId: 192788160
---
 .../optimizers/arithmetic_optimizer.cc        | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index fa0f7c1c6e..a8fa4a10cb 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1732,13 +1732,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
 
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
       !OptimizedNodeExists(*node, "square")) {
-    NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-    new_square_node->set_op("Square");
-    for (int i = 1; i < new_square_node->input_size(); ++i) {
-      new_square_node->set_input(i - 1, new_square_node->input(i));
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    string dontcare;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      return new_square_node->name();
     }
-    new_square_node->mutable_input()->RemoveLast();
-    return new_square_node->name();
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-- 
GitLab


From 51f451d9b6eec17cf3f18f928b48baecb0885ec6 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 16 Apr 2018 21:34:38 +0000
Subject: [PATCH 0886/1262] Fix lite and makefile issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/lite/download_dependencies.sh     | 4 +++-
 tensorflow/contrib/makefile/download_dependencies.sh | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index a93ed201d6..0a34da2dbc 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,7 +30,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 8b415e6527..4d3de36e2a 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,7 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-- 
GitLab


From e0c50ac5bb843178742273ba3b651397553f3eb7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 14:41:21 -0700
Subject: [PATCH 0887/1262] Exposes InputPipelineConfig as
 tf.contrib.tpu.InputPipelineConfig. This type is expected by the
 `per_host_input_for_training` argument of the TPUConfig constructor, but is
 not currently visible.

PiperOrigin-RevId: 193101540
---
 tensorflow/contrib/tpu/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index bb60f3e2d7..dc90668559 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -43,6 +43,7 @@
 @@TPUEstimator
 @@TPUEstimatorSpec
 @@RunConfig
+@@InputPipelineConfig
 @@TPUConfig
 """
 
-- 
GitLab


From be86852d8b63e0c655bd55728c8dc8d4f6dabaeb Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 16 Apr 2018 14:47:31 -0700
Subject: [PATCH 0888/1262] Porting tests for `rpc_op` to OS.

PiperOrigin-RevId: 193102564
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   3 +-
 tensorflow/contrib/rpc/BUILD                  |  16 +
 .../contrib/rpc/python/kernel_tests/BUILD     |  80 +++++
 .../rpc/python/kernel_tests/rpc_op_test.py    |  71 ++++
 .../python/kernel_tests/rpc_op_test_base.py   | 336 ++++++++++++++++++
 .../kernel_tests/rpc_op_test_servicer.py      | 101 ++++++
 .../python/kernel_tests/test_example.proto    | 171 +++++++++
 .../core/platform/default/build_config.bzl    |  86 ++++-
 tensorflow/tools/pip_package/BUILD            |   1 +
 tensorflow/workspace.bzl                      |   4 +
 12 files changed, 867 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index ae68f4aec4..7e47516550 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -87,6 +87,7 @@ py_library(
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/rpc",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/signal:signal_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e27ece8fa5..36cc5144d0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -71,6 +71,7 @@ from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
+from tensorflow.contrib import rpc
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
 from tensorflow.contrib import signal
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 21f59d2563..f6aaf41f73 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -347,7 +347,8 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
-GENERATE_PYTHON_OP_LIB("rpc_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index 597f18c771..dbd311a276 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "rpc",
     srcs = [
@@ -11,3 +13,17 @@ py_library(
     ],
     deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
 )
+
+py_library(
+    name = "rpc_pip",
+    data = if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/rpc/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":rpc",
+        "//tensorflow/contrib/rpc/python/kernel_tests:py_test_deps",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_base",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_servicer",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..2311c15a68
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -0,0 +1,80 @@
+# TODO(b/76425722): Port everything in here to OS (currently excluded).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+# Placeholder for loading internal BUILD rule.
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [":test_example_proto_py"],
+)
+
+py_library(
+    name = "rpc_op_test_base",
+    srcs = ["rpc_op_test_base.py"],
+    deps = [
+        ":test_example_proto_py",
+        "//tensorflow/contrib/proto",
+        "//tensorflow/contrib/rpc",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "rpc_op_test_servicer",
+    srcs = ["rpc_op_test_servicer.py"],
+    deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
+
+tf_py_test(
+    name = "rpc_op_test",
+    size = "small",
+    srcs = ["rpc_op_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        ":rpc_op_test_servicer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
new file mode 100644
index 0000000000..e2e0dbc7a2
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for RpcOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+import grpc
+from grpc.framework.foundation import logging_pool
+import portpicker
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_servicer
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+from tensorflow.python.platform import test
+
+
+class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
+  _protocol = 'grpc'
+
+  invalid_method_string = 'Method not found'
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(RpcOpTest, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
+
+  def get_method_name(self, suffix):
+    return '/tensorflow.contrib.rpc.TestCaseService/%s' % suffix
+
+  def setUp(self):
+    super(RpcOpTest, self).setUp()
+
+    service_port = portpicker.pick_unused_port()
+
+    server = grpc.server(logging_pool.pool(max_workers=25))
+    servicer = rpc_op_test_servicer.RpcOpTestServicer()
+    test_example_pb2_grpc.add_TestCaseServiceServicer_to_server(
+        servicer, server)
+    self._address = 'localhost:%d' % service_port
+    server.add_insecure_port(self._address)
+    server.start()
+    self._server = server
+
+  def tearDown(self):
+    # TODO(ebrevdo): Figure out why this sometimes times out.
+    #    self._service.ExitLoop()
+    #    self._service_thread.join()
+    # self._server.stop()
+    super(RpcOpTest, self).tearDown()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
new file mode 100644
index 0000000000..89f3ee1a1c
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -0,0 +1,336 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Base class for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.rpc.python.ops import rpc_op
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+
+__all__ = ['I_WARNED_YOU', 'RpcOpTestBase']
+
+I_WARNED_YOU = 'I warned you!'
+
+
+class RpcOpTestBase(object):
+  # pylint: disable=missing-docstring,invalid-name
+  """Base class for RpcOp tests."""
+
+  def get_method_name(self, suffix):
+    raise NotImplementedError
+
+  def rpc(self, *args, **kwargs):
+    return rpc_op.rpc(*args, protocol=self._protocol, **kwargs)
+
+  def try_rpc(self, *args, **kwargs):
+    return rpc_op.try_rpc(*args, protocol=self._protocol, **kwargs)
+
+  def testScalarHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, ())
+      response_values = sess.run(response_tensors)
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+
+  def testScalarHostPortTryRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      self.assertEqual(response_tensors.shape, ())
+      response_values, status_code_values, status_message_values = (
+          sess.run((response_tensors, status_code, status_message)))
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+    # For the base Rpc op, don't expect to get error status back.
+    self.assertEqual(errors.OK, status_code_values)
+    self.assertEqual(b'', status_message_values)
+
+  def testEmptyHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = []
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertAllEqual(response_tensors.shape, [0])
+      response_values = sess.run(response_tensors)
+    self.assertAllEqual(response_values.shape, [0])
+
+  def testInvalidAddresses(self):
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method='/InvalidService.IncrementTestShapes',
+                address=self._address,
+                request=''))
+
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('InvalidMethodName'),
+                address=self._address,
+                request=''))
+
+      # This also covers the case of address=''
+      # and address='localhost:293874293874'
+      with self.assertRaises(errors.UnavailableError):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('IncrementTestShapes'),
+                address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@',
+                request=''))
+
+      # Test invalid method with the TryRpc op
+      _, status_code_value, status_message_value = sess.run(
+          self.try_rpc(
+              method=self.get_method_name('InvalidMethodName'),
+              address=self._address,
+              request=''))
+      self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+      self.assertTrue(
+          self.invalid_method_string in status_message_value.decode('ascii'))
+
+  def testAlwaysFailingMethod(self):
+    with self.test_session() as sess:
+      response_tensors = self.rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+  def testSometimesFailingMethodWithManyRequests(self):
+    with self.test_session() as sess:
+      # Fail hard by default.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+      # Don't fail hard, use TryRpc - return the failing status instead.
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values, status_message_values = sess.run((status_code,
+                                                            status_message))
+      self.assertTrue([
+          x in (errors.OK, errors.INVALID_ARGUMENT) for x in status_code_values
+      ])
+      expected_message_values = np.where(
+          status_code_values == errors.INVALID_ARGUMENT,
+          I_WARNED_YOU.encode('ascii'), b'')
+      self.assertAllEqual(expected_message_values, status_message_values)
+
+  def testVecHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, (20,))
+      response_values = sess.run(response_tensors)
+    self.assertEqual(response_values.shape, (20,))
+    for i in range(20):
+      response_message = test_example_pb2.TestCase()
+      self.assertTrue(response_message.ParseFromString(response_values[i]))
+      self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortManyParallelRpcs(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      many_response_tensors = [
+          self.rpc(
+              method=self.get_method_name('IncrementTestShapes'),
+              address=self._address,
+              request=request_tensors) for _ in range(10)
+      ]
+      # Launch parallel 10 calls to the RpcOp, each containing
+      # 20 rpc requests.
+      many_response_values = sess.run(many_response_tensors)
+    self.assertEqual(10, len(many_response_values))
+    for response_values in many_response_values:
+      self.assertEqual(response_values.shape, (20,))
+      for i in range(20):
+        response_message = test_example_pb2.TestCase()
+        self.assertTrue(response_message.ParseFromString(response_values[i]))
+        self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortRpcUsingEncodeAndDecodeProto(self):
+    with self.test_session() as sess:
+      request_tensors = encode_proto_op.encode_proto(
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          sizes=[[3]] * 20,
+          values=[
+              [[i, i + 1, i + 2] for i in range(20)],
+          ])
+      response_tensor_strings = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      _, (response_shape,) = decode_proto_op.decode_proto(
+          bytes=response_tensor_strings,
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          output_types=[dtypes.int32])
+      response_shape_values = sess.run(response_shape)
+    self.assertAllEqual([[i + 1, i + 2, i + 3]
+                         for i in range(20)], response_shape_values)
+
+  def testVecHostPortRpcCancelsUponSessionTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          request=request_tensors)
+      for timeout_ms in [1, 500, 1000]:
+        options = config_pb2.RunOptions(timeout_in_ms=timeout_ms)
+        with self.assertRaises((errors.UnavailableError,
+                                errors.DeadlineExceededError)):
+          sess.run(response_tensors, options=options)
+
+  def testVecHostPortRpcCancelsUponConfiguredTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          timeout_in_ms=1000,
+          request=request_tensors)
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(response_tensors)
+
+  def testTryRpcPropagatesDeadlineErrorWithSometimesTimingOutRequests(self):
+    with self.test_session() as sess:
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesSleepForever'),
+          timeout_in_ms=1000,
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values = sess.run(status_code)
+      self.assertTrue([
+          x in (errors.OK, errors.DEADLINE_EXCEEDED) for x in status_code_values
+      ])
+
+  def testTryRpcWithMultipleAddressesSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleMethodsSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      methods = flatten(
+          [[self.get_method_name('IncrementTestShapes'), 'InvalidMethodName']
+           for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=methods, address=self._address, request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNIMPLEMENTED] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleAddressesAndRequests(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      requests = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=requests)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(20):
+        if i % 2 == 1:
+          self.assertFalse(response_tensors_values[i])
+        else:
+          response_message = test_example_pb2.TestCase()
+          self.assertTrue(
+              response_message.ParseFromString(response_tensors_values[i]))
+          self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
new file mode 100644
index 0000000000..7cbd636cb1
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Test servicer for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import time
+
+import grpc
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+
+
+class RpcOpTestServicer(test_example_pb2_grpc.TestCaseServiceServicer):
+  """Test servicer for RpcOp tests."""
+
+  def IncrementTestShapes(self, request, context):
+    """Increment the entries in the shape attribute of request.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    for i in range(len(request.shape)):
+      request.shape[i] += 1
+    return request
+
+  def AlwaysFailWithInvalidArgument(self, request, context):
+    """Always fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    del request
+    context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+    context.set_details(rpc_op_test_base.I_WARNED_YOU)
+
+  def SometimesFailWithInvalidArgument(self, request, context):
+    """Sometimes fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+      context.set_details(rpc_op_test_base.I_WARNED_YOU)
+    return request
+
+  def SleepForever(self, request, context):
+    """Sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    # TODO(ebrevdo): Make this async wait like the stubby version.
+    time.sleep(5)
+
+  def SometimesSleepForever(self, request, context):
+    """Sometimes sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      time.sleep(5)
+    return request
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..96f4550f62
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
@@ -0,0 +1,171 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.rpc;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+service TestCaseService {
+  // Copy input, and increment each entry in 'shape' by 1.
+  rpc IncrementTestShapes(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever.
+  rpc SleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever 50% of the time, return immediately the other 50%.
+  rpc SometimesSleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Always fails with InvalidArgument.
+  rpc AlwaysFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+
+  // Fails with InvalidArgument 50% of the time.
+  rpc SometimesFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 4cfa25bf66..44356e3438 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,6 @@
 # Platform-specific build configurations.
 
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
-load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
@@ -110,6 +109,12 @@ def _proto_cc_srcs(srcs, use_grpc_plugin=False):
     ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
   return ret
 
+def _proto_py_outs(srcs, use_grpc_plugin=False):
+  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+  if use_grpc_plugin:
+    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+  return ret
+
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
@@ -217,6 +222,80 @@ def cc_proto_library(
       hdrs=gen_hdrs,
       **kargs)
 
+# Re-defined protocol buffer rule to bring in the change introduced in commit
+# https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
+# which was not part of a stable protobuf release in 04/2018.
+# TODO(jsimsa): Remove this once the protobuf dependency version is updated
+# to include the above commit.
+def py_proto_library(
+        name,
+        srcs=[],
+        deps=[],
+        py_libs=[],
+        py_extra_srcs=[],
+        include=None,
+        default_runtime="@protobuf_archive//:protobuf_python",
+        protoc="@protobuf_archive//:protoc",
+        use_grpc_plugin=False,
+        **kargs):
+  """Bazel rule to create a Python protobuf library from proto source files
+
+  NOTE: the rule is only an internal workaround to generate protos. The
+  interface may change and the rule may be removed when bazel has introduced
+  the native rule.
+
+  Args:
+    name: the name of the py_proto_library.
+    srcs: the .proto files of the py_proto_library.
+    deps: a list of dependency labels; must be py_proto_library.
+    py_libs: a list of other py_library targets depended by the generated
+        py_library.
+    py_extra_srcs: extra source files that will be added to the output
+        py_library. This attribute is used for internal bootstrapping.
+    include: a string indicating the include path of the .proto files.
+    default_runtime: the implicitly default runtime which will be depended on by
+        the generated py_library target.
+    protoc: the label of the protocol compiler to generate the sources.
+    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+        when processing the proto files.
+    **kargs: other keyword arguments that are passed to cc_library.
+  """
+  outs = _proto_py_outs(srcs, use_grpc_plugin)
+
+  includes = []
+  if include != None:
+    includes = [include]
+
+  grpc_python_plugin = None
+  if use_grpc_plugin:
+    grpc_python_plugin = "//external:grpc_python_plugin"
+    # Note: Generated grpc code depends on Python grpc module. This dependency
+    # is not explicitly listed in py_libs. Instead, host system is assumed to
+    # have grpc installed.
+
+  proto_gen(
+      name=name + "_genproto",
+      srcs=srcs,
+      deps=[s + "_genproto" for s in deps],
+      includes=includes,
+      protoc=protoc,
+      gen_py=1,
+      outs=outs,
+      visibility=["//visibility:public"],
+      plugin=grpc_python_plugin,
+      plugin_language="grpc"
+  )
+
+  if default_runtime and not default_runtime in py_libs + deps:
+    py_libs = py_libs + [default_runtime]
+
+  native.py_library(
+      name=name,
+      srcs=outs+py_extra_srcs,
+      deps=py_libs+deps,
+      imports=includes,
+      **kargs)
+
 def tf_proto_library_cc(name, srcs = [], has_services = None,
                         protodeps = [],
                         visibility = [], testonly = 0,
@@ -261,8 +340,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   )
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0,
-                        srcs_version="PY2AND3"):
+                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
   py_proto_library(
       name = name + "_py",
       srcs = srcs,
@@ -272,6 +350,7 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
+      use_grpc_plugin = use_grpc_plugin,
   )
 
 def tf_jspb_proto_library(**kwargs):
@@ -310,6 +389,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
       srcs_version = "PY2AND3",
       testonly = testonly,
       visibility = visibility,
+      use_grpc_plugin = has_services,
   )
 
 def tf_additional_lib_hdrs(exclude = []):
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a0bae23a7c..2ef105755f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -76,6 +76,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/predictor:predictor_pip",
     "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
     "//tensorflow/contrib/signal:test_util",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 79730f591f..fe3619d5cd 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -752,6 +752,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc_cpp_plugin",
       actual = "@grpc//:grpc_cpp_plugin",
   )
+  native.bind(
+      name = "grpc_python_plugin",
+      actual = "@grpc//:grpc_python_plugin",
+  )
 
   # gRPC has three empty C++ functions which it wants the user to define
   # at build time. https://github.com/grpc/grpc/issues/13590
-- 
GitLab


From 90ee831014a6380f1ca0c14304979b26a62ea7d8 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Mon, 16 Apr 2018 14:52:41 -0700
Subject: [PATCH 0889/1262] Increase softmax gpu unittest numeric stability

PiperOrigin-RevId: 193103363
---
 tensorflow/python/kernel_tests/softmax_op_test.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 981f96b74d..dc4d4dbeab 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -39,6 +39,10 @@ class SoftmaxTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
+    is_fp16 = features.dtype == np.float16
+    if is_fp16:
+      # Do the compute in fp32 and cast the input back to fp32.
+      features = features.astype(np.float32)
     e = np.exp(features - np.reshape(
         np.amax(
             features, axis=dim), one_only_on_dim))
@@ -47,6 +51,8 @@ class SoftmaxTest(test.TestCase):
       res = np.log(softmax)
     else:
       res = softmax
+    if is_fp16:
+      res = res.astype(np.float16)
     return res
 
   def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
@@ -125,8 +131,8 @@ class SoftmaxTest(test.TestCase):
                        "Test only applicable when running on GPUs")
   def testFloatGPU(self):
     if test.is_gpu_available(cuda_only=True):
-      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
-      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
       for row, col in zip(rows, cols):
         logging.info("Testing softmax float dtype in shape [%d, %d]", row, col)
         data = np.random.rand(row, col)
@@ -140,8 +146,8 @@ class SoftmaxTest(test.TestCase):
                        "Test only applicable when running on GPUs")
   def testHalfGPU(self):
     if test.is_gpu_available(cuda_only=True):
-      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
-      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
       for row, col in zip(rows, cols):
         logging.info("Testing softmax half dtype in shape [%d, %d]", row, col)
         data = np.random.rand(row, col)
-- 
GitLab


From f1f1d5172fe5bfeaeb2cf657ffc43ba744187bee Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Mon, 16 Apr 2018 15:34:50 -0700
Subject: [PATCH 0890/1262] Fix TFLite Makefile FFT2D dependency.

FFT2D dependency was introduced a while ago so Makefile no longer works
until this fix.
---
 tensorflow/contrib/lite/Makefile                 | 3 ++-
 tensorflow/contrib/lite/download_dependencies.sh | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index b4504f246a..65fba52d46 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -90,7 +90,8 @@ $(wildcard tensorflow/contrib/lite/kernels/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc)
+$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c)
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index a93ed201d6..840015a7fa 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -36,6 +36,7 @@ ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
+FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -91,6 +92,7 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
-- 
GitLab


From a4570ad1cf8dab5a77b0c460fba2da30fd0c8bb6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 15:46:42 -0700
Subject: [PATCH 0891/1262] Internal change.

PiperOrigin-RevId: 193112205
---
 tensorflow/contrib/lite/interpreter.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 31b874a6a6..ff8524f12e 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -245,11 +245,8 @@ TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
         // Initialize the output tensors's delegate-related fields.
         for (int tensor_index : subgraph.output_tensors) {
           TfLiteTensor* tensor = &tensors_[tensor_index];
-          TF_LITE_ENSURE_EQ(&context_, tensor->delegate, nullptr);
-          TF_LITE_ENSURE_EQ(&context_, tensor->buffer_handle,
-                            kTfLiteNullBufferHandle);
-          // buffer_handle will be filled in delegate's `Prepare`
-          // function.
+          TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
+                                        tensor->delegate == delegate);
           tensor->delegate = delegate;
         }
 
-- 
GitLab


From 73634357b68cb162977eb406cadd29d1b2584c5e Mon Sep 17 00:00:00 2001
From: Sam Sendelbach <sbsends@gmail.com>
Date: Mon, 16 Apr 2018 17:51:29 -0500
Subject: [PATCH 0892/1262] Added support for saved_model_cli input files
 stored on GCS/AWS.

---
 tensorflow/python/tools/saved_model_cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index b88be4ae04..73ea85ab0c 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -41,6 +41,7 @@ from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import meta_graph as meta_graph_lib
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.platform import app  # pylint: disable=unused-import
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import loader
 from tensorflow.python.tools import saved_model_utils
 
@@ -543,7 +544,7 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
   input_examples = preprocess_input_examples_arg_string(input_examples_str)
 
   for input_tensor_key, (filename, variable_name) in inputs.items():
-    data = np.load(filename)
+    data = np.load(file_io.FileIO(filename, mode='r'))
 
     # When a variable_name key is specified for the input file
     if variable_name:
-- 
GitLab


From b2b7d56869d38bf68873a097251d1463e3df640d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 16 Apr 2018 15:53:08 -0700
Subject: [PATCH 0893/1262] Curly-brace id's are inconsistently supported.

Curly-brace id's are inconsistently supported.
linking to the id of an html tag seems to be supported everywhere.
---
 tensorflow/docs_src/tutorials/layers.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index b24d3f4cad..6f88c5420a 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -192,7 +192,7 @@ dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
 you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s},
 and find the above code intuitive, you may want to skim these sections or just
-skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#training_and_evaluating_the_cnn_mnist_classifier).
+skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist).
 
 ### Input Layer
 
@@ -549,7 +549,8 @@ return tf.estimator.EstimatorSpec(
     mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 ```
 
-## Training and Evaluating the CNN MNIST Classifier {#training_and_evaluating_the_cnn_mnist_classifier}
+<a id="train_eval_mnist"></a>
+## Training and Evaluating the CNN MNIST Classifier
 
 We've coded our MNIST CNN model function; now we're ready to train and evaluate
 it.
-- 
GitLab


From 39635af4b97d843228e0ab9f731fc98d8a4ec5d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 15:52:54 -0700
Subject: [PATCH 0894/1262] Fix trace collection to properly remove the suffix.

PiperOrigin-RevId: 193113074
---
 tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index b53f9be2e2..5e85a967ad 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -128,6 +128,7 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
   string host_prefix = host.empty() ? "" : StrCat(host, ".");
   string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
+  *os << "Creating directory: " << profile_run_dir;
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
-- 
GitLab


From f59a82f2b08dca1641d5766fdd2234d3b665a862 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 16:04:43 -0700
Subject: [PATCH 0895/1262] Replacing the current inner Cholesky decomposition
 loop with a While loop rolled version.

This will allow for much larger Cholesky decompositions (and thus matrix inversions) than previously possible on TPU because of the use of rolled While loops so XLA compilation will no longer timeout.

While there is a minor runtime performance decrease (now 25ms vs 15ms for a 500x500 matrix) the compilation time is significantly faster (12.8s vs 55.2s for a 500x500 matrix.)

PiperOrigin-RevId: 193114816
---
 tensorflow/compiler/tf2xla/lib/BUILD       |   1 +
 tensorflow/compiler/tf2xla/lib/cholesky.cc | 159 ++++++++++++++-------
 tensorflow/compiler/tf2xla/lib/cholesky.h  |   4 +-
 3 files changed, 110 insertions(+), 54 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index ea6e1a4c89..fde1977c1b 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -39,6 +39,7 @@ cc_library(
         ":batch_dot",
         ":triangular_solve",
         ":util",
+        ":while_loop",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index e795701181..203365e2ab 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -31,68 +32,122 @@ namespace tensorflow {
 
 namespace {
 
+// The Cholesky–Banachiewicz algorithm. See
+// https://en.wikipedia.org/wiki/Cholesky_decomposition#The_Cholesky–Banachiewicz_and_Cholesky–Crout_algorithms
+// for a description.
+//
 // def cholesky_unblocked(a):
 //   assert len(a.shape) == 2 and a.shape[-2] == a.shape[-1]
 //   n = a.shape[-2]
 //   l = np.zeros_like(a)
 //   for j in xrange(n):
-//     r = l[..., j, :j]
-//     l[..., j, j] = np.sqrt(a[..., j, j] - np.dot(r, r))
-//     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j],
-//         np.transpose(r))) / l[..., j, j]
+//     row = l[..., j, :j]
+//     row_t = np.swapaxes(row, -1, -2)
+//     l[..., j, j] = np.sqrt(a[..., j, j] - np.dot(row, row_t))
+//     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
+//                       l[..., j, j]
 //   return l
 xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(a));
-  xla::ComputationDataHandle l = Zeros(builder, *shape);
-  const int64 n = xla::ShapeUtil::GetDimension(*shape, -2);
-  for (int j = 0; j < n; ++j) {
-    // Picture of block structure:
-    // ...   \
-    //        \
-    // -- r -- d
-    //         |\
-    //    B    c \
-    //         |  \
-    //         |  ...
-    //
-    //         ^
-    //      column j
-    TF_ASSIGN_OR_RETURN(auto d,
-                        SliceInMinorDims(builder, a, {j, j}, {j + 1, j + 1}));
-    TF_ASSIGN_OR_RETURN(auto c,
-                        SliceInMinorDims(builder, a, {j + 1, j}, {n, j + 1}));
-    xla::ComputationDataHandle new_d_squared = d;
-    xla::ComputationDataHandle br;
-    if (j > 0) {
-      TF_ASSIGN_OR_RETURN(auto r,
-                          SliceInMinorDims(builder, l, {j, 0}, {j + 1, j}));
-      TF_ASSIGN_OR_RETURN(auto b,
-                          SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
-      TF_ASSIGN_OR_RETURN(auto r_squared,
-                          BatchDot(builder, r, r, /*transpose_x=*/false,
-                                   /*transpose_y=*/true, /*conjugate_x=*/false,
-                                   /*conjugate_y=*/false));
-      new_d_squared = builder->Sub(new_d_squared, r_squared);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  const int n_dims = xla::ShapeUtil::Rank(*a_shape);
+  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(a_shape->dimensions()),
+                                    /*pos=*/0,
+                                    /*len=*/n_dims - 2);
 
-      TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
-                                       /*transpose_y=*/true,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false));
-    }
-    auto new_d_inv = builder->Pow(
-        new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
-    auto new_d = builder->Mul(new_d_inv, new_d_squared);
-    TF_ASSIGN_OR_RETURN(l, UpdateSliceInMinorDims(builder, l, new_d, {j, j}));
+  xla::ComputationDataHandle l = Zeros(builder, *a_shape);
 
-    if (j > 0) {
-      c = builder->Sub(c, br);
+  // Construct the for loop body to iterate over rows.
+  auto body_fn = [&](xla::ComputationDataHandle i,
+                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
+                     xla::ComputationBuilder* body_builder)
+      -> xla::StatusOr<std::vector<xla::ComputationDataHandle>> {
+    xla::Shape col_shape;
+    xla::Shape row_shape;
+    for (int64 d : major_dims) {
+      row_shape.add_dimensions(d);
+      col_shape.add_dimensions(d);
     }
-    auto new_c = builder->Mul(c, new_d_inv);
-    TF_ASSIGN_OR_RETURN(l,
-                        UpdateSliceInMinorDims(builder, l, new_c, {j + 1, j}));
-  }
-  return l;
+    row_shape.add_dimensions(1);
+    row_shape.add_dimensions(n);
+    row_shape.set_element_type(a_shape->element_type());
+    auto mask_zeros_row = Zeros(body_builder, row_shape);
+
+    col_shape.add_dimensions(n);
+    col_shape.add_dimensions(1);
+    col_shape.set_element_type(a_shape->element_type());
+    auto mask_zeros_col = Zeros(body_builder, col_shape);
+
+    std::vector<int32> mask_vector(n);
+    std::iota(mask_vector.begin(), mask_vector.end(), 0);
+    auto mask_range = body_builder->ConstantR1<int32>(mask_vector);
+    auto mask_range_row = body_builder->Broadcast(
+        body_builder->Reshape(mask_range, {0}, {1, n}), major_dims);
+    auto mask_range_col = body_builder->Broadcast(
+        body_builder->Reshape(mask_range, {0}, {n, 1}), major_dims);
+    auto body_a = loop_vars[0];
+    auto body_l = loop_vars[1];
+
+    // row = l[..., i, :i]
+    // select the whole i-th row, then mask out all columns past i-1
+    auto zero = body_builder->ConstantR0<int32>(0);
+    TF_ASSIGN_OR_RETURN(auto l_i, DynamicSliceInMinorDims(body_builder, body_l,
+                                                          {i, zero}, {1, n}));
+    auto row = body_builder->Select(body_builder->Ge(mask_range_row, i),
+                                    mask_zeros_row, l_i);
+    // a[..., i, i]
+    TF_ASSIGN_OR_RETURN(auto a_ii, DynamicSliceInMinorDims(body_builder, body_a,
+                                                           {i, i}, {1, 1}));
+    // np.dot(row, np.swapaxes(row, -1, -2))
+    xla::ComputationDataHandle diag_dot;
+    TF_ASSIGN_OR_RETURN(diag_dot, BatchDot(body_builder, row, row,
+                                           /*transpose_x=*/false,
+                                           /*transpose_y=*/true));
+    // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
+    //                                              np.swapaxes(row, -1, -2)))
+    auto l_ii = body_builder->Pow(
+        body_builder->Sub(a_ii, diag_dot),
+        FloatLiteral(body_builder, a_shape->element_type(), 0.5));
+
+    // a[..., i+1:, i]
+    auto ip1 = body_builder->Add(i, body_builder->ConstantR0<int32>(1));
+    // select the whole i-th column, then mask out all rows above i+1
+    TF_ASSIGN_OR_RETURN(
+        auto a_0i, DynamicSliceInMinorDims(body_builder, body_a, {i}, {1}));
+    auto a_ip1i = body_builder->Select(body_builder->Le(mask_range_col, i),
+                                       mask_zeros_col, a_0i);
+
+    // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) /
+    //                   l[..., i, i]
+    // The columns in [i, n] are zeroed out in `row`, so we just have to
+    // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
+    // r.T)
+    TF_ASSIGN_OR_RETURN(auto dot, BatchDot(body_builder, body_l, row,
+                                           /*transpose_x=*/false,
+                                           /*transpose_y=*/true));
+    // np.dot(l[..., i+1:, :i], r.T)
+    auto dot_ip1 = body_builder->Select(body_builder->Le(mask_range_col, i),
+                                        mask_zeros_col, dot);
+
+    auto col_update =
+        body_builder->Div(body_builder->Sub(a_ip1i, dot_ip1), l_ii);
+    TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims(
+                                    body_builder, body_l, col_update, {i}));
+    // Assign the diagonal after the rest of the column because otherwise the
+    // column assign will wrap around and overwrite the diagonal assign.
+    TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims(
+                                    body_builder, body_l, l_ii, {i, i}));
+
+    return std::vector<xla::ComputationDataHandle>{body_a, body_l};
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      auto cholesky_while,
+      XlaForEachIndex(n, xla::S32, body_fn, {a, l}, "unblocked", builder));
+
+  return cholesky_while[1];
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index e083a383be..17da8d8b22 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
-// TODO(mattjj): handle the complex Hermitian case
+// TODO(znado): handle the complex Hermitian case
 xla::StatusOr<xla::ComputationDataHandle> Cholesky(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
     int64 block_size = 256);
-- 
GitLab


From 4f64c4bfb04038459d9551caf018890e2e7d5c41 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 16:15:19 -0700
Subject: [PATCH 0896/1262] Create copy of locals() before copying, since
 modifying locals does not always affect the values.
 https://docs.python.org/2/library/functions.html#locals.

PiperOrigin-RevId: 193116254
---
 tensorflow/contrib/gan/python/train.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 73acd05b60..6fa43059f3 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -710,7 +710,10 @@ def gan_train_ops(
     be used to train a generator/discriminator pair.
   """
   if isinstance(model, namedtuples.CycleGANModel):
-    saved_params = locals()
+    # Get and store all arguments other than model and loss from locals.
+    # Contents of locals should not be modified, may not affect values. So make
+    # a copy. https://docs.python.org/2/library/functions.html#locals.
+    saved_params = dict(locals())
     saved_params.pop('model', None)
     saved_params.pop('loss', None)
     kwargs = saved_params.pop('kwargs', {})
-- 
GitLab


From a5f8b3885dbab62e093d6c729354b8537f775b72 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 16:22:03 -0700
Subject: [PATCH 0897/1262] Internal change

PiperOrigin-RevId: 193117142
---
 tensorflow/core/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7ea8a38834..01bda8e09b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -159,6 +159,7 @@ exports_files(["ops/ops.pbtxt"])
 #
 # Note that some protos are in neither additional_core_proto_srcs nor this
 # filegroup; e.g.  ones with individual proto_library targets.
+# LINT.IfChange
 CORE_PROTO_SRCS = [
     "example/example.proto",
     "example/feature.proto",
@@ -200,6 +201,7 @@ CORE_PROTO_SRCS = [
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
 ]
+# LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb)
 
 # Protos which are not needed on mobile builds, but should be included in
 # protos_all.
-- 
GitLab


From bcfe946780034f2ff757e82c758cf8075f1132df Mon Sep 17 00:00:00 2001
From: Olivia Nordquist <nolivia@google.com>
Date: Mon, 16 Apr 2018 16:24:19 -0700
Subject: [PATCH 0898/1262] disabling flaky asan test

PiperOrigin-RevId: 193117611
---
 tensorflow/python/kernel_tests/BUILD | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e82d738f14..11adb1ccfc 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1603,6 +1603,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["noasan"],
 )
 
 cuda_py_test(
@@ -2870,7 +2871,10 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 10,
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_windows_gpu",
+        "noasan",
+    ],
 )
 
 tf_py_test(
-- 
GitLab


From 04310bea2a9585bfdbe43be5da8510649fa47dfa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 16:34:18 -0700
Subject: [PATCH 0899/1262] Port the list append into the operators module. Not
 enabled yet.

PiperOrigin-RevId: 193118940
---
 tensorflow/contrib/autograph/operators/BUILD  | 13 +++++
 .../autograph/operators/data_structures.py    | 56 +++++++++++++++++++
 .../operators/data_structures_test.py         | 44 +++++++++++++++
 3 files changed, 113 insertions(+)
 create mode 100644 tensorflow/contrib/autograph/operators/data_structures.py
 create mode 100644 tensorflow/contrib/autograph/operators/data_structures_test.py

diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
index 4c62468575..efb8d441dd 100644
--- a/tensorflow/contrib/autograph/operators/BUILD
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -21,11 +21,24 @@ py_library(
     srcs = [
         "__init__.py",
         "control_flow.py",
+        "data_structures.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/contrib/autograph/utils",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
+    name = "data_structures_test",
+    srcs = ["data_structures_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/autograph/operators/data_structures.py b/tensorflow/contrib/autograph/operators/data_structures.py
new file mode 100644
index 0000000000..c862306baa
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/data_structures.py
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators specific to data structures: list append, subscripts, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import tensor_array_ops
+
+# TODO(mdan): Add support for TensorList once functional.
+# TODO(mdan): Add primitives for empty list, list with elements.
+
+
+def append(target, element):
+  """The list append function.
+
+  Note: it is unspecified where target will be mutated or not. If target is
+  a TensorFlow entity, it will not be typically mutated. If target is a plain
+  list, it will be. In general, if the target is mutated then the return value
+  should point to the original entity.
+
+  Args:
+    target: An entity that supports append semantics.
+    element: The element to append.
+
+  Returns:
+    Same as target, after the append was performed.
+  """
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_append(target, element)
+  else:
+    return _py_append(target, element)
+
+
+def _tf_tensorarray_append(target, element):
+  """Overload of append that stages a TensorArray write at the last position."""
+  return target.write(target.size(), element)
+
+
+def _py_append(target, element):
+  """Overload of append that executes a Python list append."""
+  target.append(element)
+  return target
diff --git a/tensorflow/contrib/autograph/operators/data_structures_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
new file mode 100644
index 0000000000..577d28c34d
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for data_structures module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.operators import data_structures
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import test
+
+
+class AppendTest(test.TestCase):
+
+  def test_tf_tensorarray(self):
+    l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
+    l1 = data_structures.append(l, 1)
+    l2 = data_structures.append(l1, 2)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(l1.stack()), [1])
+      self.assertAllEqual(sess.run(l2.stack()), [1, 2])
+
+  def test_python(self):
+    l = []
+    self.assertAllEqual(data_structures.append(l, 1), [1])
+    self.assertAllEqual(data_structures.append(l, 2), [1, 2])
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 493eb20b71715e1b72dfc8a494e2e0c2e824a334 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 16 Apr 2018 16:41:12 -0700
Subject: [PATCH 0900/1262] Internal change.

PiperOrigin-RevId: 193119953
---
 tensorflow/workspace.bzl    | 8 ++++----
 third_party/llvm/llvm.BUILD | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index fe3619d5cd..d7bd2a2be0 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/15535accd9e1e9d7772202ce51c8428c1994a04b.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/15535accd9e1e9d7772202ce51c8428c1994a04b.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz",
       ],
-      sha256 = "3470c2dde055dc974e859e707aa6cd1d22eadd4f3a1f282e74c3cf1f7dc9510a",
-      strip_prefix = "llvm-15535accd9e1e9d7772202ce51c8428c1994a04b",
+      sha256 = "017d7db029cc175634d75416c326770139c76590575ed44a3794c11ab160c955",
+      strip_prefix = "llvm-3210e64b499a31193051208f2f8922dadfc4bb6f",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 097bbf5d42..cbb1b2fe42 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -2006,7 +2006,6 @@ cc_library(
     ]) + [
         "include/llvm/BinaryFormat/MachO.def",
         "include/llvm/Support/VCSRevision.h",
-        "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
     ],
     deps = [
         ":config",
-- 
GitLab


From b358c9932e0d2f50e50baa5f1a9441e3594244c4 Mon Sep 17 00:00:00 2001
From: Sourabh Bajaj <sourabhbajaj@google.com>
Date: Wed, 11 Apr 2018 15:20:11 -0700
Subject: [PATCH 0901/1262] GCS Filesystem should not cache checkpoint file as
 we need to read the updated checkpoints from the contents.

PiperOrigin-RevId: 192517819
(cherry picked from commit 079d63d59b75bdfd25f7371efda25ec5f6739b78)
---
 .../core/platform/cloud/gcs_file_system.cc    |  8 ++++
 .../platform/cloud/gcs_file_system_test.cc    | 48 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 3c0dc13d75..6ed1d5dad2 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -301,6 +301,14 @@ class GcsRandomAccessFile : public RandomAccessFile {
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
                                                &bytes_transferred));
     *result = StringPiece(scratch, bytes_transferred);
+    string checkpoint_ending = "/checkpoint";
+    // Check if the file is the checkpoint file as we should not be caching
+    // that. As it's contents are updated and used for iterating checkpoints.
+    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
+                   filename_.rbegin())) {
+      // Remove the checkpoint file from the cache
+      file_block_cache_->RemoveFile(filename_);
+    }
     if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 2fbde9b6a7..e9eca04fef 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -198,6 +198,54 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
+  // Our underlying file in this test changes as new data comes in
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "abcdefghi")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  char scratch[100];
+  StringPiece result;
+  {
+    // We are instantiating this in an enclosed scope to make sure after the
+    // unique ptr goes out of scope, we can still access result.
+    std::unique_ptr<RandomAccessFile> file;
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
+
+    // Read the first chunk. The cache will be populated with the first block of
+    // 9 bytes.
+    scratch[5] = 'x';
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("0123", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+
+    // The second chunk should not be in cache so we make a new request
+    // As the checkpoint file should not be cached
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("abcd", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+  }
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
-- 
GitLab


From 0d05b309d09d519830782ac21176ea1a0bb24e89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 17:05:18 -0700
Subject: [PATCH 0902/1262] Add LinearOperatorKronecker, representing the
 Kronecker product.

PiperOrigin-RevId: 193123894
---
 tensorflow/contrib/linalg/BUILD               |  19 +
 tensorflow/contrib/linalg/__init__.py         |   2 +
 .../linear_operator_kronecker_test.py         | 194 ++++++
 .../python/ops/linear_operator_kronecker.py   | 560 ++++++++++++++++++
 .../ops/linalg/linear_operator_test_util.py   |  12 +
 5 files changed, 787 insertions(+)
 create mode 100644 tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
 create mode 100644 tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py

diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 8b7ff75ba5..2c5fa7af89 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -61,3 +61,22 @@ cuda_py_test(
     shard_count = 5,
     tags = ["noasan"],
 )
+
+cuda_py_test(
+    name = "linear_operator_kronecker_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/linear_operator_kronecker_test.py"],
+    additional_deps = [
+        ":linalg_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 8,
+    tags = ["noasan"],
+)
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index 14cc3b2b49..38bd66b13f 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -22,6 +22,7 @@ See the @{$python/contrib.linalg} guide.
 @@LinearOperatorIdentity
 @@LinearOperatorScaledIdentity
 @@LinearOperatorFullMatrix
+@@LinearOperatorKronecker
 @@LinearOperatorLowerTriangular
 @@LinearOperatorLowRankUpdate
 @@LinearOperatorComposition
@@ -36,6 +37,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.linalg.python.ops.linear_operator_addition import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import *
+from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
new file mode 100644
index 0000000000..6574da22a1
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
@@ -0,0 +1,194 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.linalg.python.ops import linear_operator_kronecker as kronecker
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+random_seed.set_random_seed(23)
+rng = np.random.RandomState(0)
+
+
+def _kronecker_dense(factors):
+  """Convert a list of factors, into a dense Kronecker product."""
+  product = factors[0]
+  for factor in factors[1:]:
+    product = product[..., array_ops.newaxis, :, array_ops.newaxis]
+    factor_to_mul = factor[..., array_ops.newaxis, :, array_ops.newaxis, :]
+    product *= factor_to_mul
+    product = array_ops.reshape(
+        product,
+        shape=array_ops.concat(
+            [array_ops.shape(product)[:-4],
+             [array_ops.shape(product)[-4] * array_ops.shape(product)[-3],
+              array_ops.shape(product)[-2] * array_ops.shape(product)[-1]]
+            ], axis=0))
+
+  return product
+
+
+class KroneckerDenseTest(test.TestCase):
+
+  def testKroneckerDenseMatrix(self):
+    x = ops.convert_to_tensor([[2., 3.], [1., 2.]], dtype=dtypes.float32)
+    y = ops.convert_to_tensor([[1., 2.], [5., -1.]], dtype=dtypes.float32)
+    # From explicitly writing out the kronecker product of x and y.
+    z = ops.convert_to_tensor([
+        [2., 4., 3., 6.],
+        [10., -2., 15., -3.],
+        [1., 2., 2., 4.],
+        [5., -1., 10., -2.]], dtype=dtypes.float32)
+    # From explicitly writing out the kronecker product of y and x.
+    w = ops.convert_to_tensor([
+        [2., 3., 4., 6.],
+        [1., 2., 2., 4.],
+        [10., 15., -2., -3.],
+        [5., 10., -1., -2.]], dtype=dtypes.float32)
+
+    with self.test_session():
+      self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
+      self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
+
+
+class SquareLinearOperatorKroneckerTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    # Increase from 1e-6 to 1e-4
+    self._atol[dtypes.float32] = 1e-4
+    self._atol[dtypes.complex64] = 1e-4
+    self._rtol[dtypes.float32] = 1e-4
+    self._rtol[dtypes.complex64] = 1e-4
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    return [
+        build_info((1, 1), factors=[(1, 1), (1, 1)]),
+        build_info((8, 8), factors=[(2, 2), (2, 2), (2, 2)]),
+        build_info((12, 12), factors=[(2, 2), (3, 3), (2, 2)]),
+        build_info((1, 3, 3), factors=[(1, 1), (1, 3, 3)]),
+        build_info((3, 6, 6), factors=[(3, 1, 1), (1, 2, 2), (1, 3, 3)]),
+    ]
+
+  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
+    shape = list(build_info.shape)
+    expected_factors = build_info.__dict__["factors"]
+    matrices = [
+        linear_operator_test_util.random_positive_definite_matrix(
+            block_shape, dtype, force_well_conditioned=True)
+        for block_shape in expected_factors
+    ]
+
+    if use_placeholder:
+      matrices_ph = [
+          array_ops.placeholder(dtype=dtype) for _ in expected_factors
+      ]
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # values are random and we want the same value used for both mat and
+      # feed_dict.
+      matrices = self.evaluate(matrices)
+      operator = kronecker.LinearOperatorKronecker(
+          [linalg.LinearOperatorFullMatrix(
+              m_ph, is_square=True) for m_ph in matrices_ph],
+          is_square=True)
+      feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
+    else:
+      operator = kronecker.LinearOperatorKronecker(
+          [linalg.LinearOperatorFullMatrix(
+              m, is_square=True) for m in matrices])
+      feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
+
+    matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
+
+    kronecker_dense = _kronecker_dense(matrices)
+
+    if not use_placeholder:
+      kronecker_dense.set_shape(shape)
+
+    return operator, kronecker_dense, feed_dict
+
+  def test_is_x_flags(self):
+    # Matrix with two positive eigenvalues, 1, and 1.
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [linalg.LinearOperatorFullMatrix(matrix),
+         linalg.LinearOperatorFullMatrix(matrix)],
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertFalse(operator.is_self_adjoint)
+
+  def test_is_non_singular_auto_set(self):
+    # Matrix with two positive eigenvalues, 11 and 8.
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[11., 0.], [1., 8.]]
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+
+    operator = kronecker.LinearOperatorKronecker(
+        [operator_1, operator_2],
+        is_positive_definite=False,  # No reason it HAS to be False...
+        is_non_singular=None)
+    self.assertFalse(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+
+    with self.assertRaisesRegexp(ValueError, "always non-singular"):
+      kronecker.LinearOperatorKronecker(
+          [operator_1, operator_2], is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, name="left")
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, name="right")
+
+    operator = kronecker.LinearOperatorKronecker([operator_1, operator_2])
+
+    self.assertEqual("left_x_right", operator.name)
+
+  def test_different_dtypes_raises(self):
+    operators = [
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
+    ]
+    with self.assertRaisesRegexp(TypeError, "same dtype"):
+      kronecker.LinearOperatorKronecker(operators)
+
+  def test_empty_or_one_operators_raises(self):
+    with self.assertRaisesRegexp(ValueError, ">=1 operators"):
+      kronecker.LinearOperatorKronecker([])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py b/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py
new file mode 100644
index 0000000000..79080d194f
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py
@@ -0,0 +1,560 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Construct the Kronecker product of one or more `LinearOperators`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+
+
+def _vec(x):
+  """Stacks column of matrix to form a single column."""
+  return array_ops.reshape(
+      array_ops.matrix_transpose(x),
+      array_ops.concat(
+          [array_ops.shape(x)[:-2], [-1]], axis=0))
+
+
+def _unvec_by(y, num_col):
+  """Unstack vector to form a matrix, with a specified amount of columns."""
+  return array_ops.matrix_transpose(
+      array_ops.reshape(
+          y,
+          array_ops.concat(
+              [array_ops.shape(y)[:-1], [num_col, -1]], axis=0)))
+
+
+def _rotate_last_dim(x, rotate_right=False):
+  """Rotate the last dimension either left or right."""
+  ndims = array_ops.rank(x)
+  if rotate_right:
+    transpose_perm = array_ops.concat(
+        [[ndims - 1], math_ops.range(0, ndims - 1)], axis=0)
+  else:
+    transpose_perm = array_ops.concat(
+        [math_ops.range(1, ndims), [0]], axis=0)
+  return array_ops.transpose(x, transpose_perm)
+
+
+class LinearOperatorKronecker(linear_operator.LinearOperator):
+  """Kronecker product between two `LinearOperators`.
+
+  This operator composes one or more linear operators `[op1,...,opJ]`,
+  building a new `LinearOperator` representing the Kronecker product:
+  `op1 x op2 x .. opJ` (we omit parentheses as the Kronecker product is
+  associative).
+
+  If `opj` has shape `batch_shape_j` + [M_j, N_j`, then the composed operator
+  will have shape equal to `broadcast_batch_shape + [prod M_j, prod N_j]`,
+  where the product is over all operators.
+
+  ```python
+  # Create a 4 x 4 linear operator composed of two 2 x 2 operators.
+  operator_1 = LinearOperatorFullMatrix([[1., 2.], [3., 4.]])
+  operator_2 = LinearOperatorFullMatrix([[1., 0.], [2., 1.]])
+  operator = LinearOperatorKronecker([operator_1, operator_2])
+
+  operator.to_dense()
+  ==> [[1., 2., 0., 0.],
+       [3., 4., 0., 0.],
+       [2., 4., 1., 2.],
+       [6., 8., 3., 4.]]
+
+  operator.shape
+  ==> [4, 4]
+
+  operator.log_abs_determinant()
+  ==> scalar Tensor
+
+  x = ... Shape [4, 2] Tensor
+  operator.matmul(x)
+  ==> Shape [4, 2] Tensor
+
+  # Create a [2, 3] batch of 4 x 5 linear operators.
+  matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
+  operator_45 = LinearOperatorFullMatrix(matrix)
+
+  # Create a [2, 3] batch of 5 x 6 linear operators.
+  matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
+  operator_56 = LinearOperatorFullMatrix(matrix_56)
+
+  # Compose to create a [2, 3] batch of 20 x 30 operators.
+  operator_large = LinearOperatorKronecker([operator_45, operator_56])
+
+  # Create a shape [2, 3, 20, 2] vector.
+  x = tf.random_normal(shape=[2, 3, 6, 2])
+  operator_large.matmul(x)
+  ==> Shape [2, 3, 30, 2] Tensor
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorKronecker` on any operation is equal to
+  the sum of the individual operators' operations.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operators,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorKronecker`.
+
+    `LinearOperatorKronecker` is initialized with a list of operators
+    `[op_1,...,op_J]`.
+
+    Args:
+      operators:  Iterable of `LinearOperator` objects, each with
+        the same `dtype` and composable shape, representing the Kronecker
+        factors.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`.  Default is the individual
+        operators names joined with `_x_`.
+
+    Raises:
+      TypeError:  If all operators do not have the same `dtype`.
+      ValueError:  If `operators` is empty.
+    """
+    # Validate operators.
+    check_ops.assert_proper_iterable(operators)
+    operators = list(operators)
+    if not operators:
+      raise ValueError(
+          "Expected a list of >=1 operators. Found: %s" % operators)
+    self._operators = operators
+
+    # Validate dtype.
+    dtype = operators[0].dtype
+    for operator in operators:
+      if operator.dtype != dtype:
+        name_type = (str((o.name, o.dtype)) for o in operators)
+        raise TypeError(
+            "Expected all operators to have the same dtype.  Found %s"
+            % "   ".join(name_type))
+
+    # Auto-set and check hints.
+    # A Kronecker product is invertible, if and only if all factors are
+    # invertible.
+    if all(operator.is_non_singular for operator in operators):
+      if is_non_singular is False:
+        raise ValueError(
+            "The Kronecker product of non-singular operators is always "
+            "non-singular.")
+      is_non_singular = True
+
+    if all(operator.is_self_adjoint for operator in operators):
+      if is_self_adjoint is False:
+        raise ValueError(
+            "The Kronecker product of self-adjoint operators is always "
+            "self-adjoint.")
+      is_self_adjoint = True
+
+    # The eigenvalues of a Kronecker product are equal to the products of eigen
+    # values of the corresponding factors.
+    if all(operator.is_positive_definite for operator in operators):
+      if is_positive_definite is False:
+        raise ValueError("The Kronecker product of positive-definite operators "
+                         "is always positive-definite.")
+      is_positive_definite = True
+
+    # Initialization.
+    graph_parents = []
+    for operator in operators:
+      graph_parents.extend(operator.graph_parents)
+
+    if name is None:
+      name = operators[0].name
+      for operator in operators[1:]:
+        name += "_x_" + operator.name
+    with ops.name_scope(name, values=graph_parents):
+      super(LinearOperatorKronecker, self).__init__(
+          dtype=dtype,
+          graph_parents=graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operators(self):
+    return self._operators
+
+  def _shape(self):
+    # Get final matrix shape.
+    domain_dimension = self.operators[0].domain_dimension
+    for operator in self.operators[1:]:
+      domain_dimension *= operator.domain_dimension
+
+    range_dimension = self.operators[0].range_dimension
+    for operator in self.operators[1:]:
+      range_dimension *= operator.range_dimension
+
+    matrix_shape = tensor_shape.TensorShape([
+        range_dimension, domain_dimension])
+
+    # Get broadcast batch shape.
+    # broadcast_shape checks for compatibility.
+    batch_shape = self.operators[0].batch_shape
+    for operator in self.operators[1:]:
+      batch_shape = common_shapes.broadcast_shape(
+          batch_shape, operator.batch_shape)
+
+    return batch_shape.concatenate(matrix_shape)
+
+  def _shape_tensor(self):
+    domain_dimension = self.operators[0].domain_dimension_tensor()
+    for operator in self.operators[1:]:
+      domain_dimension *= operator.domain_dimension_tensor()
+
+    range_dimension = self.operators[0].range_dimension_tensor()
+    for operator in self.operators[1:]:
+      range_dimension *= operator.range_dimension_tensor()
+
+    matrix_shape = [range_dimension, domain_dimension]
+
+    # Get broadcast batch shape.
+    # broadcast_shape checks for compatibility.
+    batch_shape = self.operators[0].batch_shape_tensor()
+    for operator in self.operators[1:]:
+      batch_shape = array_ops.broadcast_dynamic_shape(
+          batch_shape, operator.batch_shape_tensor())
+
+    return array_ops.concat((batch_shape, matrix_shape), 0)
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    # Here we heavily rely on Roth's column Lemma [1]:
+    # (A x B) * vec X = vec BXA^T,
+    # where vec stacks all the columns of the matrix under each other. In our
+    # case, x represents a batch of vec X (i.e. we think of x as a batch of
+    # column vectors, rather than a matrix). Each member of the batch can be
+    # reshaped to a matrix (hence we get a batch of matrices).
+    # We can iteratively apply this lemma by noting that if B is a Kronecker
+    # product, then we can apply the lemma again.
+
+    # [1] W. E. Roth, "On direct product matrices,"
+    # Bulletin of the American Mathematical Society, vol. 40, pp. 461-468,
+    # 1934
+
+    # Efficiency
+
+    # Naively doing the Kronecker product, by calculating the dense matrix and
+    # applying it will can take cubic time in  the size of domain_dimension
+    # (assuming a square matrix). The other issue is that calculating the dense
+    # matrix can be prohibitively expensive, in that it can take a large amount
+    # of memory.
+    #
+    # This implementation avoids this memory blow up by only computing matmuls
+    # with the factors. In this way, we don't have to realize the dense matrix.
+    # In terms of complexity, if we have Kronecker Factors of size:
+    # (n1, n1), (n2, n2), (n3, n3), ... (nJ, nJ), with N = \prod n_i, and we
+    # have as input a [N, M] matrix, the naive approach would take O(N^2 M).
+    # With this approach (ignoring reshaping of tensors and transposes for now),
+    # the time complexity can be O(M * (\sum n_i) * N). There is also the
+    # benefit of batched multiplication (In this example, the batch size is
+    # roughly M * N) so this can be much faster. However, not factored in are
+    # the costs of the several transposing of tensors, which can affect cache
+    # behavior.
+
+    # Below we document the shape manipulation for adjoint=False,
+    # adjoint_arg=False, but the general case of different adjoints is still
+    # handled.
+
+    if adjoint_arg:
+      x = linalg.adjoint(x)
+
+    # Always add a batch dimension to enable broadcasting to work.
+    batch_shape = array_ops.concat(
+        [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
+    x += array_ops.zeros(batch_shape, dtype=x.dtype.base_dtype)
+
+    # x has shape [B, R, C], where B represent some number of batch dimensions,
+    # R represents the number of rows, and C represents the number of columns.
+    # In order to apply Roth's column lemma, we need to operate on a batch of
+    # column vectors, so we reshape into a batch of column vectors. We put it
+    # at the front to ensure that broadcasting between operators to the batch
+    # dimensions B still works.
+    output = _rotate_last_dim(x, rotate_right=True)
+
+    # Also expand the shape to be [A, C, B, R]. The first dimension will be
+    # used to accumulate dimensions from each operator matmul.
+    output = output[array_ops.newaxis, ...]
+
+    # In this loop, A is going to refer to the value of the accumulated
+    # dimension. A = 1 at the start, and will end up being self.range_dimension.
+    # V will refer to the last dimension. V = R at the start, and will end up
+    # being 1 in the end.
+    for operator in self.operators[:-1]:
+      # Reshape output from [A, C, B, V] to be
+      # [A, C, B, V / op.domain_dimension, op.domain_dimension]
+      if adjoint:
+        operator_dimension = operator.range_dimension_tensor()
+      else:
+        operator_dimension = operator.domain_dimension_tensor()
+
+      output = _unvec_by(output, operator_dimension)
+
+      # We are computing (XA^T) = (AX^T)^T.
+      # output has [A, C, B, V / op.domain_dimension, op.domain_dimension],
+      # which is being converted to:
+      # [A, C, B, V / op.domain_dimension, op.range_dimension]
+      output = array_ops.matrix_transpose(output)
+      output = operator.matmul(output, adjoint=adjoint, adjoint_arg=False)
+      output = array_ops.matrix_transpose(output)
+      # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension]
+      output = _rotate_last_dim(output, rotate_right=False)
+      output = _vec(output)
+      output = _rotate_last_dim(output, rotate_right=True)
+
+    # After the loop, we will have
+    # A = self.range_dimension / op[-1].range_dimension
+    # V = op[-1].domain_dimension
+
+    # We convert that using matvec to get:
+    # [A, C, B, op[-1].range_dimension]
+    output = self.operators[-1].matvec(output, adjoint=adjoint)
+    # Rearrange shape to be [B1, ... Bn, self.range_dimension, C]
+    output = _rotate_last_dim(output, rotate_right=False)
+    output = _vec(output)
+    output = _rotate_last_dim(output, rotate_right=False)
+
+    if x.shape.is_fully_defined():
+      column_dim = x.shape[-1]
+      broadcast_batch_shape = common_shapes.broadcast_shape(
+          x.shape[:-2], self.batch_shape)
+      if adjoint:
+        matrix_dimensions = [self.domain_dimension, column_dim]
+      else:
+        matrix_dimensions = [self.range_dimension, column_dim]
+
+      print("x: ", x)
+      print("bathc_shape:", self.batch_shape)
+      print("self.shape:", self.shape)
+      print("output: ", output)
+      output.set_shape(broadcast_batch_shape.concatenate(
+          matrix_dimensions))
+
+    return output
+
+  def _determinant(self):
+    # Note that we have |X1 x X2| = |X1| ** n * |X2| ** m, where X1 is an m x m
+    # matrix, and X2 is an n x n matrix. We can iteratively apply this property
+    # to get the determinant of |X1 x X2 x X3 ...|. If T is the product of the
+    # domain dimension of all operators, then we have:
+    # |X1 x X2 x X3 ...| =
+    #    |X1| ** (T / m) * |X2 x X3 ... | ** m =
+    #    |X1| ** (T / m) * |X2| ** (m * (T / m) / n) *  ... =
+    #    |X1| ** (T / m) * |X2| ** (T / n) * | X3 x X4... | ** (m * n)
+    #    And by doing induction we have product(|X_i| ** (T / dim(X_i))).
+    total = self.domain_dimension_tensor()
+    determinant = 1.
+    for operator in self.operators:
+      determinant *= operator.determinant() ** math_ops.cast(
+          total / operator.domain_dimension_tensor(),
+          dtype=operator.dtype)
+    return determinant
+
+  def _log_abs_determinant(self):
+    # This will be sum((total / dim(x_i)) * log |X_i|)
+    total = self.domain_dimension_tensor()
+    log_abs_det = 0.
+    for operator in self.operators:
+      log_abs_det += operator.log_abs_determinant() * math_ops.cast(
+          total / operator.domain_dimension_tensor(),
+          dtype=operator.dtype)
+    return log_abs_det
+
+  def _trace(self):
+    # tr(A x B) = tr(A) * tr(B)
+    trace = 1.
+    for operator in self.operators:
+      trace *= operator.trace()
+    return trace
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    # Here we follow the same use of Roth's column lemma as in `matmul`, with
+    # the key difference that we replace all `matmul` instances with `solve`.
+    # This follows from the property that inv(A x B) = inv(A) x inv(B).
+
+    # Below we document the shape manipulation for adjoint=False,
+    # adjoint_arg=False, but the general case of different adjoints is still
+    # handled.
+
+    if adjoint_arg:
+      rhs = linalg.adjoint(rhs)
+
+    # Always add a batch dimension to enable broadcasting to work.
+    batch_shape = array_ops.concat(
+        [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
+    rhs += array_ops.zeros(batch_shape, dtype=rhs.dtype.base_dtype)
+
+    # rhs has shape [B, R, C], where B represent some number of batch
+    # dimensions,
+    # R represents the number of rows, and C represents the number of columns.
+    # In order to apply Roth's column lemma, we need to operate on a batch of
+    # column vectors, so we reshape into a batch of column vectors. We put it
+    # at the front to ensure that broadcasting between operators to the batch
+    # dimensions B still works.
+    output = _rotate_last_dim(rhs, rotate_right=True)
+
+    # Also expand the shape to be [A, C, B, R]. The first dimension will be
+    # used to accumulate dimensions from each operator matmul.
+    output = output[array_ops.newaxis, ...]
+
+    # In this loop, A is going to refer to the value of the accumulated
+    # dimension. A = 1 at the start, and will end up being self.range_dimension.
+    # V will refer to the last dimension. V = R at the start, and will end up
+    # being 1 in the end.
+    for operator in self.operators[:-1]:
+      # Reshape output from [A, C, B, V] to be
+      # [A, C, B, V / op.domain_dimension, op.domain_dimension]
+      if adjoint:
+        operator_dimension = operator.range_dimension_tensor()
+      else:
+        operator_dimension = operator.domain_dimension_tensor()
+
+      output = _unvec_by(output, operator_dimension)
+
+      # We are computing (XA^-1^T) = (A^-1 X^T)^T.
+      # output has [A, C, B, V / op.domain_dimension, op.domain_dimension],
+      # which is being converted to:
+      # [A, C, B, V / op.domain_dimension, op.range_dimension]
+      output = array_ops.matrix_transpose(output)
+      output = operator.solve(output, adjoint=adjoint, adjoint_arg=False)
+      output = array_ops.matrix_transpose(output)
+      # Rearrange it to [A * op.range_dimension, C, B, V / op.domain_dimension]
+      output = _rotate_last_dim(output, rotate_right=False)
+      output = _vec(output)
+      output = _rotate_last_dim(output, rotate_right=True)
+
+    # After the loop, we will have
+    # A = self.range_dimension / op[-1].range_dimension
+    # V = op[-1].domain_dimension
+
+    # We convert that using matvec to get:
+    # [A, C, B, op[-1].range_dimension]
+    output = self.operators[-1].solvevec(output, adjoint=adjoint)
+    # Rearrange shape to be [B1, ... Bn, self.range_dimension, C]
+    output = _rotate_last_dim(output, rotate_right=False)
+    output = _vec(output)
+    output = _rotate_last_dim(output, rotate_right=False)
+
+    if rhs.shape.is_fully_defined():
+      column_dim = rhs.shape[-1]
+      broadcast_batch_shape = common_shapes.broadcast_shape(
+          rhs.shape[:-2], self.batch_shape)
+      if adjoint:
+        matrix_dimensions = [self.domain_dimension, column_dim]
+      else:
+        matrix_dimensions = [self.range_dimension, column_dim]
+
+      output.set_shape(broadcast_batch_shape.concatenate(
+          matrix_dimensions))
+
+    return output
+
+  def _diag_part(self):
+    diag_part = self.operators[0].diag_part()
+    for operator in self.operators[1:]:
+      diag_part = diag_part[..., :, array_ops.newaxis]
+      op_diag_part = operator.diag_part()[..., array_ops.newaxis, :]
+      diag_part *= op_diag_part
+      diag_part = array_ops.reshape(
+          diag_part,
+          shape=array_ops.concat(
+              [array_ops.shape(diag_part)[:-2], [-1]], axis=0))
+    if self.range_dimension > self.domain_dimension:
+      diag_dimension = self.domain_dimension
+    else:
+      diag_dimension = self.range_dimension
+    diag_part.set_shape(
+        self.batch_shape.concatenate(diag_dimension))
+    return diag_part
+
+  def _to_dense(self):
+    product = self.operators[0].to_dense()
+    for operator in self.operators[1:]:
+      # Product has shape [B, R1, 1, C1].
+      product = product[
+          ..., :, array_ops.newaxis, :, array_ops.newaxis]
+      # Operator has shape [B, 1, R2, 1, C2].
+      op_to_mul = operator.to_dense()[
+          ..., array_ops.newaxis, :, array_ops.newaxis, :]
+      # This is now [B, R1, R2, C1, C2].
+      product *= op_to_mul
+      # Now merge together dimensions to get [B, R1 * R2, C1 * C2].
+      product = array_ops.reshape(
+          product,
+          shape=array_ops.concat(
+              [array_ops.shape(product)[:-4],
+               [array_ops.shape(product)[-4] * array_ops.shape(product)[-3],
+                array_ops.shape(product)[-2] * array_ops.shape(product)[-1]]
+              ], axis=0))
+    product.set_shape(self.shape)
+    return product
+
+  def _assert_non_singular(self):
+    if all(operator.is_square for operator in self.operators):
+      asserts = [operator.assert_non_singular() for operator in self.operators]
+      return control_flow_ops.group(asserts)
+    else:
+      raise errors.InvalidArgumentError(
+          node_def=None, op=None, message="All Kronecker factors must be "
+          "square for the product to be invertible.")
+
+  def _assert_self_adjoint(self):
+    if all(operator.is_square for operator in self.operators):
+      asserts = [operator.assert_self_adjoint() for operator in self.operators]
+      return control_flow_ops.group(asserts)
+    else:
+      raise errors.InvalidArgumentError(
+          node_def=None, op=None, message="All Kronecker factors must be "
+          "square for the product to be self adjoint.")
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 9c8abb9740..7e4fb6a6fc 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -233,6 +233,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def _test_matmul(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
+        # If batch dimensions are omitted, but there are
+        # no batch dimensions for the linear operator, then
+        # skip the test case. This is already checked with
+        # with_batch=True.
+        if not with_batch and len(build_info.shape) <= 2:
+          continue
         for dtype in self._dtypes_to_test:
           for adjoint in self._adjoint_options:
             for adjoint_arg in self._adjoint_arg_options:
@@ -270,6 +276,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
+        # If batch dimensions are omitted, but there are
+        # no batch dimensions for the linear operator, then
+        # skip the test case. This is already checked with
+        # with_batch=True.
+        if not with_batch and len(build_info.shape) <= 2:
+          continue
         for dtype in self._dtypes_to_test:
           for adjoint in self._adjoint_options:
             for adjoint_arg in self._adjoint_arg_options:
-- 
GitLab


From 451070ab9e648db68830d7a13eeabaf630a1774d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 16 Apr 2018 17:12:05 -0700
Subject: [PATCH 0903/1262] Don't rely on graph contruction for an initial
 shape inference.

PiperOrigin-RevId: 193124836
---
 tensorflow/core/grappler/costs/BUILD          |   1 +
 .../core/grappler/costs/graph_properties.cc   | 132 +++++++++++++-----
 .../core/grappler/costs/graph_properties.h    |  16 ++-
 .../grappler/costs/graph_properties_test.cc   |   4 +-
 4 files changed, 112 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 33949319d5..ddbf7f3697 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -41,6 +41,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
+        "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 9fa2b7a259..a9c777e551 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -355,6 +356,8 @@ void VerboseLogUnknownDimensionSources(
 // information is refined.
 class TopoQueue {
  public:
+  explicit TopoQueue(const std::unordered_map<const Node*, int>& topo_order)
+      : queue_(CompareNodes(topo_order)) {}
   void push(const Node* n) { queue_.insert(n); }
   const Node* pop() {
     CHECK(!empty());
@@ -371,9 +374,15 @@ class TopoQueue {
   // Graph nodes are created in (roughly) topological order. Therefore we can
   // use their id to ensure they're sorted topologically.
   struct CompareNodes {
+    explicit CompareNodes(
+        const std::unordered_map<const Node*, int>& topo_ordering)
+        : topo_order(topo_ordering) {}
     bool operator()(const Node* lhs, const Node* rhs) const {
-      return lhs->id() < rhs->id();
+      return topo_order.at(lhs) < topo_order.at(rhs);
     }
+
+   private:
+    const std::unordered_map<const Node*, int>& topo_order;
   };
   std::set<const Node*, CompareNodes> queue_;
 };
@@ -689,9 +698,36 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
 // nodes to propagate any known shape from the Merge node.
 Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
                                         const Node* node, bool relax,
-                                        TopoQueue* new_shapes) {
+                                        bool* new_shapes) const {
   InferenceContext* c = shape_refiner->GetContext(node);
-  CHECK_NE(c, nullptr);
+  if (!c) {
+    // The shape refiner can't handle loops. Therefore we first need to remove
+    // all edges
+    std::vector<Edge> edges;
+    std::vector<const Edge*> edge_ptrs;
+    for (const Edge* edge : node->in_edges()) {
+      if (!edge->IsControlEdge()) {
+        edges.push_back(*edge);
+        edge_ptrs.push_back(edge);
+      }
+    }
+    for (const Edge* edge : edge_ptrs) {
+      if (!edge->IsControlEdge()) {
+        graph_->RemoveEdge(edge);
+      }
+    }
+    // Now we can run shape inference
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes));
+    // And add all the edges back
+    for (const Edge& edge : edges) {
+      graph_->AddEdge(edge.src(), edge.src_output(), edge.dst(),
+                      edge.dst_input());
+    }
+
+    c = shape_refiner->GetContext(node);
+    *new_shapes = true;
+    CHECK_NE(c, nullptr);
+  }
 
   ShapeHandle out1;
   TF_RETURN_IF_ERROR(c->WithRank(c->output(1), 0, &out1));
@@ -711,6 +747,11 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
     }
 
     InferenceContext* in = shape_refiner->GetContext(e->src());
+    if (!relax && !in) {
+      // Handling a loop for the first time, the back edge won't have any shape
+      // info.
+      continue;
+    }
     ShapeHandle input = in->output(e->src_output());
     if (relax) {
       c->RelaxInput(e->dst_input(), input);
@@ -731,7 +772,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 
   if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
     c->set_output(0, out);
-    new_shapes->push(node);
+    *new_shapes = true;
   }
 
   return Status::OK();
@@ -740,7 +781,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 Status GraphProperties::OverwriteFedPorts(
     SymbolicShapeRefiner* shape_refiner,
     const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-    const Node* node, TopoQueue* new_shapes) const {
+    const Node* node, bool* new_shapes) const {
   auto it = fed_ports.find(node->name());
   Status status;
   if (it != fed_ports.end()) {
@@ -749,7 +790,7 @@ Status GraphProperties::OverwriteFedPorts(
     for (const int output_port : it->second) {
       status.Update(shape_refiner->SetUnknownShape(node, output_port));
     }
-    new_shapes->push(node);
+    *new_shapes = true;
   }
   return status;
 }
@@ -758,9 +799,12 @@ Status GraphProperties::OverwriteFedPorts(
 // outputs.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                                     const Node* node, bool relax,
-                                    TopoQueue* new_shapes) {
+                                    bool* new_shapes) {
   auto enter_ctx = shape_refiner->GetContext(node);
-  CHECK_NE(enter_ctx, nullptr);
+  if (!enter_ctx) {
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes));
+    enter_ctx = shape_refiner->GetContext(node);
+  }
 
   for (const Edge* e : node->in_edges()) {
     if (e->IsControlEdge()) {
@@ -775,7 +819,7 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
         enter_ctx->MergeInput(0, input);
       }
       enter_ctx->set_output(0, input);
-      new_shapes->push(node);
+      *new_shapes = true;
     }
   }
   return Status::OK();
@@ -784,7 +828,7 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
 Status GraphProperties::UpdateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax,
     const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-    const Node* n, TopoQueue* new_shapes) const {
+    const Node* n, bool* new_shapes) const {
   if (n->IsEnter()) {
     // The Enter shape function always forwards an UnknownShape, so do the right
     // thing here.
@@ -800,7 +844,7 @@ Status GraphProperties::UpdateShapes(
       // We want to avoid propagating through loops on the merge pass because
       // the shapes are not guaranteed to converge.
       if (relax || !n->IsNextIteration()) {
-        new_shapes->push(n);
+        *new_shapes = true;
       }
     }
   }
@@ -837,11 +881,15 @@ Status GraphProperties::PropagateShapes(
     while (!new_shapes->empty() &&
            num_loop_iterations++ < max_loop_iterations) {
       const Node* n = new_shapes->pop();
-      for (const Edge* e : n->out_edges()) {
-        if (!e->IsControlEdge()) {
-          const Node* fanout = e->dst();
-          TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, fed_ports,
-                                          fanout, new_shapes));
+      bool updated = false;
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(shape_refiner, relax, fed_ports, n, &updated));
+      if (updated) {
+        for (const Edge* e : n->out_edges()) {
+          if (!e->IsControlEdge()) {
+            const Node* fanout = e->dst();
+            new_shapes->push(fanout);
+          }
         }
       }
     }
@@ -913,7 +961,12 @@ Status GraphProperties::UpdateResource(
                                                queue_shapes_and_types)) {
     qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types);
 
-    new_shapes->push(qnode);
+    for (const Edge* e : qnode->out_edges()) {
+      if (!e->IsControlEdge()) {
+        const Node* fanout = e->dst();
+        new_shapes->push(fanout);
+      }
+    }
   }
 
   return Status::OK();
@@ -923,6 +976,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
   Graph graph(function_library);
+  graph_ = &graph;
   ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   shape_refiner.set_require_shape_inference_fns(false);
   shape_refiner.set_disable_constant_propagation(true);
@@ -932,6 +986,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // the device placement of nodes has also completed, so there
   // is no need to validate colocation constraints again.
   options.validate_colocation_constraints = false;
+  options.validate_shape = false;
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
@@ -944,14 +999,29 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
+
+  std::unordered_map<string, int> order_by_name;
+  for (const auto topo : topo_order) {
+    order_by_name[topo.first->name()] = topo.second;
+  }
+
   // List the resources and the nodes using them. Also collect the Enter and
   // Merge nodes.
+  std::unordered_map<const Node*, int> graph_topo_order;
   std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
-  std::unordered_set<const Node*> enter_nodes;
   std::unordered_set<const Node*> merge_nodes;
   std::unordered_set<const Node*> fed_nodes;
+  std::unordered_set<const Node*> primary_inputs;
   int num_loops = 0;
   for (const Node* const node : graph.nodes()) {
+    auto it = order_by_name.find(node->name());
+    if (it == order_by_name.end()) {
+      continue;
+    }
+    graph_topo_order[node] = it->second;
+
     for (int i = 0; i < node->num_inputs(); ++i) {
       if (node->input_type(i) == DataType::DT_RESOURCE) {
         const Node* resource;
@@ -959,8 +1029,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
         resources[resource].insert(node);
       }
     }
-    if (node->IsEnter()) {
-      enter_nodes.insert(node);
+    if (node->num_inputs() == 0) {
+      primary_inputs.insert(node);
     } else if (node->IsMerge()) {
       merge_nodes.insert(node);
     } else if (node->IsNextIteration()) {
@@ -979,22 +1049,20 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // we exclusively relax shapes and propagate shapes through loops until
   // reaching fixed point.
   for (int relax = 0; relax < 2; relax++) {
-    TopoQueue new_shapes;
-    // Force the propagation of shapes of Enter nodes manually (the Enter shape
-    // function always forwards an UnknownShape).
-    for (const Node* node : enter_nodes) {
-      TF_RETURN_IF_ERROR(
-          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
-    }
+    TopoQueue new_shapes(graph_topo_order);
     // Seed the propagation of shapes through merge nodes.
-    for (const Node* node : merge_nodes) {
-      TF_RETURN_IF_ERROR(
-          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
+    if (relax) {
+      for (const Node* node : merge_nodes) {
+        new_shapes.push(node);
+      }
+    }
+    // Also seed the propagation of shapes in the fanout of primary inputs.
+    for (const Node* node : primary_inputs) {
+      new_shapes.push(node);
     }
     // Also seed the propagation of shapes in the fanout of fed nodes.
     for (const Node* node : fed_nodes) {
-      TF_RETURN_IF_ERROR(
-          OverwriteFedPorts(&refiner, fed_ports, node, &new_shapes));
+      new_shapes.push(node);
     }
     // Propagate shapes normally.
     TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 8ff572fe4f..30351f58fd 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
+class Graph;
+
 namespace grappler {
 
 class SymbolicShapeRefiner;
@@ -95,24 +97,22 @@ class GraphProperties {
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
-  static Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                const Node* node, bool relax,
-                                TopoQueue* new_shapes);
+  Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, const Node* node,
+                         bool relax, bool* new_shapes) const;
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                            const Node* node, bool relax,
-                            TopoQueue* new_shapes);
+                            const Node* node, bool relax, bool* new_shapes);
   // Process a node that is used to feed the model.
   Status OverwriteFedPorts(
       SymbolicShapeRefiner* shape_refiner,
       const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-      const Node* node, TopoQueue* new_shapes) const;
+      const Node* node, bool* new_shapes) const;
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
   Status UpdateShapes(
       SymbolicShapeRefiner* shape_refiner, bool relax,
       const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-      const Node* n, TopoQueue* new_shapes) const;
+      const Node* n, bool* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
   Status PropagateShapes(
@@ -127,6 +127,8 @@ class GraphProperties {
   std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
   std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
   const std::vector<OpInfo::TensorProperties> missing_properties_;
+
+  Graph* graph_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index d3d89b59af..3de697bd37 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -303,9 +303,9 @@ TEST_F(GraphPropertiesTest, Queues) {
       root.WithOpName("Queue5"),
       {DataType::DT_FLOAT, DataType::DT_DOUBLE, DataType::DT_FLOAT});
   Output rnd2 =
-      ops::RandomNormal(root.WithOpName("rnd"), {10}, DataType::DT_DOUBLE);
+      ops::RandomNormal(root.WithOpName("rnd2"), {10}, DataType::DT_DOUBLE);
   Output rnd3 =
-      ops::RandomNormal(root.WithOpName("rnd"), {1, 2, 3}, DataType::DT_FLOAT);
+      ops::RandomNormal(root.WithOpName("rnd3"), {1, 2, 3}, DataType::DT_FLOAT);
   auto enqueue5 =
       ops::QueueEnqueue(root.WithOpName("Enqueue5"), q5, {rnd, rnd2, rnd3});
   auto dequeue5 = ops::QueueDequeue(
-- 
GitLab


From d0345d2d863d50e9db56dc03b1792ec3c4e193c1 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 16 Apr 2018 17:25:12 -0700
Subject: [PATCH 0904/1262] [tf.data] Sort the results of `tf.matching_files()`
 to enable `Dataset.list_files()` to be determinstic.

PiperOrigin-RevId: 193126572
---
 tensorflow/core/kernels/matching_files_op.cc  |  1 +
 .../list_files_dataset_op_test.py             | 48 +++++++++++++++++++
 tensorflow/python/data/ops/dataset_ops.py     | 16 ++++---
 .../api/golden/tensorflow.data.-dataset.pbtxt |  2 +-
 ...ow.data.-fixed-length-record-dataset.pbtxt |  2 +-
 .../tensorflow.data.-t-f-record-dataset.pbtxt |  2 +-
 .../tensorflow.data.-text-line-dataset.pbtxt  |  2 +-
 7 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index cdff7bad5f..7912ca1563 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -60,6 +60,7 @@ class MatchingFilesOp : public OpKernel {
         output(index++) = all_fnames[i][j];
       }
     }
+    std::sort(&output(0), &output(0) + num_files);
   }
 };
 
diff --git a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
index 6442eb9ff5..f7d7d085c9 100644
--- a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
@@ -69,6 +69,54 @@ class ListFilesDatasetOpTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(itr.get_next())
 
+  def testSimpleDirectoryNotShuffled(self):
+    filenames = ['b', 'c', 'a']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False)
+    with self.test_session() as sess:
+      itr = dataset.make_one_shot_iterator()
+      next_element = itr.get_next()
+
+      for filename in sorted(filenames):
+        self.assertEqual(compat.as_bytes(path.join(self.tmp_dir, filename)),
+                         sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testFixedSeedResultsInRepeatableOrder(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
+    with self.test_session() as sess:
+      itr = dataset.make_initializable_iterator()
+      next_element = itr.get_next()
+
+      full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
+                        for filename in filenames]
+
+      all_produced_filenames = []
+      for _ in range(3):
+        produced_filenames = []
+        sess.run(itr.initializer)
+        try:
+          while True:
+            produced_filenames.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+        all_produced_filenames.append(produced_filenames)
+
+      # Each run should produce the same set of filenames, which may be
+      # different from the order of `full_filenames`.
+      self.assertItemsEqual(full_filenames, all_produced_filenames[0])
+      # However, the different runs should produce filenames in the same order
+      # as each other.
+      self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
+      self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
+
   def testEmptyDirectoryInitializer(self):
     filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
     dataset = dataset_ops.Dataset.list_files(filename_placeholder)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 406f172e59..bd9686f692 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -571,9 +571,13 @@ class Dataset(object):
     return PrefetchDataset(self, buffer_size)
 
   @staticmethod
-  def list_files(file_pattern, shuffle=None):
+  def list_files(file_pattern, shuffle=None, seed=None):
     """A dataset of all files matching a pattern.
 
+    NOTE: The default behavior of this method is to return filenames in
+    a non-deterministic random shuffled order. Pass a `seed` or `shuffle=False`
+    to get results in a deterministic order.
+
     Example:
       If we had the following files on our filesystem:
         - /path/to/dir/a.txt
@@ -584,20 +588,18 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
-    NOTE: The order of the file names returned can be non-deterministic even
-    when `shuffle` is `False`.
-
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
       shuffle: (Optional.) If `True`, the file names will be shuffled randomly.
         Defaults to `True`.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
 
     Returns:
      Dataset: A `Dataset` of strings corresponding to file names.
     """
-    # TODO(b/73959787): Add a `seed` argument and make the `shuffle=False`
-    # behavior deterministic (e.g. by sorting the filenames).
     if shuffle is None:
       shuffle = True
     matching_files = gen_io_ops.matching_files(file_pattern)
@@ -607,7 +609,7 @@ class Dataset(object):
       # list of files might be empty.
       buffer_size = math_ops.maximum(
           array_ops.shape(matching_files, out_type=dtypes.int64)[0], 1)
-      dataset = dataset.shuffle(buffer_size)
+      dataset = dataset.shuffle(buffer_size, seed=seed)
     return dataset
 
   def repeat(self, count=None):
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index 0900adaf76..cbbd077c97 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -64,7 +64,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\', \'shuffle\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 7b16ac90c9..9a56ae8675 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\', \'shuffle\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index 9cf5f2ae20..e5ec824bb8 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\', \'shuffle\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index 8c3d669143..008239789c 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "list_files"
-    argspec: "args=[\'file_pattern\', \'shuffle\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "make_initializable_iterator"
-- 
GitLab


From 7e073a01639f8424408776771dbb0d634fccc3f2 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 16 Apr 2018 17:36:07 -0700
Subject: [PATCH 0905/1262] Fixes for review requests

---
 tensorflow/contrib/tensorrt/BUILD             | 19 ++++---
 ...egration.py => tf_trt_integration_test.py} | 52 +++++--------------
 2 files changed, 26 insertions(+), 45 deletions(-)
 rename tensorflow/contrib/tensorrt/test/{test_integration.py => tf_trt_integration_test.py} (77%)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index d116114db0..d382adb986 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -139,6 +139,7 @@ tf_custom_op_py_library(
     ]),
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:resources",
     ],
@@ -173,6 +174,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":wrap_conversion",
+        "//tensorflow/python:tf_optimizer",
     ],
 )
 
@@ -274,10 +276,13 @@ tf_cc_test(
 )
 
 py_test(
-  name = "tf_trt_integration_test",
-  srcs = ["test/test_integration.py"],
-  srcs_version = "PY2AND3",
-  deps = [
-    ":init_py"
-  ]
-)
\ No newline at end of file
+    name = "tf_trt_integration_test",
+    srcs = ["test/tf_trt_integration_test.py"],
+    main = "test/tf_trt_integration_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":init_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/test/test_integration.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
similarity index 77%
rename from tensorflow/contrib/tensorrt/test/test_integration.py
rename to tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
index 97915c2659..b17fdd52b2 100644
--- a/tensorflow/contrib/tensorrt/test/test_integration.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -18,29 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import warnings
+import numpy as np
 
 from tensorflow.contrib import tensorrt as trt
 from tensorflow.core.protobuf import config_pb2 as cpb2
-from tensorflow.python.client import session as csess
-from tensorflow.python.framework import test_util
 from tensorflow.python.framework import constant_op as cop
 from tensorflow.python.framework import dtypes as dtypes
 from tensorflow.python.framework import importer as importer
 from tensorflow.python.framework import ops as ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops as aops
 from tensorflow.python.ops import nn as nn
 from tensorflow.python.ops import nn_ops as nn_ops
 from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test
 
 
 @test_util.with_c_api
 class IntegrationTest(test_util.TensorFlowTestCase):
-
+  """Class to test Tensorflow-TensorRT integration."""
   def setUp(self):
-    """ Setup method """
+    """Setup method."""
     super(IntegrationTest, self).setUp()
     warnings.simplefilter('always')
     inp_dims = (100, 24, 24, 2)
@@ -103,8 +101,8 @@ class IntegrationTest(test_util.TensorFlowTestCase):
           graph_def=gdef, return_elements=["input", "output"])
       inp = inp.outputs[0]
       out = out.outputs[0]
-      # run over real calibration data here, we are mimicking a calibration set of
-      # 30 different batches. Use as much calibration data as you want
+      # run over real calibration data here, we are mimicking a calibration
+      # set of 30 different batches. Use as much calibration data as you want
     with self.test_session(
         graph=g, config=self._config, use_gpu=True,
         force_gpu=True) as sess:
@@ -113,42 +111,20 @@ class IntegrationTest(test_util.TensorFlowTestCase):
     return val
 
   def get_trt_graph(self, mode):
-    """  return trt converted graph """
-    if mode == "FP32":
-      return trt.create_inference_graph(
-          input_graph_def=self._original_graph,
-          outputs=["output"],
-          max_batch_size=self._input.shape[0],
-          max_workspace_size_bytes=1 << 25,
-          precision_mode=
-          "FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
-          minimum_segment_size=2  # minimum number of nodes in an engine
-          )
-    elif mode == "FP16":
-      return trt.create_inference_graph(
-          input_graph_def=self._original_graph,
-          outputs=["output"],
-          max_batch_size=self._input.shape[0],
-          max_workspace_size_bytes=1 << 25,
-          precision_mode=
-          "FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
-          minimum_segment_size=2  # minimum number of nodes in an engine
-          )
-    elif mode == "INT8":
+    """Return trt converted graph."""
+    if mode in  ["FP32", "FP16", "INT8"]:
       return trt.create_inference_graph(
           input_graph_def=self._original_graph,
           outputs=["output"],
           max_batch_size=self._input.shape[0],
           max_workspace_size_bytes=1 << 25,
-          precision_mode=
-          "INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
+          precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
           minimum_segment_size=2  # minimum number of nodes in an engine
           )
-
     return None
 
   def testFP32(self):
-    """ Test FP32 conversion. Results should be identical to native case """
+    """Test FP32 conversion. Results should be identical to native case."""
     trt_graph = self.get_trt_graph("FP32")
     result = self.run_graph(trt_graph, self._input)
     self.assertAllEqual(self._reference, result)
@@ -156,21 +132,21 @@ class IntegrationTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(result1, result)
 
   def testFP16(self):
-    """ Test FP16 conversion. Results may be different from native case """
+    """Test FP16 conversion. Results may be different from native case."""
     trt_graph = self.get_trt_graph("FP16")
     result = self.run_graph(trt_graph, self._input)
-    self.assertAllClose(self._reference, result,rtol=1.e-03)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
     result1 = self.run_graph(trt_graph, self._input)
     self.assertAllEqual(result1, result)
 
   def testINT8(self):
-    """ Test INT8 conversion. Results may be different from native case """
+    """Test INT8 conversion. Results may be different from native case."""
     calib_graph = self.get_trt_graph("INT8")
     result = self.run_calibration(calib_graph, self._input)
     self.assertAllEqual(self._reference, result)
     int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
     result = self.run_graph(int8_graph, self._input)
-    self.assertAllClose(self._reference, result,rtol=1.e-03)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
     result1 = self.run_graph(int8_graph, self._input)
     self.assertAllEqual(result1, result)
 
-- 
GitLab


From 7ee54c2f7cdbc7098627a56b4f084f7b6654b662 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Mon, 16 Apr 2018 17:34:01 -0700
Subject: [PATCH 0906/1262] Remove deprecated/unused python related Bazel
 options.

Since py_runtime was introduced, Bazel ignores options such as
--force_python2 and --python2_path. Deleting to clean stuff up and
make sure people are not misled.

PiperOrigin-RevId: 193127681
---
 configure.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/configure.py b/configure.py
index 8fb8979111..b745e374a2 100644
--- a/configure.py
+++ b/configure.py
@@ -226,8 +226,6 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --force_python=py%s' % python_major_version)
-  write_to_bazelrc('build --host_force_python=py%s' % python_major_version)
   write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
-- 
GitLab


From 1516756b0297b3642689b06128358aeefd67a321 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 18:03:05 -0700
Subject: [PATCH 0907/1262] Adding min node weight regularization

PiperOrigin-RevId: 193131300
---
 .../python/estimator/boosted_trees.py         | 18 +++-
 ...tedTreesCalculateBestGainsPerFeature.pbtxt |  8 +-
 .../core/kernels/boosted_trees/stats_ops.cc   |  9 ++
 tensorflow/core/ops/boosted_trees_ops.cc      |  1 +
 .../core/ops/compat/ops_history.v1.pbtxt      |  4 +
 .../python/estimator/canned/boosted_trees.py  | 85 ++++++++++---------
 .../estimator/canned/boosted_trees_test.py    |  3 +-
 .../boosted_trees/stats_ops_test.py           | 51 +++++++++++
 ....estimator.-boosted-trees-classifier.pbtxt |  2 +-
 ...w.estimator.-boosted-trees-regressor.pbtxt |  2 +-
 10 files changed, 138 insertions(+), 45 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 314c54ed00..00356ce0ca 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -36,6 +36,7 @@ class _BoostedTreesEstimator(estimator.Estimator):
                l1_regularization=0.,
                l2_regularization=0.,
                tree_complexity=0.,
+               min_node_weight=0.,
                config=None):
     """Initializes a `BoostedTreesEstimator` instance.
 
@@ -65,13 +66,16 @@ class _BoostedTreesEstimator(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
     """
     # pylint:disable=protected-access
     # HParams for the model.
     tree_hparams = canned_boosted_trees._TreeHParams(
         n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+        tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return canned_boosted_trees._bt_model_fn(
@@ -96,6 +100,7 @@ def boosted_trees_classifier_train_in_memory(
     l1_regularization=0.,
     l2_regularization=0.,
     tree_complexity=0.,
+    min_node_weight=0.,
     config=None,
     train_hooks=None):
   """Trains a boosted tree classifier with in memory dataset.
@@ -162,6 +167,9 @@ def boosted_trees_classifier_train_in_memory(
     l2_regularization: regularization multiplier applied to the square weights
       of the tree leafs.
     tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
     config: `RunConfig` object to configure the runtime settings.
     train_hooks: a list of Hook instances to be passed to estimator.train().
 
@@ -184,7 +192,7 @@ def boosted_trees_classifier_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity)
+      tree_complexity, min_node_weight)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
@@ -220,6 +228,7 @@ def boosted_trees_regressor_train_in_memory(
     l1_regularization=0.,
     l2_regularization=0.,
     tree_complexity=0.,
+    min_node_weight=0.,
     config=None,
     train_hooks=None):
   """Trains a boosted tree regressor with in memory dataset.
@@ -279,6 +288,9 @@ def boosted_trees_regressor_train_in_memory(
     l2_regularization: regularization multiplier applied to the square weights
       of the tree leafs.
     tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
     config: `RunConfig` object to configure the runtime settings.
     train_hooks: a list of Hook instances to be passed to estimator.train().
 
@@ -300,7 +312,7 @@ def boosted_trees_regressor_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity)
+      tree_complexity, min_node_weight)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index 7f18c64574..3f181e91ce 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -29,6 +29,12 @@ END
     name: "tree_complexity"
     description: <<END
 adjustment to the gain, per leaf based.
+END
+  }
+  in_arg {
+    name: "min_node_weight"
+    description: <<END
+mininum avg of hessians in a node before required for the node to be considered for splitting.
 END
   }
   out_arg {
@@ -84,4 +90,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 40f50333d3..6dfcd63ab3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -60,6 +60,10 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->input("tree_complexity", &tree_complexity_t));
     const auto tree_complexity = tree_complexity_t->scalar<float>()();
+    const Tensor* min_node_weight_t;
+    OP_REQUIRES_OK(context,
+                   context->input("min_node_weight", &min_node_weight_t));
+    const auto min_node_weight = min_node_weight_t->scalar<float>()();
 
     // Allocate output lists of tensors:
     OpOutputList output_node_ids_list;
@@ -105,6 +109,11 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
           cum_grad.push_back(total_grad);
           cum_hess.push_back(total_hess);
         }
+        // Check if node has enough of average hessian.
+        if (total_hess < min_node_weight) {
+          // Do not split the node because not enough avg hessian.
+          continue;
+        }
         float best_gain = std::numeric_limits<float>::lowest();
         float best_bucket = 0;
         float best_contrib_for_left = 0.0;
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 4d74e6d63a..88d6eaf819 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -40,6 +40,7 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
     .Input("l1: float")
     .Input("l2: float")
     .Input("tree_complexity: float")
+    .Input("min_node_weight: float")
     .Attr("max_splits: int >= 1")
     .Attr("num_features: int >= 1")  // not passed but populated automatically.
     .Output("node_ids_list: num_features * int32")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 0af560010f..5bd37efac8 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10867,6 +10867,10 @@ op {
     name: "tree_complexity"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index d099d308f5..536bd2bf81 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -40,9 +40,11 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
-_TreeHParams = collections.namedtuple(
-    'TreeHParams',
-    ['n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity'])
+# TODO(nponomareva): Reveal pruning params here.
+_TreeHParams = collections.namedtuple('TreeHParams', [
+    'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
+    'min_node_weight'
+])
 
 _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
@@ -397,6 +399,7 @@ def _bt_model_fn(
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
                  tree_complexity=tree_hparams.tree_complexity,
+                 min_node_weight=tree_hparams.min_node_weight,
                  max_splits=max_splits))
         grow_op = boosted_trees_ops.update_ensemble(
             # Confirm if local_tree_ensemble or tree_ensemble should be used.
@@ -515,21 +518,21 @@ def _create_regression_head(label_dimension, weight_column=None):
 class BoostedTreesClassifier(estimator.Estimator):
   """A Classifier for Tensorflow Boosted Trees models."""
 
-  def __init__(
-      self,
-      feature_columns,
-      n_batches_per_layer,
-      model_dir=None,
-      n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
-      weight_column=None,
-      label_vocabulary=None,
-      n_trees=100,
-      max_depth=6,
-      learning_rate=0.1,
-      l1_regularization=0.,
-      l2_regularization=0.,
-      tree_complexity=0.,
-      config=None):
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
+               weight_column=None,
+               label_vocabulary=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
     """Initializes a `BoostedTreesClassifier` instance.
 
     Example:
@@ -593,6 +596,9 @@ class BoostedTreesClassifier(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
 
     Raises:
@@ -606,9 +612,9 @@ class BoostedTreesClassifier(estimator.Estimator):
         n_classes, weight_column, label_vocabulary=label_vocabulary)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
@@ -630,20 +636,20 @@ class BoostedTreesClassifier(estimator.Estimator):
 class BoostedTreesRegressor(estimator.Estimator):
   """A Regressor for Tensorflow Boosted Trees models."""
 
-  def __init__(
-      self,
-      feature_columns,
-      n_batches_per_layer,
-      model_dir=None,
-      label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
-      weight_column=None,
-      n_trees=100,
-      max_depth=6,
-      learning_rate=0.1,
-      l1_regularization=0.,
-      l2_regularization=0.,
-      tree_complexity=0.,
-      config=None):
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
     """Initializes a `BoostedTreesRegressor` instance.
 
     Example:
@@ -700,6 +706,9 @@ class BoostedTreesRegressor(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
 
     Raises:
@@ -712,9 +721,9 @@ class BoostedTreesRegressor(estimator.Estimator):
     head = _create_regression_head(label_dimension, weight_column)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 7823ef8410..56e67a6707 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -188,7 +188,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         learning_rate=0.1,
         l1=0.,
         l2=0.01,
-        tree_complexity=0.)
+        tree_complexity=0.,
+        min_node_weight=0.)
 
   def _get_expected_ensembles_for_classification(self):
     first_round = """
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 4d09cf94d4..f0bb84e69a 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -59,6 +59,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=0.0,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -106,6 +107,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=0.1,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -154,6 +156,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=l1,
           l2=0.0,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
@@ -205,6 +208,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=l2,
           tree_complexity=tree_complexity,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -220,6 +224,53 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
                           sess.run(right_node_contribs_list))
 
+  def testCalculateBestGainsWithMinNodeWEight(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .036], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .68], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .6], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .03], [-.4, .05], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          min_node_weight=1,
+          max_splits=max_splits)
+
+      # We can't split node 1 on feature 1 and node 2 on feature 2 because of
+      # the min node weight.
+      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
+      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllClose([[[0.4852941]], [[-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-0.75]], [[-0.014925]]],
+                          sess.run(right_node_contribs_list))
+
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.test_session():
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fd9be8c759..53a903c239 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 6b305be43f..ba17c90de2 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "evaluate"
-- 
GitLab


From 10467d29e05d9957a6e3cb2335f8eeba1fd8896e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 00:52:20 +0000
Subject: [PATCH 0908/1262] Improve shape function check for `tf.roll`

The `tf.roll` op has requirements for the shape of inputs. However,
the shape of the inputs are only done at the runtime inside the kernel.
This fix improve the shape function so that the check could be
done early if shape is already known in the shape function.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/manip_ops.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index 95b4774fe6..1cc9182389 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -28,6 +28,12 @@ REGISTER_OP("Roll")
     .Attr("T: type")
     .Attr("Tshift: {int32,int64}")
     .Attr("Taxis: {int32,int64}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // The `input` must be 1-D or higher
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+
+      return shape_inference::UnchangedShape(c);
+    });
 
 }  // namespace tensorflow
-- 
GitLab


From 894af557bcb6a375990f2fe067e1fc9cb27631d2 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 00:55:37 +0000
Subject: [PATCH 0909/1262] Add test case for input shape of tf.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/manip_ops_test.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index 7948a475bb..0ef02ea10a 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.platform import test as test_lib
@@ -98,14 +100,20 @@ class RollTest(test_util.TensorFlowTestCase):
         manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
                        3, -10).eval()
 
+  def testInvalidInputShape(self):
+    # The input should be 1-D or higher, checked in shape function.
+    with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 1 but is rank 0"):
+      roll = manip_ops.roll(7, 1, 0)
+
   def testRollInputMustVectorHigherRaises(self):
-    tensor = 7
+    # The input should be 1-D or higher, checked is done in kernel.
+    tensor = array_ops.placeholder(dtype=dtypes.int32)
     shift = 1
     axis = 0
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "input must be 1-D or higher"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
 
   def testRollAxisMustBeScalarOrVectorRaises(self):
     tensor = [[1, 2], [3, 4]]
-- 
GitLab


From 3f796ff8c9e6d7ff88f99c056b78e88fb0b31114 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:01:50 +0000
Subject: [PATCH 0910/1262] Add axis shape check for tf.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/manip_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index 1cc9182389..3dd6dfabfc 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -32,6 +32,8 @@ REGISTER_OP("Roll")
       shape_inference::ShapeHandle unused;
       // The `input` must be 1-D or higher
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // The `axis` must be scalar or 1-D.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
 
       return shape_inference::UnchangedShape(c);
     });
-- 
GitLab


From 2b86827637d09e0c231db2ff481a7f083566f4ed Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:02:03 +0000
Subject: [PATCH 0911/1262] Add test case for axis shape check with tf.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/manip_ops_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index 0ef02ea10a..b6b3b9260b 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -115,14 +115,20 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "input must be 1-D or higher"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
 
+  def testInvalidAxisShape(self):
+    # The axis should be a scalar or 1-D, checked in shape function.
+    with self.assertRaisesRegexp(ValueError, "Shape must be at most rank 1 but is rank 2"):
+      roll = manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
+
   def testRollAxisMustBeScalarOrVectorRaises(self):
+    # The axis should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
     shift = 1
-    axis = [[0, 1]]
+    axis = array_ops.placeholder(dtype=dtypes.int32)
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "axis must be a scalar or a 1-D vector"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
 
   def testRollShiftMustBeScalarOrVectorRaises(self):
     tensor = [[1, 2], [3, 4]]
-- 
GitLab


From 851177fee860211e2fabcb019d644e75b7f701b0 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:04:40 +0000
Subject: [PATCH 0912/1262] Add shape check for shift of tf.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/manip_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index 3dd6dfabfc..8461b1db9f 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -32,6 +32,8 @@ REGISTER_OP("Roll")
       shape_inference::ShapeHandle unused;
       // The `input` must be 1-D or higher
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // The `shift` must be scalar or 1-D.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
       // The `axis` must be scalar or 1-D.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
 
-- 
GitLab


From 2622adbb4d8f5d6a5a545df7a2fa46eb7de6384b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:05:17 +0000
Subject: [PATCH 0913/1262] Add test case for shape check with shift in tf.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/manip_ops_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index b6b3b9260b..4539dd5c2c 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -130,14 +130,20 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "axis must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
 
+  def testInvalidShiftShape(self):
+    # The shift should be a scalar or 1-D, checked in shape function.
+    with self.assertRaisesRegexp(ValueError, "Shape must be at most rank 1 but is rank 2"):
+      roll = manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
+
   def testRollShiftMustBeScalarOrVectorRaises(self):
+    # The shift should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
-    shift = [[0, 1]]
+    shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = 1
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "shift must be a scalar or a 1-D vector"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
 
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
     tensor = [[1, 2], [3, 4]]
-- 
GitLab


From 1dbc6712045108d0d50f6a3b7d5a749322b6843a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:14:03 +0000
Subject: [PATCH 0914/1262] Check in shape function that axis and shift are
 same size

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/manip_ops.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index 8461b1db9f..c90b2b22cf 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -36,7 +36,8 @@ REGISTER_OP("Roll")
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
       // The `axis` must be scalar or 1-D.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
-
+      // Validate 'shift' is the same shape as axis'.
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &unused) );
       return shape_inference::UnchangedShape(c);
     });
 
-- 
GitLab


From 59275fe1327d1611d717578b0983b59f845b943b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:14:25 +0000
Subject: [PATCH 0915/1262] Add test case for axis and shift shape equal check
 for tf.roll

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/manip_ops_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index 4539dd5c2c..786df5cc7b 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -145,14 +145,20 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "shift must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
 
+  def testInvalidShiftAndAxisNotEqualShape(self):
+    # The shift and axis must be same size, checked in shape function.
+    with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+      roll = manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
+
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    # The shift and axis must be same size, checked in kernel.
     tensor = [[1, 2], [3, 4]]
-    shift = [1]
+    shift = array_ops.placeholder(dtype=dtypes.int32)
     axis = [0, 1]
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "shift and axis must have the same size"):
-        manip_ops.roll(tensor, shift, axis).eval()
+        manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [1]})
 
   def testRollAxisOutOfRangeRaises(self):
     tensor = [1, 2]
-- 
GitLab


From 99345da1fe6079b263612ce1dd9b1cafc87eb146 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:15:00 +0000
Subject: [PATCH 0916/1262] Sanitize with clang-format -i --style=Google

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/manip_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
index c90b2b22cf..e180f3d5f6 100644
--- a/tensorflow/core/ops/manip_ops.cc
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -37,7 +37,7 @@ REGISTER_OP("Roll")
       // The `axis` must be scalar or 1-D.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
       // Validate 'shift' is the same shape as axis'.
-      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &unused) );
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &unused));
       return shape_inference::UnchangedShape(c);
     });
 
-- 
GitLab


From f5fafb421e2a951180acacc2612204a7a66720fb Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 12 Apr 2018 22:58:29 +0000
Subject: [PATCH 0917/1262] Using xrange from six

In python 2 vs 3 xrange is different. This fix is an enhancement
to use xrange from six, instead of additional logic of handling
xrange in python 2 vs python 3.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/confusion_matrix_test.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 670a625f0f..e05355ac03 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-
+from six.moves import xrange
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -104,10 +104,7 @@ class ConfusionMatrixTest(test.TestCase):
       d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0})
 
       truth = np.zeros([2, 2], dtype=np_dtype)
-      try:
-        range_builder = xrange
-      except NameError:  # In Python 3.
-        range_builder = range
+      range_builder = xrange
       for i in range_builder(len(d)):
         truth[l[i], d[i]] += 1
 
-- 
GitLab


From b7d01e6d99f3b7e2fc14a0a28e50c7622f73085c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:22:02 +0000
Subject: [PATCH 0918/1262] Pylint issue fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/confusion_matrix_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index e05355ac03..9fe4dd0a67 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 from six.moves import xrange
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
-- 
GitLab


From 508361ae6c09dac7e1de2f8e2de0ef832ce4bca4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:23:00 +0000
Subject: [PATCH 0919/1262] Disable pylint: disable=redefined-builtin

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/confusion_matrix_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 9fe4dd0a67..116e5e4e5a 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-- 
GitLab


From 7586dee9aa8b4b63143ab658ca59658aaed0df97 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:28:30 +0000
Subject: [PATCH 0920/1262] Add shape check to TFRecordDataset

The inputs of TFRecordDataset have the requirements for shapes.
However, the check was not done in the shape function. This fix
adds shape checks whenever possible.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 8a7185e005..47a0c0b88f 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -417,7 +417,12 @@ REGISTER_OP("TFRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("Iterator")
     .Output("handle: resource")
-- 
GitLab


From d97ffbdf362fa7d06ef8d946c8620ff7a3a50a08 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:30:42 +0000
Subject: [PATCH 0921/1262] Add shape check for compression_type in
 TFrecordDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 47a0c0b88f..ce28a9c798 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -421,6 +421,8 @@ REGISTER_OP("TFRecordDataset")
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // `compression_type` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused) );
       return shape_inference::ScalarShape(c);
     });
 
-- 
GitLab


From c4dea2255c71037c9cade9cbd1d7820b3429b3fa Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:31:54 +0000
Subject: [PATCH 0922/1262] Add shape check for buffer_size with
 TFRecordDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index ce28a9c798..c551eb0e1a 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -423,6 +423,8 @@ REGISTER_OP("TFRecordDataset")
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
       // `compression_type` could only be a scalar.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused) );
+      // `buffer_size` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused) );
       return shape_inference::ScalarShape(c);
     });
 
-- 
GitLab


From 6ad2fcaabd88c876de61c6c3804d7075f0e65b3f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:32:41 +0000
Subject: [PATCH 0923/1262] Sanitize with clan-format -i --style=Google

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index c551eb0e1a..7f4d63b024 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -422,9 +422,9 @@ REGISTER_OP("TFRecordDataset")
       // `filenames` must be a scalar or a vector.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
       // `compression_type` could only be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused) );
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       // `buffer_size` could only be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused) );
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::ScalarShape(c);
     });
 
-- 
GitLab


From dfae914b3e1564ea61cbd8934c0184401ae66e9a Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Mon, 16 Apr 2018 18:31:22 -0700
Subject: [PATCH 0924/1262] Add a simple Profiler and instrument operator
 invocations in Interpreter.

PiperOrigin-RevId: 193133955
---
 tensorflow/contrib/lite/BUILD                 |   1 +
 tensorflow/contrib/lite/interpreter.cc        |   4 +
 tensorflow/contrib/lite/interpreter.h         |  12 +-
 tensorflow/contrib/lite/profiling/BUILD       |  44 +++++
 .../contrib/lite/profiling/profile_buffer.h   | 150 +++++++++++++++
 .../lite/profiling/profile_buffer_test.cc     | 102 ++++++++++
 tensorflow/contrib/lite/profiling/profiler.h  | 174 ++++++++++++++++++
 .../contrib/lite/profiling/profiler_test.cc   | 105 +++++++++++
 8 files changed, 591 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/lite/profiling/BUILD
 create mode 100644 tensorflow/contrib/lite/profiling/profile_buffer.h
 create mode 100644 tensorflow/contrib/lite/profiling/profile_buffer_test.cc
 create mode 100644 tensorflow/contrib/lite/profiling/profiler.h
 create mode 100644 tensorflow/contrib/lite/profiling/profiler_test.cc

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 9c4533079c..1534f97d76 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -137,6 +137,7 @@ cc_library(
         "//tensorflow/contrib/lite/kernels:eigen_support",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
+        "//tensorflow/contrib/lite/profiling:profiler",
         "//tensorflow/contrib/lite/schema:schema_fbs",
     ],
 )
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index ff8524f12e..91b6c414bf 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/interpreter.h"
+
 #include <cassert>
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+
 #include "tensorflow/contrib/lite/arena_planner.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
 #include "tensorflow/contrib/lite/memory_planner.h"
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
+#include "tensorflow/contrib/lite/profiling/profiler.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/util.h"
 
@@ -544,6 +547,7 @@ TfLiteStatus Interpreter::Invoke() {
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
+    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
 
     // TODO(ycling): This is an extra loop through inputs to check if the data
     // need to be copied from Delegate buffer to raw memory, which is often not
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 3c776aacb6..a49134b95e 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
+
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/memory_planner.h"
+#include "tensorflow/contrib/lite/profiling/profiler.h"
 
 namespace tflite {
 
@@ -321,6 +323,12 @@ class Interpreter {
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
+  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+
+  profiling::Profiler* GetProfiler(profiling::Profiler* profiler) {
+    return profiler_;
+  }
+
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
   // The capacity headroom of `tensors_` vector before calling ops'
@@ -532,8 +540,10 @@ class Interpreter {
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
-  // WARNING: This is an experimental interface that is subject to change.
   bool allow_buffer_handle_output_ = false;
+
+  // Profiler for this interpreter instance.
+  profiling::Profiler* profiler_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
new file mode 100644
index 0000000000..15999e5d41
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+common_copts = [
+    "-Wall",
+]
+
+cc_library(
+    name = "profiler",
+    hdrs = ["profiler.h"],
+    copts = common_copts,
+    deps = [":profile_buffer"],
+)
+
+cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    copts = ["-DTFLITE_PROFILING_ENABLED"],
+    defines = ["TFLITE_PROFILING_ENABLED"],
+    deps = [
+        ":profiler",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "profile_buffer",
+    hdrs = ["profile_buffer.h"],
+    copts = common_copts,
+)
+
+cc_test(
+    name = "profile_buffer_test",
+    srcs = ["profile_buffer_test.cc"],
+    copts = ["-DTFLITE_PROFILING_ENABLED"],
+    defines = ["TFLITE_PROFILING_ENABLED"],
+    deps = [
+        ":profile_buffer",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h
new file mode 100644
index 0000000000..3bfe02571b
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_buffer.h
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace tflite {
+namespace profiling {
+
+// A profiling event.
+struct ProfileEvent {
+  // Describes the type of event.
+  // The event_metadata field may contain additional data for interpreting
+  // the event.
+  enum class EventType {
+    // Default event type, the metadata field has no special significance.
+    DEFAULT = 0,
+    // The event is an operator invocation and the event_metadata field is the
+    // index of operator node.
+    OPERATOR_INVOKE_EVENT = 1
+  };
+
+  // Label of the event. This usually describes the event.
+  const char* tag;
+  // Timestamp in microseconds when the event began.
+  int64_t begin_timestamp_ms;
+  // Timestamp in microseconds when the event ended.
+  int64_t end_timestamp_ms;
+  // The field containing the type of event. This must be one of the event types
+  // in EventType.
+  EventType event_type;
+  // Extra data describing the details of the event.
+  uint32_t event_metadata;
+};
+}  // namespace profiling
+}  // namespace tflite
+
+#ifdef TFLITE_PROFILING_ENABLED
+
+#include <sys/time.h>
+#include <vector>
+
+namespace tflite {
+namespace profiling {
+constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
+
+// A ring buffer of profile events.
+// This class is not thread safe.
+class ProfileBuffer {
+ public:
+  ProfileBuffer(uint32_t max_num_entries, bool enabled)
+      : enabled_(enabled), current_index_(0), event_buffer_(max_num_entries) {}
+
+  // Adds an event to the buffer with begin timestamp set to the current
+  // timestamp. Returns a handle to event that can be used to call EndEvent. If
+  // buffer is disabled this has no affect.
+  // The tag of the event should remain valid till the buffer is valid.
+  uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
+                      uint32_t event_metadata) {
+    if (!enabled_) {
+      return kInvalidEventHandle;
+    }
+    int64_t timestamp = NowMicros();
+    int index = current_index_ % event_buffer_.size();
+    event_buffer_[index].tag = tag;
+    event_buffer_[index].event_type = event_type;
+    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].begin_timestamp_ms = timestamp;
+    event_buffer_[index].end_timestamp_ms = 0;
+    current_index_++;
+    return index;
+  }
+
+  // Sets the enabled state of buffer to |enabled|
+  void SetEnabled(bool enabled) { enabled_ = enabled; }
+
+  // Sets the end timestamp for event for the handle to current time.
+  // If the buffer is disabled or previous event has been overwritten this
+  // operation has not effect.
+  void EndEvent(uint32_t event_handle) {
+    if (!enabled_ || event_handle == kInvalidEventHandle ||
+        event_handle > current_index_) {
+      return;
+    }
+    const uint32_t max_size = event_buffer_.size();
+    if (current_index_ > (max_size + event_handle)) {
+      // Ignore, buffer has already overflowed.
+      return;
+    }
+
+    int event_index = event_handle % max_size;
+    event_buffer_[event_index].end_timestamp_ms = NowMicros();
+  }
+
+  // Returns the size of the buffer.
+  size_t Size() const {
+    return (current_index_ >= event_buffer_.size()) ? event_buffer_.size()
+                                                    : current_index_;
+  }
+
+  // Resets the buffer.
+  void Reset() {
+    enabled_ = false;
+    current_index_ = 0;
+  }
+
+  // Returns the profile event at the given index. If the index is invalid a
+  // nullptr is returned. The return event may get overwritten if more events
+  // are added to buffer.
+  const struct ProfileEvent* const At(int index) const {
+    size_t size = Size();
+    if (index >= size) {
+      return nullptr;
+    }
+    const uint32_t max_size = event_buffer_.size();
+    uint32_t start =
+        (current_index_ > max_size) ? current_index_ % max_size : max_size;
+    index = (index + start) % max_size;
+    return &event_buffer_[index];
+  }
+
+ private:
+  static int64_t NowMicros() {
+    // TODO(shashishekhar): Refactor this to a separate file.
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+  bool enabled_;
+  uint32_t current_index_;
+  std::vector<ProfileEvent> event_buffer_;
+};
+}  // namespace profiling
+}  // namespace tflite
+#endif  // TFLITE_PROFILING_ENABLED
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
new file mode 100644
index 0000000000..0c5f0cd314
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/profiling/profile_buffer.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+
+namespace {
+
+std::vector<const ProfileEvent*> GetProfileEvents(const ProfileBuffer& buffer) {
+  std::vector<const ProfileEvent*> events;
+  for (auto i = 0; i < buffer.Size(); i++) {
+    events.push_back(buffer.At(i));
+  }
+  return events;
+}
+
+TEST(ProfileBufferTest, Empty) {
+  ProfileBuffer buffer(/*max_size*/ 0, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+}
+
+TEST(ProfileBufferTest, AddEvent) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle = buffer.BeginEvent(
+      "hello", ProfileEvent::EventType::DEFAULT, /* event_metadata */ 42);
+
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+
+  auto event = GetProfileEvents(buffer)[0];
+  EXPECT_EQ(event->tag, "hello");
+  EXPECT_GT(event->begin_timestamp_ms, 0);
+  EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT);
+  EXPECT_EQ(event->event_metadata, 42);
+
+  buffer.EndEvent(event_handle);
+  EXPECT_EQ(1, buffer.Size());
+  EXPECT_GE(event->end_timestamp_ms, event->begin_timestamp_ms);
+}
+
+TEST(ProfileBufferTest, OverFlow) {
+  const int max_size = 4;
+  ProfileBuffer buffer{max_size, true};
+  std::vector<std::string> eventNames = {"first", "second", "third", "fourth"};
+  for (int i = 0; i < 2 * max_size; i++) {
+    buffer.BeginEvent(eventNames[i % 4].c_str(),
+                      ProfileEvent::EventType::DEFAULT, i);
+    size_t expected_size = std::min(i + 1, max_size);
+    EXPECT_EQ(expected_size, buffer.Size());
+  }
+  EXPECT_EQ(max_size, buffer.Size());
+  for (int j = 0; j < buffer.Size(); ++j) {
+    auto event = buffer.At(j);
+    EXPECT_EQ(eventNames[j % 4], event->tag);
+    EXPECT_EQ(ProfileEvent::EventType::DEFAULT, event->event_type);
+    EXPECT_EQ(4 + j, event->event_metadata);
+  }
+}
+
+TEST(ProfileBufferTest, Enable) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ false);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle = buffer.BeginEvent(
+      "hello", ProfileEvent::EventType::DEFAULT, /* event_metadata */ 42);
+  EXPECT_EQ(kInvalidEventHandle, event_handle);
+  EXPECT_EQ(0, buffer.Size());
+  buffer.SetEnabled(true);
+  event_handle = buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
+                                   /* event_metadata */ 42);
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/profiling/profiler.h b/tensorflow/contrib/lite/profiling/profiler.h
new file mode 100644
index 0000000000..dfa98a6708
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profiler.h
@@ -0,0 +1,174 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/profiling/profile_buffer.h"
+
+#ifdef TFLITE_PROFILING_ENABLED
+
+namespace tflite {
+namespace profiling {
+class ScopedProfile;
+class ScopedOperatorProfile;
+
+// Controls whether profiling is enabled or disabled and collects profiles.
+// TFLite is used on platforms that don't have posix threads, so the profiler is
+// kept as simple as possible. It is designed to be used only on a single
+// thread.
+//
+// Profiles are collected using Scoped*Profile objects that begin and end a
+// profile event.
+// An example usage is shown in the example below:
+//
+// Say Worker class has a DoWork method and we are interested in profiling
+// the overall execution time for DoWork and time spent in Task1 and Task2
+// functions.
+//
+// class Worker {
+//  public:
+//   void DoWork() {
+//    ScopedProfile(&controller, "DoWork");
+//    Task1();
+//    Task2();
+//    .....
+//   }
+//
+//   void Task1() {
+//    ScopedProfile(&controller, "Task1");
+//    ....
+//   }
+//
+//   void Task2() {
+//    ScopedProfile(&controller, "Task2");
+//   }
+//
+//    Profiler profiler;
+// }
+//
+// We instrument the functions that need to be profiled.
+//
+// Profile can be collected by enable profiling and then getting profile
+// events.
+//
+//  void ProfileWorker() {
+//    Worker worker;
+//    worker.profiler.EnableProfiling();
+//    worker.DoWork();
+//    worker.profiler.DisableProfiling();
+//    // Profiling is complete, extract profiles.
+//    auto profile_events = worker.profiler.GetProfiles();
+//  }
+//
+//
+class Profiler {
+ public:
+  Profiler() : buffer_(1024, false) {}
+
+  void StartProfiling() { buffer_.SetEnabled(true); }
+  void StopProfiling() { buffer_.SetEnabled(false); }
+  void Reset() { buffer_.Reset(); }
+  std::vector<const ProfileEvent*> GetProfileEvents() {
+    std::vector<const ProfileEvent*> profile_events;
+    profile_events.reserve(buffer_.Size());
+    for (int i = 0; i < buffer_.Size(); i++) {
+      profile_events.push_back(buffer_.At(i));
+    }
+    return profile_events;
+  }
+
+ private:
+  friend class ScopedProfile;
+  friend class ScopedOperatorProfile;
+  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
+  ProfileBuffer buffer_;
+};
+
+class ScopedProfile {
+ public:
+  // Adds a profile event to profile that begins with the construction
+  // of object and ends when the object goes out of scope.
+  // The lifetime of tag should be at least the lifetime of profiler.
+  ScopedProfile(Profiler* profiler, const char* tag) {
+    if (profiler) {
+      buffer_ = profiler->GetProfileBuffer();
+      event_handle_ =
+          buffer_->BeginEvent(tag, ProfileEvent::EventType::DEFAULT, 0);
+    }
+  }
+  ~ScopedProfile() {
+    if (buffer_) {
+      buffer_->EndEvent(event_handle_);
+    }
+  }
+
+ private:
+  ProfileBuffer* buffer_;
+  int32_t event_handle_;
+};
+
+class ScopedOperatorProfile {
+ public:
+  // Adds a profile event to profile that begins with the construction
+  // of object and ends when the object goes out of scope.
+  // The lifetime of tag should be at least the lifetime of profiler.
+  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index) {
+    if (profiler) {
+      buffer_ = profiler->GetProfileBuffer();
+      event_handle_ = buffer_->BeginEvent(
+          tag, ProfileEvent::EventType::OPERATOR_INVOKE_EVENT, node_index);
+    }
+  }
+
+  ~ScopedOperatorProfile() {
+    if (buffer_) {
+      buffer_->EndEvent(event_handle_);
+    }
+  }
+
+ private:
+  ProfileBuffer* buffer_;
+  int32_t event_handle_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index)                       \
+  tflite::profiling::ScopedOperatorProfile _profile((profiler), "OpInvoke", \
+                                                    (node_index))
+#else
+
+namespace tflite {
+namespace profiling {
+// A noop version of profiler when profiling is disabled.
+class Profiler {
+ public:
+  Profiler() {}
+  void StartProfiling() {}
+  void StopProfiling() {}
+  void Reset() {}
+  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
+};
+}  // namespace profiling
+}  // namespace tflite
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index)
+
+#endif  // TFLITE_PROFILING_ENABLED
+
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc
new file mode 100644
index 0000000000..994523a8fb
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profiler_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+
+#include <chrono>  // NOLINT(build/c++11)
+#include <cmath>
+#include <thread>  // NOLINT(build/c++11)
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/profiling/profiler.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+namespace {
+
+void AssertDurationOfEventAroundMs(const ProfileEvent* event,
+                                   double expected_ms, double eps_ms) {
+  double duration_ms =
+      (event->end_timestamp_ms - event->begin_timestamp_ms) / 1e3;
+  EXPECT_NEAR(expected_ms, duration_ms, eps_ms);
+}
+
+void SleepForQuarterSecond(Profiler* profiler) {
+  ScopedProfile profile(profiler, "SleepForQuarter");
+  std::this_thread::sleep_for(std::chrono::milliseconds(250));
+}
+
+void ChildFunction(Profiler* profiler) {
+  ScopedProfile profile(profiler, "Child");
+  SleepForQuarterSecond(profiler);
+}
+
+void ParentFunction(Profiler* profiler) {
+  ScopedProfile profile(profiler, "Parent");
+  for (int i = 0; i < 2; i++) {
+    ChildFunction(profiler);
+  }
+}
+
+TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
+  Profiler profiler;
+  ParentFunction(&profiler);
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
+TEST(ProfilingTest, ProfilesAreCollected) {
+  Profiler profiler;
+  profiler.StartProfiling();
+  ParentFunction(&profiler);
+  profiler.StopProfiling();
+  auto profile_events = profiler.GetProfileEvents();
+  // ParentFunction calls the ChildFunction 2 times.
+  // Each ChildFunction calls SleepForQuarterSecond once.
+  // We expect 1 entry for ParentFunction, 2 for ChildFunction and 2 for
+  // SleepForQuarterSecond: Total: 1+ 2 + 2 = 5
+  //  Profiles should look like:
+  //  Parent ~ 500 ms (due to 2 Child calls)
+  //   - Child ~ 250 ms (due to SleepForQuarter calls)
+  //       - SleepForQuarter ~ 250ms
+  //   - Child ~ 250 ms (due to SleepForQuarter calls)
+  //      - SleepForQuarter ~ 250ms
+  //
+  ASSERT_EQ(5, profile_events.size());
+  EXPECT_EQ("Parent", profile_events[0]->tag);
+  EXPECT_EQ("Child", profile_events[1]->tag);
+  EXPECT_EQ("SleepForQuarter", profile_events[2]->tag);
+  EXPECT_EQ("Child", profile_events[3]->tag);
+  EXPECT_EQ("SleepForQuarter", profile_events[4]->tag);
+
+  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500,
+                                /*eps_ms*/ 2);
+  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250,
+                                /*eps_ms*/ 2);
+  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250,
+                                /*eps_ms*/ 2);
+  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250,
+                                /*eps_ms*/ 2);
+  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250,
+                                /*eps_ms*/ 2);
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
-- 
GitLab


From 9b24fb8d04c37d488eb5066a61f8c56171cbe0f0 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 16 Apr 2018 18:41:28 -0700
Subject: [PATCH 0925/1262] Remove proto imports in header files for
 core/kernels/hexagon.

The goal is to make kernels mostly independent of proto headers, which will let us lock down our .so imports.

PiperOrigin-RevId: 193134710
---
 .../core/framework/graph_transfer_info.proto  |  91 ++++++-------
 tensorflow/core/kernels/hexagon/BUILD         |   1 +
 .../kernels/hexagon/graph_transfer_utils.cc   |   2 +
 .../kernels/hexagon/graph_transfer_utils.h    |   4 +-
 .../core/kernels/hexagon/graph_transferer.cc  | 126 +++++++++---------
 .../core/kernels/hexagon/graph_transferer.h   |  21 +--
 .../kernels/hexagon/graph_transferer_test.cc  |  37 +++--
 .../hexagon/hexagon_control_wrapper.cc        |  36 +++--
 .../kernels/hexagon/hexagon_control_wrapper.h |   4 +-
 .../hexagon/hexagon_graph_execution_test.cc   |  29 ++--
 10 files changed, 179 insertions(+), 172 deletions(-)

diff --git a/tensorflow/core/framework/graph_transfer_info.proto b/tensorflow/core/framework/graph_transfer_info.proto
index 016259ddbf..41dd54d78c 100644
--- a/tensorflow/core/framework/graph_transfer_info.proto
+++ b/tensorflow/core/framework/graph_transfer_info.proto
@@ -8,6 +8,46 @@ option java_package = "org.tensorflow.framework";
 
 import "tensorflow/core/framework/types.proto";
 
+message GraphTransferNodeInput {
+  int32 node_id = 1;
+  int32 output_port = 2;
+}
+message GraphTransferNodeInfo {
+  string name = 1;
+  int32 node_id = 2;
+  string type_name = 3;
+  int32 soc_op_id = 4;
+  int32 padding_id = 5;
+  int32 input_count = 6;
+  int32 output_count = 7;
+};
+message GraphTransferConstNodeInfo {
+  string name = 1;
+  int32 node_id = 2;
+  repeated int64 shape = 3;
+  bytes data = 4;
+  DataType dtype = 5;
+};
+message GraphTransferNodeInputInfo {
+  int32 node_id = 1;
+  repeated GraphTransferNodeInput node_input = 2;
+};
+message GraphTransferNodeOutputInfo {
+  int32 node_id = 1;
+  repeated int32 max_byte_size = 2;
+};
+message GraphTransferGraphInputNodeInfo {
+  string name = 1;
+  repeated int64 shape = 2;
+  DataType dtype = 3;
+}
+
+message GraphTransferGraphOutputNodeInfo {
+  string name = 1;
+  repeated int64 shape = 2;
+  DataType dtype = 3;
+}
+
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
 // a single run.
@@ -16,53 +56,14 @@ message GraphTransferInfo {
     NOP = 0;
     HEXAGON = 1;
   }
-  message NodeInput {
-    int32 node_id = 1;
-    int32 output_port = 2;
-  }
-  message NodeInfo {
-    string name = 1;
-    int32 node_id = 2;
-    string type_name = 3;
-    int32 soc_op_id = 4;
-    int32 padding_id = 5;
-    int32 input_count = 6;
-    int32 output_count = 7;
-  };
-  message ConstNodeInfo {
-    string name = 1;
-    int32 node_id = 2;
-    repeated int64 shape = 3;
-    bytes data = 4;
-    DataType dtype = 5;
-  };
-  message NodeInputInfo {
-    int32 node_id = 1;
-    repeated NodeInput node_input = 2;
-  };
-  message NodeOutputInfo {
-    int32 node_id = 1;
-    repeated int32 max_byte_size = 2;
-  };
-  message GraphInputNodeInfo {
-    string name = 1;
-    repeated int64 shape = 2;
-    DataType dtype = 3;
-  }
-
-  message GraphOutputNodeInfo {
-    string name = 1;
-    repeated int64 shape = 2;
-    DataType dtype = 3;
-  }
 
-  repeated NodeInfo node_info = 1;
-  repeated ConstNodeInfo const_node_info = 2;
-  repeated NodeInputInfo node_input_info = 3;
-  repeated NodeOutputInfo node_output_info = 4;
+  repeated GraphTransferNodeInfo node_info = 1;
+  repeated GraphTransferConstNodeInfo const_node_info = 2;
+  repeated GraphTransferNodeInputInfo node_input_info = 3;
+  repeated GraphTransferNodeOutputInfo node_output_info = 4;
   // Input Node parameters of transferred graph
-  repeated GraphInputNodeInfo graph_input_node_info = 5;
-  repeated GraphOutputNodeInfo graph_output_node_info = 6;
+  repeated GraphTransferGraphInputNodeInfo graph_input_node_info = 5;
+  repeated GraphTransferGraphOutputNodeInfo graph_output_node_info = 6;
   // Destination of graph transfer
   Destination destination = 7;
 };
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 4870d9ae20..66aeec5105 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -70,6 +70,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index 4040bf52bf..40bf5a4dc7 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/const_op.h"
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
index 352d548bd3..ada96ae4ea 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -20,14 +20,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class RemoteFusedGraphExecuteInfo;
+
 class GraphTransferUtils {
  public:
   static std::priority_queue<std::tuple<float, int, string>>
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 0963dff5fa..7960cb4b05 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <algorithm>
 #include <cinttypes>
 
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -73,6 +75,12 @@ static Node* FindMutableNodeByName(const string& name, Graph* graph) {
   return nullptr;
 }
 
+GraphTransferer::GraphTransferer() {
+  graph_transfer_info_ = new GraphTransferInfo();
+}
+
+GraphTransferer::~GraphTransferer() { delete graph_transfer_info_; }
+
 /**
  * graph loading functions
  * - LoadGraphFromProto
@@ -142,8 +150,8 @@ Status GraphTransferer::LoadGraphFromProto(
 
   for (const std::pair<string, Tensor>& input_node_info :
        input_node_info_list) {
-    GraphTransferInfo::GraphInputNodeInfo& graph_input_node_info =
-        *graph_transfer_info_.add_graph_input_node_info();
+    GraphTransferGraphInputNodeInfo& graph_input_node_info =
+        *graph_transfer_info_->add_graph_input_node_info();
     graph_input_node_info.set_name(input_node_info.first);
     graph_input_node_info.set_dtype(input_node_info.second.dtype());
     for (const int64 dim : ToTensorShapeArray(input_node_info.second.shape())) {
@@ -159,8 +167,8 @@ Status GraphTransferer::LoadGraphFromProto(
     const Node* node = node_name_cache_list_.at(node_id);
     CHECK_NOTNULL(node);
 
-    GraphTransferInfo::GraphOutputNodeInfo& graph_output_node_info =
-        *graph_transfer_info_.add_graph_output_node_info();
+    GraphTransferGraphOutputNodeInfo& graph_output_node_info =
+        *graph_transfer_info_->add_graph_output_node_info();
     graph_output_node_info.set_name(strings::StrCat(node_name, ":", port));
 
     // Get output tensor shape type
@@ -231,17 +239,17 @@ Status GraphTransferer::LoadGraphFromProtoFile(
 
 void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
   // TODO(satok): optimize complexity
-  std::unordered_map<int, GraphTransferInfo::NodeInputInfo*> input_map;
-  for (GraphTransferInfo::NodeInputInfo& input :
-       *graph_transfer_info_.mutable_node_input_info()) {
+  std::unordered_map<int, GraphTransferNodeInputInfo*> input_map;
+  for (GraphTransferNodeInputInfo& input :
+       *graph_transfer_info_->mutable_node_input_info()) {
     input_map.emplace(input.node_id(), &input);
   }
 
   // Setup dependency map placeholder
   std::vector<int> output_node_ids;
   std::unordered_map<int, std::unordered_set<int>> dependency_map;
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info_.node_info()) {
+  for (const GraphTransferNodeInfo& params :
+       graph_transfer_info_->node_info()) {
     const int node_id = params.node_id();
     for (const string& output_node_name : output_node_names) {
       if (params.name() == output_node_name) {
@@ -255,7 +263,7 @@ void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
       continue;
     }
     CHECK_EQ(input_map.count(node_id), 1);
-    for (const GraphTransferInfo::NodeInput& node_input :
+    for (const GraphTransferNodeInput& node_input :
          input_map.at(node_id)->node_input()) {
       dependency_map.at(node_id).emplace(node_input.node_id());
     }
@@ -267,8 +275,8 @@ void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
     FillDependencyRec(output_node_id, dependency_map, completed);
   }
 
-  std::sort(graph_transfer_info_.mutable_node_info()->begin(),
-            graph_transfer_info_.mutable_node_info()->end(),
+  std::sort(graph_transfer_info_->mutable_node_info()->begin(),
+            graph_transfer_info_->mutable_node_info()->end(),
             TransferParamsComparator(dependency_map));
 }
 
@@ -278,15 +286,15 @@ void GraphTransferer::EnableStrictCheckMode(const bool enable) {
 
 void GraphTransferer::SetSerializedGraphTransferInfo(
     const string& serialized_proto) {
-  graph_transfer_info_.ParseFromString(serialized_proto);
+  graph_transfer_info_->ParseFromString(serialized_proto);
 }
 
 const GraphTransferInfo& GraphTransferer::GetGraphTransferInfo() const {
-  return graph_transfer_info_;
+  return *graph_transfer_info_;
 }
 
 GraphTransferInfo& GraphTransferer::GetMutableGraphTransferInfo() {
-  return graph_transfer_info_;
+  return *graph_transfer_info_;
 }
 
 void GraphTransferer::CacheNode(const Node& node) {
@@ -473,8 +481,8 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
   data_size = max_bytes_per_data * num_output_elements;
   shape_array = BuildShapeArray(shape_handle, context);
 
-  GraphTransferInfo::ConstNodeInfo& const_node_info =
-      *graph_transfer_info_.add_const_node_info();
+  GraphTransferConstNodeInfo& const_node_info =
+      *graph_transfer_info_->add_const_node_info();
   const_node_info.set_name(node.name());
   const_node_info.set_node_id(id);
   // TODO(satok): Make this generic. Never assume rank is 4.
@@ -505,8 +513,8 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
     node_name_cache_list_.emplace_back(nullptr);
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(shape_name, id);
-    GraphTransferInfo::ConstNodeInfo& const_node_info =
-        *graph_transfer_info_.add_const_node_info();
+    GraphTransferConstNodeInfo& const_node_info =
+        *graph_transfer_info_->add_const_node_info();
     const_node_info.set_name(shape_name);
     const_node_info.set_node_id(id);
     // TODO(satok): Make this generic. Never assume rank is 5.
@@ -528,8 +536,8 @@ int GraphTransferer::RegisterConstTensor(const Tensor& tensor,
     node_name_cache_list_.emplace_back(nullptr);
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(node_name, id);
-    GraphTransferInfo::ConstNodeInfo& const_node_info =
-        *graph_transfer_info_.add_const_node_info();
+    GraphTransferConstNodeInfo& const_node_info =
+        *graph_transfer_info_->add_const_node_info();
     const_node_info.set_name(node_name);
     const_node_info.set_node_id(id);
     CHECK_EQ(4, SHAPE_ARRAY_SIZE);
@@ -558,8 +566,8 @@ int GraphTransferer::RegisterConstScalar(const DataType dt, const int val,
     node_name_cache_list_.emplace_back(nullptr);
     const int id = node_name_cache_list_.size() - 1;
     node_name_to_id_cache_map_.emplace(val_name, id);
-    GraphTransferInfo::ConstNodeInfo& const_node_info =
-        *graph_transfer_info_.add_const_node_info();
+    GraphTransferConstNodeInfo& const_node_info =
+        *graph_transfer_info_->add_const_node_info();
     const_node_info.set_name(val_name);
     const_node_info.set_node_id(id);
     // TODO(satok): Do not assume rank is 4 here.
@@ -715,8 +723,8 @@ void GraphTransferer::RegisterPadNode(
 
   CHECK_EQ(2, node.num_inputs());
 
-  GraphTransferInfo::NodeInputInfo& node_input_info =
-      *graph_transfer_info_.add_node_input_info();
+  GraphTransferNodeInputInfo& node_input_info =
+      *graph_transfer_info_->add_node_input_info();
   node_input_info.set_node_id(id);
 
   AddNodeInputByInputIndex(node, 0, &node_input_info);
@@ -761,8 +769,7 @@ void GraphTransferer::RegisterPadNode(
         new_const_tensor,
         strings::StrCat(input_node->name(), "_", node.name(), "_1"));
 
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
+    GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
     node_input.set_node_id(id);
     node_input.set_output_port(0);
   } else {
@@ -849,8 +856,7 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
                                        const int padding, const int inputs_size,
                                        const std::vector<int>& extra_inputs,
                                        const int outputs_size) {
-  GraphTransferInfo::NodeInfo& node_info =
-      *graph_transfer_info_.add_node_info();
+  GraphTransferNodeInfo& node_info = *graph_transfer_info_->add_node_info();
   node_info.set_name(name);
   node_info.set_node_id(id);
   node_info.set_type_name(type);
@@ -863,7 +869,7 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
 
 void GraphTransferer::AddNodeInputByInputIndex(
     const Node& node, const int idx,
-    GraphTransferInfo::NodeInputInfo* node_input_info) {
+    GraphTransferNodeInputInfo* node_input_info) {
   const Edge* edge = nullptr;
   TF_CHECK_OK(node.input_edge(idx, &edge));
   const Node* input_node = edge->src();
@@ -873,7 +879,7 @@ void GraphTransferer::AddNodeInputByInputIndex(
   const std::string& op_name = input_node->name();
   CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
   const int src_id = node_name_to_id_cache_map_[op_name];
-  GraphTransferInfo::NodeInput& node_input = *node_input_info->add_node_input();
+  GraphTransferNodeInput& node_input = *node_input_info->add_node_input();
   node_input.set_node_id(src_id);
   node_input.set_output_port(port);
 }
@@ -882,15 +888,14 @@ void GraphTransferer::AppendNodeInputParams(
     const int id, const Node& node, const std::vector<int>& extra_inputs) {
   VLOG(1) << "Append input params: " << node.name() << ", " << node.num_inputs()
           << ", " << extra_inputs.size();
-  GraphTransferInfo::NodeInputInfo& node_input_info =
-      *graph_transfer_info_.add_node_input_info();
+  GraphTransferNodeInputInfo& node_input_info =
+      *graph_transfer_info_->add_node_input_info();
   node_input_info.set_node_id(id);
   for (int i = 0; i < node.num_inputs(); ++i) {
     AddNodeInputByInputIndex(node, i, &node_input_info);
   }
   for (const int extra_input : extra_inputs) {
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
+    GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
     node_input.set_node_id(extra_input);
     node_input.set_output_port(0);
   }
@@ -900,8 +905,8 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
                                              const int id, const Node& node) {
   VLOG(1) << "Append output params: " << node.name() << ", "
           << node.num_outputs();
-  GraphTransferInfo::NodeOutputInfo& node_output_info =
-      *graph_transfer_info_.add_node_output_info();
+  GraphTransferNodeOutputInfo& node_output_info =
+      *graph_transfer_info_->add_node_output_info();
   node_output_info.set_node_id(id);
 
   std::vector<DataType> data_types;
@@ -1030,8 +1035,7 @@ GraphTransferer::TransferParamsComparator::TransferParamsComparator(
     : dependency_map_(dep_map) {}
 
 bool GraphTransferer::TransferParamsComparator::operator()(
-    const GraphTransferInfo::NodeInfo& obj0,
-    const GraphTransferInfo::NodeInfo& obj1) {
+    const GraphTransferNodeInfo& obj0, const GraphTransferNodeInfo& obj1) {
   const int node_id0 = obj0.node_id();
   const int node_id1 = obj1.node_id();
   bool obj0_uses_obj1 = false;
@@ -1114,8 +1118,8 @@ void GraphTransferer::ClearCache() {
 
 void GraphTransferer::DumpNodeTransferParams() const {
   LOG(INFO) << "*** Const Nodes ***";
-  for (const GraphTransferInfo::ConstNodeInfo& params :
-       graph_transfer_info_.const_node_info()) {
+  for (const GraphTransferConstNodeInfo& params :
+       graph_transfer_info_->const_node_info()) {
     // TODO(satok): Stop assuming shape size is 4.
     CHECK_EQ(params.shape_size(), 4);
     LOG(INFO) << "[ " << params.node_id() << " \"" << params.name()
@@ -1131,8 +1135,8 @@ void GraphTransferer::DumpNodeTransferParams() const {
   }
   LOG(INFO) << "******\n";
   LOG(INFO) << "*** Op Nodes ***";
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info_.node_info()) {
+  for (const GraphTransferNodeInfo& params :
+       graph_transfer_info_->node_info()) {
     LOG(INFO) << "[ " << params.node_id() << " \"" << params.name();
     LOG(INFO) << "  type: " << params.type_name();
     LOG(INFO) << "  padding: " << ToPaddingDebugString(params.padding_id());
@@ -1146,18 +1150,18 @@ void GraphTransferer::DumpNodeTransferParams() const {
   }
   LOG(INFO) << "******\n";
   LOG(INFO) << "*** Node input params ***";
-  for (const GraphTransferInfo::NodeInputInfo& params :
-       graph_transfer_info_.node_input_info()) {
+  for (const GraphTransferNodeInputInfo& params :
+       graph_transfer_info_->node_input_info()) {
     LOG(INFO) << "[ " << params.node_id() << " ]";
-    for (const GraphTransferInfo::NodeInput& node_input : params.node_input()) {
+    for (const GraphTransferNodeInput& node_input : params.node_input()) {
       LOG(INFO) << "    src node id = " << node_input.node_id()
                 << ", output port = " << node_input.output_port();
     }
   }
   LOG(INFO) << "******\n";
   LOG(INFO) << "*** Node output params ***";
-  for (const GraphTransferInfo::NodeOutputInfo& params :
-       graph_transfer_info_.node_output_info()) {
+  for (const GraphTransferNodeOutputInfo& params :
+       graph_transfer_info_->node_output_info()) {
     LOG(INFO) << "[ " << params.node_id() << " ]";
     for (const int max_size : params.max_byte_size()) {
       LOG(INFO) << "    max_size = " << max_size;
@@ -1167,8 +1171,8 @@ void GraphTransferer::DumpNodeTransferParams() const {
 }
 
 void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
-  for (const GraphTransferInfo::ConstNodeInfo& params :
-       graph_transfer_info_.const_node_info()) {
+  for (const GraphTransferConstNodeInfo& params :
+       graph_transfer_info_->const_node_info()) {
     std::stringstream sstream;
     // TODO(satok): Stop assuming shape size is 4.
     CHECK_EQ(params.shape_size(), 4);
@@ -1182,9 +1186,9 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Const node count = "
-            << graph_transfer_info_.const_node_info_size();
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info_.node_info()) {
+            << graph_transfer_info_->const_node_info_size();
+  for (const GraphTransferNodeInfo& params :
+       graph_transfer_info_->node_info()) {
     std::stringstream sstream;
     sstream << "---(OP) [" << params.name().c_str() << "," << std::hex
             << params.node_id() << std::dec << "," << params.soc_op_id() << ","
@@ -1197,12 +1201,12 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
             << "," << params.output_count() << "," << params.type_name() << "]";
     LOG(INFO) << sstream.str();
   }
-  LOG(INFO) << "Op node count = " << graph_transfer_info_.node_info_size();
-  for (const GraphTransferInfo::NodeInputInfo& params :
-       graph_transfer_info_.node_input_info()) {
+  LOG(INFO) << "Op node count = " << graph_transfer_info_->node_info_size();
+  for (const GraphTransferNodeInputInfo& params :
+       graph_transfer_info_->node_input_info()) {
     std::stringstream sstream;
     sstream << "---(INPUT) [" << std::hex << params.node_id() << std::dec;
-    for (const GraphTransferInfo::NodeInput& node_input : params.node_input()) {
+    for (const GraphTransferNodeInput& node_input : params.node_input()) {
       sstream << "," << std::hex << node_input.node_id() << std::dec << ","
               << node_input.output_port();
     }
@@ -1210,9 +1214,9 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Input params count = "
-            << graph_transfer_info_.node_input_info_size();
-  for (const GraphTransferInfo::NodeOutputInfo& params :
-       graph_transfer_info_.node_output_info()) {
+            << graph_transfer_info_->node_input_info_size();
+  for (const GraphTransferNodeOutputInfo& params :
+       graph_transfer_info_->node_output_info()) {
     std::stringstream sstream;
     sstream << "---(OUTPUT) [" << std::hex << params.node_id() << std::dec;
     for (const int max_size : params.max_byte_size()) {
@@ -1222,7 +1226,7 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
     LOG(INFO) << sstream.str();
   }
   LOG(INFO) << "Output params count = "
-            << graph_transfer_info_.node_output_info_size();
+            << graph_transfer_info_->node_output_info_size();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 0d43d028cd..86c1c5625f 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -22,8 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
@@ -34,6 +32,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+class GraphTransferInfo;
+class GraphTransferNodeInfo;
+class GraphTransferNodeInputInfo;
+
 // GraphTransferer transfers graph definitions into SoC memory.
 // This functionality is effective if SoC is capable to run
 // the graph on that chip.
@@ -47,7 +49,9 @@ class GraphTransferer {
   static constexpr int SHAPE_ARRAY_SIZE = MAX_SUPPORTED_RANK;
   using TensorShapeMap = RemoteFusedGraphExecuteUtils::TensorShapeMap;
 
-  GraphTransferer() = default;
+  GraphTransferer();
+
+  ~GraphTransferer();
 
   // Load graph structure into GraphTransferer
   // TODO(satok): Pass a pair of TensorShape and DataType instead of
@@ -96,8 +100,8 @@ class GraphTransferer {
    public:
     TransferParamsComparator(
         const std::unordered_map<int, std::unordered_set<int>>& dep_map);
-    bool operator()(const GraphTransferInfo::NodeInfo& obj0,
-                    const GraphTransferInfo::NodeInfo& obj1);
+    bool operator()(const GraphTransferNodeInfo& obj0,
+                    const GraphTransferNodeInfo& obj1);
     const std::unordered_map<int, std::unordered_set<int>>& dependency_map_;
   };
 
@@ -174,9 +178,8 @@ class GraphTransferer {
                         const std::vector<int>& extra_inputs,
                         const int outputs_size);
 
-  void AddNodeInputByInputIndex(
-      const Node& node, const int idx,
-      GraphTransferInfo::NodeInputInfo* node_input_info);
+  void AddNodeInputByInputIndex(const Node& node, const int idx,
+                                GraphTransferNodeInputInfo* node_input_info);
 
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
@@ -211,7 +214,7 @@ class GraphTransferer {
   // Dump pretty print of parameters
   void DumpNodeTransferParams() const;
 
-  GraphTransferInfo graph_transfer_info_{};
+  GraphTransferInfo* graph_transfer_info_;
 
   std::vector<const Node*> node_name_cache_list_{};
   std::unordered_map<string, int> node_name_to_id_cache_map_{};
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 20b09f144b..765795b1f4 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -191,9 +191,9 @@ static GraphDef CreatePoolGraphDef() {
   return def;
 }
 
-static const GraphTransferInfo::ConstNodeInfo* FindConstNodeInfo(
+static const GraphTransferConstNodeInfo* FindConstNodeInfo(
     const GraphTransferer& gt, const string& name) {
-  for (const GraphTransferInfo::ConstNodeInfo& params :
+  for (const GraphTransferConstNodeInfo& params :
        gt.GetGraphTransferInfo().const_node_info()) {
     if (params.name() == name) {
       return &params;
@@ -202,9 +202,9 @@ static const GraphTransferInfo::ConstNodeInfo* FindConstNodeInfo(
   return nullptr;
 }
 
-static const GraphTransferInfo::NodeInfo* FindNodeInfo(
-    const GraphTransferer& gt, const string& name) {
-  for (const GraphTransferInfo::NodeInfo& params :
+static const GraphTransferNodeInfo* FindNodeInfo(const GraphTransferer& gt,
+                                                 const string& name) {
+  for (const GraphTransferNodeInfo& params :
        gt.GetGraphTransferInfo().node_info()) {
     if (params.name() == name) {
       return &params;
@@ -213,9 +213,9 @@ static const GraphTransferInfo::NodeInfo* FindNodeInfo(
   return nullptr;
 }
 
-static const GraphTransferInfo::NodeInputInfo* FindNodeInputInfo(
+static const GraphTransferNodeInputInfo* FindNodeInputInfo(
     const GraphTransferer& gt, const int node_id) {
-  for (const GraphTransferInfo::NodeInputInfo& params :
+  for (const GraphTransferNodeInputInfo& params :
        gt.GetGraphTransferInfo().node_input_info()) {
     if (params.node_id() == node_id) {
       return &params;
@@ -224,9 +224,9 @@ static const GraphTransferInfo::NodeInputInfo* FindNodeInputInfo(
   return nullptr;
 }
 
-static const GraphTransferInfo::NodeOutputInfo* FindNodeOutputInfo(
+static const GraphTransferNodeOutputInfo* FindNodeOutputInfo(
     const GraphTransferer& gt, const int node_id) {
-  for (const GraphTransferInfo::NodeOutputInfo& params :
+  for (const GraphTransferNodeOutputInfo& params :
        gt.GetGraphTransferInfo().node_output_info()) {
     if (params.node_id() == node_id) {
       return &params;
@@ -236,21 +236,21 @@ static const GraphTransferInfo::NodeOutputInfo* FindNodeOutputInfo(
 }
 
 static void SanityCheckNodes(const GraphTransferer& gt) {
-  for (const GraphTransferInfo::NodeInfo& params :
+  for (const GraphTransferNodeInfo& params :
        gt.GetGraphTransferInfo().node_info()) {
     if (params.input_count() > 0) {
-      const GraphTransferInfo::NodeInputInfo* input_params =
+      const GraphTransferNodeInputInfo* input_params =
           FindNodeInputInfo(gt, params.node_id());
       ASSERT_NE(nullptr, input_params);
       EXPECT_EQ(params.input_count(), input_params->node_input_size());
       EXPECT_EQ(params.node_id(), input_params->node_id());
-      for (const GraphTransferInfo::NodeInput& node_input :
+      for (const GraphTransferNodeInput& node_input :
            input_params->node_input()) {
         EXPECT_GE(node_input.output_port(), 0);
       }
     }
     if (params.output_count() > 0) {
-      const GraphTransferInfo::NodeOutputInfo* output_params =
+      const GraphTransferNodeOutputInfo* output_params =
           FindNodeOutputInfo(gt, params.node_id());
       ASSERT_NE(nullptr, output_params);
       EXPECT_EQ(params.output_count(), output_params->max_byte_size_size());
@@ -273,8 +273,7 @@ TEST_F(GraphTransfererTest, LoadAddGraph) {
   const int const_node_count =
       gt_.GetGraphTransferInfo().const_node_info_size();
   ASSERT_EQ(2, const_node_count);
-  const GraphTransferInfo::ConstNodeInfo* params_a =
-      FindConstNodeInfo(gt_, NAME_A);
+  const GraphTransferConstNodeInfo* params_a = FindConstNodeInfo(gt_, NAME_A);
   ASSERT_TRUE(params_a != nullptr);
   EXPECT_EQ(NAME_A, params_a->name());
   ASSERT_EQ(4, params_a->shape_size());
@@ -284,8 +283,7 @@ TEST_F(GraphTransfererTest, LoadAddGraph) {
   EXPECT_EQ(1, params_a->shape(3));
   EXPECT_EQ(4, params_a->data().length());
 
-  const GraphTransferInfo::ConstNodeInfo* params_b =
-      FindConstNodeInfo(gt_, NAME_B);
+  const GraphTransferConstNodeInfo* params_b = FindConstNodeInfo(gt_, NAME_B);
   ASSERT_TRUE(params_b != nullptr);
   ASSERT_EQ(4, params_b->shape_size());
   EXPECT_EQ(1, params_b->shape(0));
@@ -328,7 +326,7 @@ TEST_F(GraphTransfererTest, LoadConvGraph) {
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
   ASSERT_EQ(4, op_node_count);
-  const GraphTransferInfo::NodeInfo* params_conv = FindNodeInfo(gt_, "conv");
+  const GraphTransferNodeInfo* params_conv = FindNodeInfo(gt_, "conv");
   ASSERT_TRUE(params_conv != nullptr);
   const int id = params_conv->node_id();
   EXPECT_GE(id, 0);
@@ -354,8 +352,7 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
   ASSERT_EQ(4, op_node_count);
-  const GraphTransferInfo::NodeInfo* params_max_pool =
-      FindNodeInfo(gt_, "maxpool");
+  const GraphTransferNodeInfo* params_max_pool = FindNodeInfo(gt_, "maxpool");
   ASSERT_TRUE(params_max_pool != nullptr);
   const int id = params_max_pool->node_id();
   EXPECT_GE(id, 0);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 9c2e1e123c..66d24d171d 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 #include "tensorflow/core/kernels/hexagon/soc_interface.h"
@@ -54,9 +55,9 @@ static uint8* FindAlignedPointer(uint8* ptr) {
   return data_ptr;
 }
 
-/* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo(
+/* static */ GraphTransferNodeInfo* HexagonControlWrapper::FindNodeInfo(
     const string& name, GraphTransferInfo* graph_transfer_info) {
-  for (GraphTransferInfo::NodeInfo& node_info :
+  for (GraphTransferNodeInfo& node_info :
        *graph_transfer_info->mutable_node_info()) {
     if (node_info.name() == name) {
       return &node_info;
@@ -138,9 +139,9 @@ bool HexagonControlWrapper::SetupGraph() {
       graph_transferer_.GetMutableGraphTransferInfo();
 
   // Overwrite op type of input nodes for hexagon
-  for (const GraphTransferInfo::GraphInputNodeInfo& graph_input :
+  for (const GraphTransferGraphInputNodeInfo& graph_input :
        graph_transfer_info.graph_input_node_info()) {
-    GraphTransferInfo::NodeInfo* node_info =
+    GraphTransferNodeInfo* node_info =
         FindNodeInfo(graph_input.name(), &graph_transfer_info);
     CHECK_NE(node_info, nullptr);
   }
@@ -148,13 +149,13 @@ bool HexagonControlWrapper::SetupGraph() {
   // Generate a new output node which is connected to graph output node
   // TODO(satok): Support multiple output nodes
   CHECK_EQ(graph_transfer_info.graph_output_node_info_size(), 1);
-  for (const GraphTransferInfo::GraphOutputNodeInfo& graph_output :
+  for (const GraphTransferGraphOutputNodeInfo& graph_output :
        graph_transfer_info.graph_output_node_info()) {
     const int new_output_node_id = graph_transfer_info.node_info_size() +
                                    graph_transfer_info.const_node_info_size() +
                                    2 /* offset for ids */;
     // Register a new output node
-    GraphTransferInfo::NodeInfo& new_output_node_info =
+    GraphTransferNodeInfo& new_output_node_info =
         *graph_transfer_info.add_node_info();
     new_output_node_info.set_name(OUTPUT_OP_NAME);
     new_output_node_info.set_node_id(new_output_node_id);
@@ -169,14 +170,13 @@ bool HexagonControlWrapper::SetupGraph() {
     const string node_name = tid.first.ToString();
     const int port = tid.second;
     // Register node input for the new output node
-    const GraphTransferInfo::NodeInfo* node_info =
+    const GraphTransferNodeInfo* node_info =
         FindNodeInfo(node_name, &graph_transfer_info);
     CHECK_NE(node_info, nullptr);
-    GraphTransferInfo::NodeInputInfo& node_input_info =
+    GraphTransferNodeInputInfo& node_input_info =
         *graph_transfer_info.add_node_input_info();
     node_input_info.set_node_id(new_output_node_id);
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
+    GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
     node_input.set_node_id(node_info->node_id());
     node_input.set_output_port(port);
   }
@@ -189,12 +189,12 @@ bool HexagonControlWrapper::SetupGraph() {
 
   int inputs_count = 0;
   int outputs_count = 0;
-  for (const GraphTransferInfo::NodeInputInfo& input_params :
+  for (const GraphTransferNodeInputInfo& input_params :
        graph_transfer_info.node_input_info()) {
     inputs_count += input_params.node_input_size();
   }
 
-  for (const GraphTransferInfo::NodeOutputInfo& output_params :
+  for (const GraphTransferNodeOutputInfo& output_params :
        graph_transfer_info.node_output_info()) {
     outputs_count += output_params.max_byte_size_size();
   }
@@ -204,15 +204,14 @@ bool HexagonControlWrapper::SetupGraph() {
 
   // Construct node input parameters
   std::unordered_map<int, std::tuple<void*, int>> inputs_map;
-  for (const GraphTransferInfo::NodeInputInfo& input_params :
+  for (const GraphTransferNodeInputInfo& input_params :
        graph_transfer_info.node_input_info()) {
     const int count = input_params.node_input_size();
     CHECK(count <= MAX_IN_OUT_COUNT);
     int node_ids[MAX_IN_OUT_COUNT];
     int ports[MAX_IN_OUT_COUNT];
     for (int i = 0; i < count; ++i) {
-      const GraphTransferInfo::NodeInput& node_input =
-          input_params.node_input(i);
+      const GraphTransferNodeInput& node_input = input_params.node_input(i);
       node_ids[i] = node_input.node_id() + NODE_ID_OFFSET;
       ports[i] = node_input.output_port();
     }
@@ -224,7 +223,7 @@ bool HexagonControlWrapper::SetupGraph() {
 
   // Construct node output parameters
   std::unordered_map<int, std::tuple<void*, int>> outputs_map;
-  for (const GraphTransferInfo::NodeOutputInfo& output_params :
+  for (const GraphTransferNodeOutputInfo& output_params :
        graph_transfer_info.node_output_info()) {
     const int count = output_params.max_byte_size_size();
     CHECK(count <= MAX_IN_OUT_COUNT);
@@ -244,7 +243,7 @@ bool HexagonControlWrapper::SetupGraph() {
 
   // Initialize graph
   // 1. Setup const nodes
-  for (const GraphTransferInfo::ConstNodeInfo& params :
+  for (const GraphTransferConstNodeInfo& params :
        graph_transfer_info.const_node_info()) {
     const int node_id = params.node_id();
     // TODO(satok): Stop assuming shape size is 4.
@@ -267,8 +266,7 @@ bool HexagonControlWrapper::SetupGraph() {
   }
 
   // 2. Setup op nodes
-  for (const GraphTransferInfo::NodeInfo& params :
-       graph_transfer_info.node_info()) {
+  for (const GraphTransferNodeInfo& params : graph_transfer_info.node_info()) {
     const int node_id = params.node_id();
     const int op_id = params.soc_op_id();
     CHECK(inputs_map.count(node_id) == 1);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index dca1f94a9b..132cfde2db 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -67,8 +67,8 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
 
-  static GraphTransferInfo::NodeInfo* FindNodeInfo(
-      const string& node_name, GraphTransferInfo* graph_transfer_info);
+  static GraphTransferNodeInfo* FindNodeInfo(
+      const string& name, GraphTransferInfo* graph_transfer_info);
 
   const RemoteFusedGraphExecuteInfo* execute_info_{};
   GraphTransferer graph_transferer_{};
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 3f794dfb1a..5fb6b9247f 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -29,6 +29,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 
 #include <memory>
 
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
@@ -209,7 +210,7 @@ BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
     const GraphTransferInfo& graph_transfer_info) {
   RemoteFusedGraphExecuteInfo execute_info;
   execute_info.set_executor_name("build_hexagon_remote_fused_graph_executor");
-  for (const GraphTransferInfo::GraphInputNodeInfo& input :
+  for (const GraphTransferGraphInputNodeInfo& input :
        graph_transfer_info.graph_input_node_info()) {
     execute_info.add_graph_input_node_name(input.name());
     RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
@@ -221,7 +222,7 @@ BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
     }
   }
 
-  for (const GraphTransferInfo::GraphOutputNodeInfo& output :
+  for (const GraphTransferGraphOutputNodeInfo& output :
        graph_transfer_info.graph_output_node_info()) {
     execute_info.add_graph_output_node_name(output.name());
     RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
@@ -325,8 +326,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 1. check node_info
   ASSERT_EQ(gfi0.node_info_size(), gfi1.node_info_size());
   for (int i = 0; i < gfi0.node_info_size(); ++i) {
-    const GraphTransferInfo::NodeInfo& ni0 = gfi0.node_info(i);
-    const GraphTransferInfo::NodeInfo& ni1 = gfi1.node_info(i);
+    const GraphTransferNodeInfo& ni0 = gfi0.node_info(i);
+    const GraphTransferNodeInfo& ni1 = gfi1.node_info(i);
     EXPECT_EQ(ni0.DebugString(), ni1.DebugString());
     EXPECT_EQ(ni0.ByteSizeLong(), ni1.ByteSizeLong());
   }
@@ -334,8 +335,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 2. check const_node_info
   ASSERT_EQ(gfi0.const_node_info_size(), gfi1.const_node_info_size());
   for (int i = 0; i < gfi0.const_node_info_size(); ++i) {
-    const GraphTransferInfo::ConstNodeInfo& cni0 = gfi0.const_node_info(i);
-    const GraphTransferInfo::ConstNodeInfo& cni1 = gfi1.const_node_info(i);
+    const GraphTransferConstNodeInfo& cni0 = gfi0.const_node_info(i);
+    const GraphTransferConstNodeInfo& cni1 = gfi1.const_node_info(i);
     ASSERT_EQ(cni0.shape_size(), cni1.shape_size());
     for (int j = 0; j < cni0.shape_size(); ++j) {
       EXPECT_EQ(cni0.shape(j), cni1.shape(j));
@@ -347,8 +348,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 3. check node_input_info
   ASSERT_EQ(gfi0.node_input_info_size(), gfi1.node_input_info_size());
   for (int i = 0; i < gfi0.node_input_info_size(); ++i) {
-    const GraphTransferInfo::NodeInputInfo& nii0 = gfi0.node_input_info(i);
-    const GraphTransferInfo::NodeInputInfo& nii1 = gfi1.node_input_info(i);
+    const GraphTransferNodeInputInfo& nii0 = gfi0.node_input_info(i);
+    const GraphTransferNodeInputInfo& nii1 = gfi1.node_input_info(i);
     EXPECT_EQ(nii0.ByteSizeLong(), nii1.ByteSizeLong());
     EXPECT_EQ(nii0.DebugString(), nii1.DebugString());
   }
@@ -356,8 +357,8 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   // 4. check node_output_info
   ASSERT_EQ(gfi0.node_output_info_size(), gfi1.node_output_info_size());
   for (int i = 0; i < gfi0.node_output_info_size(); ++i) {
-    const GraphTransferInfo::NodeOutputInfo& noi0 = gfi0.node_output_info(i);
-    const GraphTransferInfo::NodeOutputInfo& noi1 = gfi1.node_output_info(i);
+    const GraphTransferNodeOutputInfo& noi0 = gfi0.node_output_info(i);
+    const GraphTransferNodeOutputInfo& noi1 = gfi1.node_output_info(i);
     ASSERT_EQ(noi0.max_byte_size_size(), noi1.max_byte_size_size());
     for (int j = 0; j < noi0.max_byte_size_size(); ++j) {
       EXPECT_EQ(noi0.max_byte_size(j), noi1.max_byte_size(j));
@@ -370,9 +371,9 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   ASSERT_EQ(gfi0.graph_input_node_info_size(),
             gfi1.graph_input_node_info_size());
   for (int i = 0; i < gfi0.graph_input_node_info_size(); ++i) {
-    const GraphTransferInfo::GraphInputNodeInfo& gini0 =
+    const GraphTransferGraphInputNodeInfo& gini0 =
         gfi0.graph_input_node_info(i);
-    const GraphTransferInfo::GraphInputNodeInfo& gini1 =
+    const GraphTransferGraphInputNodeInfo& gini1 =
         gfi0.graph_input_node_info(i);
     EXPECT_EQ(gini0.ByteSizeLong(), gini1.ByteSizeLong());
     EXPECT_EQ(gini0.DebugString(), gini1.DebugString());
@@ -382,9 +383,9 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   ASSERT_EQ(gfi0.graph_output_node_info_size(),
             gfi1.graph_output_node_info_size());
   for (int i = 0; i < gfi0.graph_output_node_info_size(); ++i) {
-    const GraphTransferInfo::GraphOutputNodeInfo& goni0 =
+    const GraphTransferGraphOutputNodeInfo& goni0 =
         gfi0.graph_output_node_info(i);
-    const GraphTransferInfo::GraphOutputNodeInfo& goni1 =
+    const GraphTransferGraphOutputNodeInfo& goni1 =
         gfi0.graph_output_node_info(i);
     EXPECT_EQ(goni0.ByteSizeLong(), goni1.ByteSizeLong());
     EXPECT_EQ(goni0.DebugString(), goni1.DebugString());
-- 
GitLab


From 67e76defd59c4d867a1db4a371cfa9640bec1000 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Tue, 17 Apr 2018 10:46:37 +0900
Subject: [PATCH 0926/1262] fix typo

---
 tensorflow/python/profiler/tfprof_logger_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/profiler/tfprof_logger_test.py b/tensorflow/python/profiler/tfprof_logger_test.py
index 141144f987..caf3869f56 100644
--- a/tensorflow/python/profiler/tfprof_logger_test.py
+++ b/tensorflow/python/profiler/tfprof_logger_test.py
@@ -38,7 +38,7 @@ class TFProfLoggerTest(test.TestCase):
     return math_ops.matmul(a, b)
 
   # pylint: disable=pointless-string-statement
-  """# TODO(xpan): This this out of core so it doesn't depend on contrib.
+  """# TODO(xpan): This out of core so it doesn't depend on contrib.
   def testFillMissingShape(self):
     a, b, y = self._BuildSmallPlaceholderlModel()
     run_options = config_pb2.RunOptions(
-- 
GitLab


From d05d1d6625cdbcaa04ece05862635dbaa32449d1 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Tue, 17 Apr 2018 10:48:52 +0900
Subject: [PATCH 0927/1262] fix typo

---
 .../lite/toco/graph_transformations/resolve_tensorflow_merge.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 477e7f13da..38e0005890 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -32,7 +32,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
   }
 
   // We need to yield until this Merge node has only 1 input, which will mean
-  // that that is the selected input. Other graph transformations on other nodes
+  // that is the selected input. Other graph transformations on other nodes
   // such as ResolveTensorFlowSwitch, will take care of trimming the
   // non-selected inputs, so that at some point there will be only 1 input left.
   if (merge_op->inputs.size() > 1) {
-- 
GitLab


From b530f98f69ff90dcddde45017904993421c88508 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Tue, 17 Apr 2018 10:52:16 +0900
Subject: [PATCH 0928/1262] fix typo

---
 tensorflow/go/op/wrappers.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1d5ebf6687..ea1ec6392f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -19595,7 +19595,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
-- 
GitLab


From d48c55db5fc8ab07d2bf679b4ea7c3c4c84ace76 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 16 Apr 2018 19:10:10 -0700
Subject: [PATCH 0929/1262] BoostedTreesEstimator in contrib: train_in_memory
 works with input_fns returning data.Dataset. Only one batch of data is
 expected, so dataset.batch() is disallowed, and dataset.repeat() will be
 ignored (only the first one would be used)

PiperOrigin-RevId: 193137094
---
 .../python/estimator/boosted_trees.py         |  38 +++-
 .../python/estimator/boosted_trees_test.py    |  80 +++++++--
 .../python/estimator/canned/boosted_trees.py  | 149 +++++++++++-----
 .../estimator/canned/boosted_trees_test.py    | 167 +++++++++++++++++-
 4 files changed, 362 insertions(+), 72 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 00356ce0ca..bd641014e9 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -17,10 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 
 
+def _validate_input_fn_and_repeat_dataset(train_input_fn):
+  """Validates whether the input_fn is valid, and repeat() if tf.Dataset."""
+  def _input_fn():
+    result_input_fn = train_input_fn()
+    if isinstance(result_input_fn, dataset_ops.Dataset):
+      return result_input_fn.repeat()
+    return result_input_fn
+
+  return _input_fn
+
+
 class _BoostedTreesEstimator(estimator.Estimator):
   """An Estimator for Tensorflow Boosted Trees models."""
 
@@ -113,10 +125,13 @@ def boosted_trees_classifier_train_in_memory(
   bucketized_feature_2 = bucketized_column(
     numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
 
-  def input_fn_train():
+  def train_input_fn():
     dataset = create-dataset-from-training-data
-    # Don't use repeat or cache, since it is assumed to be one epoch
-    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
     return dataset
 
   classifier = boosted_trees_classifier_train_in_memory(
@@ -210,7 +225,9 @@ def boosted_trees_classifier_train_in_memory(
   in_memory_classifier = estimator.Estimator(
       model_fn=_model_fn, model_dir=model_dir, config=config)
 
-  in_memory_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
+  in_memory_classifier.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
 
   return in_memory_classifier
   # pylint: enable=protected-access
@@ -241,10 +258,13 @@ def boosted_trees_regressor_train_in_memory(
   bucketized_feature_2 = bucketized_column(
     numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
 
-  def input_fn_train():
+  def train_input_fn():
     dataset = create-dataset-from-training-data
-    # Don't use repeat or cache, since it is assumed to be one epoch
-    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
     return dataset
 
   regressor = boosted_trees_regressor_train_in_memory(
@@ -329,7 +349,9 @@ def boosted_trees_regressor_train_in_memory(
   in_memory_regressor = estimator.Estimator(
       model_fn=_model_fn, model_dir=model_dir, config=config)
 
-  in_memory_regressor.train(input_fn=train_input_fn, hooks=train_hooks)
+  in_memory_regressor.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
 
   return in_memory_regressor
   # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index eee5910687..76cbefe5e9 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
 from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -49,12 +50,24 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features = dict(FEATURES_DICT)
-    if is_classification:
-      labels = CLASSIFICATION_LABELS
-    else:
-      labels = REGRESSION_LABELS
-    return features, labels
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(features_dict),
+         dataset_ops.Dataset.from_tensors(labels)
+        ))
+    return ds
 
   return _input_fn
 
@@ -132,15 +145,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
     est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
-    # Check eval.
+    # Check evaluate and predict.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
     # Validate predictions.
@@ -148,24 +159,59 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testBinaryClassifierTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
     est = boosted_trees.boosted_trees_regressor_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
-    # Check eval.
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testRegressorTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Check evaluate and predict.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['average_loss'], 2.478283)
-    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
     self.assertAllClose(
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 536bd2bf81..085dace1b3 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
@@ -50,6 +51,32 @@ _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
 
 
+def _get_max_buckets(feature_columns):
+  """Gets the maximum number of buckets from feature_columns.
+
+  Args:
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    max_buckets: the maximum number of buckets among bucketized_columns.
+
+  Raises:
+    ValueError: when unsupported feature_columns are given.
+  """
+  if not feature_columns:
+    raise ValueError('feature_columns must be a non-empty list/set of '
+                     'tf.feature_column.')
+  max_buckets = 1
+  for fc in feature_columns:
+    if isinstance(fc, feature_column_lib._BucketizedColumn):  # pylint:disable=protected-access
+      # N boundaries creates (N+1) buckets.
+      max_buckets = max(max_buckets, len(fc.boundaries) + 1)
+    else:
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  return max_buckets
+
+
 def _get_transformed_features(features, feature_columns):
   """Gets the transformed features from features/feature_columns pair.
 
@@ -59,36 +86,31 @@ def _get_transformed_features(features, feature_columns):
 
   Returns:
     result_features: a list of the transformed features, sorted by the name.
-    num_buckets: the maximum number of buckets across bucketized_columns.
 
   Raises:
     ValueError: when unsupported features/columns are tried.
   """
-  num_buckets = 1
   # pylint:disable=protected-access
   for fc in feature_columns:
-    if isinstance(fc, feature_column_lib._BucketizedColumn):
-      # N boundaries creates (N+1) buckets.
-      num_buckets = max(num_buckets, len(fc.boundaries) + 1)
-    else:
+    if not isinstance(fc, feature_column_lib._BucketizedColumn):
       raise ValueError('For now, only bucketized_column is supported but '
                        'got: {}'.format(fc))
-  transformed = feature_column_lib._transform_features(features,
-                                                       feature_columns)
+  transformed_features = feature_column_lib._transform_features(
+      features, feature_columns)
   # pylint:enable=protected-access
   result_features = []
-  for column in sorted(transformed, key=lambda tc: tc.name):
+  for column in sorted(transformed_features, key=lambda tc: tc.name):
     source_name = column.source_column.name
-    squeezed_tensor = array_ops.squeeze(transformed[column], axis=1)
+    squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
     if len(squeezed_tensor.shape) > 1:
       raise ValueError('For now, only supports features equivalent to rank 1 '
                        'but column `{}` got: {}'.format(
                            source_name, features[source_name].shape))
     result_features.append(squeezed_tensor)
-  return result_features, num_buckets
+  return result_features
 
 
-def _keep_as_local_variable(tensor, name=None):
+def _local_variable(tensor, name=None):
   """Stores a tensor as a local Variable for faster read."""
   return variable_scope.variable(
       initial_value=tensor,
@@ -98,6 +120,48 @@ def _keep_as_local_variable(tensor, name=None):
       name=name)
 
 
+def _cache_transformed_features(features, feature_columns, batch_size):
+  """Transform features and cache, then returns (cached_features, cache_op)."""
+  num_features = len(feature_columns)
+  cached_features = [
+      _local_variable(
+          array_ops.zeros([batch_size], dtype=dtypes.int32),
+          name='cached_feature_{}'.format(i))
+      for i in range(num_features)
+  ]
+  are_features_cached = _local_variable(False, name='are_features_cached')
+
+  def cache_features_and_return():
+    """Caches transoformed features.
+
+    The intention is to hide get_transformed_features() from the graph by
+    caching the result except the first step, since bucketize operation
+    (inside get_transformed_features) is expensive.
+
+    Returns:
+      input_feature_list: a list of input features.
+      cache_flip_op: op to add to graph to make sure cache update is included to
+          the graph.
+    """
+
+    transformed_features = _get_transformed_features(features, feature_columns)
+    cached = [
+        state_ops.assign(cached_features[i], transformed_features[i])
+        for i in range(num_features)
+    ]
+    # TODO(youngheek): Try other combination of dependencies so that the
+    # function returns a single result, not a tuple.
+    with ops.control_dependencies(cached):
+      cache_flip_op = are_features_cached.assign(True)
+    return cached, cache_flip_op
+
+  input_feature_list, cache_flip_op = control_flow_ops.cond(
+      are_features_cached,
+      lambda: (cached_features, control_flow_ops.no_op()),
+      cache_features_and_return)
+  return input_feature_list, cache_flip_op
+
+
 class _CacheTrainingStatesUsingHashTable(object):
   """Caching logits, etc. using MutableHashTable."""
 
@@ -186,13 +250,13 @@ class _CacheTrainingStatesUsingVariables(object):
       logits_dimension: a constant (int) for the dimension of logits.
     """
     self._logits_dimension = logits_dimension
-    self._tree_ids = _keep_as_local_variable(
+    self._tree_ids = _local_variable(
         array_ops.zeros([batch_size], dtype=dtypes.int32),
         name='tree_ids_cache')
-    self._node_ids = _keep_as_local_variable(
+    self._node_ids = _local_variable(
         array_ops.zeros([batch_size], dtype=dtypes.int32),
         name='node_ids_cache')
-    self._logits = _keep_as_local_variable(
+    self._logits = _local_variable(
         array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
         name='logits_cache')
 
@@ -290,33 +354,38 @@ def _bt_model_fn(
         'When train_in_memory is enabled, input_fn should return the entire '
         'dataset as a single batch, and n_batches_per_layer should be set as '
         '1.')
+    if (not config.is_chief or config.num_worker_replicas > 1 or
+        config.num_ps_replicas > 0):
+      raise ValueError('train_in_memory is supported only for '
+                       'non-distributed training.')
   worker_device = control_flow_ops.no_op().device
   # maximum number of splits possible in the whole tree =2^(D-1)-1
   # TODO(youngheek): perhaps storage could be optimized by storing stats with
   # the dimension max_splits_per_layer, instead of max_splits (for the entire
   # tree).
   max_splits = (1 << tree_hparams.max_depth) - 1
+  max_buckets = _get_max_buckets(feature_columns)
+  train_op = []
   with ops.name_scope(name) as name:
     # Prepare.
     global_step = training_util.get_or_create_global_step()
-    input_feature_list, num_buckets = _get_transformed_features(
-        features, feature_columns)
-    if train_in_memory and mode == model_fn.ModeKeys.TRAIN:
-      input_feature_list = [
-          _keep_as_local_variable(feature) for feature in input_feature_list
-      ]
-    num_features = len(input_feature_list)
-
-    cache = None
-    if mode == model_fn.ModeKeys.TRAIN:
-      if train_in_memory and is_single_machine:  # maybe just train_in_memory?
-        batch_size = array_ops.shape(input_feature_list[0])[0]
-        cache = _CacheTrainingStatesUsingVariables(batch_size,
-                                                   head.logits_dimension)
-      elif example_id_column_name:
+    num_features = len(feature_columns)
+    # Extract input features and set up cache for training.
+    training_state_cache = None
+    if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
+      # cache transformed features as well for in-memory training.
+      batch_size = array_ops.shape(labels)[0]
+      input_feature_list, input_cache_op = _cache_transformed_features(
+          features, feature_columns, batch_size)
+      train_op.append(input_cache_op)
+      training_state_cache = _CacheTrainingStatesUsingVariables(
+          batch_size, head.logits_dimension)
+    else:
+      input_feature_list = _get_transformed_features(features, feature_columns)
+      if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
         example_ids = features[example_id_column_name]
-        cache = _CacheTrainingStatesUsingHashTable(example_ids,
-                                                   head.logits_dimension)
+        training_state_cache = _CacheTrainingStatesUsingHashTable(
+            example_ids, head.logits_dimension)
 
     # Create Ensemble resources.
     tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
@@ -340,11 +409,12 @@ def _bt_model_fn(
         # TODO(soroush): Do partial updates if this becomes a bottleneck.
         ensemble_reload = local_tree_ensemble.deserialize(
             *tree_ensemble.serialize())
-      if cache:
-        cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
+      if training_state_cache:
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            training_state_cache.lookup())
       else:
         # Always start from the beginning when no cache is set up.
-        batch_size = array_ops.shape(input_feature_list[0])[0]
+        batch_size = array_ops.shape(labels)[0]
         cached_tree_ids, cached_node_ids, cached_logits = (
             array_ops.zeros([batch_size], dtype=dtypes.int32),
             array_ops.zeros([batch_size], dtype=dtypes.int32),
@@ -368,9 +438,8 @@ def _bt_model_fn(
     # Create training graph.
     def _train_op_fn(loss):
       """Run one training iteration."""
-      train_op = []
-      if cache:
-        train_op.append(cache.insert(tree_ids, node_ids, logits))
+      if training_state_cache:
+        train_op.append(training_state_cache.insert(tree_ids, node_ids, logits))
       if closed_form_grad_and_hess_fn:
         gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
       else:
@@ -385,7 +454,7 @@ def _bt_model_fn(
                   hessians=hessians,
                   bucketized_features_list=[input_feature_list[f]],
                   max_splits=max_splits,
-                  num_buckets=num_buckets),
+                  num_buckets=max_buckets),
               axis=0) for f in range(num_features)
       ]
 
@@ -422,7 +491,7 @@ def _bt_model_fn(
         summary_accumulator = data_flow_ops.ConditionalAccumulator(
             dtype=dtypes.float32,
             # The stats consist of gradients and hessians (the last dimension).
-            shape=[num_features, max_splits, num_buckets, 2],
+            shape=[num_features, max_splits, max_buckets, 2],
             shared_name='stats_summary_accumulator')
         apply_grad = summary_accumulator.apply_grad(
             array_ops.stack(stats_summary_list, axis=0), stamp_token)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 56e67a6707..c8c52d3bc6 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator.canned import boosted_trees
@@ -58,13 +59,32 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features = dict(FEATURES_DICT)
-    features[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
-    if is_classification:
-      labels = CLASSIFICATION_LABELS
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    if batch:
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensor_slices(features_dict),
+           dataset_ops.Dataset.from_tensor_slices(labels))).batch(batch)
     else:
-      labels = REGRESSION_LABELS
-    return features, labels
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensors(features_dict),
+           dataset_ops.Dataset.from_tensors(labels)))
+    # repeat indefinitely by default, or stop at the given step.
+    ds = ds.repeat(repeat)
+    return ds
 
   return _input_fn
 
@@ -125,9 +145,28 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
+  def testTrainClassifierWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
     predictions = list(est.predict(input_fn=predict_input_fn))
-    # All labels are correct.
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
@@ -166,12 +205,126 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetBatch(self):
+    # The batch_size as the entire data size should yield the same result as
+    # dataset without batching.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=5)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetLargerBatch(self):
+    # The batch_size as the multiple of the entire data size should still yield
+    # the same result.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=15)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetSmallerBatch(self):
+    # Even when using small batches, if (n_batches_per_layer * batch_size) makes
+    # the same entire data size, the result should be the same.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=1)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=5,
+        n_trees=1,
+        max_depth=5)
+    # Train stops after (n_batches_per_layer * n_trees * max_depth) steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=25, finalized_trees=1, attempted_layers=5)
+    # 5 batches = one epoch.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=5)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     predictions = list(est.predict(input_fn=predict_input_fn))
     self.assertAllClose(
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
         [pred['predictions'] for pred in predictions])
 
+  def testTrainRegressorWithDatasetWhenInputIsOverEarlier(self):
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, repeat=3)  # to stop input after 3 steps.
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    # Note that training will stop when input exhausts.
+    # This might not be a typical pattern, but dataset.repeat(3) causes
+    # the input stream to cease after 3 steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=3, finalized_trees=0, attempted_layers=3)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 3.777295)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.353850], [0.254100], [0.106850], [0.712100], [1.012100]],
+        [pred['predictions'] for pred in predictions])
+
 
 class ModelFnTests(test_util.TensorFlowTestCase):
   """Tests bt_model_fn including unexposed internal functionalities."""
-- 
GitLab


From 3bb161433069ea5012f1f5be97fbbd8d0784213d Mon Sep 17 00:00:00 2001
From: Neil Tenenholtz <ntenenz@users.noreply.github.com>
Date: Mon, 16 Apr 2018 22:26:17 -0400
Subject: [PATCH 0930/1262] Remove conditional scope logic now that
 "current_arg_scope" exists in contrib

---
 tensorflow/contrib/layers/python/layers/rev_block_lib.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 02d294c68f..c4fa3392ef 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -504,11 +504,7 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   @_fn_with_custom_grad(grad_fn)
   def fn_with_recompute(*args):
     cached_vs.append(variable_scope.get_variable_scope())
-    # TODO(rsepassi): Rm conditional in TF 1.4
-    if hasattr(contrib_framework_ops, "current_arg_scope"):
-      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
-    else:
-      cached_arg_scope.append({})
+    cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
     return fn(*args)
 
   return fn_with_recompute(*args)
-- 
GitLab


From eb35f19cf7e8c43cfb759bce2fab266ae753f0d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 19:51:13 -0700
Subject: [PATCH 0931/1262] Supply a dtype to super constructor, without which
 build() seems to crash.

PiperOrigin-RevId: 193139585
---
 tensorflow/python/kernel_tests/rnn_test.py    | 22 +++++++++++
 tensorflow/python/ops/rnn_cell_impl.py        | 38 ++++++++++++++-----
 ...flow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt |  2 +-
 ...orflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt |  2 +-
 .../tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt  |  2 +-
 ...tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt |  2 +-
 6 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 9a0409c796..fe5ad84c10 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -206,6 +206,28 @@ class RNNTest(test.TestCase):
     self.assertAllEqual(4, state[0])
     self.assertAllEqual([[[1]], [[2]], [[3]], [[4]]], state[1])
 
+  def _assert_cell_builds(self, cell_class, dtype, batch_size, in_size,
+                          out_size):
+    cell = cell_class(out_size, dtype=dtype)
+    in_shape = tensor_shape.TensorShape((batch_size, in_size))
+    cell.build(in_shape)
+    state_output = cell.zero_state(batch_size, dtype)
+    cell_output, _ = cell(array_ops.zeros(in_shape, dtype), state_output)
+    self.assertAllEqual([batch_size, out_size], cell_output.shape.as_list())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCellsBuild(self):
+    f32 = dtypes.float32
+    f64 = dtypes.float64
+    self._assert_cell_builds(rnn_cell_impl.BasicRNNCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.BasicRNNCell, f64, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.BasicLSTMCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.BasicLSTMCell, f64, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.GRUCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.GRUCell, f64, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.LSTMCell, f32, 5, 7, 3)
+    self._assert_cell_builds(rnn_cell_impl.LSTMCell, f64, 5, 7, 3)
+
 
 ######### Benchmarking RNN code
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index cbc2dcf419..9251e9802c 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -352,10 +352,17 @@ class BasicRNNCell(LayerRNNCell):
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such
       cases.
+    dtype: Default dtype of the layer (default of `None` means use the type
+      of the first input). Required when `build` is called before `call`.
   """
 
-  def __init__(self, num_units, activation=None, reuse=None, name=None):
-    super(BasicRNNCell, self).__init__(_reuse=reuse, name=name)
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None):
+    super(BasicRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
     self.input_spec = base_layer.InputSpec(ndim=2)
@@ -413,6 +420,8 @@ class GRUCell(LayerRNNCell):
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such
       cases.
+    dtype: Default dtype of the layer (default of `None` means use the type
+      of the first input). Required when `build` is called before `call`.
   """
 
   def __init__(self,
@@ -421,8 +430,9 @@ class GRUCell(LayerRNNCell):
                reuse=None,
                kernel_initializer=None,
                bias_initializer=None,
-               name=None):
-    super(GRUCell, self).__init__(_reuse=reuse, name=name)
+               name=None,
+               dtype=None):
+    super(GRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
     self.input_spec = base_layer.InputSpec(ndim=2)
@@ -531,8 +541,14 @@ class BasicLSTMCell(LayerRNNCell):
   that follows.
   """
 
-  def __init__(self, num_units, forget_bias=1.0,
-               state_is_tuple=True, activation=None, reuse=None, name=None):
+  def __init__(self,
+               num_units,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None):
     """Initialize the basic LSTM cell.
 
     Args:
@@ -550,11 +566,13 @@ class BasicLSTMCell(LayerRNNCell):
       name: String, the name of the layer. Layers with the same name will
         share weights, but to avoid mistakes we require reuse=True in such
         cases.
+      dtype: Default dtype of the layer (default of `None` means use the type
+        of the first input). Required when `build` is called before `call`.
 
       When restoring from CudnnLSTM-trained checkpoints, must use
       `CudnnCompatibleLSTMCell` instead.
     """
-    super(BasicLSTMCell, self).__init__(_reuse=reuse, name=name)
+    super(BasicLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -668,7 +686,7 @@ class LSTMCell(LayerRNNCell):
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=None, num_proj_shards=None,
                forget_bias=1.0, state_is_tuple=True,
-               activation=None, reuse=None, name=None):
+               activation=None, reuse=None, name=None, dtype=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -701,11 +719,13 @@ class LSTMCell(LayerRNNCell):
       name: String, the name of the layer. Layers with the same name will
         share weights, but to avoid mistakes we require reuse=True in such
         cases.
+      dtype: Default dtype of the layer (default of `None` means use the type
+        of the first input). Required when `build` is called before `call`.
 
       When restoring from CudnnLSTM-trained checkpoints, use
       `CudnnCompatibleLSTMCell` instead.
     """
-    super(LSTMCell, self).__init__(_reuse=reuse, name=name)
+    super(LSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index f909cd8756..e1abd43ab5 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 173d2eae63..93e7e40199 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index d7f658aaee..465fc1cd9c 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index b9ab487c77..38a387d55a 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
-- 
GitLab


From 78cab9848daa0ba5c6e983e76c504b3ef2c0903e Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Mon, 16 Apr 2018 20:28:02 -0700
Subject: [PATCH 0932/1262] Make ParallelInterleaveDataset saveable.

PiperOrigin-RevId: 193142302
---
 .../interleave_dataset_op_test.py             |  70 ++
 tensorflow/core/kernels/data/BUILD            |   1 +
 .../data/parallel_interleave_dataset_op.cc    | 700 ++++++++++++++++--
 3 files changed, 700 insertions(+), 71 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 256ad8d94d..2df35f81eb 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -94,6 +94,76 @@ class InterleaveDatasetSerializationTest(
     self.run_core_tests(_build_dataset, None, 20)
 
 
+class ParallelInterleaveDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def setUp(self):
+    self.input_values = np.array([4, 5, 6], dtype=np.int64)
+    self.num_repeats = 2
+    self.num_outputs = np.sum(self.input_values) * 2
+
+  def _build_ds(self, cycle_length, block_length, sloppy=False):
+    return (dataset_ops.Dataset.from_tensor_slices(
+        self.input_values).repeat(self.num_repeats).apply(
+            interleave_ops.parallel_interleave(
+                lambda x: dataset_ops.Dataset.range(10 * x, 11 * x),
+                cycle_length, block_length, sloppy)))
+
+  def testSerializationCore(self):
+    # cycle_length > 1, block_length > 1
+    cycle_length = 2
+    block_length = 3
+    self.run_core_tests(
+        lambda: self._build_ds(cycle_length, block_length),
+        lambda: self._build_ds(cycle_length * 2, block_length * 1),
+        self.num_outputs)
+    # cycle_length = 1
+    cycle_length = 1
+    block_length = 3
+    self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
+                        None, self.num_outputs)
+    # block_length = 1
+    cycle_length = 2
+    block_length = 1
+    self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
+                        None, self.num_outputs)
+
+  def testSerializationWithSloppy(self):
+    break_points = self.gen_break_points(self.num_outputs, 10)
+    expected_outputs = np.repeat(
+        np.concatenate([np.arange(10 * x, 11 * x) for x in self.input_values]),
+        self.num_repeats).tolist()
+
+    def run_test(cycle_length, block_length):
+      actual = self.gen_outputs(
+          lambda: self._build_ds(cycle_length, block_length, True),
+          break_points, self.num_outputs)
+      self.assertSequenceEqual(sorted(actual), expected_outputs)
+
+    # cycle_length > 1, block_length > 1
+    run_test(2, 3)
+    # cycle_length = 1
+    run_test(1, 3)
+    # block_length = 1
+    run_test(2, 1)
+
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_dataset():
+      return dataset_ops.Dataset.range(10).map(_map_fn).apply(
+          interleave_ops.parallel_interleave(_interleave_fn, 1))
+
+    self.run_core_tests(_build_dataset, None, 20)
+
+
 class ParallelInterleaveDatasetTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 8c4f0218ee..e856ede44b 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -281,6 +281,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 3f88d6dee8..fa33867ec1 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -35,7 +36,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
         graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -80,24 +81,28 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
-                            func_, std::move(other_arguments), &captured_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(
+                 interleave_func_, std::move(other_arguments), &captured_func));
 
     *output =
-        new Dataset(input, std::move(captured_func), cycle_length, block_length,
-                    sloppy, buffer_output_elements, prefetch_input_elements,
-                    output_types_, output_shapes_);
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, sloppy, buffer_output_elements,
+                    prefetch_input_elements, output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, bool sloppy, int64 buffer_output_elements,
             int64 prefetch_input_elements, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          interleave_func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -128,6 +133,52 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       return "ParallelInterleaveDatasetOp::Dataset";
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      Node* sloppy_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+      Node* buffer_output_elements_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+      Node* prefetch_input_elements_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
+                                      &prefetch_input_elements_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(interleave_func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node},
+           {2, cycle_length_node},
+           {3, block_length_node},
+           {4, sloppy_node},
+           {5, buffer_output_elements_node},
+           {6, prefetch_input_elements_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
    private:
     int64 num_threads() const {
       return cycle_length_ + prefetch_input_elements_;
@@ -156,17 +207,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     // that a caller will block waiting for an element to be produced.
     //
     // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_` is a vector containing pointers to `WorkerState`s that
-    //  we
-    //     are interleaving. Worker threads backing these WorkerStates should
-    //     be regularly producing values.
-    //  2. `staging_` is a deque containing pointers to WorkerStates that we
-    //     will move to `interleave_` when an iterator in `interleave_` is
-    //     exhausted.
+    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+    //     in `workers_` that we are interleaving. Worker threads backing these
+    //     WorkerStates should be regularly producing values.
+    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+    //     `workers_` that we will move to `interleave_indices_` when an
+    //     iterator in `interleave_indices_` is exhausted.
     //
     // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_` and `staging_`
-    // as output iterators (run by the worker threads) are exhausted.
+    // internal implementation updates the state of `interleave_indices_` and
+    // `staging_indices_` as output iterators (run by the worker threads) are
+    // exhausted.
     //
     // `input_impl_` is the input iterator that generates arguments for the
     // flat-map function (`captured_func_`). It is set to an iterator at
@@ -175,18 +226,19 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     // memory.
     //
     // A few invariants are maintained:
-    //  1. No element in interleave_ should be a nullptr unless `staging_` is
-    //     empty and `input_impl_` is empty.
+    //  1. No element in interleave_indices_ should be a -1 unless
+    //     `staging_indices_` is empty and `input_impl_` is empty.
     //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_` and `staging_`.
+    //     union of `interleave_indices_` and `staging_indices_`.
     //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_` or `staging_`.
+    //     an element in `interleave_indices_` or `staging_indices_`.
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            workers_(dataset()->num_threads()) {}
+            workers_(dataset()->num_threads()),
+            worker_thread_states_(dataset()->num_threads()) {}
 
       ~Iterator() override {
         mutex_lock l(mu_);
@@ -211,10 +263,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           // not have an item readily available.
           bool can_produce_elements = false;
           bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_.size();
-            WorkerState* current_worker = interleave_[index];
-            if (!current_worker) continue;  // Empty interleave elements.
+          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_indices_.size();
+            int64 current_worker_index = interleave_indices_[index];
+            if (current_worker_index < 0) {
+              continue;  // Empty interleave elements.
+            }
+            WorkerState* current_worker = &workers_[current_worker_index];
             can_produce_elements |= current_worker->MayHaveElements();
             if (!current_worker->outputs.empty()) {
               // We have an element!
@@ -222,7 +277,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
               if (i == 0) {
                 block_count_++;
                 if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_.size();
+                  next_index_ = (index + 1) % interleave_indices_.size();
                   block_count_ = 0;
                 }
               } else {
@@ -245,7 +300,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
               break;
             } else if (!current_worker->is_producing) {
               // This iterator has reached end of input.
-              interleave_[index] = nullptr;
+              interleave_indices_[index] = -1;
               if (input_impl_) {
                 // Start prefetching a new iterator.
                 std::vector<Tensor> args;
@@ -255,16 +310,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                   input_impl_.reset();
                 } else {
                   current_worker->SetInputs(s, std::move(args));
-                  staging_.emplace_back(current_worker);
+                  staging_indices_.emplace_back(current_worker_index);
                 }
               }
 
-              if (!staging_.empty()) {
-                // Move a worker from `staging_` to `interleave_`.
-                interleave_[index] = staging_.front();
-                staging_.pop_front();
+              if (!staging_indices_.empty()) {
+                // Move a worker from `staging_indices_` to
+                // `interleave_indices_`.
+                interleave_indices_[index] = staging_indices_.front();
+                staging_indices_.pop_front();
 
-                next_index_ = (index + 1) % interleave_.size();
+                next_index_ = (index + 1) % interleave_indices_.size();
                 block_count_ = 0;
                 // Restart the inner [for] loop
                 can_produce_elements = true;
@@ -285,7 +341,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             if (dataset()->sloppy_) {
               sloppy_cond_var_.wait(l);
             } else {
-              interleave_[next_index_]->cond_var.wait(l);
+              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
             }
           }
         }
@@ -293,6 +349,137 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("next_index"), next_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_count"), block_count_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("workers_size"), workers_.size()));
+        for (int i = 0; i < workers_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+        }
+        for (int i = 0; i < worker_thread_states_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
+                                               interleave_indices_.size()));
+        for (int i = 0; i < interleave_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("interleave_indices_", i)),
+              interleave_indices_[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
+                                               staging_indices_.size()));
+        for (int i = 0; i < staging_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("staging_indices_", i)),
+              staging_indices_[i]));
+        }
+        if (!worker_threads_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("worker_threads_running"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (!reader->Contains(full_name("input_exhausted"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
+        next_index_ = size_t(temp);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
+        block_count_ = size_t(temp);
+
+        // Restore WorkerStates.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("workers_size"), &temp));
+        if (temp != dataset()->num_threads()) {
+          return errors::Internal("Expected ", dataset()->num_threads(),
+                                  " worker states but found ", temp, ".");
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+        }
+
+        // Restore `interleave_indices_`.
+        std::set<int64> all_indices;
+        {
+          int64 interleave_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
+                                                &interleave_size));
+          interleave_indices_.reserve(interleave_size);
+          for (int64 i = 0; i < interleave_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("interleave_indices_", i)), &temp));
+            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            interleave_indices_.emplace_back(temp);
+          }
+        }
+
+        // Restore `staging_indices_`.
+        {
+          int64 staging_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("staging_size"), &staging_size));
+          for (int i = 0; i < staging_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("staging_indices_", i)), &temp));
+            if (all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            staging_indices_.emplace_back(temp);
+          }
+        }
+
+        // Start Worker threads.
+        if (reader->Contains(full_name("worker_threads_running"))) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                std::bind(&Iterator::WorkerThread, this,
+                          new IteratorContext(*ctx), i)));
+          }
+        }
+        return Status::OK();
+      }
+
      private:
       // OutputElem contains the information from a call to GetNext by an output
       // iterator.
@@ -345,6 +532,31 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         }
       };
 
+      // The internal state of a worker thread that is not already captured
+      // in its `WorkerState`.
+      //
+      // This is needed only for checkpointing purposes. We keep this
+      // separate from `WorkerState` and guard its fields using a separate
+      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+      struct WorkerThreadState {
+        // The output element that has been produced from the input iterator
+        // and is waiting to be added to `WorkerState.outputs`.
+        OutputElem output_elem;
+
+        // Whether the input iterator returned an `end_of_sequence`.
+        bool end_of_sequence = false;
+
+        // Status returned from `MakeIteratorFromInputElement`.
+        Status iterator_creation_status;
+
+        // The arguments to be used to construct `iterator`.
+        std::vector<Tensor> input;
+
+        std::unique_ptr<IteratorBase> iterator;
+
+        WorkerThreadState() : output_elem(Status::OK()) {}
+      };
+
       Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (worker_threads_.empty()) {
@@ -363,19 +575,38 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                 std::bind(&Iterator::WorkerThread, this,
                           new IteratorContext(*ctx), i)));
             if (i < dataset()->cycle_length_) {
-              interleave_.push_back(&workers_[i]);
+              interleave_indices_.push_back(i);
             } else {
-              staging_.push_back(&workers_[i]);
+              staging_indices_.push_back(i);
             }
           }
-          DCHECK(interleave_.size() == dataset()->cycle_length_);
-          DCHECK(staging_.size() == dataset()->prefetch_input_elements_);
+          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+          DCHECK(staging_indices_.size() ==
+                 dataset()->prefetch_input_elements_);
         }
         return Status::OK();
       }
 
       // Produces elements into the worker's output buffers.
       void WorkerThread(IteratorContext* ctx_ptr, const int64 thread_index) {
+        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
+        //
+        // 1. Any local state that may need to be checkpointed should be kept
+        //    in `worker_thread_states_[thread_index]`.
+        // 2. `WorkerThreadState` should contain state that is needed only for
+        //    checkpointing, i.e., if we were to remove checkpointing support,
+        //    we could keep that state as local variables in this thread.
+        // 3. This thread should only read/write state at `thread_index`
+        //    and should not access other thread states.
+        // 4. When restoring from checkpoint, threads are started only after
+        //    the restore is complete.
+        // 5. Once restored from a checkpoint, the local state is edited only
+        //    by this thread. 3 & 4 allow making assumptions like temporarily
+        //    caching local state in this thread and using it outside a lock
+        //    e.g. `make_new_iterator`.
+        // 6. `ckpt_mu_` should be wisely used to create *consistent*
+        //    checkpoint markers.
+
         // std::function arguments are copy-constructable, so we pass raw
         // pointers, and then immediately wrap them to ensure correct ownership.
         std::unique_ptr<IteratorContext> ctx(ctx_ptr);
@@ -383,38 +614,135 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           mutex_lock l(mu_);
           workers_[thread_index].cond_var.notify_all();
         });
-
+        bool make_new_iterator;
+        {
+          tf_shared_lock l(ckpt_mu_);
+          // Decide whether a new iterator should be built.
+          // 1. If there is an existing iterator, we use it.
+          // 2. If there was an error in iterator creation that could not be
+          //    notified to the client we attempt to send that to the client
+          //    first.
+          make_new_iterator =
+              worker_thread_states_[thread_index].iterator == nullptr &&
+              worker_thread_states_[thread_index].iterator_creation_status.ok();
+        }
+        // Even though `make_new_iterator` has cached values from
+        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+        // it is safe to *read* `make_new_iterator`outside of a lock without
+        // worrying about concurrent changes to values in
+        // `worker_thread_states_[thread_index]`. See comment at the start of
+        // this function for details.
         while (true) {
-          // 1. Wait for input.
-          std::vector<Tensor> input;
-          {
-            mutex_lock l(mu_);
-            while (!cancelled_ && !workers_[thread_index].is_producing) {
-              workers_[thread_index].cond_var.wait(l);
+          // Whether creation of the iterator succeeded.
+          Status iterator_creation_status;
+          // 1. Build a new iterator or use the existing one.
+          if (make_new_iterator) {
+            // 1a. Get new input tensors or use the exiting ones.
+
+            bool read_new_input;
+
+            {
+              tf_shared_lock l(ckpt_mu_);
+              // worker_thread_states_[thread_index].input will be non-empty
+              // if checkpointing happened at CHECKPOINT_MARKER_A.
+              read_new_input =
+                  worker_thread_states_[thread_index].input.empty();
             }
-            if (cancelled_) return;
-            input.swap(workers_[thread_index].input);
-          }
 
-          // 2. Run the user defined function to produce a new iterator.
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = dataset::MakeIteratorFromInputElement(
-              ctx.get(), input, thread_index, dataset()->captured_func_.get(),
-              prefix(), &iterator);
-          input.clear();  // Release memory as early as possible.
+            if (read_new_input) {
+              mutex_lock l(mu_);
+              while (!cancelled_ && !workers_[thread_index].is_producing) {
+                workers_[thread_index].cond_var.wait(l);
+              }
+              if (cancelled_) return;
+              // Copy the input tensors so that we do not need to block on `mu_`
+              // when building the iterator.
+              // We keep a copy of the input tensors in
+              // `WorkerThreadState.input` till the iterator is in use. This is
+              // used in `RestoreInternal` to re-build the iterator.
+              // TODO(b/78046638): Explore ways to avoid tracking the input
+              // tensors.
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              worker_thread_states_[thread_index].input.swap(
+                  workers_[thread_index].input);
+              // CHECKPOINT_MARKER_A
+              // We have the input tensors but have not built the iterator yet.
+            }
 
-          if (!s.ok()) {
+            // 1b. Run the user defined function to produce a new iterator.
+            {
+              tf_shared_lock l(ckpt_mu_);
+              worker_thread_states_[thread_index].iterator_creation_status =
+                  dataset::MakeIteratorFromInputElement(
+                      ctx.get(), worker_thread_states_[thread_index].input,
+                      thread_index, dataset()->captured_func_.get(), prefix(),
+                      &worker_thread_states_[thread_index].iterator);
+              iterator_creation_status =
+                  worker_thread_states_[thread_index].iterator_creation_status;
+              if (!iterator_creation_status.ok()) {
+                worker_thread_states_[thread_index].input.clear();
+              }
+              // CHECKPOINT_MARKER_B
+              // Either an iterator has been successfully built and placed in
+              // `worker_thread_states_[thread_index].iterator` or it failed and
+              // a non-OK status has been put in
+              // `worker_thread_states_[thread_index].iterator_creation_status`.
+            }
+          } else {
+            tf_shared_lock l(ckpt_mu_);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            // Mark that we have used up the restored iterator.
+            make_new_iterator = true;
+          }
+          // 2. Start producing elements or send error state to client if
+          //    iterator creation failed.
+          if (!iterator_creation_status.ok()) {
             mutex_lock l(mu_);
-            workers_[thread_index].outputs.emplace_back(s);
+            // Wait for space in the prefetch queue.
+            while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                      dataset()->buffer_output_elements_) {
+              workers_[thread_index].cond_var.wait(l);
+            }
+            if (cancelled_) return;
+            tf_shared_lock ckpt_l(ckpt_mu_);
+            workers_[thread_index].outputs.emplace_back(
+                iterator_creation_status);
             workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].iterator_creation_status =
+                Status::OK();
+            // CHECKPOINT_MARKER_C
+            // Non-OK iterator creation status has been notified to the
+            // client.
             workers_[thread_index].cond_var.notify_one();
           } else {
-            // 3. Produce elements
             bool end_of_sequence = false;
             while (!end_of_sequence) {
               // 3.a Produce an element!
-              std::vector<Tensor> output_elem;
-              s = iterator->GetNext(ctx.get(), &output_elem, &end_of_sequence);
+              {
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                if (worker_thread_states_[thread_index]
+                        .output_elem.status.ok() &&
+                    worker_thread_states_[thread_index]
+                        .output_elem.output.empty() &&
+                    !worker_thread_states_[thread_index].end_of_sequence) {
+                  worker_thread_states_[thread_index].output_elem.status =
+                      worker_thread_states_[thread_index].iterator->GetNext(
+                          ctx.get(),
+                          &worker_thread_states_[thread_index]
+                               .output_elem.output,
+                          &worker_thread_states_[thread_index].end_of_sequence);
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                } else {
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                }
+                // CHECKPOINT_MARKER_D
+                // An element has been read or an error or end_of_sequence has
+                // been received from the input iterator and is waiting to be
+                // sent to client.
+              }
 
               // 3.b Make it available to the client.
               {
@@ -427,30 +755,255 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                 }
                 if (cancelled_) return;
 
-                // Output the element.
+                tf_shared_lock ckpt_l(ckpt_mu_);
                 workers_[thread_index].is_producing = !end_of_sequence;
-                if (!end_of_sequence) {
-                  workers_[thread_index].outputs.emplace_back(s);
+
+                // Output the element.
+
+                // Move the temporary state in WorkerThreadState to WorkerState
+                // and mark it as used.
+                if (end_of_sequence) {
+                  worker_thread_states_[thread_index].iterator.reset();
+                  worker_thread_states_[thread_index].input.clear();
+                  worker_thread_states_[thread_index].end_of_sequence = false;
+                } else {
+                  workers_[thread_index].outputs.emplace_back(
+                      worker_thread_states_[thread_index].output_elem.status);
                   workers_[thread_index].outputs.back().output.swap(
-                      output_elem);
+                      worker_thread_states_[thread_index].output_elem.output);
                 }
+                worker_thread_states_[thread_index].output_elem.status =
+                    Status::OK();
                 if (dataset()->sloppy_) {
                   sloppy_cond_var_.notify_one();
                 } else {
                   workers_[thread_index].cond_var.notify_one();
                 }
+                // CHECKPOINT_MARKER_E
+                // Output element or iterator status has been sent to the
+                // client.
               }
             }
           }
         }
       }
 
+      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_", index);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            workers_[index].input.size()));
+        for (int i = 0; i < workers_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              workers_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_outputs_size")),
+            workers_[index].outputs.size()));
+        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+              writer, workers_[index].outputs[i],
+              full_name(strings::StrCat(prefix, "_outputs_", i))));
+        }
+        if (workers_[index].is_producing) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_is_producing")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                   IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        workers_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          workers_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &workers_[index].input.back()));
+        }
+        int64 outputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
+            &outputs_size));
+        for (int i = 0; i < outputs_size; ++i) {
+          workers_[index].outputs.emplace_back(Status::OK());
+          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+              reader, &workers_[index].outputs.back(),
+              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
+        }
+        if (reader->Contains(
+                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
+          workers_[index].is_producing = true;
+        } else {
+          workers_[index].is_producing = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
+                                          int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_thread_", index);
+        if (worker_thread_states_[index].iterator != nullptr) {
+          TF_RETURN_IF_ERROR(
+              SaveParent(writer, worker_thread_states_[index].iterator));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            worker_thread_states_[index].input.size()));
+        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              worker_thread_states_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_iterator_creation_status"),
+            worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+            writer, worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(prefix, "_output"))));
+        if (worker_thread_states_[index].end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                         IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_thread_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        worker_thread_states_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          worker_thread_states_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &worker_thread_states_[index].input.back()));
+        }
+        // Restore iterator.
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
+          worker_thread_states_[index].iterator.reset();
+        } else {
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = dataset::MakeIteratorFromInputElement(
+              ctx, worker_thread_states_[index].input, index,
+              dataset()->captured_func_.get(), prefix(), &iterator);
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, iterator));
+          worker_thread_states_[index].iterator.swap(iterator);
+        }
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
+            &worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+            reader, &worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(worker_prefix, "_output"))));
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
+          worker_thread_states_[index].end_of_sequence = true;
+        } else {
+          worker_thread_states_[index].end_of_sequence = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                   const OutputElem& output_elem,
+                                   const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
+                                output_elem.output.size()));
+        for (int i = 0; i < output_elem.output.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                  OutputElem* output_elem, const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            strings::StrCat(prefix, "_output_size"), &output_size));
+        output_elem->output.reserve(output_size);
+        for (int i = 0; i < output_size; ++i) {
+          output_elem->output.emplace_back();
+          TF_RETURN_IF_ERROR(
+              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
+                                 &output_elem->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
       // Mutex & condition variable to guard mutable iterator internals and
       // coordinate among worker threads and client thread[s].
-      mutex mu_;
+      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
       // The main thread waits on this condition variable if running in sloppy
       // mode and no values are available.
       condition_variable sloppy_cond_var_;
+      // Mutex used to wait for a consistent state while checkpointing.
+      // Only Save and Restore require an exclusive lock on this mutex. In
+      // other scenarios we just acquire a shared lock so the pipeline's
+      // performance should not be affected in the absence of checkpointing.
+      // A thread must not wait on any condition variable while holding
+      // `ckpt_mu_` in either shared or exclusive modes.
+      mutex ckpt_mu_;
 
       // The iterator producing elements which are converted to datasets by
       // the dataset()->captured_func_ then interleaved together.
@@ -461,10 +1014,14 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // workers_ elements are in at most one of interleave_ and staging_.
       std::vector<WorkerState> workers_ GUARDED_BY(mu_);
 
-      // The iterators to interleave
-      std::vector<WorkerState*> interleave_ GUARDED_BY(mu_);
-      // Prefetched iterators
-      std::deque<WorkerState*> staging_ GUARDED_BY(mu_);
+      // Stores the temporary state of WorkerThreads which is not stored in
+      // WorkerState. This is used for checkpointing purposes only.
+      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+
+      // Indices in `workers_` of iterators to interleave.
+      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+      // Indices in `workers_` of prefetched iterators.
+      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
 
       // The index into output_elements_ for next element to produce.
       size_t next_index_ GUARDED_BY(mu_) = 0;
@@ -479,6 +1036,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
+    const NameAttrList interleave_func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
@@ -492,7 +1050,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
   const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
+  NameAttrList interleave_func_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
-- 
GitLab


From 2fc312a5787ec24b114c3e889c42e8df2450e145 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 17 Apr 2018 12:58:19 +0800
Subject: [PATCH 0933/1262] Fix Warning in reduce_mean/ reduce_max related
 deprecation argument keep_dims

---
 .../contrib/bayesflow/python/ops/monte_carlo_impl.py |  4 ++--
 .../python/training/functions/gbdt_batch_test.py     |  2 +-
 .../contrib/estimator/python/estimator/head.py       |  2 +-
 .../contrib/factorization/python/ops/gmm_ops.py      | 12 ++++++------
 .../python/eval/python/sliced_wasserstein_impl.py    |  2 +-
 .../python/features/python/virtual_batchnorm_impl.py |  6 +++---
 tensorflow/contrib/kfac/python/ops/loss_functions.py |  6 +++---
 .../contrib/labeled_tensor/python/ops/ops_test.py    |  4 ++--
 tensorflow/contrib/metrics/python/ops/metric_ops.py  |  2 +-
 tensorflow/contrib/nn/python/ops/sampling_ops.py     |  2 +-
 .../contrib/slim/python/slim/nets/resnet_v1.py       |  2 +-
 .../contrib/slim/python/slim/nets/resnet_v2.py       |  2 +-
 .../timeseries/python/timeseries/math_utils.py       |  2 +-
 .../examples/tutorials/word2vec/word2vec_basic.py    |  2 +-
 tensorflow/python/grappler/layout_optimizer_test.py  | 10 +++++-----
 .../python/kernel_tests/distributions/util_test.py   |  2 +-
 16 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index d193a8459d..48ff083532 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -329,7 +329,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
     if not callable(f):
       raise ValueError('`f` must be a callable function.')
     if use_reparametrization:
-      return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims)
     else:
       if not callable(log_prob):
         raise ValueError('`log_prob` must be a callable function.')
@@ -349,7 +349,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
       # "Is there a floating point value of x, for which x-x == 0 is false?"
       # http://stackoverflow.com/q/2686644
       fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
-      return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims)
 
 
 def _sample_mean(values):
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 17dcb49f47..f9c22283b7 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -45,7 +45,7 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keep_dims=True)
+      math_ops.square(predictions - label), 1, keepdims=True)
   return loss
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index ae2fd8b490..3dcf0374c8 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -485,7 +485,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
           reduction=losses.Reduction.NONE)
       # Averages loss over classes.
       unweighted_loss = math_ops.reduce_mean(
-          unweighted_loss, axis=-1, keep_dims=True)
+          unweighted_loss, axis=-1, keepdims=True)
     weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 5d77bc77e1..ccdd679d6a 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -54,10 +54,10 @@ def _covariance(x, diag):
   diagonal matrix just the diagonal is returned.
   """
   num_points = math_ops.to_float(array_ops.shape(x)[0])
-  x -= math_ops.reduce_mean(x, 0, keep_dims=True)
+  x -= math_ops.reduce_mean(x, 0, keepdims=True)
   if diag:
     cov = math_ops.reduce_sum(
-        math_ops.square(x), 0, keep_dims=True) / (num_points - 1)
+        math_ops.square(x), 0, keepdims=True) / (num_points - 1)
   else:
     cov = math_ops.matmul(x, x, transpose_a=True) / (num_points - 1)
   return cov
@@ -313,7 +313,7 @@ class GmmAlgorithm(object):
     # TODO(xavigonzalvo): look into alternatives to log for
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
-        math_ops.log(self._covs + 1e-3), 1, keep_dims=True)
+        math_ops.log(self._covs + 1e-3), 1, keepdims=True)
     diff = shard - self._means
     x2 = math_ops.square(diff)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
@@ -351,7 +351,7 @@ class GmmAlgorithm(object):
       shard_id: id of current shard_id.
     """
     self._prior_probs[shard_id] = math_ops.reduce_logsumexp(
-        self._probs[shard_id], axis=1, keep_dims=True)
+        self._probs[shard_id], axis=1, keepdims=True)
 
   def _define_expectation_operation(self, shard_id):
     # Shape broadcasting.
@@ -375,7 +375,7 @@ class GmmAlgorithm(object):
     """
     # Soft assignment of each data point to each of the two clusters.
     self._points_in_k[shard_id] = math_ops.reduce_sum(
-        self._w[shard_id], 0, keep_dims=True)
+        self._w[shard_id], 0, keepdims=True)
     # Partial means.
     w_mul_x = array_ops.expand_dims(
         math_ops.matmul(
@@ -454,7 +454,7 @@ class GmmAlgorithm(object):
     for shard_id, prior_probs in enumerate(self._prior_probs):
       op.append(prior_probs + math_ops.log(self._w[shard_id]))
     self._scores = array_ops.squeeze(
-        math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
+        math_ops.reduce_logsumexp(op, axis=2, keepdims=True), axis=0)
 
 
 def gmm(inp,
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
index 4b10bc0f8e..4b1105f6bd 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -161,7 +161,7 @@ def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
     proj = random_ops.random_normal(
         [array_ops.shape(a)[1], random_projection_dim])
     proj *= math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(proj), 0, keep_dims=True))
+        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
     # Project both distributions and sort them.
     proj_a = math_ops.matmul(a, proj)
     proj_b = math_ops.matmul(b, proj)
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
index f8b372546b..650eab97a3 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
@@ -64,11 +64,11 @@ def _statistics(x, axes):
   y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
 
   # Compute true mean while keeping the dims for proper broadcasting.
-  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keep_dims=True))
+  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))
 
-  shifted_mean = math_ops.reduce_mean(y - shift, axes, keep_dims=True)
+  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
   mean = shifted_mean + shift
-  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keep_dims=True)
+  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)
 
   mean = array_ops.squeeze(mean, axes)
   mean_squared = array_ops.squeeze(mean_squared, axes)
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index e7d4243fc3..42d525c2c2 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -613,19 +613,19 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
   def multiply_fisher(self, vector):
     probs = self._probs
     return vector * probs - probs * math_ops.reduce_sum(
-        vector * probs, axis=-1, keep_dims=True)
+        vector * probs, axis=-1, keepdims=True)
 
   def multiply_fisher_factor(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=-1, keep_dims=True)
+        sqrt_probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_transpose(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=-1, keep_dims=True)
+        probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_replicated_one_hot(self, index):
     assert len(index) == 1, "Length of index was {}".format(len(index))
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index 0727f4cf88..39e9d65407 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -660,7 +660,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, {('channel', 'hihowareyou')})
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
@@ -668,7 +668,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, ('channel', 'hihowareyou'))
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 2bf281b791..9fe76c1229 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -3235,7 +3235,7 @@ def streaming_mean_cosine_distance(predictions,
   radial_diffs = math_ops.reduce_sum(
       radial_diffs, reduction_indices=[
           dim,
-      ], keep_dims=True)
+      ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
                                             name or 'mean_cosine_distance')
   mean_distance = math_ops.subtract(1.0, mean_distance)
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 63fc487dca..e65925610c 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -88,7 +88,7 @@ def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
     return math_ops.reduce_logsumexp(
         math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True),
         axis=1,
-        keep_dims=False)
+        keepdims=False)
 
   # Calling this protected form of embedding_lookup allows co-locating
   # the logsumexp computation with the partitioned weights, which yields
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
index 235a595de4..11c4214176 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
@@ -207,7 +207,7 @@ def resnet_v1(inputs,
         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers.conv2d(
               net,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index 61665c9c8b..19e0538dd1 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -221,7 +221,7 @@ def resnet_v2(inputs,
             net, activation_fn=nn_ops.relu, scope='postnorm')
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers_lib.conv2d(
               net,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 26793c80bf..9b593fecbb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -60,7 +60,7 @@ def clip_covariance(
   # TODO(allenl): Smarter scaling here so that correlations are preserved when
   # fiddling with diagonal elements.
   diagonal = array_ops.matrix_diag_part(covariance_matrix)
-  maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True)
+  maximum = math_ops.reduce_max(diagonal, axis=-1, keepdims=True)
   new_diagonal = gen_math_ops.maximum(
       diagonal, maximum / maximum_variance_ratio)
   return array_ops.matrix_set_diag(
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 14ae7fbf35..b09ee99768 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -224,7 +224,7 @@ with graph.as_default():
     optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
   # Compute the cosine similarity between minibatch examples and all embeddings.
-  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
+  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
   normalized_embeddings = embeddings / norm
   valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                             valid_dataset)
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 5a84b16a23..e3dd4b0bdf 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -476,7 +476,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keepdims=True)
       squeeze = array_ops.squeeze(reduce_sum, axis=[1, 2])
       output = array_ops.identity(squeeze)
 
@@ -506,7 +506,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keepdims=True)
       squeeze = array_ops.squeeze(reduce_sum, axis=[0, 1, 2])
       output = array_ops.identity(squeeze)
 
@@ -623,7 +623,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -653,7 +653,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[2], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[2], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
@@ -682,7 +682,7 @@ class LayoutOptimizerTest(test.TestCase):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keep_dims=True)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keepdims=True)
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index f54f146e0a..d1381c086c 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -785,7 +785,7 @@ class FillTriangularTest(test.TestCase):
 @test_util.with_c_api
 class ReduceWeightedLogSumExp(test.TestCase):
 
-  def _reduce_weighted_logsumexp(self, logx, w, axis, keep_dims=False):
+  def _reduce_weighted_logsumexp(self, logx, w, axis, keepdims=False):
     m = np.max(logx, axis=axis, keepdims=True)
     sum_ = np.sum(w * np.exp(logx - m), axis=axis, keepdims=keep_dims)
     sgn = np.sign(sum_)
-- 
GitLab


From f71253dbfac74b8c11b1a1aa4984a250ed980058 Mon Sep 17 00:00:00 2001
From: "wenhao.hu" <wenhao.hu@leapmind.io>
Date: Tue, 17 Apr 2018 14:01:06 +0900
Subject: [PATCH 0934/1262] fix init_ops.py

---
 tensorflow/python/ops/init_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 09cf6dd238..dd27ce3f80 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -656,7 +656,7 @@ class ConvolutionOrthogonal2D(Initializer):
     a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed)
     if self.seed:
       self.seed += 1
-    q, r = linalg_ops.qr(a)
+    q, r = gen_linalg_ops.qr(a)
     d = array_ops.diag_part(r)
     # make q uniform
     q *= math_ops.sign(d)
@@ -709,7 +709,7 @@ class ConvolutionOrthogonal2D(Initializer):
       raise ValueError("The dimension of the matrices must be the same.")
     n = p1.shape.as_list()[0]
     kernel2x2 = {}
-    eye = linalg_ops.eye(n, dtype=self.dtype)
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel2x2[0, 0] = math_ops.matmul(p1, p2)
     kernel2x2[0, 1] = math_ops.matmul(p1, (eye - p2))
     kernel2x2[1, 0] = math_ops.matmul((eye - p1), p2)
-- 
GitLab


From 57f64fe469364417cfc6755c754abb54c2e3756b Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 17 Apr 2018 13:03:56 +0800
Subject: [PATCH 0935/1262] revert unwanted typo

---
 tensorflow/python/kernel_tests/distributions/util_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index d1381c086c..f54f146e0a 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -785,7 +785,7 @@ class FillTriangularTest(test.TestCase):
 @test_util.with_c_api
 class ReduceWeightedLogSumExp(test.TestCase):
 
-  def _reduce_weighted_logsumexp(self, logx, w, axis, keepdims=False):
+  def _reduce_weighted_logsumexp(self, logx, w, axis, keep_dims=False):
     m = np.max(logx, axis=axis, keepdims=True)
     sum_ = np.sum(w * np.exp(logx - m), axis=axis, keepdims=keep_dims)
     sgn = np.sign(sum_)
-- 
GitLab


From c2643d12c552799532b933238711d5c433e4df17 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 16 Apr 2018 23:07:24 -0700
Subject: [PATCH 0936/1262] [tf.data] Add an API for randomly sampling from
 multiple datasets.

Fixes #15999.

PiperOrigin-RevId: 193152683
---
 tensorflow/contrib/data/__init__.py           |   2 +
 tensorflow/contrib/data/kernels/BUILD         |  12 +
 .../kernels/directed_interleave_dataset_op.cc | 274 ++++++++++++++++++
 tensorflow/contrib/data/ops/dataset_ops.cc    |  17 ++
 .../interleave_dataset_op_test.py             | 103 +++++++
 tensorflow/contrib/data/python/ops/BUILD      |  10 +
 .../contrib/data/python/ops/interleave_ops.py | 100 +++++++
 7 files changed, 518 insertions(+)
 create mode 100644 tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc

diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 637b1dc46c..077cbba9d2 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -41,6 +41,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@prefetch_to_device
 @@read_batch_features
 @@rejection_resample
+@@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
 @@sliding_window_batch
@@ -69,6 +70,7 @@ from tensorflow.contrib.data.python.ops.get_single_element import get_single_ele
 from tensorflow.contrib.data.python.ops.grouping import bucket_by_sequence_length
 from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
+from tensorflow.contrib.data.python.ops.interleave_ops import sample_from_datasets
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
 from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index 83ada6fb67..c56910c783 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -18,6 +18,17 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "directed_interleave_dataset_op",
+    srcs = ["directed_interleave_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "ignore_errors_dataset_op",
     srcs = ["ignore_errors_dataset_op.cc"],
@@ -52,6 +63,7 @@ cc_library(
 cc_library(
     name = "dataset_kernels",
     deps = [
+        ":directed_interleave_dataset_op",
         ":ignore_errors_dataset_op",
         ":prefetching_kernels",
         ":threadpool_dataset_op",
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
new file mode 100644
index 0000000000..48d3734162
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
@@ -0,0 +1,274 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/hash/hash.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class DirectedInterleaveDatasetOp : public DatasetOpKernel {
+ public:
+  explicit DirectedInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    DatasetBase* selector_input;
+    OP_REQUIRES_OK(ctx,
+                   GetDatasetFromVariantTensor(ctx->input(0), &selector_input));
+
+    OP_REQUIRES(
+        ctx,
+        selector_input->output_dtypes().size() == 1 &&
+            selector_input->output_dtypes()[0] == DT_INT64 &&
+            selector_input->output_shapes().size() == 1 &&
+            selector_input->output_shapes()[0].IsCompatibleWith(
+                PartialTensorShape({})),
+        errors::InvalidArgument(
+            "The selector input must be a dataset of scalar int64 elements."));
+
+    std::vector<DatasetBase*> data_inputs;
+    for (size_t i = 1; i < ctx->num_inputs(); ++i) {
+      DatasetBase* input;
+      OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(i), &input));
+      data_inputs.push_back(input);
+
+      OP_REQUIRES(
+          ctx, data_inputs[0]->output_dtypes() == input->output_dtypes(),
+          errors::InvalidArgument(
+              "All inputs must have the same output_dtypes. First input "
+              "has types ",
+              DataTypeVectorString(data_inputs[0]->output_dtypes()),
+              ", and input ", i - 1, " has types ",
+              DataTypeVectorString(input->output_dtypes())));
+    }
+    *output = new Dataset(ctx, selector_input, std::move(data_inputs));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* selector_input,
+            std::vector<DatasetBase*> data_inputs)
+        : GraphDatasetBase(ctx),
+          selector_input_(selector_input),
+          data_inputs_(std::move(data_inputs)) {
+      selector_input_->Ref();
+
+      output_shapes_ = data_inputs_[0]->output_shapes();
+      data_inputs_[0]->Ref();
+      for (size_t i = 1; i < data_inputs_.size(); ++i) {
+        const DatasetBase* data_input = data_inputs_[i];
+        data_input->Ref();
+        for (size_t j = 0; j < output_shapes_.size(); ++j) {
+          output_shapes_[j] = MostSpecificCompatibleShape(
+              output_shapes_[j], data_input->output_shapes()[j]);
+        }
+      }
+    }
+
+    ~Dataset() override {
+      selector_input_->Unref();
+      for (DatasetBase* data_input : data_inputs_) {
+        data_input->Unref();
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::DirectedInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return data_inputs_[0]->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("DirectedInterleaveDatasetOp::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* selector_input_node;
+      TF_RETURN_IF_ERROR(
+          b->AddParentDataset(ctx, selector_input_, &selector_input_node));
+      std::vector<Node*> data_input_nodes(data_inputs_.size());
+      for (size_t i = 0; i < data_inputs_.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            b->AddParentDataset(ctx, data_inputs_[i], &data_input_nodes[i]));
+      }
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, selector_input_node}},
+                                       {{1, data_input_nodes}}, {}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            selector_input_impl_(params.dataset->selector_input_->MakeIterator(
+                params.prefix + ".selector")),
+            num_active_inputs_(params.dataset->data_inputs_.size()) {
+        data_input_impls_.reserve(params.dataset->data_inputs_.size());
+        for (size_t i = 0; i < params.dataset->data_inputs_.size(); ++i) {
+          const DatasetBase* data_input = params.dataset->data_inputs_[i];
+          data_input_impls_.push_back(data_input->MakeIterator(
+              strings::StrCat(params.prefix, "[", i, "]")));
+        }
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!selector_input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        while (true) {
+          std::vector<Tensor> selector_result;
+          *end_of_sequence = false;
+          TF_RETURN_IF_ERROR(selector_input_impl_->GetNext(
+              ctx, &selector_result, end_of_sequence));
+          if (*end_of_sequence) {
+            selector_input_impl_.reset();
+            for (auto& data_input_impl : data_input_impls_) {
+              data_input_impl.reset();
+            }
+            return Status::OK();
+          }
+
+          int64 selected_input = selector_result[0].scalar<int64>()();
+          if (selected_input < 0 || selected_input > data_input_impls_.size()) {
+            return errors::InvalidArgument(
+                "Selector index out of range: ", selected_input,
+                " >= ", data_input_impls_.size());
+          }
+
+          if (data_input_impls_[selected_input]) {
+            bool end_of_selected_input = false;
+            TF_RETURN_IF_ERROR(data_input_impls_[selected_input]->GetNext(
+                ctx, out_tensors, &end_of_selected_input));
+
+            if (!end_of_selected_input) {
+              return Status::OK();
+            }
+
+            data_input_impls_[selected_input].reset();
+            --num_active_inputs_;
+
+            if (num_active_inputs_ == 0) {
+              selector_input_impl_.reset();
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+          }
+
+          LOG(WARNING) << "DirectedInterleave selected an exhausted input: "
+                       << selected_input;
+        }
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (selector_input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, selector_input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("selector_input_impl_empty"), ""));
+        }
+        for (size_t i = 0; i < data_input_impls_.size(); ++i) {
+          const auto& data_input_impl = data_input_impls_[i];
+          if (data_input_impl) {
+            TF_RETURN_IF_ERROR(SaveParent(writer, data_input_impl));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("data_input_impl_empty[", i, "]")),
+                ""));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("selector_input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, selector_input_impl_));
+        } else {
+          selector_input_impl_.reset();
+        }
+        for (size_t i = 0; i < data_input_impls_.size(); ++i) {
+          if (!reader->Contains(full_name(
+                  strings::StrCat("data_input_impl_empty[", i, "]")))) {
+            TF_RETURN_IF_ERROR(
+                RestoreParent(ctx, reader, data_input_impls_[i]));
+          } else {
+            data_input_impls_[i].reset();
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> selector_input_impl_ GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<IteratorBase>> data_input_impls_
+          GUARDED_BY(mu_);
+      int64 num_active_inputs_ GUARDED_BY(mu_);
+    };
+
+    static PartialTensorShape MostSpecificCompatibleShape(
+        const PartialTensorShape& ts1, const PartialTensorShape& ts2) {
+      PartialTensorShape output_tensorshape;
+      if (ts1.dims() != ts2.dims() || ts1.unknown_rank() || ts2.unknown_rank())
+        return output_tensorshape;
+      auto dims1 = ts1.dim_sizes();
+      auto dims2 = ts2.dim_sizes();
+      for (int d = 0; d < ts1.dims(); d++) {
+        if (dims1[d] == dims2[d])
+          output_tensorshape.Concatenate(dims1[d]);
+        else
+          output_tensorshape.Concatenate(-1);
+      }
+      return output_tensorshape;
+    }
+
+    const DatasetBase* const selector_input_;
+    const std::vector<DatasetBase*> data_inputs_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("DirectedInterleaveDataset").Device(DEVICE_CPU),
+                        DirectedInterleaveDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index cf0a8bbccb..137deb6352 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -17,6 +17,23 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("DirectedInterleaveDataset")
+    .Input("selector_input_dataset: variant")
+    .Input("data_input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+
+selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines
+  which of the `N` data inputs should produce the next output element.
+data_input_datasets: `N` datasets with the same type that will be interleaved
+  according to the values of `selector_input_dataset`.
+)doc");
+
 REGISTER_OP("IgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 2df35f81eb..f8556a1b28 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -906,5 +907,107 @@ class ParallelInterleaveDatasetTest(test.TestCase):
         sess.run(self.next_element)
 
 
+class DirectedInterleaveDatasetTest(test.TestCase):
+
+  def testBasic(self):
+    selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
+    input_datasets = [
+        dataset_ops.Dataset.from_tensors(i).repeat(100) for i in range(10)
+    ]
+    dataset = interleave_ops.DirectedInterleaveDataset(selector_dataset,
+                                                       input_datasets)
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(100):
+        for i in range(10):
+          self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def _normalize(self, vec):
+    batched = (len(vec.shape) == 2)
+    return vec / vec.sum(axis=1, keepdims=True) if batched else vec / vec.sum()
+
+  def _chi2(self, expected, actual):
+    actual = np.asarray(actual)
+    expected = np.asarray(expected)
+    diff = actual - expected
+    chi2 = np.sum(diff * diff / expected, axis=0)
+    return chi2
+
+  def testSampleFromDatasets(self):
+    random_seed.set_random_seed(1618)
+    num_samples = 10000
+    rand_probs = self._normalize(np.random.random_sample((10,)))
+    rand_probs2 = self._normalize(np.random.random_sample((15,)))
+
+    for probs in [[.5, .5], [.85, .05, .1], rand_probs, rand_probs2]:
+      probs = np.asarray(probs)
+
+      # Create a dataset that samples each integer in `[0, probs.shape[0])`
+      # with probability given by `probs[i]`.
+      dataset = interleave_ops.sample_from_datasets([
+          dataset_ops.Dataset.from_tensors(i).repeat(None)
+          for i in range(probs.shape[0])
+      ], probs)
+      dataset = dataset.take(num_samples)
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+
+      with self.test_session() as sess:
+        freqs = np.zeros_like(probs)
+        for _ in range(num_samples):
+          freqs[sess.run(next_element)] += 1
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(next_element)
+
+      # Use chi-squared test to assert that the observed distribution
+      # matches the expected distribution. Based on the implementation
+      # in "tensorflow/python/kernel_tests/multinomial_op_test.py".
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
+
+  def testErrors(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"vector of length `len\(datasets\)`"):
+      interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[0.25, 0.25, 0.25, 0.25])
+
+    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
+      interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[1, 1])
+
+    with self.assertRaisesRegexp(TypeError, "must have the same type"):
+      interleave_ops.sample_from_datasets([
+          dataset_ops.Dataset.from_tensors(0),
+          dataset_ops.Dataset.from_tensors(0.0)
+      ])
+
+
+class SampleFromDatasetsSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, probs, num_samples):
+    dataset = interleave_ops.sample_from_datasets(
+        [
+            dataset_ops.Dataset.from_tensors(i).repeat(None)
+            for i in range(len(probs))
+        ],
+        probs,
+        seed=1813)
+    return dataset.take(num_samples)
+
+  def testSerializationCore(self):
+    self.run_core_tests(
+        lambda: self._build_dataset([0.5, 0.5], 100),
+        lambda: self._build_dataset([0.25, 0.25, 0.25, 0.25], 1000), 100)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 0e4590829b..e00f2304cc 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -172,8 +172,18 @@ py_library(
     srcs = ["interleave_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        ":random_ops",
+        "//tensorflow/contrib/stateless",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 91f19da02d..106a1ef388 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -17,7 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib import stateless
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.contrib.data.python.ops import random_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 
 
@@ -140,3 +151,92 @@ def sloppy_interleave(map_func, cycle_length, block_length=1):
         prefetch_input_elements=None)
 
   return _apply_fn
+
+
+class DirectedInterleaveDataset(dataset_ops.Dataset):
+  """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
+
+  def __init__(self, selector_input, data_inputs):
+    self._selector_input = selector_input
+    self._data_inputs = list(data_inputs)
+
+    for data_input in data_inputs[1:]:
+      if (data_input.output_types != data_inputs[0].output_types or
+          data_input.output_classes != data_inputs[0].output_classes):
+        raise TypeError("All datasets must have the same type.")
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.directed_interleave_dataset(
+        self._selector_input._as_variant_tensor(),
+        [data_input._as_variant_tensor() for data_input in self._data_inputs],
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._data_inputs[0].output_classes
+
+  @property
+  def output_shapes(self):
+    ret = self._data_inputs[0].output_shapes
+    for data_input in self._data_inputs[1:]:
+      ret = nest.pack_sequence_as(ret, [
+          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
+              nest.flatten(ret), nest.flatten(data_input.output_shapes))
+      ])
+    return ret
+
+  @property
+  def output_types(self):
+    return self._data_inputs[0].output_types
+
+
+def sample_from_datasets(datasets, weights=None, seed=None):
+  """Samples elements at random from the datasets in `datasets`.
+
+  Args:
+    datasets: A list of @{tf.data.Dataset} objects with compatible structure.
+    weights: (Optional.) A list of `len(datasets)` floating-point values,
+      where `weights[i]` represents the probability with which an element
+      should be sampled from `datasets[i]`. Defaults to a uniform distribution
+      across `datasets`.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      @{tf.set_random_seed} for behavior.
+
+  Returns:
+    A dataset that interleaves elements from `datasets` at random, according to
+    `weights` if provided, otherwise with uniform probability.
+
+  Raises:
+    TypeError: If the `datasets` or `weights` arguments have the wrong type.
+    ValueError: If the `weights` argument is specified and does not match the
+      length of the `datasets` element.
+  """
+  num_datasets = len(datasets)
+  if weights is None:
+    weights = array_ops.ones(
+        [num_datasets], dtype=dtypes.float32, name="weights")
+  else:
+    weights = ops.convert_to_tensor(weights, name="weights")
+    if weights.dtype not in (dtypes.float32, dtypes.float64):
+      raise TypeError("`weights` must be convertible to a tensor of "
+                      "`tf.float32` or `tf.float64` elements.")
+    if not weights.shape.is_compatible_with([num_datasets]):
+      raise ValueError("`weights` must be a vector of length `len(datasets)`.")
+
+  # The `stateless_multinomial()` op expects log-probabilities, as opposed to
+  # weights.
+  logits = math_ops.log(weights, name="logits")
+
+  def select_dataset(seed):
+    return array_ops.squeeze(
+        stateless.stateless_multinomial([logits], 1, seed=seed), axis=[0, 1])
+
+  selector_input = random_ops.RandomDataset(seed).batch(2).map(select_dataset)
+
+  return DirectedInterleaveDataset(selector_input, datasets)
-- 
GitLab


From 5ee30ac7c1affb8b214c6fb08aa83fded8d1374d Mon Sep 17 00:00:00 2001
From: "wenhao.hu" <wenhao.hu@leapmind.io>
Date: Tue, 17 Apr 2018 15:49:44 +0900
Subject: [PATCH 0937/1262] fix non-whitelited pylint errors

---
 tensorflow/python/ops/init_ops.py   | 1 -
 tensorflow/python/ops/linalg_ops.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index dd27ce3f80..9ecc639dbc 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -43,7 +43,6 @@ from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 805fbd99ef..a0dfa543f9 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
-- 
GitLab


From 794dfc1752564ee5f5813a72a72fa7f2a9da17a9 Mon Sep 17 00:00:00 2001
From: "wenhao.hu" <wenhao.hu@leapmind.io>
Date: Tue, 17 Apr 2018 16:03:24 +0900
Subject: [PATCH 0938/1262] loose test to 1e-5

---
 tensorflow/python/kernel_tests/norm_op_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index dde28007d4..3f71b326a2 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -76,7 +76,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_norm = linalg_ops.norm(
             tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
-    self.assertAllClose(np_norm, tf_norm_val)
+    self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
 
   def Test(self):
     is_matrix_norm = (isinstance(axis_, tuple) or
-- 
GitLab


From ed260b6e7f87938eae34de67e4df3ebb36451806 Mon Sep 17 00:00:00 2001
From: Wenhao Hu <fumihwh@gmail.com>
Date: Tue, 17 Apr 2018 17:22:06 +0900
Subject: [PATCH 0939/1262] change the year of copyright

---
 tensorflow/python/ops/linalg_ops_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/linalg_ops_impl.py b/tensorflow/python/ops/linalg_ops_impl.py
index 9263b95336..e7c89f6ae3 100644
--- a/tensorflow/python/ops/linalg_ops_impl.py
+++ b/tensorflow/python/ops/linalg_ops_impl.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-- 
GitLab


From 02dfbe61e3ca0869ca3d4d9a5f02b17306c7b36e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 02:23:10 -0700
Subject: [PATCH 0940/1262] Automated g4 rollback of changelist 192842670

PiperOrigin-RevId: 193168327
---
 .../ci_build/windows/bazel/bazel_test_lib.sh  |  7 ++++++
 .../windows/cpu/pip/build_tf_windows.sh       | 23 +++++++++++++++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index d654b433e7..582188fc00 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -140,6 +140,13 @@ function run_configure_for_gpu_build {
   echo "" | ./configure
 }
 
+function set_gcs_remote_cache_options {
+  echo "build --experimental_remote_spawn_cache" >> "${TMP_BAZELRC}"
+  echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> "${TMP_BAZELRC}"
+  echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> "${TMP_BAZELRC}"
+  echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
+}
+
 function create_python_test_dir() {
   rm -rf "$1"
   mkdir -p "$1"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 5e9ae497e1..632f1ef564 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,20 +42,30 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
+# Recreate an empty bazelrc file under source root
+export TMP_BAZELRC=.tmp.bazelrc
+rm -f "${TMP_BAZELRC}"
+touch "${TMP_BAZELRC}"
+
 skip_test=0
 
 for ARG in "$@"; do
   if [[ "$ARG" == --skip_test ]]; then
     skip_test=1
+  elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
+    set_gcs_remote_cache_options
   fi
 done
 
-run_configure_for_cpu_build
-
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
 # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-BUILD_OPTS="--define=override_eigen_strong_inline=true"
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+
+echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
+
+run_configure_for_cpu_build
+
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -73,10 +83,13 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
   --flaky_test_attempts=3 \
   //${PY_TEST_DIR}/tensorflow/python/... \
   //${PY_TEST_DIR}/tensorflow/contrib/...
+
+# Remove all options in .tmp.bazelrc
+echo "" > "${TMP_BAZELRC}"
-- 
GitLab


From 777f843ad1b57f0674185963bbf4b72c36d0dd4c Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Tue, 17 Apr 2018 21:19:46 +0800
Subject: [PATCH 0941/1262] Fix warnings for initialize_variables

---
 .../contrib/framework/python/framework/tensor_util_test.py      | 2 +-
 tensorflow/python/framework/graph_util_test.py                  | 2 +-
 tensorflow/python/training/saver_test.py                        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index a2834b6489..8fc4f60492 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -48,7 +48,7 @@ class LocalVariabletest(test.TestCase):
       variables = variables_lib.local_variables()
       self.assertEquals(2, len(variables))
       self.assertRaises(errors_impl.OpError, sess.run, variables)
-      variables_lib.initialize_variables(variables).run()
+      variables_lib.variables_initializer(variables).run()
       self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
 
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index b618152b02..2dafb94ba7 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -209,7 +209,7 @@ class DeviceFunctionsTest(test.TestCase):
           defun_node, 2.0, name="output_node")
 
       with session.Session() as sess:
-        init = variables.initialize_variables([variable_node])
+        init = variables.variables_initializer([variable_node])
         sess.run(init)
         output = sess.run(output_node)
         self.assertNear(4.0, output, 0.00001)
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 3867c0d8da..70495291bc 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -2731,7 +2731,7 @@ class ScopedGraphTest(test.TestCase):
       # The rest of the variables.
       rest_variables = list(
           set(variables.global_variables()) - set(var_list.keys()))
-      init_rest_op = variables.initialize_variables(rest_variables)
+      init_rest_op = variables.variables_initializer(rest_variables)
 
     with self.test_session(graph=graph) as sess:
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
-- 
GitLab


From f73d793e7a9234efb14fd8f11322429d122949b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 07:07:16 -0700
Subject: [PATCH 0942/1262] Fix the type info analysis to correctly process
 loops. Simplify the implementation by reusing some of the transformer base
 functions. Allow set_element_type to use literals. Add additional tests.

PiperOrigin-RevId: 193192409
---
 .../pyct/static_analysis/type_info.py         | 36 +++++++++++--------
 .../pyct/static_analysis/type_info_test.py    | 24 ++++++++++---
 .../contrib/autograph/pyct/transformer.py     | 12 +++++++
 3 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index a75ba7a272..2f553e1e23 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -48,6 +48,9 @@ from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
 
+# TODO(mdan): Remove the duplication between this and activity.py.
+# In particular, the symbol definitions we track here could as well be tracked
+# there because they follow the same rules for visibility.
 class Scope(object):
   """Tracks symbol value references.
 
@@ -99,20 +102,16 @@ class TypeInfoResolver(transformer.Base):
   def __init__(self, context):
     super(TypeInfoResolver, self).__init__(context)
     self.scope = Scope(None)
-    self.function_level = 0
 
   def visit_FunctionDef(self, node):
     self.scope = Scope(self.scope)
-    self.function_level += 1
-    self.generic_visit(node)
-    self.function_level -= 1
+    node = self.generic_visit(node)
     self.scope = self.scope.parent
     return node
 
   def _visit_block(self, block):
     self.scope = Scope(self.scope)
-    for i, n in enumerate(block):
-      block[i] = self.generic_visit(n)
+    block = self.visit_block(block)
     self.scope = self.scope.parent
     return block
 
@@ -137,7 +136,7 @@ class TypeInfoResolver(transformer.Base):
 
   def _process_function_arg(self, arg_name):
     str_name = str(arg_name)
-    if self.function_level == 1 and str_name in self.context.arg_types:
+    if len(self.enclosing_entities) == 1 and str_name in self.context.arg_types:
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
       type_holder = arg_name.ast()
@@ -221,19 +220,26 @@ class TypeInfoResolver(transformer.Base):
       # type that it specified.
       if (anno.getanno(node.func, 'live_val') is
           self.context.type_annotation_func):
-        # Expecting the actual type to be the second argument.
+
         if len(node.args) != 2:
           raise ValueError('"%s" must have exactly two parameters'
                            % self.context.type_annotation_func)
-        if not anno.hasanno(node.args[0], anno.Basic.QN):
+        target_arg, type_arg = node.args
+        if not anno.hasanno(target_arg, anno.Basic.QN):
           raise ValueError('the first argument of "%s" must by a symbol'
                            % self.context.type_annotation_func)
-        if not anno.hasanno(node.args[1], 'live_val'):
-          raise ValueError(
-              'the second argument of "%s" must be statically resolvable' %
-              self.context.type_annotation_func)
-        target_symbol = anno.getanno(node.args[0], anno.Basic.QN)
-        element_type = anno.getanno(node.args[1], 'live_val')
+        if isinstance(type_arg, gast.Str):
+          element_type = type_arg.s
+        elif isinstance(type_arg, gast.Num):
+          element_type = type_arg.n
+        else:
+          if not anno.hasanno(type_arg, 'live_val'):
+            raise ValueError(
+                'the second argument of "%s" must be statically resolvable' %
+                self.context.type_annotation_func)
+          element_type = anno.getanno(type_arg, 'live_val')
+
+        target_symbol = anno.getanno(target_arg, anno.Basic.QN)
         # Find the definition of this symbol and annotate it with the given
         # data type. That in turn will cause future uses of the symbol
         # to receive the same type annotation.
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
index 4f53923275..46b7701624 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
@@ -209,10 +209,7 @@ class TypeInfoResolverTest(test.TestCase):
       return a, b, c
 
     node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'Bar': Bar})
-    lhs = node.body[0].body[1].value.elts
-    a = lhs[0]
-    b = lhs[1]
-    c = lhs[2]
+    a, b, c = node.body[0].body[1].value.elts
     self.assertEquals(Foo, anno.getanno(a, 'type'))
     self.assertEquals(Bar, anno.getanno(b, 'type'))
     self.assertEquals(Foo, anno.getanno(c, 'type'))
@@ -220,6 +217,25 @@ class TypeInfoResolverTest(test.TestCase):
     self.assertFalse(anno.hasanno(b, 'live_val'))
     self.assertFalse(anno.hasanno(c, 'live_val'))
 
+  def test_inner_scope(self):
+
+    def test_fn():
+      a = []
+      utils.set_element_type(a, 1)
+      for _ in a:
+        b = []
+        utils.set_element_type(b, 2)
+        return a, b
+
+    node = self._parse_and_analyze(test_fn, {'utils': utils})
+    a, b = node.body[0].body[2].body[2].value.elts
+    self.assertEquals(1, anno.getanno(a, 'element_type'))
+    self.assertEquals(2, anno.getanno(b, 'element_type'))
+    self.assertFalse(anno.hasanno(a, 'type'))
+    self.assertFalse(anno.hasanno(b, 'type'))
+    self.assertFalse(anno.hasanno(a, 'live_val'))
+    self.assertFalse(anno.hasanno(b, 'live_val'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 3e414d7ba5..e102ab7630 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -87,6 +87,18 @@ class Base(gast.NodeTransformer):
       print(pretty_printer.fmt(node))
     return node
 
+  def visit_block(self, nodes):
+    """Helper equivalent to generic_visit, but for node lists."""
+    results = []
+    for node in nodes:
+      replacement = self.visit(node)
+      if replacement:
+        if isinstance(replacement, (list, tuple)):
+          results.extend(replacement)
+        else:
+          results.append(replacement)
+    return results
+
   def visit(self, node):
     source_code = self.context.source_code
     source_file = self.context.source_file
-- 
GitLab


From fb7675e06d6b5ee1d45dcd4eda64a4caa689e393 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 07:41:09 -0700
Subject: [PATCH 0943/1262] Add uint32/uint64 support to Gather op.

PiperOrigin-RevId: 193195939
---
 tensorflow/core/kernels/gather_op.cc             | 2 ++
 tensorflow/python/kernel_tests/gather_op_test.py | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 08adf4badb..ef332ebee3 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -143,6 +143,8 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
+TF_CALL_uint32(REGISTER_GATHER_CPU);
+TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 9a94692569..a2fcd751df 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -149,6 +149,15 @@ class GatherTest(test.TestCase):
       self.assertAllEqual([b"asdf", b"qwer"],
                           array_ops.gather(params, 0, axis=1).eval())
 
+  def testUInt32AndUInt64(self):
+    for unsigned_type in (dtypes.uint32, dtypes.uint64):
+      params = self._buildParams(
+          np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
+      with self.test_session():
+        self.assertAllEqual([7, 8, 9],
+                            array_ops.gather(params, 1, axis=0).eval())
+        self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
+
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
-- 
GitLab


From a2def2dbbe670d37f7bf2bf15a7eed6d7b3a1011 Mon Sep 17 00:00:00 2001
From: Dalmo Cirne <dalmo@clarifai.com>
Date: Tue, 17 Apr 2018 10:51:00 -0400
Subject: [PATCH 0944/1262] Fix unintialized var warning in bfloat16

This contribution initializes result to 0, then inside the #if statement only one byte needs to be set, depending on the endian, the other will already be zero from the initialization. This also fixes the compilation warning.
---
 tensorflow/core/lib/bfloat16/bfloat16.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 126e5a17af..1a822d441d 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -89,15 +89,13 @@ struct bfloat16 {
       : bfloat16(static_cast<float>(val)) {}
 
   B16_DEVICE_FUNC explicit operator float() const {
-    float result;
+    float result = 0;
 
     uint16_t* q = reinterpret_cast<uint16_t*>(&result);
 
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     q[0] = value;
-    q[1] = 0;
 #else
-    q[0] = 0;
     q[1] = value;
 #endif
     return result;
-- 
GitLab


From bcfbeabef0ec1ae36b786a9ad10a2e0236208146 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 09:02:14 -0700
Subject: [PATCH 0945/1262] Fix incorrect rejection of xrange.

PiperOrigin-RevId: 193205016
---
 tensorflow/contrib/autograph/converters/builtin_functions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/autograph/converters/builtin_functions.py b/tensorflow/contrib/autograph/converters/builtin_functions.py
index cd889cb663..317711a866 100644
--- a/tensorflow/contrib/autograph/converters/builtin_functions.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions.py
@@ -49,7 +49,9 @@ class BuiltinFunctionTransformer(transformer.Base):
   def visit_Call(self, node):
     self.generic_visit(node)
     # TODO(mdan): This won't work if the function was hidden.
-    if isinstance(node.func, gast.Name) and node.func.id in ('len', 'range'):
+    # TODO(mdan): Rely on the live_val and use inspect_utils.is_builtin instead.
+    if (isinstance(node.func, gast.Name) and
+        node.func.id in ('len', 'range', 'xrange')):
       return self._convert_builtin(node)
     # Print needs to be handled separately because it can be read as statement.
     if isinstance(node.func, gast.Name) and node.func.id == 'print':
-- 
GitLab


From 6fe887e5e495cff6eba35ea9c1e08c6044aa90ed Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 16:10:27 +0000
Subject: [PATCH 0946/1262] Fix tf.compat.as_str returns bytes issue in Python
 3

This fix tries to address the issue raised in 18598 where
tf.compat.as_str returns bytes (vs. str) in Python 3.
The issue was that `tf_export` decorator:
```
@tf_export('compat.as_bytes', 'compat.as_str')
```
could not be assigned to `as_bytes` or `as_text`
based on python 2 or 3.
This fix invokes tf_export explicitly based on `_six.PY2`
(for python 2/3) so that `as_str` calls `as_bytes` or `as_text`
conditionally.

This fix fixes 18598.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/util/compat.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 4163fcac79..73fc5d19a2 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -93,8 +93,12 @@ def as_text(bytes_or_text, encoding='utf-8'):
 # Convert an object to a `str` in both Python 2 and 3.
 if _six.PY2:
   as_str = as_bytes
+  tf_export('compat.as_bytes', 'compat.as_str')(as_bytes)
+  tf_export('compat.as_text')(as_text)
 else:
   as_str = as_text
+  tf_export('compat.as_bytes')(as_bytes)
+  tf_export('compat.as_text', 'compat.as_str')(as_text)
 
 
 @tf_export('compat.as_str_any')
-- 
GitLab


From 4a73005267aa7620c62de1ae89efc0e3e80cf3f9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 16:14:50 +0000
Subject: [PATCH 0947/1262] Removed unnneded tf_export

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/util/compat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 73fc5d19a2..074640da14 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -45,7 +45,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
-- 
GitLab


From 8ab6d08d6dfcee52efc96a354d4d1d6d080353ee Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Tue, 17 Apr 2018 09:19:06 -0700
Subject: [PATCH 0948/1262] Estimate IdentityN as Identity.

PiperOrigin-RevId: 193207469
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 087190ad2a..b35873ce38 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -35,6 +35,7 @@ constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
+constexpr char kIdentityN[] = "IdentityN";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
 constexpr char kReshape[] = "Reshape";
@@ -211,6 +212,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kIdentityN, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
-- 
GitLab


From c2babdab821bbd488c88d4cac4e0e4959396602b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 16:15:20 +0000
Subject: [PATCH 0949/1262] Pylint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/util/compat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 074640da14..738479c946 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -67,7 +67,6 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
-@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
-- 
GitLab


From 388d29628167b74b2261ab7eb79d930f8af45745 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 18 Apr 2018 00:28:09 +0800
Subject: [PATCH 0950/1262] Improve deprecation assignment with
 deprecated_argument_lookup

---
 tensorflow/contrib/losses/python/losses/loss_ops.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 8c3a8afe7a..5af1f21b11 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 __all__ = [
     "absolute_difference", "add_loss", "cosine_distance",
@@ -651,11 +652,9 @@ def cosine_distance(predictions,
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   with ops.name_scope(scope, "cosine_distance_loss",
                       [predictions, labels, weights]) as scope:
-- 
GitLab


From 56026690cd4a5587670047dc89aaad7b09853f87 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 10:13:09 -0700
Subject: [PATCH 0951/1262] Change the contract of dynamic_builtin to reject
 all functions it can't process.

PiperOrigin-RevId: 193215246
---
 tensorflow/contrib/autograph/utils/builtins.py      | 9 +--------
 tensorflow/contrib/autograph/utils/builtins_test.py | 5 +++--
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 349b7b6f2a..dfc3c86a3d 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -28,24 +28,17 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util import tf_inspect
 
 
 def dynamic_builtin(f, *args, **kwargs):
   """Converts a builtin function call inline."""
-  # Some built-ins may be objects.
-  if not tf_inspect.isbuiltin(f) and f not in (range,):
-    return f(*args, **kwargs)
-
   if f is len:
     return dynamic_len(*args, **kwargs)
   if six.PY2 and f is xrange:
     return dynamic_range(*args, **kwargs)
   if f is range:
     return dynamic_range(*args, **kwargs)
-
-  raise NotImplementedError(
-      'The "%s" builtin is not yet supported.' % f.__name__)
+  raise ValueError('%s is not supported' % f)
 
 
 def dynamic_len(list_or_tensor):
diff --git a/tensorflow/contrib/autograph/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py
index d9f7913d89..163e698407 100644
--- a/tensorflow/contrib/autograph/utils/builtins_test.py
+++ b/tensorflow/contrib/autograph/utils/builtins_test.py
@@ -76,8 +76,9 @@ class BuiltinsTest(test.TestCase):
     def range(x):  # pylint:disable=redefined-builtin
       return x
 
-    # Functions that just have the names of builtins are ignored.
-    self.assertEqual(builtins.dynamic_builtin(range, 1), 1)
+    # Functions that just have the names of builtins are rejected.
+    with self.assertRaises(ValueError):
+      self.assertEqual(builtins.dynamic_builtin(range, 1), 1)
     if six.PY2:
       self.assertListEqual(
           list(builtins.dynamic_builtin(xrange, 3)), [0, 1, 2])
-- 
GitLab


From 69f392fab1445f18dbd31dcd0e97f1f65eeb68e0 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 17 Apr 2018 10:32:47 -0700
Subject: [PATCH 0952/1262] Avoid ToString() in Eager's TFE_Execute.

Also use InlinedVector instead of std::vector for non-async path

Before:
Benchmark              Time(ns)        CPU(ns)     Iterations
-------------------------------------------------------------
BM_Execute/0               1895           1898         360200  Execute
BM_Execute/1               1193           1942         358322  ExecuteAsync
BM_ExecuteFunction/0       5812           5825         100000  ExecuteFunction
BM_ExecuteFunction/1       5015           5374         100000  ExecuteFunctionAsync

After:
Benchmark              Time(ns)        CPU(ns)     Iterations
-------------------------------------------------------------
BM_Execute/0               1604           1607         428262  Execute
BM_Execute/1               1150           1765         404821  ExecuteAsync
BM_ExecuteFunction/0       5615           5626         100000  ExecuteFunction
BM_ExecuteFunction/1       5111           5476         100000  ExecuteFunctionAsync
PiperOrigin-RevId: 193218331
---
 tensorflow/c/eager/c_api.cc                        | 14 ++++----------
 tensorflow/c/eager/runtime.cc                      |  9 +++------
 tensorflow/core/kernels/string_to_hash_bucket_op.h |  2 +-
 tensorflow/core/platform/default/fingerprint.h     | 10 ++++++----
 tensorflow/core/platform/fingerprint.h             |  8 +++-----
 5 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index c96a38dec3..393851d13c 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -116,9 +116,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
                          opts->async, std::move(device_mgr), r);
 }
 
-void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
-  delete ctx;
-}
+void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
@@ -581,7 +579,6 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
   return nullptr;
 }
 
-
 #ifdef TENSORFLOW_EAGER_USE_XLA
 // Synthesizes and returns a wrapper function over `op`, which must be a
 // primitive op (e.g. matmul).
@@ -725,9 +722,7 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   }
 
   const tensorflow::FunctionDef* fdef;
-  {
-    fdef = op->ctx->context.FindFunctionDef(op->name);
-  }
+  { fdef = op->ctx->context.FindFunctionDef(op->name); }
   std::vector<TF_DataType> const_input_types;
   std::vector<TF_DataType> arg_input_types;
   tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
@@ -940,8 +935,8 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
-    std::vector<tensorflow::TensorHandle*> handle_retvals(*num_retvals,
-                                                          nullptr);
+    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
+        *num_retvals);
     status->status = tensorflow::EagerExecute(
         &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(),
         handle_retvals.data(), *num_retvals);
@@ -1091,7 +1086,6 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
 }
 }  // namespace tensorflow
 
-
 TFE_Op::~TFE_Op() {
   for (tensorflow::TensorHandle* h : inputs) {
     h->Unref();
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index abe2793ce8..e6c51ab17a 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -184,8 +184,7 @@ void CombineUnordered(const tensorflow::Fprint128& a,
 
 inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s,
                                             const tensorflow::Fprint128& b) {
-  // TODO(agarwal): avoid ToString().
-  tensorflow::Fprint128 a = tensorflow::Fingerprint128(s.ToString());
+  tensorflow::Fprint128 a = tensorflow::Fingerprint128(s);
   return FingerprintCat128(a, b);
 }
 
@@ -213,10 +212,8 @@ tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) const {
     if (node_def_finalized_) return f;
   }
   for (const auto& p : string_attrs_) {
-    // TODO(agarwal): avoid ToString().
-    CombineUnordered(CacheKeyHelper(p.first, tensorflow::Fingerprint128(
-                                                 p.second.ToString())),
-                     &f);
+    CombineUnordered(
+        CacheKeyHelper(p.first, tensorflow::Fingerprint128(p.second)), &f);
   }
   for (const auto& p : int_attrs_) {
     CombineUnordered(CacheKeyHelper(p.first, static_cast<uint64>(p.second)),
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.h b/tensorflow/core/kernels/string_to_hash_bucket_op.h
index 2fd22c3f4e..62ef35bbba 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <uint64 hash(const string&)>
+template <uint64 hash(StringPiece)>
 class StringToHashBucketOp : public OpKernel {
  public:
   explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
diff --git a/tensorflow/core/platform/default/fingerprint.h b/tensorflow/core/platform/default/fingerprint.h
index 71f9951e53..f901befc16 100644
--- a/tensorflow/core/platform/default/fingerprint.h
+++ b/tensorflow/core/platform/default/fingerprint.h
@@ -18,14 +18,16 @@ limitations under the License.
 
 #include <farmhash.h>
 
+#include "tensorflow/core/lib/core/stringpiece.h"
+
 namespace tensorflow {
 
-inline uint64 Fingerprint64(const string& s) {
-  return ::util::Fingerprint64(s);
+inline uint64 Fingerprint64(StringPiece s) {
+  return ::util::Fingerprint64(s.data(), s.size());
 }
 
-inline Fprint128 Fingerprint128(const string& s) {
-  const auto fingerprint = ::util::Fingerprint128(s);
+inline Fprint128 Fingerprint128(StringPiece s) {
+  const auto fingerprint = ::util::Fingerprint128(s.data(), s.size());
   return {::util::Uint128Low64(fingerprint),
           ::util::Uint128High64(fingerprint)};
 }
diff --git a/tensorflow/core/platform/fingerprint.h b/tensorflow/core/platform/fingerprint.h
index fd0347a10b..b47dcdedd7 100644
--- a/tensorflow/core/platform/fingerprint.h
+++ b/tensorflow/core/platform/fingerprint.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
 #define TENSORFLOW_CORE_PLATFORM_FINGERPRINT_H_
 
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -36,15 +37,12 @@ struct Fprint128Hasher {
   }
 };
 
-// TODO(sibyl-Mooth6ku): Change these to accept StringPiece (or make them templated
-// on any kind of byte array?).
-
 // This is a portable fingerprint interface for strings that will never change.
 // However, it is not suitable for cryptography.
-uint64 Fingerprint64(const string& s);
+uint64 Fingerprint64(StringPiece s);
 
 // 128-bit variant of Fingerprint64 above (same properties and caveats apply).
-Fprint128 Fingerprint128(const string& s);
+Fprint128 Fingerprint128(StringPiece s);
 
 namespace internal {
 // Mixes some of the bits that got propagated to the high bits back into the
-- 
GitLab


From 3177c063efcf4721a45d065cde72a1f605d3961a Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Tue, 17 Apr 2018 10:53:07 -0700
Subject: [PATCH 0953/1262] Enable consumption of GIT_TAG_OVERRIDE env var in
 release build script. (#18579)

Enable consumption of GIT_TAG_OVERRIDE env var in release build script.
---
 tensorflow/contrib/cmake/tf_core_framework.cmake | 2 +-
 tensorflow/tools/ci_build/builds/pip.sh          | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index f7cb186c7c..b47c32f1c4 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 82042b93c0..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -123,6 +123,10 @@ done
 
 BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
 
+if [[ -z "$GIT_TAG_OVERRIDE" ]]; then
+  BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE"
+fi
+
 echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
-- 
GitLab


From c06004be0a6c72c4fdf3905d94740035035b8083 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 11:05:57 -0700
Subject: [PATCH 0954/1262] Fixes a comment in
 tf.contrib.seq2seq.monotonic_attention().

PiperOrigin-RevId: 193224285
---
 tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index f0f143ddfc..8a40a7ab53 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -654,7 +654,7 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
     shifted_1mp_choose_i = array_ops.concat(
         [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1)
     # Compute attention distribution recursively as
-    # q[i] = (1 - p_choose_i[i])*q[i - 1] + previous_attention[i]
+    # q[i] = (1 - p_choose_i[i - 1])*q[i - 1] + previous_attention[i]
     # attention[i] = p_choose_i[i]*q[i]
     attention = p_choose_i*array_ops.transpose(functional_ops.scan(
         # Need to use reshape to remind TF of the shape between loop iterations
-- 
GitLab


From 1192c1662c5c98f55805450b4619ac2bc9c6908c Mon Sep 17 00:00:00 2001
From: Mingxing Tan <tanmingxing@google.com>
Date: Tue, 17 Apr 2018 11:48:43 -0700
Subject: [PATCH 0955/1262] Replace decode_image with decode_jpeg to avoid
 ValueError in datasets programmers guide.

PiperOrigin-RevId: 193231717
---
 tensorflow/docs_src/programmers_guide/datasets.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index 9ccdbde627..67be41b1a6 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -540,7 +540,7 @@ batched into a fixed size.
 # to a fixed shape.
 def _parse_function(filename, label):
   image_string = tf.read_file(filename)
-  image_decoded = tf.image.decode_image(image_string)
+  image_decoded = tf.image.decode_jpeg(image_string)
   image_resized = tf.image.resize_images(image_decoded, [28, 28])
   return image_resized, label
 
-- 
GitLab


From d7b6cb66c0fc346cf55020042931c07208713c60 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 11:53:29 -0700
Subject: [PATCH 0956/1262] Fixes and cleanup to support more complex quantized
 models and adds PropagateFakeQuantNumBits.

PiperOrigin-RevId: 193232630
---
 tensorflow/contrib/lite/toco/BUILD            |   5 +-
 tensorflow/contrib/lite/toco/args.h           |   1 +
 tensorflow/contrib/lite/toco/dump_graphviz.cc |  12 +-
 .../ensure_bias_vectors.cc                    |   2 +-
 .../graph_transformations.h                   |  20 +-
 .../make_initial_dequantize_operator.cc       |   1 +
 .../propagate_fake_quant_num_bits.cc          | 307 ++++++++++++++++++
 .../quantization_util.cc                      |  88 +++++
 .../graph_transformations/quantization_util.h |  25 +-
 .../toco/graph_transformations/quantize.cc    | 139 +++-----
 .../remove_trivial_fake_quant.cc              |  86 +++++
 .../resolve_constant_fake_quant.cc            |  25 +-
 .../contrib/lite/toco/toco_cmdline_flags.cc   |   7 +
 tensorflow/contrib/lite/toco/toco_flags.proto |  11 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  26 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |  73 +++--
 tensorflow/contrib/lite/toco/tooling_util.h   |  18 +-
 17 files changed, 702 insertions(+), 144 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 5b86e4e5ae..398978b145 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -238,6 +238,7 @@ cc_library(
         "graph_transformations/merge_reshape_into_preceding_transpose.cc",
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
+        "graph_transformations/propagate_fake_quant_num_bits.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
         "graph_transformations/quantization_util.cc",
         "graph_transformations/quantization_util.h",
@@ -249,6 +250,7 @@ cc_library(
         "graph_transformations/remove_trivial_binary.cc",
         "graph_transformations/remove_trivial_concatenation.cc",
         "graph_transformations/remove_trivial_concatenation_input.cc",
+        "graph_transformations/remove_trivial_fake_quant.cc",
         "graph_transformations/remove_trivial_passthrough.cc",
         "graph_transformations/remove_trivial_passthrough.h",
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
@@ -303,7 +305,7 @@ cc_library(
         ":runtime",
         ":toco_port",
         ":tooling_util",
-        ":types_proto_cc",
+        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -378,7 +380,6 @@ cc_library(
         ":toco_graphviz_dump_options",
         ":toco_port",
         ":types_proto_cc",
-        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@protobuf_archive//:protobuf_headers",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 7a7059e357..71e7318ac3 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -237,6 +237,7 @@ struct ParsedTocoFlags {
   Arg<string> input_types;
   Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
   Arg<bool> drop_control_dependency = Arg<bool>(false);
+  Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index c8352741b4..c289ddcd92 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -95,10 +95,8 @@ Color GetColorForArray(const Model& model, const string& array_name) {
       array_name == dump_options.graphviz_last_array) {
     return Color(0x9E, 0x9E, 0x9E);
   }
-  for (const string& output_array : model.flags.output_arrays()) {
-    if (array_name == output_array) {
-      return Color(0x9E, 0x9E, 0x9E);
-    }
+  if (IsOutputArray(model, array_name)) {
+    return Color(0x9E, 0x9E, 0x9E);
   }
   // Remaining arrays are intermediate activation arrays.
   // Lighter tone of the same grey as for input/output arrays:
@@ -119,6 +117,12 @@ void AppendArrayVal(string* string, Array const& array, int index) {
       return;
     }
     AppendF(string, "%d", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kInt16) {
+    const auto& data = array.GetBuffer<ArrayDataType::kInt16>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
   } else if (array.buffer->type == ArrayDataType::kInt32) {
     const auto& data = array.GetBuffer<ArrayDataType::kInt32>().data;
     if (index >= data.size()) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
index badefeca88..708ecf6e0a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -47,7 +47,7 @@ bool EnsureBiasVectors::Run(Model* model, std::size_t op_index) {
       op->type == OperatorType::kDepthwiseConv ||
       op->type == OperatorType::kFullyConnected) {
     if (ProcessLinearOperator(model, op)) {
-      AddMessageF("Added bias vector to %s", LogName(*op));
+      AddMessageF("Added bias vector to %s as %s", LogName(*op), op->inputs[2]);
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index dbf029a853..56b3dec5c4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -135,6 +135,7 @@ DECLARE_GRAPH_TRANSFORMATION(IdentifyDilatedConv)
 DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
+DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
@@ -144,6 +145,7 @@ DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedMinMax)
@@ -163,7 +165,6 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMerge)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSqueezeAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
-DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantReshape)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
@@ -210,6 +211,23 @@ class RemoveTrivialReshape : public GraphTransformation {
   bool treat_expand_dims_as_trivial_ = false;
 };
 
+class ResolveConstantFakeQuant : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "ResolveConstantFakeQuant"; }
+
+  // True if the num_bits should adjust the final data type.
+  bool propagate_fake_quant_num_bits() const {
+    return propagate_fake_quant_num_bits_;
+  }
+  void set_propagate_fake_quant_num_bits(bool val) {
+    propagate_fake_quant_num_bits_ = val;
+  }
+
+ private:
+  bool propagate_fake_quant_num_bits_ = false;
+};
+
 #undef DECLARE_GRAPH_TRANSFORMATION
 
 }  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index 183b3d3f2e..45d9f73a1e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
new file mode 100644
index 0000000000..0bce183c18
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -0,0 +1,307 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void ChangeArrayDataType(GraphTransformation* transformation, Array* array,
+                         ArrayDataType new_data_type,
+                         const MinMax* new_minmax) {
+  // Ensure the array ends up in the new type (if it hasn't yet been quantized).
+  array->final_data_type = new_data_type;
+
+  if (array->minmax && array->quantization_params) {
+    // The array is already quantized and has min/max info.
+    // As we are changing the data type we need to fix up the existing min/max
+    // to the new data type range.
+
+    double old_quantized_min, old_quantized_max;
+    CHECK(GetQuantizedDataTypeNumericalRange(
+        array->data_type, &old_quantized_min, &old_quantized_max))
+        << "Existing data type is not quantized: "
+        << ArrayDataTypeName(array->data_type);
+    double new_quantized_min, new_quantized_max;
+    CHECK(GetQuantizedDataTypeNumericalRange(new_data_type, &new_quantized_min,
+                                             &new_quantized_max))
+        << "New data type is not quantized: "
+        << ArrayDataTypeName(new_data_type);
+
+    // Compute new minmax values.
+    double min = (old_quantized_min - array->quantization_params->zero_point) *
+                 array->quantization_params->scale;
+    double max =
+        (old_quantized_max + 1 - array->quantization_params->zero_point) *
+        array->quantization_params->scale;
+    max = max - 1.0 / (new_quantized_max + 1);
+
+    auto& array_minmax = array->GetOrCreateMinMax();
+    transformation->AddMessageF(
+        "Rescaling min/max from %g,%g (%s) to %g,%g (%s)", array_minmax.min,
+        array_minmax.max, ArrayDataTypeName(array->data_type), min, max,
+        ArrayDataTypeName(new_data_type));
+
+    array_minmax.min = min;
+    array_minmax.max = max;
+    GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+        array_minmax, array->quantization_params.get());
+
+    // Directly change the type as the array was already quantized.
+    array->data_type = new_data_type;
+  } else {
+    // Array has not yet been quantized so we can just set the final data type
+    // and assign the new min/max value (if provided).
+    CHECK(!array->quantization_params);
+
+    if (!array->minmax && new_minmax) {
+      transformation->AddMessageF("Forcing new minmax to %g,%g (%s)",
+                                  new_minmax->min, new_minmax->max,
+                                  ArrayDataTypeName(new_data_type));
+      auto& array_minmax = array->GetOrCreateMinMax();
+      array_minmax.min = new_minmax->min;
+      array_minmax.max = new_minmax->max;
+    }
+  }
+}
+
+// Returns true if the op blocks our backward recursive data type propagation.
+bool DoesOpBlockBackwardPropagation(const Operator& op) {
+  switch (op.type) {
+    case OperatorType::kConcatenation:
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kTensorFlowConcatV2:
+      // Concat shouldn't block propagation, but we do expect that all inputs
+      // have the same range.
+      return false;
+    case OperatorType::kDequantize:
+      // Dequantize ops are inserted between the value we care about and the
+      // FakeQuant so make sure we move across them.
+    case OperatorType::kGather:
+      // Gathers need their parameters changed to the appropriate data type.
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      // Reshapes and transposes don't change values.
+      return false;
+    default:
+      return true;
+  }
+}
+
+// Returns true if the input of an op blocks our backward recursive data type
+// propagation.
+bool DoesOpInputBlockBackwardPropagation(const Operator& op, int input_index) {
+  switch (op.type) {
+    case OperatorType::kGather:
+      // Ignore gather indices.
+      return input_index != 0;
+      break;
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      // Ignore reshape/transpose shapes/dimensions.
+      return input_index != 0;
+    default:
+      return false;
+  }
+}
+
+// Propagates the data type up into the input arrays if they are model inputs
+// that may need their type changed. May act recursively if the inputs are
+// produced by ops that we can move over (such as Dequantize).
+bool RecursivelyBackwardPropagateDataType(GraphTransformation* transformation,
+                                          Model* model, Operator* op,
+                                          ArrayDataType new_data_type,
+                                          const MinMax& new_minmax) {
+  bool did_change = false;
+  for (int input_index = 0; input_index < op->inputs.size(); ++input_index) {
+    const auto& input = op->inputs[input_index];
+    auto& input_array = model->GetArray(input);
+    if (input_array.final_data_type == new_data_type) {
+      // Final data type is already - skip.
+      continue;
+    }
+
+    // Prevent moving into constant param args that we don't want to modify.
+    if (DoesOpInputBlockBackwardPropagation(*op, input_index)) {
+      continue;
+    }
+
+    if (input_array.final_data_type != new_data_type) {
+      transformation->AddMessageF(
+          "Adjusting input final data type of array %s from %s to %s", input,
+          ArrayDataTypeName(input_array.final_data_type),
+          ArrayDataTypeName(new_data_type));
+      did_change = true;
+      ChangeArrayDataType(transformation, &input_array, new_data_type,
+                          &new_minmax);
+
+      // Walk up into all ops producing the inputs to this op.
+      for (auto& producing_op : model->operators) {
+        if (!DoesOpBlockBackwardPropagation(*producing_op)) {
+          for (const auto& output : producing_op->outputs) {
+            if (input == output) {
+              did_change |= RecursivelyBackwardPropagateDataType(
+                  transformation, model, producing_op.get(), new_data_type,
+                  new_minmax);
+            }
+          }
+        }
+      }
+    }
+  }
+  return did_change;
+}
+
+// Returns true if the op blocks our forward recursive data type propagation.
+bool DoesOpBlockForwardPropagation(const Operator& op) {
+  switch (op.type) {
+    case OperatorType::kFakeQuant:
+      // Always stop at another FakeQuant, as it will likely have different
+      // parameters.
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Recurses down the graph setting the data type of all arrays until an operator
+// that blocks propagation (like another FakeQuant) or a final_data_type is
+// already specified.
+bool RecursivelyForwardPropagateDataType(GraphTransformation* transformation,
+                                         Model* model, Operator* op,
+                                         ArrayDataType new_data_type) {
+  bool did_change = false;
+  for (const auto& output : op->outputs) {
+    auto& output_array = model->GetArray(output);
+    if (output_array.final_data_type == new_data_type) {
+      // Final data type is already - skip.
+      continue;
+    }
+
+    if (output_array.final_data_type == ArrayDataType::kNone ||
+        output_array.final_data_type != new_data_type) {
+      transformation->AddMessageF(
+          "Adjusting output final data type of array %s from %s to %s", output,
+          ArrayDataTypeName(output_array.final_data_type),
+          ArrayDataTypeName(new_data_type));
+      did_change = true;
+      ChangeArrayDataType(transformation, &output_array, new_data_type,
+                          nullptr);
+
+      // Walk down into all ops consuming the output of this op.
+      for (auto& consuming_op : model->operators) {
+        if (!DoesOpBlockForwardPropagation(*consuming_op)) {
+          for (const auto& input : consuming_op->inputs) {
+            if (input == output) {
+              did_change |= RecursivelyForwardPropagateDataType(
+                  transformation, model, consuming_op.get(), new_data_type);
+            }
+          }
+        }
+      }
+    }
+  }
+  return did_change;
+}
+
+}  // namespace
+
+// Propagates the num_bits on a FakeQuant operator into the final data types
+// of inputs and outputs. For example, if FakeQuant.num_bits==16 then we know
+// the output must be int16 and assume all inputs up until the preceding op are
+// also 16.
+//
+// This can be thought of as a bidirectional flood-fill of the num_bits implied
+// final_data_type that terminates at other FakeQuant ops (and a few others as
+// determined by DoesOpBlockBackwardPropagation/DoesOpBlockForwardPropagation).
+// Once all FakeQuant ops have been visted the arrays should all have
+// appropriate final_data_types if the source graph was annotated with the
+// proper FakeQuant ops.
+//
+// Annotating a graph requires following a few hard rules:
+// - every input MUST have a FakeQuant immediately following it
+// - every output MUST have a FakeQuant immediately preceding it
+// - important arithmetic ops (such as FullyConnected) SHOULD have a FakeQuant
+//   immediately following it
+// - all trained weights (RHS of FullyConnected ops, params on Gather ops, etc)
+//   MUST have FakeQuants between them and the consuming op
+// Additional FakeQuants may be used if desired, especially in areas that may
+// suffer from large precision changes - such as between a Softmax and a
+// FullyConnected. Only by validating accuracy differences between float
+// inference with the FakeQuant ops simulating quantization and the actually
+// quantized graph can you be sure the appropriate FakeQuant ops are present.
+//
+// You can tell if you're missing some FakeQuants by looking for warnings from
+// quantize.cc about minmax ranges being determined by the contents of constant
+// arrays. This will almost never produce functional models during inference.
+//
+// As this op may change the data types and ranges of input and output arrays
+// downstream tools must also be sure to parse the output model flags to get the
+// post-Transform values that may have changed due to this transformation.
+//
+// This isn't a GraphTransformation in the traditional respect as it affects ops
+// outside of the one under transformation. This is primarily so that we can
+// utilize the graph traversal and repeated pass system underlying the
+// transformation system to exhaustively find all FakeQuant ops. It also gets us
+// nice logging and integration with the graphviz video dumping mode.
+// In general you should not copy this style of transformation and stick to
+// local-only changes as seen in the other transformations.
+bool PropagateFakeQuantNumBits::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if (op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
+
+  ArrayDataType quantized_data_type = ArrayDataType::kNone;
+  if (!InferQuantizedDataTypeFromFakeQuant(*fakequant_op,
+                                           &quantized_data_type)) {
+    AddMessageF("FakeQuant op %s num_bits=%d is out of range, ignoring",
+                LogName(*op), fakequant_op->num_bits);
+    return false;
+  }
+  const auto& final_minmax = *fakequant_op->minmax;
+
+  AddMessageF(
+      "Beginning propagation of fake quant %s num_bits=%d min=%g max=%g to %s",
+      LogName(*op), fakequant_op->num_bits, final_minmax.min, final_minmax.max,
+      ArrayDataTypeName(quantized_data_type));
+
+  bool did_change = false;
+
+  // Propagate the FakeQuant information backward up the graph.
+  // This will possibly adjust input arrays or constant types (like Gather).
+  did_change |= RecursivelyBackwardPropagateDataType(
+      this, model, op, quantized_data_type, final_minmax);
+
+  // Propagate the FakeQuant information forward down the graph.
+  // This will possibly adjust output arrays.
+  did_change |=
+      RecursivelyForwardPropagateDataType(this, model, op, quantized_data_type);
+
+  return did_change;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
index e080df4bed..d74cad9a62 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
@@ -22,6 +22,20 @@ limitations under the License.
 
 namespace toco {
 
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type) {
+  if (op.num_bits <= 8) {
+    *out_quantized_data_type = ArrayDataType::kUint8;
+    return true;
+  } else if (op.num_bits <= 16) {
+    *out_quantized_data_type = ArrayDataType::kInt16;
+    return true;
+  } else {
+    *out_quantized_data_type = ArrayDataType::kNone;
+    return false;
+  }
+}
+
 bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
                                         double* out_min_value,
                                         double* out_max_value) {
@@ -103,6 +117,80 @@ void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
   }
 }
 
+namespace {
+
+template <ArrayDataType A>
+std::unique_ptr<GenericBuffer> QuantizeBuffer(
+    const GenericBuffer& buffer,
+    const QuantizationParams& quantization_params) {
+  const auto inverse_scale = 1. / quantization_params.scale;
+  CHECK(buffer.type == ArrayDataType::kFloat);
+  const auto& float_buffer =
+      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
+  auto* quantized_buffer = new Buffer<A>;
+  quantized_buffer->data.resize(float_buffer.data.size());
+  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
+    const float src_val = float_buffer.data[i];
+    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
+                        // enough to make a few tests fail!
+    if (quantization_params.scale == 0) {
+      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
+                           << "so all its values should be 0.";
+      scaled_val = quantization_params.zero_point;
+    } else {
+      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
+    }
+    quantized_buffer->data[i] =
+        tflite::SafeCast<DataType<A>>(std::round(scaled_val));
+  }
+  return std::unique_ptr<GenericBuffer>(quantized_buffer);
+}
+
+template <ArrayDataType A>
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name,
+                   const QuantizationParams& quantization_params) {
+  auto& array = model->GetArray(name);
+  CHECK(array.data_type == ArrayDataType::kFloat);
+  CHECK(!array.quantization_params);
+  array.GetOrCreateQuantizationParams() = quantization_params;
+  if (array.buffer) {
+    array.buffer = QuantizeBuffer<A>(*array.buffer, quantization_params);
+  }
+  array.data_type = A;
+  array.final_data_type = A;
+  transformation->AddMessageF(
+      "Quantized array %s to %s zero_point=%g, scale=%g", name,
+      ArrayDataTypeName(array.data_type), quantization_params.zero_point,
+      quantization_params.scale);
+}
+
+}  // namespace
+
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params) {
+  ArrayDataType adjusted_data_type = quantized_data_type;
+  auto& array = model->GetArray(name);
+  if (array.final_data_type == ArrayDataType::kInt16) {
+    adjusted_data_type = array.final_data_type;
+  }
+
+  switch (adjusted_data_type) {
+    case ArrayDataType::kUint8:
+      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt16:
+      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt32:
+      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
+                                                  quantization_params);
+    default:
+      LOG(FATAL) << "Unhandled case.";
+  }
+}
+
 bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
                                  const Array& array, double clamp_min,
                                  double clamp_max) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
index 35fb310777..79a2ce7e50 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
@@ -15,11 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
 #define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
 
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 
 namespace toco {
 
+// Gets the target quantized data type of an array based on the fake quant op.
+// For example, if the num_bits is 8 the data type will be kUint8.
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type);
+
 // Gets the min/max numerical range for the given quantized data type.
 // For example, kUint8 will return [0,255].
 // Returns true if the ranges were set and false if the type is not quantized.
@@ -32,11 +38,28 @@ bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
 ArrayDataType GetQuantizedDataType(const Array& array,
                                    ArrayDataType default_type);
 
-// Gets the quantization params for the array with the given data type and
+// Returns the quantization params for the array with the given data type and
 // minmax.
 void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
                            QuantizationParams* quantization_params);
 
+// Returns the quantization params for the data type and minmax values.
+template <ArrayDataType A>
+void GetQuantizationParamsFromMinMax(const MinMax& minmax,
+                                     QuantizationParams* quantization_params) {
+  using Integer = DataType<A>;
+  const double rmin = minmax.min;
+  const double rmax = minmax.max;
+  *quantization_params =
+      ::tflite::ChooseQuantizationParams<Integer>(rmin, rmax);
+}
+
+// Quantizes an array by setting its data type and (if constant) quantizing
+// all values in the array.
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params);
+
 // Returns true if the given array, when quantized, contains only values between
 // the provided clamp min/max.
 // Either clamp_min or clamp_max may be +/-infinity to indicate that the value
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index d6cae3cdbf..fa46e6bc38 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -57,72 +57,6 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTranspose || type == OperatorType::kMean;
 }
 
-template <ArrayDataType A>
-std::unique_ptr<GenericBuffer> QuantizeBuffer(
-    const GenericBuffer& buffer,
-    const QuantizationParams& quantization_params) {
-  const auto inverse_scale = 1. / quantization_params.scale;
-  CHECK(buffer.type == ArrayDataType::kFloat);
-  const auto& float_buffer =
-      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
-  auto* quantized_buffer = new Buffer<A>;
-  quantized_buffer->data.resize(float_buffer.data.size());
-  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
-    const float src_val = float_buffer.data[i];
-    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
-                        // enough to make a few tests fail!
-    if (quantization_params.scale == 0) {
-      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
-                           << "so all its values should be 0.";
-      scaled_val = quantization_params.zero_point;
-    } else {
-      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
-    }
-    quantized_buffer->data[i] =
-        tflite::SafeCast<DataType<A>>(std::round(scaled_val));
-  }
-  return std::unique_ptr<GenericBuffer>(quantized_buffer);
-}
-
-template <ArrayDataType A>
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name,
-                   const QuantizationParams& quantization_params) {
-  auto& array = model->GetArray(name);
-  CHECK(array.data_type == ArrayDataType::kFloat);
-  CHECK(!array.quantization_params);
-  array.GetOrCreateQuantizationParams() = quantization_params;
-  if (array.buffer) {
-    array.buffer = QuantizeBuffer<A>(*array.buffer, quantization_params);
-  }
-  array.data_type = A;
-  transformation->AddMessageF("Quantized array %s", name);
-}
-
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
-                   const QuantizationParams& quantization_params) {
-  ArrayDataType adjusted_data_type = quantized_data_type;
-  auto& array = model->GetArray(name);
-  if (array.final_data_type == ArrayDataType::kInt16) {
-    adjusted_data_type = array.final_data_type;
-  }
-
-  switch (adjusted_data_type) {
-    case ArrayDataType::kUint8:
-      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
-                                                  quantization_params);
-    case ArrayDataType::kInt16:
-      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
-                                                  quantization_params);
-    case ArrayDataType::kInt32:
-      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
-                                                  quantization_params);
-    default:
-      LOG(FATAL) << "Unhandled case.";
-  }
-}
-
 const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
   auto& array = model->GetArray(array_name);
   // Normally we should have a MinMax recorded on this Array,
@@ -245,6 +179,8 @@ bool ChooseQuantizationForOperatorInput(
     const auto& input_weights = model->GetArray(op.inputs[weights_input_index]);
     if (!input_activations.quantization_params ||
         !input_weights.quantization_params) {
+      transformation->AddMessageF(
+          "Input array %s is a bias vector but has no qparams", input);
       return false;
     }
     const auto input_activations_scale =
@@ -366,6 +302,9 @@ bool ChooseQuantizationForOperatorOutput(
   const auto& output = op.outputs[output_index];
   auto& array = model->GetArray(output);
   if (array.data_type != ArrayDataType::kFloat) {
+    transformation->AddMessageF("Array data type already set to %s, final=%s",
+                                ArrayDataTypeName(array.data_type),
+                                ArrayDataTypeName(array.final_data_type));
     return false;
   }
   *quantized_data_type = model->GetArray(op.inputs[0]).data_type;
@@ -427,29 +366,22 @@ bool ChooseQuantizationForOperatorOutput(
 // Fixes array minmax info to match the quantization parameters.
 // This is required for when quantization parameters change for an array during
 // quantization (such as ChooseQuantizationForOperatorOutput).
-void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
+void FixMinMaxPostQuantization(GraphTransformation* transformation,
+                               ArrayDataType quantized_data_type,
                                const QuantizationParams& quantization_params,
                                MinMax* minmax) {
-  double qmin, qmax;
-  switch (quantized_data_type) {
-    case ArrayDataType::kUint8:
-      qmin = 0;
-      qmax = 255;
-      break;
-    case ArrayDataType::kInt16:
-      qmin = -32768;
-      qmax = 32767;
-      break;
-    default:
-      // No update required.
-      return;
+  double quantized_min, quantized_max;
+  if (!GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
+                                          &quantized_max)) {
+    // Not quantized - no update required.
+    return;
   }
 
   // Compute new minmax values.
-  double min =
-      (qmin - quantization_params.zero_point) * quantization_params.scale;
-  double max =
-      (qmax - quantization_params.zero_point) * quantization_params.scale;
+  double min = (quantized_min - quantization_params.zero_point) *
+               quantization_params.scale;
+  double max = (quantized_max - quantization_params.zero_point) *
+               quantization_params.scale;
 
   // If we are close to the existing minmax values don't bother changing them.
   // This prevents propagating small floating point precision errors.
@@ -457,6 +389,9 @@ void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
   const double width = max - min;
   if (std::abs(min - minmax->min) > kMinMaxThreshold * width ||
       std::abs(max - minmax->max) > kMinMaxThreshold * width) {
+    transformation->AddMessageF(
+        "Adjusting min/max from %g,%g to %g,%g to match quantization params",
+        minmax->min, minmax->max, min, max);
     minmax->min = min;
     minmax->max = max;
   }
@@ -566,10 +501,33 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
             // input instead.
             for (int i = 0; i < model->flags.output_arrays_size(); i++) {
               if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
-                model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+                // TODO(b/78013785): never rename output arrays.
+                if (IsInputArray(*model, dequantize_op->inputs[0])) {
+                  // The op input is an input array and the output is an output
+                  // array and we can't have an array be both. Insert a copy
+                  // op to ensure the two arrays stay separate.
+                  AddMessageF(
+                      "Tried to rename output array %d while removing dequant "
+                      "op %s but array is also an input; inserting copy %s "
+                      "-> %s",
+                      i, LogName(*dequantize_op), model->flags.output_arrays(i),
+                      dequantize_op->inputs[0]);
+                  InsertCopyOperator(model, dequantize_op->inputs[0],
+                                     dequantize_op->outputs[0]);
+                } else {
+                  // Op output is strictly used as an output array, so we can
+                  // just rename the array and directly bypass the op.
+                  AddMessageF(
+                      "Renaming output array %d after removing dequant op %s: "
+                      "%s -> %s",
+                      i, LogName(*dequantize_op), model->flags.output_arrays(i),
+                      dequantize_op->inputs[0]);
+                  model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+                  model->EraseArray(dequantize_op->outputs[0]);
+                }
+                break;
               }
             }
-            model->EraseArray(dequantize_op->outputs[0]);
             model->operators.erase(dequantize_it);
           }
           changed = true;
@@ -615,7 +573,7 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
       CHECK(output_array.minmax)
           << "Output array named " << output << " lacks minmax";
       auto& output_minmax = output_array.GetMinMax();
-      FixMinMaxPostQuantization(quantized_data_type, quantization_params,
+      FixMinMaxPostQuantization(this, quantized_data_type, quantization_params,
                                 &output_minmax);
 
       QuantizeArray(this, model, output, quantized_data_type,
@@ -626,6 +584,7 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
       auto& dequantized_output_array =
           model->GetOrCreateArray(dequantized_output);
       dequantized_output_array.data_type = ArrayDataType::kFloat;
+      dequantized_output_array.final_data_type = output_array.data_type;
       auto& dequantized_output_minmax =
           dequantized_output_array.GetOrCreateMinMax();
       dequantized_output_minmax.min = output_minmax.min;
@@ -642,6 +601,12 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
       dequantize_op->outputs = {dequantized_output};
       for (int i = 0; i < model->flags.output_arrays_size(); i++) {
         if (model->flags.output_arrays(i) == output) {
+          // TODO(b/78013785): never rename output arrays.
+          AddMessageF(
+              "Renaming output array %d after inserting dequant op %s: %s -> "
+              "%s",
+              i, LogName(*dequantize_op), model->flags.output_arrays(i),
+              dequantized_output);
           model->flags.set_output_arrays(i, dequantized_output);
         }
       }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
new file mode 100644
index 0000000000..2c8d04440f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsFakeQuantTrivial(GraphTransformation* transformation, const Model& model,
+                        const FakeQuantOperator& fakequant_op) {
+  CHECK(fakequant_op.type == OperatorType::kFakeQuant);
+
+  if (!fakequant_op.minmax) {
+    // Require ReadFakeQuantMinMax to have run.
+    return false;
+  }
+
+  // FakeQuants are trivial if they are taking input from another identical
+  // FakeQuant op.
+  auto* producing_op = GetOpWithOutput(model, fakequant_op.inputs[0]);
+  if (!producing_op || producing_op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  const auto& producing_fakequant_op =
+      *static_cast<FakeQuantOperator*>(producing_op);
+  if (!producing_fakequant_op.minmax) {
+    // Require ReadFakeQuantMinMax to have run.
+    return false;
+  }
+
+  if (*fakequant_op.minmax == *producing_fakequant_op.minmax &&
+      fakequant_op.num_bits == producing_fakequant_op.num_bits) {
+    transformation->AddMessageF(
+        "%s is trivial because it is preceded by an identical FakeQuant %s",
+        LogName(fakequant_op), LogName(producing_fakequant_op));
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace
+
+// Removes FakeQuant ops that are trivial (have no effect, are redundant, etc).
+bool RemoveTrivialFakeQuant::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  auto* op = op_it->get();
+  if (op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
+
+  if (!IsFakeQuantTrivial(this, *model, *fakequant_op)) {
+    AddMessageF("%s is not trivial", LogName(*fakequant_op));
+    return false;
+  }
+
+  AddMessageF("Removing trivial %s", LogName(*fakequant_op));
+
+  CHECK_EQ(fakequant_op->inputs.size(), 1);
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 625d90205a..efb7bb2184 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -45,9 +46,29 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   }
 
   const auto& input_array = model->GetArray(fakequant_op->inputs[0]);
+  CHECK(input_array.data_type == ArrayDataType::kFloat);
+
+  // Determine the final data type in the same way as PropagateFakeQuantNumBits.
+  ArrayDataType quantized_data_type = input_array.final_data_type;
+  if (!InferQuantizedDataTypeFromFakeQuant(*fakequant_op,
+                                           &quantized_data_type)) {
+    AddMessageF("Unsupported FakeQuant num_bits=%d", fakequant_op->num_bits);
+    return false;
+  }
+
+  AddMessageF("Resolving constant %s", LogName(*fakequant_op));
+
   auto& output_array = model->GetArray(fakequant_op->outputs[0]);
   CHECK(input_array.data_type == ArrayDataType::kFloat);
   output_array.data_type = ArrayDataType::kFloat;
+
+  // We'll set the final data type to what the fake quant indicates we should
+  // have (and would have been set if this stayed around until
+  // PropagateFakeQuantNumBits).
+  if (propagate_fake_quant_num_bits()) {
+    output_array.final_data_type = quantized_data_type;
+  }
+
   CHECK(!output_array.buffer);
   const auto& input_buffer = input_array.GetBuffer<ArrayDataType::kFloat>();
   output_array.GetOrCreateMinMax() = *fakequant_op->minmax;
@@ -66,7 +87,9 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
     const double dst_val = qparams.scale * (quantized_val - qparams.zero_point);
     output_buffer.data[i] = dst_val;
   }
-  if (CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
+
+  if (IsDiscardableArray(*model, fakequant_op->inputs[0]) &&
+      CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
     model->EraseArray(fakequant_op->inputs[0]);
   }
   model->operators.erase(fakequant_it);
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index cc7803dd86..d1d68b6b47 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -126,6 +126,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.debug_disable_recurrent_cell_fusion.default_value(),
            "If true, disable fusion of known identifiable cell subgraphs into "
            "cells. This includes, for example, specific forms of LSTM cell."),
+      Flag("propagate_fake_quant_num_bits",
+           parsed_flags.propagate_fake_quant_num_bits.bind(),
+           parsed_flags.propagate_fake_quant_num_bits.default_value(),
+           "If true, use FakeQuant* operator num_bits attributes to adjust "
+           "array data_types."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -211,6 +216,8 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone);
   READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone);
+  READ_TOCO_FLAG(debug_disable_recurrent_cell_fusion, FlagRequirement::kNone);
+  READ_TOCO_FLAG(propagate_fake_quant_num_bits, FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 3237147a73..751aca948c 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 14.
+// Next ID to use: 15.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -141,4 +141,13 @@ message TocoFlags {
   // Disables transformations that fuse subgraphs such as known LSTMs (not all
   // LSTMs are identified).
   optional bool debug_disable_recurrent_cell_fusion = 13;
+
+  // Uses the FakeQuantWithMinMaxArgs.num_bits attribute to adjust quantized
+  // array data types throughout the graph. The graph must be properly annotated
+  // with FakeQuant* ops on at least the edges and may contain additional ops on
+  // the interior of the graph to widen/narrow as desired.
+  //
+  // Input and output array data types may change because of this propagation
+  // and users must be sure to query the final data_type values.
+  optional bool propagate_fake_quant_num_bits = 14;
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 5ba093a830..b69852453c 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -66,6 +66,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new RemoveTensorFlowIdentity);
   transformations->Add(new RemoveTrivialConcatenation);
   transformations->Add(new RemoveTrivialConcatenationInput);
+  transformations->Add(new RemoveTrivialFakeQuant);
   transformations->Add(new RemoveTrivialSlice);
   transformations->Add(new RemoveUnusedOp);
   transformations->Add(new EnsureBiasVectors);
@@ -109,7 +110,6 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveMeanAttributes);
   transformations->Add(new ResolveConstantShapeOrRank);
   transformations->Add(new MakeInitialDequantizeOperator);
-  transformations->Add(new ResolveConstantFakeQuant);
   transformations->Add(new UnpartitionEmbeddingLookup);
 }
 
@@ -233,6 +233,12 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   MakeGeneralGraphTransformationsSet(&transformations);
   auto* remove_trivial_reshape = new RemoveTrivialReshape;
   transformations.Add(remove_trivial_reshape);
+  auto* resolve_constant_fake_quant = new ResolveConstantFakeQuant;
+  if (quantize_output) {
+    resolve_constant_fake_quant->set_propagate_fake_quant_num_bits(
+        toco_flags.propagate_fake_quant_num_bits());
+  }
+  transformations.Add(resolve_constant_fake_quant);
   if (SupportsFusedActivationFunction(output_format)) {
     transformations.Add(new FuseActivationFunctions);
   } else {
@@ -264,9 +270,21 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
 
+  // Fix any issues with IO edges. This must happen after any transform that
+  // may modify the structure of the edges.
+  FixEdgeArrays(model);
+
   if (quantize_output) {
+    if (toco_flags.propagate_fake_quant_num_bits()) {
+      RunGraphTransformations(model,
+                              "fake quant propagation graph transformations",
+                              {new PropagateFakeQuantNumBits});
+    }
     RunGraphTransformations(model, "pre-quantization graph transformations",
-                            {new HardcodeMinMax, new DropFakeQuant});
+                            {
+                                new HardcodeMinMax,
+                                new DropFakeQuant,
+                            });
   }
 
   if (quantize_output) {
@@ -303,10 +321,6 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(model);
   }
 
-  // Fix any issues with IO edges. This must happen after any transform that
-  // may modify the structure of the edges.
-  FixEdgeArrays(model);
-
   LogDump(kLogLevelModelChanged, "AFTER TRANSFORMATIONS", *model);
 
   if (output_format != GRAPHVIZ_DOT && output_format != TFLITE) {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 224df9973e..ecac0c28a5 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -93,9 +93,18 @@ string ArrayDataTypeName(ArrayDataType data_type) {
   }
 }
 
-bool IsInputArray(const Model& model, const string& name) {
+bool IsInputArray(const Model& model, const string& array_name) {
   for (const auto& input_array : model.flags.input_arrays()) {
-    if (input_array.name() == name) {
+    if (array_name == input_array.name()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsOutputArray(const Model& model, const string& array_name) {
+  for (const auto& output_array : model.flags.output_arrays()) {
+    if (array_name == output_array) {
       return true;
     }
   }
@@ -106,10 +115,8 @@ bool IsArrayConsumed(const Model& model, const string& name) {
   if (GetOpWithInput(model, name)) {
     return true;
   }
-  for (const string& model_output : model.flags.output_arrays()) {
-    if (model_output == name) {
-      return true;
-    }
+  if (IsOutputArray(model, name)) {
+    return true;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (rnn_state.back_edge_source_array() == name) {
@@ -379,6 +386,7 @@ string HelpfulOperatorTypeName(const Operator& op) {
 bool OperatorSupportsFusedActivation(OperatorType type) {
   switch (type) {
     case OperatorType::kConcatenation:
+    case OperatorType::kFakeQuant:
     case OperatorType::kGather:
     case OperatorType::kSlice:
     case OperatorType::kSqueeze:
@@ -1064,16 +1072,38 @@ void FixEdgeArrays(Model* model) {
   }
 }
 
+namespace {
+void CopyArrayAttribs(const Array& source_array, Array* target_array) {
+  target_array->data_type = source_array.data_type;
+  target_array->final_data_type = source_array.final_data_type;
+  target_array->copy_shape(source_array.shape());
+
+  if (source_array.minmax) {
+    target_array->GetOrCreateMinMax() = source_array.GetMinMax();
+  } else {
+    target_array->minmax.reset();
+  }
+
+  if (source_array.quantization_params) {
+    target_array->GetOrCreateQuantizationParams() =
+        source_array.GetQuantizationParams();
+  } else {
+    target_array->quantization_params.reset();
+  }
+}
+}  // namespace
+
 void InsertCopyOperator(Model* model, const string& source_array_name,
                         const string& target_array_name) {
+  // Reshape to the same size. This should be a no-op.
+  const Array& source_array = model->GetArray(source_array_name);
+  std::vector<int> shape = source_array.shape().dims();
+
   // Drop constant data from the target array as the copy will be done at
   // runtime.
   Array& target_array = model->GetOrCreateArray(target_array_name);
   target_array.buffer.reset();
-
-  // Reshape to the same size. This should be a no-op.
-  const Array& source_array = model->GetArray(source_array_name);
-  std::vector<int> shape = source_array.shape().dims();
+  CopyArrayAttribs(source_array, &target_array);
 
   // Insert copy operator.
   auto* copy_op = new TensorFlowReshapeOperator;
@@ -1089,6 +1119,7 @@ void CloneArray(Model* model, const string& source_array_name,
   CHECK(!model->HasArray(target_array_name));
   const Array& source_array = model->GetArray(source_array_name);
   Array& target_array = model->GetOrCreateArray(target_array_name);
+  CopyArrayAttribs(source_array, &target_array);
 
   if (source_array.minmax) {
     const auto& smm = source_array.GetMinMax();
@@ -1513,14 +1544,9 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   if (model.IsOptionalArray(array_name)) return false;
   // The model's input and output arrays are externally allocated.
   // They are not transient arrays.
-  if (IsInputArray(model, array_name)) {
+  if (IsInputArray(model, array_name) || IsOutputArray(model, array_name)) {
     return false;
   }
-  for (const string& output_array : model.flags.output_arrays()) {
-    if (array_name == output_array) {
-      return false;
-    }
-  }
   const auto& array = &model.GetArray(array_name);
   // An array with a constant buffer isn't a transient array.
   if (!!array->buffer) {
@@ -1898,15 +1924,8 @@ int AxesCount(AxesOrder axes_order) {
 }
 
 bool IsDiscardableArray(const Model& model, const string& array_name) {
-  for (const auto& input_array : model.flags.input_arrays()) {
-    if (array_name == input_array.name()) {
-      return false;
-    }
-  }
-  for (const string& output_array : model.flags.output_arrays()) {
-    if (array_name == output_array) {
-      return false;
-    }
+  if (IsInputArray(model, array_name) || IsOutputArray(model, array_name)) {
+    return false;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (!rnn_state.discardable()) {
@@ -1960,8 +1979,8 @@ void CheckFinalDataTypesSatisfied(const Model& model) {
       CHECK(array.final_data_type == array.data_type)
           << "Array \"" << array_entry.first
           << "\" has mis-matching actual and final data types ("
-          << static_cast<int>(array.data_type) << ","
-          << static_cast<int>(array.final_data_type) << ").";
+          << ArrayDataTypeName(array.data_type) << ","
+          << ArrayDataTypeName(array.final_data_type) << ").";
     }
   }
 }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index ed0ecd4d0f..4c705f4e5f 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -28,7 +28,6 @@ limitations under the License.
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/src/google/protobuf/text_format.h"
 #endif  // TOCO_SUPPORT_PORTABLE_PROTOS
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
@@ -57,7 +56,11 @@ string LogName(const Operator& op);
 
 string ArrayDataTypeName(ArrayDataType data_type);
 
-bool IsInputArray(const Model& model, const string& name);
+// Returns true if the given array is specified as a model input array.
+bool IsInputArray(const Model& model, const string& array_name);
+// Returns true if the given array is specified as a model output array.
+bool IsOutputArray(const Model& model, const string& array_name);
+
 bool IsArrayConsumed(const Model& model, const string& name);
 int CountTrueOutputs(const Model& model, const Operator& op);
 
@@ -175,17 +178,6 @@ void CloneArray(Model* model, const string& source_array_name,
 
 void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
 
-template <ArrayDataType A>
-void GetQuantizationParamsFromMinMax(const MinMax& minmax,
-                                     QuantizationParams* quantization_params) {
-  using Integer = DataType<A>;
-  const double rmin = minmax.min;
-  const double rmax = minmax.max;
-
-  *quantization_params =
-      ::tflite::ChooseQuantizationParams<Integer>(rmin, rmax);
-}
-
 template <typename T>
 T ConvertOperator(Operator* o, OperatorType type) {
   if (o != nullptr && o->type == type) {
-- 
GitLab


From 96486029beea45177367508528d72587518608cc Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Tue, 17 Apr 2018 12:06:50 -0700
Subject: [PATCH 0957/1262] Moving gradient registration for CudnnRNN op from
 contrib to core.

PiperOrigin-RevId: 193234663
---
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     | 25 ----------
 tensorflow/python/BUILD                       | 11 +++++
 tensorflow/python/ops/cudnn_rnn_grad.py       | 47 +++++++++++++++++++
 tensorflow/python/ops/standard_ops.py         |  4 +-
 4 files changed, 61 insertions(+), 26 deletions(-)
 create mode 100644 tensorflow/python/ops/cudnn_rnn_grad.py

diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index c28c3a18e4..b615824460 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -1640,31 +1640,6 @@ class CudnnRNNRelu(_CudnnRNNNoInputC):
   _NUM_PARAMS_PER_LAYER = CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
-@ops.RegisterGradient("CudnnRNN")
-def _cudnn_rnn_backward(op, *grad):
-  if not op.get_attr("is_training"):
-    raise ValueError(
-        "CudnnRNN must set is_training to True to be used in gradients")
-  return gen_cudnn_rnn_ops.cudnn_rnn_backprop(
-      input=op.inputs[0],
-      input_h=op.inputs[1],
-      input_c=op.inputs[2],
-      params=op.inputs[3],
-      output=op.outputs[0],
-      output_h=op.outputs[1],
-      output_c=op.outputs[2],
-      output_backprop=grad[0],
-      output_h_backprop=grad[1],
-      output_c_backprop=grad[2],
-      reserve_space=op.outputs[3],
-      dropout=op.get_attr("dropout"),
-      seed=op.get_attr("seed"),
-      seed2=op.get_attr("seed2"),
-      rnn_mode=op.get_attr("rnn_mode"),
-      input_mode=op.get_attr("input_mode"),
-      direction=op.get_attr("direction"))
-
-
 ops.RegisterShape("CudnnRNNParamsSize")(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape("CudnnRNNParamsToCanonical")(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape("CudnnRNNCanonicalToParams")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 14ce8a57bd..569d3eb2ce 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1792,6 +1792,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "cudnn_rnn_grad",
+    srcs = ["ops/cudnn_rnn_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_for_generated_wrappers",
+        "//tensorflow/python:cudnn_rnn_ops_gen",
+    ],
+)
+
 py_library(
     name = "data_flow_grad",
     srcs = ["ops/data_flow_grad.py"],
@@ -2465,6 +2475,7 @@ py_library(
         ":clip_ops",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":cudnn_rnn_grad",
         ":data_flow_grad",
         ":data_flow_ops",
         ":framework_for_generated_wrappers",
diff --git a/tensorflow/python/ops/cudnn_rnn_grad.py b/tensorflow/python/ops/cudnn_rnn_grad.py
new file mode 100644
index 0000000000..97331bb5b5
--- /dev/null
+++ b/tensorflow/python/ops/cudnn_rnn_grad.py
@@ -0,0 +1,47 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradients for CuudnnRNN operators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+
+
+@ops.RegisterGradient("CudnnRNN")
+def _cudnn_rnn_backward(op, *grads):
+  """Gradients for the CudnnRNN op."""
+  if not op.get_attr("is_training"):
+    raise ValueError(
+        "CudnnRNN must set is_training to True to be used in gradients")
+  return gen_cudnn_rnn_ops.cudnn_rnn_backprop(
+      input=op.inputs[0],
+      input_h=op.inputs[1],
+      input_c=op.inputs[2],
+      params=op.inputs[3],
+      output=op.outputs[0],
+      output_h=op.outputs[1],
+      output_c=op.outputs[2],
+      output_backprop=grads[0],
+      output_h_backprop=grads[1],
+      output_c_backprop=grads[2],
+      reserve_space=op.outputs[3],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
+      rnn_mode=op.get_attr("rnn_mode"),
+      input_mode=op.get_attr("input_mode"),
+      direction=op.get_attr("direction"))
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index e90ff0746a..f71f98aa12 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -22,12 +22,13 @@ from __future__ import print_function
 
 import sys as _sys
 
+# pylint: disable=g-bad-import-order
 # Imports the following modules so that @RegisterGradient get executed.
 from tensorflow.python.ops import array_grad
+from tensorflow.python.ops import cudnn_rnn_grad
 from tensorflow.python.ops import data_flow_grad
 from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
-from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
@@ -96,6 +97,7 @@ from tensorflow.python.ops.tensor_array_ops import *
 from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
 # pylint: enable=wildcard-import
+# pylint: enable=g-bad-import-order
 
 #### For use in remove_undocumented below:
 from tensorflow.python.framework import constant_op as _constant_op
-- 
GitLab


From b50142067e776fc86ce2ba3d01d01c7c16da671f Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 17 Apr 2018 12:07:33 -0700
Subject: [PATCH 0958/1262] Automated g4 rollback of changelist 193168327

PiperOrigin-RevId: 193234819
---
 .../ci_build/windows/bazel/bazel_test_lib.sh  |  7 ------
 .../windows/cpu/pip/build_tf_windows.sh       | 23 ++++---------------
 2 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 582188fc00..d654b433e7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -140,13 +140,6 @@ function run_configure_for_gpu_build {
   echo "" | ./configure
 }
 
-function set_gcs_remote_cache_options {
-  echo "build --experimental_remote_spawn_cache" >> "${TMP_BAZELRC}"
-  echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> "${TMP_BAZELRC}"
-  echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> "${TMP_BAZELRC}"
-  echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
-}
-
 function create_python_test_dir() {
   rm -rf "$1"
   mkdir -p "$1"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 632f1ef564..5e9ae497e1 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,30 +42,20 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-# Recreate an empty bazelrc file under source root
-export TMP_BAZELRC=.tmp.bazelrc
-rm -f "${TMP_BAZELRC}"
-touch "${TMP_BAZELRC}"
-
 skip_test=0
 
 for ARG in "$@"; do
   if [[ "$ARG" == --skip_test ]]; then
     skip_test=1
-  elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
-    set_gcs_remote_cache_options
   fi
 done
 
-# --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
-# by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
-
-echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
-
 run_configure_for_cpu_build
 
-bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
+# --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+# by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
+BUILD_OPTS="--define=override_eigen_strong_inline=true"
+bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -83,13 +73,10 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt -k --test_output=errors \
+bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
   --flaky_test_attempts=3 \
   //${PY_TEST_DIR}/tensorflow/python/... \
   //${PY_TEST_DIR}/tensorflow/contrib/...
-
-# Remove all options in .tmp.bazelrc
-echo "" > "${TMP_BAZELRC}"
-- 
GitLab


From f67aa59c264a0ca84d2ff2e7a551d16136af6e56 Mon Sep 17 00:00:00 2001
From: "Tang, Wenyi" <twytwy12345@live.com>
Date: Wed, 18 Apr 2018 03:17:48 +0800
Subject: [PATCH 0959/1262] Complement cmake script to compile tensorflow with
 mkl and mkldnn on Windows (#16936)

* Add build batch for windows

* Automaticaly find python, cuda, mkl runtimes in PATH

* auto select cmake generator

* Add external library mkldnn. Add options for mkl and mkldnn

* fix syntax error in make.bat

* Fix errorlevel syntex bug in make.bat

* Add /arch:avx2 flags to enable avx2 on windows

* Revert to keep `tensprflow_WIN_CPU_SIMD_OPTIONS` unchanged, add an option `tensorflow_ENABLE_MKL_SUPPORT` to include MKL compilation. Still specify SIMD flags by setting `tensorflow_WIN_CPU_SIMD_OPTIONS` to such as '/arch:AVX2'

* Fix a mistake of CUDA path in make.bat

* resolve conflict in mkl_cpu_allocator.h

* Improve error detection

* Use where /Q to detect cmd environment

* fix "ELSE IF" the syntax error in make.bat

* update README.md, wrap windows based codes by #ifdef _WIN32

* unistd.h is not needed in mkl_cpu_allocator.h any more in master branch

* Remove inline of kMaxLimitStr, which causes compile error in VS2015

* Add static_cast in  to fix compile error

* remove make.bat

* Removed make.bat description part
---
 tensorflow/contrib/cmake/CMakeLists.txt       | 58 +++++++++++++++++--
 tensorflow/contrib/cmake/README.md            | 28 +++++++++
 .../contrib/cmake/external/mkldnn.cmake       | 44 ++++++++++++++
 .../core/common_runtime/mkl_cpu_allocator.h   |  4 ++
 tensorflow/core/graph/mkl_tfconversion_pass.h |  4 ++
 tensorflow/core/kernels/mkl_relu_op.cc        |  8 ++-
 tensorflow/core/util/mkl_util.h               |  4 ++
 7 files changed, 144 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/external/mkldnn.cmake

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index a7944ea74a..95df69465a 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -31,10 +31,14 @@ option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for contrib packages" OFF)
 option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
-option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
 
+# SIMD, MKL and MKLDNN options
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions" OFF)
+option(tensorflow_ENABLE_MKL_SUPPORT "Enable Intel MKL support" OFF)
+option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires MKL enabled" OFF)
+
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
@@ -162,12 +166,21 @@ endif()
 
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+  include(CheckCXXCompilerFlag)
+  if (tensorflow_ENABLE_MKL_SUPPORT)
+    add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
+    if (NOT tensorflow_ENABLE_MKLDNN_SUPPORT)
+      add_definitions(-DINTEL_MKL_ML)
+    endif()
+  endif()
+  CHECK_CXX_COMPILER_FLAG("-fopenmp" COMPILER_OPT_OPENMP_SUPPORT)
+  if (COMPILER_OPT_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  endif()
   if (WIN32)
-    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
-    else()
-      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
     endif()
   endif()
 endif()
@@ -298,6 +311,43 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+if (tensorflow_ENABLE_MKL_SUPPORT)
+  if (WIN32)
+    find_path(MKL_HOME_PLATFORM mkl
+      PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      PATH_SUFFIXES windows)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS
+      ${MKL_HOME_PLATFORM}/mkl/lib/intel64
+      ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt
+      ${MKL_HOME_PLATFORM}/compiler/lib/intel64
+      ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib)
+    set(MKL_REDIST_DLL_DIRS
+      ${MKL_HOME_PLATFORM}/redist/intel64/mkl
+      ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt
+      ${MKL_HOME_PLATFORM}/redist/intel64/compiler)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES
+      mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64)
+  endif()
+  if (UNIX)
+    # Fix me: complete the path on linux
+    find_path(MKL_HOME_PLATFORM mkl
+      HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      PATH_SUFFIXES linux)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS) # incompleted
+    set(MKL_REDIST_SO_DIRS) # incompleted
+  endif()
+  include_directories(${MKL_INCLUDE_DIRS})
+  link_directories(${MKL_LINK_DIRS})
+  if (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    include(mkldnn)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
+    list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
+    include_directories(${mkldnn_INCLUDE_DIRS})
+  endif()
+endif (tensorflow_ENABLE_MKL_SUPPORT)
+
 if (tensorflow_ENABLE_GPU)
   if (NOT WIN32)
     # Default install paths for cuda libraries in Linux
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index fe83bb3204..0b79f718d4 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -128,6 +128,18 @@ Step-by-step Windows build
      D:\local\cuda\bin
      ```
 
+   * When building with MKL support after installing [MKL](https://software.intel.com/en-us/mkl) from INTEL, append its bin directories to your PATH environment variable.
+
+     In case TensorFlow fails to find the MKL dll's during initialization, check your PATH environment variable.
+     It should contain the directory of the MKL dlls. For example:
+
+     ```
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\mkl
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\compiler
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\tbb\vc_mt
+     ```
+
+
    * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
      for example `cmake` is not in your path and it is installed in
      `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@@ -166,7 +178,15 @@ Step-by-step Windows build
    More? -Dtensorflow_ENABLE_GPU=ON ^
    More? -DCUDNN_HOME="D:\...\cudnn"
    ```
+   To build with MKL support add "^" at the end of the last line above following with:
+
+   ```
+   More? -Dtensorflow_ENABLE_MKL_SUPPORT=ON ^
+   More? -DMKL_HOME="D:\...\compilers_and_libraries"
+   ```
+
    To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+
    ```
    More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
    ```
@@ -226,6 +246,7 @@ Step-by-step Windows build
      ```
      ctest -C RelWithDebInfo
      ```
+
    * `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python tests on
      serveral major packages. This option is only valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
      After building the python wheel, you need to install the new wheel before running the tests.
@@ -234,6 +255,12 @@ Step-by-step Windows build
      ctest -C RelWithDebInfo
      ```
 
+   * `-Dtensorflow_ENABLE_MKL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL support. If MKL is enabled you need to install the [Intel Math Kernal Library](https://software.intel.com/en-us/mkl).
+     CMake will expect the location of MKL in -MKL_HOME=path_you_install_mkl.
+
+   * `-Dtensorflow_ENABLE_MKLDNN_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL DNN support. MKL DNN is [Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](https://github.com/intel/mkl-dnn). You have to add `-Dtensorflow_ENABLE_MKL_SUPPORT=ON` before including MKL DNN support.
+
+
 4. Invoke MSBuild to build TensorFlow.
 
    To build the C++ example program, which will be created as a `.exe`
@@ -251,6 +278,7 @@ Step-by-step Windows build
    D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
    ```
 
+
 Linux Continuous Integration build
 ==================================
 
diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000..a639fdee36
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/mkldnn.cmake
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(mkldnn_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/include)
+set(mkldnn_URL https://github.com/01org/mkl-dnn.git)
+set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src)
+set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291)
+
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib)
+  else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib)
+  endif()
+else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a)
+endif()
+
+ExternalProject_Add(mkldnn
+    PREFIX mkldnn
+    GIT_REPOSITORY ${mkldnn_URL}
+    GIT_TAG ${mkldnn_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${mkldnn_STATIC_LIBRARIES}
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DMKLINC:STRING=${MKL_INCLUDE_DIRS}
+)
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index b2ef51d10b..245320c896 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -31,6 +31,10 @@ limitations under the License.
 
 #include "i_malloc.h"
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 namespace tensorflow {
 
 class MklSubAllocator : public SubAllocator {
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/graph/mkl_tfconversion_pass.h
index 0562d8b3cd..84e50ee6e0 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.h
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.h
@@ -24,6 +24,10 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/core/graph/graph.h"
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 namespace tensorflow {
 // Interface to invoke the pass for unit test
 //
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 0a0f69522f..1ed43834dd 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -441,7 +441,9 @@ class MklReluOpBase : public OpKernel {
       // Allocate output and MklDnnShape tensors separately for possible
       // in-place operation
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                      {src_index}, dst_index, tf_shape_dst, &dst_tensor));
+                                      {static_cast<const int>(src_index)},
+                                      static_cast<const int>(dst_index),
+                                      tf_shape_dst, &dst_tensor));
       AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
 
       // Destination memory descriptor is same as source memory descriptor.
@@ -611,7 +613,9 @@ class MklReluGradOpBase : public OpKernel {
       // Allocate diff_src and MklDnnShape tensors separately for possible
       // in-place operation
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                      {diff_dst_index}, diff_src_index, tf_shape_diff_src,
+                                      {static_cast<const int>(diff_dst_index)},
+                                      static_cast<const int>(diff_src_index),
+                                      tf_shape_diff_src,
                                       &diff_src_tensor));
       AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src);
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 9f58e40d94..bc6d2d77a4 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -45,6 +45,10 @@ using mkldnn::primitive;
 using mkldnn::reorder;
 #endif
 
+#ifdef _WIN32
+typedef unsigned int uint;
+#endif
+
 // The file contains a number of utility classes and functions used by MKL
 // enabled kernels
 
-- 
GitLab


From 105d9795ae692ed2486652e5d672825ccbd726e9 Mon Sep 17 00:00:00 2001
From: Noah Eisen <ncteisen@google.com>
Date: Tue, 17 Apr 2018 12:21:10 -0700
Subject: [PATCH 0960/1262] Removes another custom implementation of
 ZeroCopyInputStream, instead uses the now public gRPC implementation.

Also, moves GrpcByteSource to grpc_util, to keep it near the other serialization code.

Lastly, gives a more verbose error if serialization (unparsing) fails (which should not ever happen).

PiperOrigin-RevId: 193236893
---
 tensorflow/core/distributed_runtime/rpc/BUILD |   1 +
 .../core/distributed_runtime/rpc/grpc_state.h |   8 +-
 .../core/distributed_runtime/rpc/grpc_util.cc | 107 +++---------------
 .../core/distributed_runtime/rpc/grpc_util.h  |  58 ++++++----
 .../distributed_runtime/rpc/grpc_util_test.cc |  12 +-
 .../rpc/grpc_worker_service_impl.h            |  28 +----
 6 files changed, 65 insertions(+), 149 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index fa0f8c9b52..e973a22f45 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -189,6 +189,7 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
+        ":grpc_util",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 0b6f9474dd..59dbb7ae04 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -56,7 +56,11 @@ class RPCState : public GrpcClientCQTag {
     }
 
     response_ = response;
-    GrpcMaybeUnparseProto(request, &request_buf_);
+    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
+    if (!s.ok()) {
+      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
+                 << s.error_message();
+    }
     call_ =
         std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
     call_->StartCall();
@@ -73,7 +77,7 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(response_buf_, response_)) {
+    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
       s.Update(errors::Internal("could not parse rpc response"));
     }
     if (!s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index c80728544b..ece56a2727 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -18,115 +18,42 @@ limitations under the License.
 
 namespace tensorflow {
 
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-grpc::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-void GrpcMaybeUnparseProto(const protobuf::Message& src,
-                           grpc::ByteBuffer* dst) {
-  // TODO(sanjay): For bigger protos, serialize into a ZeroCopyOutputStream.
-  ::grpc::Slice s(src.ByteSizeLong());
-  src.SerializeWithCachedSizesToArray(
-      const_cast<uint8*>(reinterpret_cast<const uint8*>(s.begin())));
-  ::grpc::ByteBuffer buffer(&s, 1);
-  dst->Swap(&buffer);
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     grpc::ByteBuffer* dst) {
+  bool own_buffer;
+  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter, protobuf::Message>(
+      src, dst, &own_buffer);
 }
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
 // ByteBuffer.
-void GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
   ::grpc::Slice s(src.data(), src.size());
   ::grpc::ByteBuffer buffer(&s, 1);
   dst->Swap(&buffer);
+  return ::grpc::Status::OK;
 }
 
-bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, protobuf::Message* dst) {
-  GrpcByteBufferSource stream;
-  if (!stream.Init(src)) return false;
-  return dst->ParseFromZeroCopyStream(&stream);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
+  grpc::ProtoBufferReader reader(src);
+  return dst->ParseFromZeroCopyStream(&reader);
 }
 
 // Overload of GrpcParseProto so we can decode a TensorResponse without
 // extra copying.  This overload is used by the RPCState class in
 // grpc_state.h.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst) {
-  struct ByteSource : public TensorResponse::Source {
-    const ::grpc::ByteBuffer* buffer;
-    GrpcByteBufferSource src;
-    bool ok;
-
-    ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() override {
-      ok = src.Init(*buffer);
-      return &src;
-    }
-  };
-  ByteSource bs;
-  bs.buffer = &src;
-  return dst->ParseFrom(&bs).ok() && bs.ok;
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst) {
+  ::tensorflow::GrpcByteSource byte_source(src);
+  auto s = dst->ParseFrom(&byte_source);
+  return s.ok();
 }
 
 // GrpcMaybeParseProto into a string simply copies bytes into the string.
-bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, string* dst) {
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
   dst->clear();
-  dst->reserve(src.Length());
+  dst->reserve(src->Length());
   std::vector<::grpc::Slice> slices;
-  if (!src.Dump(&slices).ok()) {
+  if (!src->Dump(&slices).ok()) {
     return false;
   }
   for (const ::grpc::Slice& s : slices) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index d5e7e9f5b3..4b58781b54 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -29,6 +29,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse an
+// efficient byte reader from which to decode a RecvTensorResponse.
+class GrpcByteSource : public TensorResponse::Source {
+ public:
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::ProtoBufferReader Reader;
+
+  protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  ::grpc::ByteBuffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;    // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
 constexpr char kStreamRemovedMessage[] = "Stream removed";
 
 // Identify if the given grpc::Status corresponds to an HTTP stream removed
@@ -79,38 +106,21 @@ typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
 inline string GrpcIdKey() { return "tf-rpc"; }
 
 // Serialize src and store in *dst.
-void GrpcMaybeUnparseProto(const protobuf::Message& src,
-                           ::grpc::ByteBuffer* dst);
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     ::grpc::ByteBuffer* dst);
 
 // Parse contents of src and initialize *dst with them.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, protobuf::Message* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
 
 // Specialization for TensorResponse
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst);
 
 // Copy string src to grpc buffer *dst.
-void GrpcMaybeUnparseProto(const string& src, ::grpc::ByteBuffer* dst);
+::grpc::Status GrpcMaybeUnparseProto(const string& src,
+                                     ::grpc::ByteBuffer* dst);
 
 // Copy grpc buffer src to string *dst.
-bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, string* dst);
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::grpc::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  int cur_;          // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::grpc::protobuf::int64 byte_count_;
-};
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
index 5356fb36e4..6eaa0b1833 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
@@ -67,7 +67,7 @@ TEST(GrpcProto, Unparse) {
   proto.add_container("hello");
   proto.add_container("world");
   grpc::ByteBuffer buf;
-  GrpcMaybeUnparseProto(proto, &buf);
+  ASSERT_TRUE(GrpcMaybeUnparseProto(proto, &buf).ok());
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -80,7 +80,7 @@ TEST(GrpcProto, UnparseToString) {
   string str;
   CHECK(proto.SerializeToString(&str));
   grpc::ByteBuffer buf;
-  GrpcMaybeUnparseProto(str, &buf);
+  ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -103,7 +103,7 @@ TEST(GrpcProto, Parse) {
     CleanupAllRequest proto = MakeProto(c.length);
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed))
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed))
         << c.length << " " << c.slices;
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
   }
@@ -127,7 +127,7 @@ TEST(GrpcProto, ParseFromString) {
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     string parsed_str;
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed_str))
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
         << c.length << " " << c.slices;
     ASSERT_TRUE(parsed.ParseFromString(parsed_str));
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -140,7 +140,7 @@ static void BM_UnparseGrpc(int iters, int size) {
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     grpc::ByteBuffer buf;
-    GrpcMaybeUnparseProto(proto, &buf);
+    CHECK(GrpcMaybeUnparseProto(proto, &buf).ok());
   }
   testing::StopTiming();
 }
@@ -167,7 +167,7 @@ static void BM_ParseGrpc(int iters, int size, int num_slices) {
   testing::StartTiming();
 
   for (int i = 0; i < iters; i++) {
-    CHECK(GrpcMaybeParseProto(buf, &proto));
+    CHECK(GrpcMaybeParseProto(&buf, &proto));
   }
 
   testing::StopTiming();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 0abac4f3c7..a54ea93796 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,36 +26,10 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
-namespace tensorflow {
-class GrpcByteSource : public TensorResponse::Source {
- public:
-  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::ProtoBufferReader Reader;
-
-  protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  ::grpc::ByteBuffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-}  // namespace tensorflow
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
-- 
GitLab


From 91be39b2bae2d935fb9eb8c9a7cd1d09642784af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:31:36 -0700
Subject: [PATCH 0961/1262] Relaxes the type constraints for the features in
 ServingInputReceiver, so it will accept anything convertible to a Tensor or
 SparseTensor.

This makes it possible to use with tf.contrib.labeled_tensor.

PiperOrigin-RevId: 193238295
---
 tensorflow/python/estimator/export/export.py  | 14 ++++++++++++-
 .../python/estimator/export/export_test.py    | 20 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 9206a4964b..41c1f5a2e2 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -74,8 +74,20 @@ class ServingInputReceiver(collections.namedtuple(
         raise ValueError('feature keys must be strings: {}.'.format(name))
       if not (isinstance(tensor, ops.Tensor)
               or isinstance(tensor, sparse_tensor.SparseTensor)):
-        raise ValueError(
+        value_error = ValueError(
             'feature {} must be a Tensor or SparseTensor.'.format(name))
+        # NOTE(ericmc): This if-else block is a specific carve-out for
+        # LabeledTensor, which has a `.tensor` attribute and which is
+        # convertible to tf.Tensor via ops.convert_to_tensor.
+        # Allowing all types convertible to tf.Tensor is considered by soergel@
+        # to be too permissive.
+        if hasattr(tensor, 'tensor'):
+          try:
+            ops.convert_to_tensor(tensor)
+          except TypeError:
+            raise value_error
+        else:
+          raise value_error
 
     if receiver_tensors is None:
       raise ValueError('receiver_tensors must be defined.')
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index eb9688bc97..c203be7dac 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -39,6 +39,21 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 
 
+class LabeledTensorMock(object):
+  """Mock class emulating LabeledTensor."""
+
+  def __init__(self):
+    self.tensor = constant_op.constant([1])
+
+
+def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
+  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
+
+
+ops.register_tensor_conversion_function(LabeledTensorMock,
+                                        _convert_labeled_tensor_mock_to_tensor)
+
+
 class ExportTest(test_util.TensorFlowTestCase):
 
   def test_serving_input_receiver_constructor(self):
@@ -135,6 +150,11 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       _ = export.ServingInputReceiver(feature, receiver_tensor)
 
+  def test_feature_labeled_tensor(self):
+    feature = LabeledTensorMock()
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.ServingInputReceiver(feature, receiver_tensor)
+
   def test_receiver_wrong_type(self):
     feature = constant_op.constant(5)
     receiver_tensor = "not a tensor"
-- 
GitLab


From 4d2de472999653bb7000be47959b1c5b996d6496 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:31:42 -0700
Subject: [PATCH 0962/1262] Fix the test gensym to prevent creating duplicate
 names in the same test.

PiperOrigin-RevId: 193238314
---
 .../contrib/autograph/converters/converter_test_base.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 23b61cf781..41c2e71702 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -35,14 +35,17 @@ from tensorflow.python.platform import test
 
 
 class FakeNamer(object):
+  """A fake namer that uses a global counter to generate unique names."""
+
+  def __init__(self):
+    self.i = 0
 
   def new_symbol(self, name_root, used):
-    i = 0
     while True:
-      name = '%s%d' % (name_root, i)
+      self.i += 1
+      name = '%s%d' % (name_root, self.i)
       if name not in used:
         return name
-      i += 1
 
   def compiled_function_name(self,
                              original_fqn,
-- 
GitLab


From 2fe299f39785611e29a5fb0d859cd283b3f9587c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:36:20 -0700
Subject: [PATCH 0963/1262] [XLA] Fix arguments to IsSparseArray and
 IsDenseArray

PiperOrigin-RevId: 193238920
---
 .../compiler/xla/service/pattern_matcher.h    |  4 +--
 .../xla/service/pattern_matcher_test.cc       | 32 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 5d49638077..f5a4f2c9df 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -532,7 +532,7 @@ class ShapePattern {
       ShapeType,
       ShapePatternLayoutImpl<Impl, const ::xla::Layout,
                              LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
-  IsDenseArray(const ::xla::Layout* layout) const {
+  IsDenseArray() const {
     return WithLayout(Layout().WithDenseFormat());
   }
 
@@ -540,7 +540,7 @@ class ShapePattern {
       ShapeType,
       ShapePatternLayoutImpl<Impl, const ::xla::Layout,
                              LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
-  IsSparseArray(const ::xla::Layout* layout) const {
+  IsSparseArray() const {
     return WithLayout(Layout().WithSparseFormat());
   }
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 5291b1437a..c88157c312 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -67,6 +67,7 @@ TEST(PatternMatcherTest, ScalarShape) {
   EXPECT_TRUE(Match(&scalar_shape, match::Shape(&matched_shape).IsScalar()));
   EXPECT_EQ(matched_shape, &scalar_shape);
   EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsArray()));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsDenseArray()));
   EXPECT_FALSE(Match(&scalar_shape, match::Shape().IsTuple()));
   EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithElementType(F32)));
   EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithRank(0)));
@@ -75,11 +76,13 @@ TEST(PatternMatcherTest, ScalarShape) {
       match::Shape().WithSubshape({0}, match::Shape()).WithElementType(F32)));
 }
 
-TEST(PatternMatcherTest, ArrayShape) {
+TEST(PatternMatcherTest, DenseArrayShape) {
   auto array_shape = ShapeUtil::MakeShape(F32, {2, 3, 4});
   Shape* matched_shape;
   EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
   EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_TRUE(Match(&array_shape, match::Shape().IsDenseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsSparseArray()));
   EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
   EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
   EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
@@ -90,6 +93,33 @@ TEST(PatternMatcherTest, ArrayShape) {
   EXPECT_FALSE(Match(&array_shape,
                      match::Shape().WithLayout(
                          match::Layout(&matched_layout).WithSparseFormat())));
+  EXPECT_TRUE(Match(&array_shape,
+                    match::Shape().WithLayout(
+                        match::Layout(&matched_layout).WithDenseFormat())));
+  EXPECT_EQ(matched_layout, &array_shape.layout());
+}
+
+TEST(PatternMatcherTest, SparseArrayShape) {
+  auto array_shape = ShapeUtil::MakeShapeWithSparseLayout(F32, {2, 3, 4}, 10);
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
+  EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsDenseArray()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().IsSparseArray()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3)));
+  EXPECT_FALSE(
+      Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape())));
+  Layout* matched_layout;
+  EXPECT_FALSE(Match(&array_shape,
+                     match::Shape().WithLayout(
+                         match::Layout(&matched_layout).WithDenseFormat())));
+  EXPECT_TRUE(Match(&array_shape,
+                    match::Shape().WithLayout(
+                        match::Layout(&matched_layout).WithSparseFormat())));
+  EXPECT_EQ(matched_layout, &array_shape.layout());
 }
 
 TEST(PatternMatcherTest, TupleShape) {
-- 
GitLab


From 4e6c516e8895204526446d8c3cf939a159362d59 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 01:17:11 +0000
Subject: [PATCH 0964/1262] Pylint issue fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/manip_ops_test.py       | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index 786df5cc7b..7cc4bf61ba 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -102,8 +102,9 @@ class RollTest(test_util.TensorFlowTestCase):
 
   def testInvalidInputShape(self):
     # The input should be 1-D or higher, checked in shape function.
-    with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 1 but is rank 0"):
-      roll = manip_ops.roll(7, 1, 0)
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at least rank 1 but is rank 0"):
+      manip_ops.roll(7, 1, 0)
 
   def testRollInputMustVectorHigherRaises(self):
     # The input should be 1-D or higher, checked is done in kernel.
@@ -117,8 +118,9 @@ class RollTest(test_util.TensorFlowTestCase):
 
   def testInvalidAxisShape(self):
     # The axis should be a scalar or 1-D, checked in shape function.
-    with self.assertRaisesRegexp(ValueError, "Shape must be at most rank 1 but is rank 2"):
-      roll = manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at most rank 1 but is rank 2"):
+      manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
   def testRollAxisMustBeScalarOrVectorRaises(self):
     # The axis should be a scalar or 1-D, checked in kernel.
@@ -132,8 +134,9 @@ class RollTest(test_util.TensorFlowTestCase):
 
   def testInvalidShiftShape(self):
     # The shift should be a scalar or 1-D, checked in shape function.
-    with self.assertRaisesRegexp(ValueError, "Shape must be at most rank 1 but is rank 2"):
-      roll = manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be at most rank 1 but is rank 2"):
+      manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
   def testRollShiftMustBeScalarOrVectorRaises(self):
     # The shift should be a scalar or 1-D, checked in kernel.
@@ -148,7 +151,7 @@ class RollTest(test_util.TensorFlowTestCase):
   def testInvalidShiftAndAxisNotEqualShape(self):
     # The shift and axis must be same size, checked in shape function.
     with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
-      roll = manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
+      manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
     # The shift and axis must be same size, checked in kernel.
-- 
GitLab


From 59367ba641fd33a78da38a42389d73d9f250dc36 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 19:47:25 +0000
Subject: [PATCH 0965/1262] Remove duplicate import in compat.py

Noticed there are a couple of places in compat.py that
have duplicate import:
```
from tensorflow.python.util.tf_export import tf_export
from tensorflow.python.util.tf_export import tf_export
```

This fix remove duplicate imports.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/util/compat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 738479c946..3358ffe526 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -42,7 +42,6 @@ import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 def as_bytes(bytes_or_text, encoding='utf-8'):
-- 
GitLab


From 05d6e17528c7929884eb4aa2df998fe3197f9335 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 19:48:42 +0000
Subject: [PATCH 0966/1262] Duplicate imports in histogram_ops.py

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/histogram_ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 4a1ef54fb5..ec38d89a0e 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -32,7 +32,6 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
-- 
GitLab


From 33d55d7caff2bd32fa2b1c5cacb7ac251c48e27d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 12:52:34 -0700
Subject: [PATCH 0967/1262] Cache the ag_internal module, to avoid falsely
 rejecting it when in the namespace of a previously converted function.
 Explicitly reject lambda functions, for now, becasue they require special
 treatment.

PiperOrigin-RevId: 193241279
---
 .../contrib/autograph/impl/conversion.py      | 35 +++++++++++++------
 .../contrib/autograph/impl/conversion_test.py | 22 ++++++++++++
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 11bbe7888a..5653e991f6 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -154,7 +154,16 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types):
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, conversion_map)
   elif tf_inspect.isfunction(o):
-    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
+    # TODO(mdan): This is not a reliable mechanism.
+    # The most reliable way is to check the source code, the AST will contain
+    # a Lambda node instead of a FunctionDef
+    if o.__name__ == '<lambda>':
+      raise NotImplementedError(
+          'lambda functions are not yet supported; declare the function'
+          ' using def instead: %s' % o)
+    else:
+      node, name, ns = function_to_graph(o, conversion_map, arg_values,
+                                         arg_types)
   elif tf_inspect.ismethod(o):
     node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
   else:
@@ -222,16 +231,22 @@ def _add_reserved_symbol(namespace, name, entity):
     raise ValueError('The name "%s" is reserved and may not be used.' % name)
 
 
+ag_internal = None
+
+
 def _add_self_references(namespace, api_module):
-  # Craft a module that exposes parts of the external API as well as certain
-  # internal modules.
-  ag_internal = imp.new_module('autograph')
-  ag_internal.converted_call = api_module.converted_call
-  ag_internal.utils = utils
-  # TODO(mdan): Add safeguards against name clashes.
-  # We don't want to create a submodule because we want the operators to be
-  # accessible as ag__.<operator>
-  ag_internal.__dict__.update(operators.__dict__)
+  """Adds namespace references to the module that exposes the api itself."""
+  global ag_internal
+  if ag_internal is None:
+    # Craft a module that exposes parts of the external API as well as certain
+    # internal modules.
+    ag_internal = imp.new_module('autograph')
+    ag_internal.converted_call = api_module.converted_call
+    ag_internal.utils = utils
+    # TODO(mdan): Add safeguards against name clashes.
+    # We don't want to create a submodule because we want the operators to be
+    # accessible as ag__.<operator>
+    ag_internal.__dict__.update(operators.__dict__)
 
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
 
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index f0b597c12f..da3220892f 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -78,6 +78,28 @@ class ConversionTest(test.TestCase):
         conversion_map.dependency_cache[f].body[0].body[0].value.func.id)
     self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
 
+  def test_entity_to_graph_lambda(self):
+    f = lambda a: a
+
+    with self.assertRaises(NotImplementedError):
+      conversion_map = self._simple_conversion_map()
+      conversion.entity_to_graph(f, conversion_map, None, None)
+
+  def test_ag_module_cached(self):
+    def callee():
+      return range(3)
+
+    def caller(a):
+      return a()
+
+    conversion_map = self._simple_conversion_map()
+    _, _, callee_ns = conversion.entity_to_graph(
+        callee, conversion_map, None, None)
+    _, _, caller_ns = conversion.entity_to_graph(
+        caller, conversion_map, None, None)
+
+    self.assertTrue(callee_ns['ag__'] is caller_ns['ag__'])
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 445245b6083952adfb8e27d8dafebf1254e55b1e Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Tue, 17 Apr 2018 13:05:09 -0700
Subject: [PATCH 0968/1262] Update clip_ops_test.py

---
 tensorflow/python/kernel_tests/clip_ops_test.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index c3f44f385e..e08123b041 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -127,12 +127,10 @@ class ClipTest(test.TestCase):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        ans = clip_ops.clip_by_value(x, -clip, clip)
-        tf_ans = ans.eval()
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        ans = clip_ops.clip_by_value(x, 1.0, clip)
-        tf_ans = ans.eval()
+      with self.assertRaises(ValueError):
+        _ = clip_ops.clip_by_value(x, -clip, clip)
+      with self.assertRaises(ValueError):
+        _ = clip_ops.clip_by_value(x, 1.0, clip)
 
   def testClipByValueNonFinite(self):
     # TODO(b/78016351): Enable test on GPU once the bug is fixed.
-- 
GitLab


From 83418120b7c2659fedddd7c85b65d3c3e6aa94e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:20:42 -0700
Subject: [PATCH 0969/1262] Fixing a bug in strided slice. The op was not
 handling negative indices correctly.

PiperOrigin-RevId: 193245539
---
 .../internal/optimized/optimized_ops.h        | 144 +++++++++++++----
 .../internal/reference/reference_ops.h        | 150 +++++++++++++-----
 .../contrib/lite/kernels/strided_slice.cc     |  22 +--
 .../lite/kernels/strided_slice_test.cc        |  37 ++---
 .../contrib/lite/testing/generate_examples.py |  14 +-
 .../propagate_fixed_sizes.cc                  | 144 +++++++++++++----
 .../resolve_constant_strided_slice.cc         |  93 ++++++++---
 tensorflow/contrib/lite/toco/model.h          |  55 +++++++
 tensorflow/contrib/lite/toco/toco_port.h      |   8 +
 9 files changed, 493 insertions(+), 174 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 7fc6615965..d269056800 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5561,43 +5561,127 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
       output_dims, 0);
 }
 
+// UNOPTIMIZED COPY of StridedSlice from reference_ops.h (see comments there).
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+inline int StartForAxis(int begin_mask, const std::vector<int>& start_indices,
+                        const std::vector<int>& strides,
+                        const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+inline int StopForAxis(int end_mask, const std::vector<int>& stop_indices,
+                       const std::vector<int>& strides,
+                       const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int stop = stop_indices[axis];
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("StridedSlice");
-  const int start_b = (begin_mask & 8) ? 0 : starts[3];
-  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
-  const int start_h = (begin_mask & 4) ? 0 : starts[2];
-  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
-  const int start_w = (begin_mask & 2) ? 0 : starts[1];
-  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
-  const int start_d = (begin_mask & 1) ? 0 : starts[0];
-  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  TFLITE_DCHECK_EQ(stop_indices.size(), 4);
+  TFLITE_DCHECK_EQ(strides.size(), 4);
+  const int start_b =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 3);
+  const int stop_b =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 3);
+  const int start_h =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 2);
+  const int stop_h =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 2);
+  const int start_w =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 1);
+  const int stop_w =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 1);
+  const int start_d =
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 0);
+  const int stop_d =
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 0);
 
   T* out_ptr = output_data;
-  if (strides[0] == 0) {
-    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-          const int len = stop_d - start_d;
-          memcpy(out_ptr,
-                 input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
-                 len * sizeof(T));
-          out_ptr += len;
-        }
-      }
-    }
-  } else {
-    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-          for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
-            *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
-          }
+  for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]);
+       in_b += strides[3]) {
+    for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]);
+         in_h += strides[2]) {
+      for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]);
+           in_w += strides[1]) {
+        for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]);
+             in_d += strides[0]) {
+          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
       }
     }
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 791fb52391..49a93b0c6d 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3026,59 +3026,139 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
       output_dims, 0);
 }
 
-inline bool LoopCondition(int index, int stop, int stride) {
-  return stride > 0 ? index < stop : index > stop;
+// STRIDED SLICE
+// The functions below for StridedSlice are mirrored in a number of places:
+//
+//   propagate_fixed_sizes.cc
+//   propagate_shapes.cc
+//   resolve_constant_strided_slice.cc
+//   optimized_ops.h
+//
+// It is designed for an arbitrary number of dimensions, even though dimensions
+// here are fixed at 4. This is because we expect to eventually support
+// arbitrary dimensionality. Also note that the axis orders are reversed for
+// runtime ops, and so the indices and masks must be as well too.
+//
+// Be warned this code involves some rather subtle logic of python slicing. The
+// best "ground truth" is to compare results to actual python execution.
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+inline int StartForAxis(int begin_mask, const std::vector<int>& start_indices,
+                        const std::vector<int>& strides,
+                        const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
 }
 
-inline int StartIndex(int start, int stride, int dim, bool masked) {
-  return masked ? (stride > 0 ? 0 : dim - 1) : start;
+inline int StopForAxis(int end_mask, const std::vector<int>& stop_indices,
+                       const std::vector<int>& strides,
+                       const Dims<4>& input_shape, int axis) {
+  // Begin with the specified index
+  int stop = stop_indices[axis];
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.sizes[axis];
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
 }
 
-inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
-                     bool shrink_axis_masked) {
-  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
-                            : masked ? (stride > 0 ? dim : -1) : stop;
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
 }
 
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask, int shrink_axis_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& start_indices,
+                         const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  TFLITE_DCHECK_EQ(starts.size(), 4);
-  TFLITE_DCHECK_EQ(stops.size(), 4);
+  TFLITE_DCHECK_EQ(start_indices.size(), 4);
+  TFLITE_DCHECK_EQ(stop_indices.size(), 4);
   TFLITE_DCHECK_EQ(strides.size(), 4);
   const int start_b =
-      StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 3);
   const int stop_b =
-      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
-                end_mask & 8, shrink_axis_mask & 8);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 3);
   const int start_h =
-      StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 2);
   const int stop_h =
-      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
-                end_mask & 4, shrink_axis_mask & 4);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 2);
   const int start_w =
-      StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 1);
   const int stop_w =
-      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
-                end_mask & 2, shrink_axis_mask & 2);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 1);
   const int start_d =
-      StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
+      StartForAxis(begin_mask, start_indices, strides, input_dims, 0);
   const int stop_d =
-      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
-                end_mask & 1, shrink_axis_mask & 1);
+      StopForAxis(end_mask, stop_indices, strides, input_dims, 0);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
+  for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]);
        in_b += strides[3]) {
-    for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
+    for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]);
          in_h += strides[2]) {
-      for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
+      for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]);
            in_w += strides[1]) {
-        for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
+        for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]);
              in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
@@ -3087,18 +3167,6 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
-                         const std::vector<int>& starts,
-                         const std::vector<int>& stops,
-                         const std::vector<int>& strides, T* output_data,
-                         const Dims<4>& output_dims) {
-  StridedSlice(input_data, input_dims, begin_mask, end_mask,
-               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
-               output_dims);
-}
-
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index e6d5c300dc..40ac436b7d 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -87,6 +87,8 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
                           std::min(std::max(index, -dim), dim - 1), dim));
 }
 
+// TODO(b/77971377) this logic should be removed, as it's a duplication of
+// StartForAxis() & StopForAxis() in kernels/internal/reference/reference_ops.h
 inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
   const int dim = op_context->input->dims->data[idx];
   const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
@@ -188,8 +190,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::vector<int32_t> strides;
 
   for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
-    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    starts.emplace_back(GetTensorData<int32_t>(op_context.begin)[idx]);
+    stops.emplace_back(GetTensorData<int32_t>(op_context.end)[idx]);
     strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
   }
 
@@ -202,15 +204,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   int begin_mask =
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
   int end_mask = ReverseMaskBits(op_context.params->end_mask, op_context.dims);
-  int shrink_axis_mask =
-      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
-
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                          \
-  kernel_type::StridedSlice(                                                   \
-      GetTensorData<data_type>(op_context.input),                              \
-      GetTensorDims(op_context.input), begin_mask, end_mask, shrink_axis_mask, \
-      starts, stops, strides, GetTensorData<data_type>(op_context.output),     \
-      GetTensorDims(op_context.output))
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                    \
+  kernel_type::StridedSlice(GetTensorData<data_type>(op_context.input),  \
+                            GetTensorDims(op_context.input), begin_mask, \
+                            end_mask, starts, stops, strides,            \
+                            GetTensorData<data_type>(op_context.output), \
+                            GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 22d7b097cb..cc39179bc7 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -377,29 +377,18 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
-  m.SetEnd({3});
+  m.SetEnd({2});
   m.SetStrides({1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
 }
 
-TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
-  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
-  m.SetInput({1, 2, 3, 4});
-  m.SetBegin({2});
-  m.SetEnd({1});
-  m.SetStrides({1});
-  m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
-}
-
 TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
-  m.SetEnd({3});
+  m.SetEnd({1});
   m.SetStrides({1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -421,7 +410,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
   StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({1, 3});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
@@ -432,7 +421,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
   StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({2, 1});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -443,7 +432,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
   StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
-  m.SetEnd({2, 3});
+  m.SetEnd({1, 1});
   m.SetStrides({1, 1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -454,7 +443,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
@@ -465,7 +454,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 1, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
@@ -476,7 +465,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 1, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -487,7 +476,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 3, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
@@ -498,7 +487,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
@@ -509,7 +498,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({2, 1, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
@@ -520,7 +509,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
   StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 1, 1});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_TRUE(m.GetOutputShape().empty());
@@ -553,7 +542,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
                                                  0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
-  m.SetEnd({2, 3, 2});
+  m.SetEnd({1, 3, 2});
   m.SetStrides({1, 1, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index e045c27427..f72a4e0d8c 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1758,19 +1758,7 @@ def make_strided_slice_tests(zip_path):
           "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
           "constant_indices": [False, True],
       },
-      #
-      {
-          "dtype": [tf.float32],
-          "index_type": [tf.int32],
-          "input_shape": [[12, 2, 2, 5]],
-          "begin": [[0]],
-          "end": [[1]],
-          "strides": [[1]],
-          "begin_mask": [0],
-          "end_mask": [0],
-          "shrink_axis_mask": [1],
-          "constant_indices": [True],
-      },
+      # TODO(b/73170889) Restore test paramaters removed in cl/191608113.
       # 2-D
       {
           "dtype": [tf.float32, tf.int32, tf.int64],
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 9191e69662..b34aca1f09 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1253,6 +1253,83 @@ void ProcessStackOperator(Model* model, StackOperator* op) {
   output_array.copy_shape(*stacked_shape);
 }
 
+// These StridedSlice utility functions are essentially a COPY of those in
+// reference_ops.h. See comments there.
+
+// Use until std::clamp() is available from C++17.
+int Clamp(const int v, const int lo, const int hi) {
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
+                 int axis) {
+  // Begin with the specified index
+  int start = op.start_indices[axis];
+
+  // begin_mask override
+  if (op.begin_mask & 1 << axis) {
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
+                int axis) {
+  // Begin with the specified index
+  int stop = op.stop_indices[axis];
+
+  // end_mask override
+  if (op.end_mask & (1 << axis)) {
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (op.strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
 void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
@@ -1290,43 +1367,46 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
     return;
   }
 
-  int dim_count = input_array.shape().dimensions_count();
-  CHECK(op->start_indices.size() == dim_count)
-      << ": Incorrect number of start indices supplied to StridedSlice op with "
-         "output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " start indices";
-  CHECK(op->stop_indices.size() == dim_count)
-      << ": Incorrect number of stop indices supplied to StridedSlice op with "
-         "output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " stop indices";
-  CHECK(op->strides.size() == dim_count)
-      << ": Incorrect number of strides supplied to StridedSlice op with "
-         " output \""
-      << op->outputs[0] << "\". Op requires " << dim_count << " strides";
+  int num_input_axes = input_array.shape().dimensions_count();
+  CHECK_LE(op->start_indices.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " start indices";
+  CHECK_LE(op->stop_indices.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " stop indices";
+  CHECK_LE(op->strides.size(), num_input_axes)
+      << "StridedSlice op with output \"" << op->outputs[0]
+      << "\", requires no more than " << num_input_axes << " strides";
+  for (int i = 0; i < op->strides.size(); i++) {
+    CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i
+                                << " has stride=" << op->strides[i] << ".";
+  }
+
+  // The TensorFlow documentation is not explicit on how it handles fewer
+  // supplied indices than dimensions, but they are accepted. We emulate TF's
+  // behavior by fully iterating over each "forgotten" dimension.
+  op->PadIndices(num_input_axes);
 
   // Create output shape
   std::vector<int>* dims = output_array.mutable_shape()->mutable_dims();
 
   // Compute output shape
-  for (int i = 0; i < dim_count; ++i) {
-    const int mask = 1 << i;
-    int start = (op->begin_mask & mask) ? 0 : op->start_indices[i];
-    if (start < 0) {
-      // handle negative indices
-      start += input_array.shape().dims(i);
-    }
-    int stop = (op->end_mask & mask) ? input_array.shape().dims(i)
-                                     : op->stop_indices[i];
-    if (stop < 0) {
-      // handle negative indices
-      stop += input_array.shape().dims(i);
-    }
-
-    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
-    dim_size = dim_size < 0 ? 0 : dim_size;
-    if (op->shrink_axis_mask & mask) {
-      CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
-                               "shrinking that axis";
+  for (int axis = 0; axis < num_input_axes; ++axis) {
+    int start_index = StartForAxis(*op, input_array.shape(), axis);
+    int stop_index = StopForAxis(*op, input_array.shape(), axis);
+    int dim_size =
+        ceil(static_cast<float>(stop_index - start_index) / op->strides[axis]);
+
+    CHECK_GT(dim_size, 0)
+        << "Output size for an axis must be greater than 0. Axis " << axis
+        << " computes to size " << dim_size
+        << " for StridedSlice op with output \"" << op->outputs[0] << "\".";
+    if (op->shrink_axis_mask & (1 << axis)) {
+      CHECK_EQ(dim_size, 1)
+          << "Output size for an axis must compute to 1 when shrinking an "
+             "axis. Axis "
+          << axis << " computes to size " << dim_size
+          << " for StridedSlice op with output \"" << op->outputs[0] << "\".";
     } else {
       dims->push_back(dim_size);
     }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index a0cfc3d597..8df3c2f195 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -23,40 +23,88 @@ namespace toco {
 
 namespace {
 
+// These StridedSlice utility functions are essentially a COPY of those in
+// reference_ops.h. See comments there.
+
+// Use until std::clamp() is available from C++17.
+int Clamp(const int v, const int lo, const int hi) {
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
 int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
                  int axis) {
-  int start;
+  // Begin with the specified index
+  int start = op.start_indices[axis];
+
+  // begin_mask override
   if (op.begin_mask & 1 << axis) {
-    // If begin mask bit is set, use the first element
-    start = 0;
-  } else {
-    // Otherwise, use the specified element
-    start = op.start_indices[axis];
-    if (start < 0) {
-      // Handle negative indices
-      start += input_shape.dims(axis);
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
     }
   }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
   return start;
 }
 
 int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
                 int axis) {
-  int stop;
+  // Begin with the specified index
+  int stop = op.stop_indices[axis];
+
+  // end_mask override
   if (op.end_mask & (1 << axis)) {
-    // If end mask bit set, use the last element
-    stop = input_shape.dims(axis);
-  } else {
-    // Otherwise, use the specified element
-    stop = op.stop_indices[axis];
-    if (stop < 0) {
-      // Handle negative indices
-      stop += input_shape.dims(axis);
+    if (op.strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
     }
   }
+
+  // Handle negative indices
+  int axis_size = input_shape.dims(axis);
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (op.strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
   return stop;
 }
 
+bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
 template <ArrayDataType Type>
 void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
                   Array* output_array) {
@@ -73,9 +121,6 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   int num_input_axes = op.start_indices.size();
   CHECK_EQ(num_input_axes, op.stop_indices.size());
   CHECK_EQ(num_input_axes, op.strides.size());
-  for (int i = 0; i < op.strides.size(); i++) {
-    CHECK_GE(op.strides[i], 0) << "Negative strides usupported";
-  }
 
   // Create a buffer for the output array
   std::vector<DataType<Type>>& output_data =
@@ -103,13 +148,15 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
     // Compute next source input coordinates.
     bool carry = true;
     for (int axis = 0; axis < num_input_axes; axis++) {
+      int stride = op.strides[axis];
       // Increment this axis if we carried from the previous one
       if (carry) {
-        src_coord[axis] += op.strides[axis];
+        src_coord[axis] += stride;
       }
 
       // Check if we've overflowed.
-      if (src_coord[axis] >= StopForAxis(op, input_shape, axis)) {
+      int stop = StopForAxis(op, input_shape, axis);
+      if (LoopCondition(src_coord[axis], stop, stride)) {
         // Reset axis and set carry
         src_coord[axis] = StartForAxis(op, input_shape, axis);
         carry = true;
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 1c4c96ae70..705a9d69a6 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -845,6 +846,60 @@ struct StridedSliceOperator : Operator {
   int end_mask;
   int new_axis_mask;
   int shrink_axis_mask;
+
+  StridedSliceOperator(const StridedSliceOperator& other)
+      : Operator(OperatorType::kStridedSlice) {
+    inputs = other.inputs;
+    outputs = other.outputs;
+
+    start_indices = other.start_indices;
+    stop_indices = other.stop_indices;
+    strides = other.strides;
+
+    begin_mask = other.begin_mask;
+    ellipsis_mask = other.ellipsis_mask;
+    end_mask = other.end_mask;
+    new_axis_mask = other.new_axis_mask;
+    shrink_axis_mask = other.shrink_axis_mask;
+  }
+
+  void PadIndices(int dim_count) {
+    // Add indices and mask bits to fully include extra dimensions
+    CHECK_GE(dim_count, start_indices.size());
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    for (int i = start_indices.size(); i < dim_count; i++) {
+      start_indices.push_back(0);
+      stop_indices.push_back(0);
+      strides.push_back(1);
+      begin_mask |= 1 << i;
+      end_mask |= 1 << i;
+    }
+  }
+
+  void ReverseIndices() {
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    std::reverse(start_indices.begin(), start_indices.end());
+    std::reverse(stop_indices.begin(), stop_indices.end());
+    std::reverse(strides.begin(), strides.end());
+
+    begin_mask = toco::port::ReverseBits32(static_cast<uint32>(begin_mask)) >>
+                 (32 - start_indices.size());
+    ellipsis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(ellipsis_mask)) >>
+        (32 - start_indices.size());
+    end_mask = toco::port::ReverseBits32(static_cast<uint32>(end_mask)) >>
+               (32 - start_indices.size());
+    new_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(new_axis_mask)) >>
+        (32 - start_indices.size());
+    shrink_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(shrink_axis_mask)) >>
+        (32 - start_indices.size());
+  }
 };
 
 // Reshaping operator, reshaping its input array to a two-dimensional shape
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 4be3b5a0bf..2d5c231bef 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -75,6 +75,14 @@ Status Exists(const string& filename, const Options& options);
 void CopyToBuffer(const ::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
+
+inline uint32 ReverseBits32(uint32 n) {
+  n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
+  n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
+  n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
+  return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) |
+          ((n & 0xFF000000) >> 24));
+}
 }  // namespace port
 
 inline bool ParseFromStringOverload(const std::string& in,
-- 
GitLab


From 8744d2954a755a64e115e2c2dc81e9f79e19f17a Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Tue, 17 Apr 2018 13:25:51 -0700
Subject: [PATCH 0970/1262] Remove range_builder

---
 tensorflow/python/kernel_tests/confusion_matrix_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 116e5e4e5a..79e419867d 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -105,8 +105,7 @@ class ConfusionMatrixTest(test.TestCase):
       d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0})
 
       truth = np.zeros([2, 2], dtype=np_dtype)
-      range_builder = xrange
-      for i in range_builder(len(d)):
+      for i in xrange(len(d)):
         truth[l[i], d[i]] += 1
 
       self.assertEqual(cm_out.dtype, np_dtype)
-- 
GitLab


From 84b6dac70710075e67fcf40ccd29033335d63f83 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 17 Apr 2018 13:27:46 -0700
Subject: [PATCH 0971/1262] Internal change.

PiperOrigin-RevId: 193246563
---
 tensorflow/contrib/timeseries/examples/BUILD | 33 +++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 32e948a009..355303acf6 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -8,14 +8,22 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+config_setting(
+    name = "empty_condition",
+    values = {"define": "UNUSED=unused"},
+)
+
 py_binary(
     name = "predict",
     srcs = ["predict.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -41,9 +49,12 @@ py_binary(
     data = ["data/changepoints.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -64,9 +75,12 @@ py_binary(
     data = ["data/multivariate_level.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
         "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -89,11 +103,14 @@ py_binary(
     data = ["data/multivariate_periods.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }) + [
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/timeseries/python/timeseries:estimators",
         "//tensorflow/contrib/timeseries/python/timeseries:model",
-        "//third_party/py/numpy",
     ],
 )
 
-- 
GitLab


From 197572bd517a4bc6f4850dfecf3288818d8d84ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:30:45 -0700
Subject: [PATCH 0972/1262] Unpack multiple assignments when processing lists.
 This supports the cases "a, b = [], []" and "[a, b] = [], []". Also expand
 the static analysis to support list unpacking constructs.

PiperOrigin-RevId: 193247024
---
 .../contrib/autograph/converters/lists.py     | 34 +++++++++-----
 .../autograph/converters/lists_test.py        | 46 ++++++++++++++++++-
 .../pyct/static_analysis/type_info.py         |  2 +-
 3 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py
index 6dda554acc..b49521b2c3 100644
--- a/tensorflow/contrib/autograph/converters/lists.py
+++ b/tensorflow/contrib/autograph/converters/lists.py
@@ -82,23 +82,33 @@ class ListTransformer(transformer.Base):
             element=call_node.args[0])
     return node
 
+  def _replace_list_constructors(self, targets, values):
+    for target in targets:
+      if (isinstance(target, (gast.Tuple, gast.List)) and
+          isinstance(values, (gast.Tuple, gast.List))):
+        n_targets = len(target.elts)
+        for i in range(n_targets):
+          target_el, value_el = target.elts[i], values.elts[i]
+          values.elts[i] = self._replace_list_constructors(
+              (target_el,), value_el)
+        return values
+      if isinstance(values, gast.List):
+        if values.elts:
+          return self._pre_populated_list(values)
+        else:
+          return self._empty_list(values)
+    return values
+
   def visit_Assign(self, node):
     node = self.generic_visit(node)
 
     # Only convert lists when they are assigned to a variable, e.g.:
     #   l = []
-    # TODO(mdan): This rule should be improved.
-    if len(node.targets) != 1:
-      return node
-    if not isinstance(node.value, gast.List):
-      return node
-    if not isinstance(node.value.ctx, gast.Load):
-      return node
-
-    if node.value.elts:
-      node.value = self._pre_populated_list(node.value)
-    else:
-      node.value = self._empty_list(node.value)
+    # TODO(mdan): A similar pattern exists in type_info.py
+    # We should add a generic "unpack_assignment" function to the base
+    # transformer, that has the same effect as applying some logic to the SSA
+    # form.
+    node.value = self._replace_list_constructors(node.targets, node.value)
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
index 749ba14347..74c6dc64f1 100644
--- a/tensorflow/contrib/autograph/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -45,7 +45,51 @@ class ListTest(converter_test_base.TestCase):
       result.utils = utils
       result.dtypes = dtypes
       with self.test_session() as sess:
-        self.assertEqual(test_fn(), sess.run(result.test_fn().stack()))
+        self.assertAllEqual([1], sess.run(result.test_fn().stack()))
+
+  def test_empty_annotated_lists_unpacked(self):
+
+    def test_fn():
+      l, m = [], []
+      utils.set_element_type(l, dtypes.int32)
+      utils.set_element_type(m, dtypes.int32)
+      l.append(1)
+      m.append(2)
+      return l, m
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        res_l, res_m = result.test_fn()
+        self.assertEqual([1], sess.run(res_l.stack()))
+        self.assertEqual([2], sess.run(res_m.stack()))
+
+  def test_empty_annotated_lists_list_unpacked(self):
+
+    def test_fn():
+      [l, m] = [], []
+      utils.set_element_type(l, dtypes.int32)
+      utils.set_element_type(m, dtypes.int32)
+      l.append(1)
+      m.append(2)
+      return l, m
+
+    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node, tensor_array_ops.TensorArray,
+                       dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        res_l, res_m = result.test_fn()
+        self.assertEqual([1], sess.run(res_l.stack()))
+        self.assertEqual([2], sess.run(res_m.stack()))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 2f553e1e23..763997968c 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -184,7 +184,7 @@ class TypeInfoResolver(transformer.Base):
     # Multiple targets mean multiple assignment.
     for target in targets:
       # Tuple target means unpacking.
-      if isinstance(target, gast.Tuple):
+      if isinstance(target, (gast.Tuple, gast.List)):
         for i, target_item in enumerate(target.elts):
           # Two cases here:
           #   1. Static unpacking, e.g. a, b = c, d
-- 
GitLab


From 1628d18d24400f08b768b545f839e32b44a097c7 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 17 Apr 2018 13:31:54 -0700
Subject: [PATCH 0973/1262] Use is_constructible instead of is_convertible.

Before this, all objects would follow the slow path (since nothing is
convertible to AlphaNum since it has a private copy constructor).

Before:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 67.5895690918
  extras {
    key: "examples_per_sec"
    value {
      double_value: 14795.1823549
    }
  }
}

After:
entry {
  name: "MicroBenchmarks.benchmark_defun_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 61.0044002533
  extras {
    key: "examples_per_sec"
    value {
      double_value: 16392.2601623
    }
  }
}
PiperOrigin-RevId: 193247183
---
 tensorflow/core/lib/core/errors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index 1a0f4be2ea..51c09032df 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -42,7 +42,7 @@ namespace internal {
 // Eventually absl::strings will have native support for this and we will be
 // able to completely remove PrepareForStrCat().
 template <typename T>
-typename std::enable_if<!std::is_convertible<T, strings::AlphaNum>::value,
+typename std::enable_if<!std::is_constructible<strings::AlphaNum, T>::value,
                         string>::type
 PrepareForStrCat(const T& t) {
   std::stringstream ss;
-- 
GitLab


From fabf01011654be16e3aeb08192caa76c9595cfde Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:36:24 -0700
Subject: [PATCH 0974/1262] Make GroupRec* const in GroupRecCallback by marking
 mu mutable in CollectiveParamResolverLocal::GroupRec.

PiperOrigin-RevId: 193247799
---
 .../collective_param_resolver_local.cc         | 17 +++++++++--------
 .../collective_param_resolver_local.h          | 18 ++++++++++--------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index b34950b2f4..393d3f824d 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -401,7 +401,7 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 }
 
 Status CollectiveParamResolverLocal::InitInstanceSharedParams(
-    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
   VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
@@ -443,7 +443,7 @@ Status CollectiveParamResolverLocal::InitInstanceSharedParams(
 }
 
 void CollectiveParamResolverLocal::CompleteDefaultRanking(
-    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
     const std::vector<DeviceLocality>& localities) {
   // Establish an instance-specific default rank order for devices
   // based on localities.  This rank order should be a good ring
@@ -485,7 +485,7 @@ void CollectiveParamResolverLocal::CallbackWithStatus(
 }
 
 void CollectiveParamResolverLocal::FindInstanceRec(
-    GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
+    const GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
   InstanceRec* irec = nullptr;
   bool exit_outside_locks = false;
   {
@@ -544,7 +544,8 @@ void CollectiveParamResolverLocal::CompleteParamsAsync(
   VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
-      device, cp, [this, device, cp, done](const Status& s, GroupRec* gr) {
+      device, cp,
+      [this, device, cp, done](const Status& s, const GroupRec* gr) {
         if (s.ok()) {
           CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
         } else {
@@ -563,8 +564,8 @@ void CollectiveParamResolverLocal::CompleteInstanceAsync(
 }
 
 void CollectiveParamResolverLocal::CompleteInstanceLocal(
-    const string& device, GroupRec* gr, CollectiveParams* cp, bool is_source,
-    const StatusCallback& done) {
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    bool is_source, const StatusCallback& done) {
   VLOG(1) << "CompleteInstanceLocal " << device
           << " instance_key: " << cp->instance.instance_key << " gr " << gr;
 
@@ -589,8 +590,8 @@ void CollectiveParamResolverLocal::CompleteInstanceLocal(
 }
 
 void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
-    const string& device, GroupRec* gr, CollectiveParams* cp, InstanceRec* ir,
-    bool is_source, const StatusCallback& done) {
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    InstanceRec* ir, bool is_source, const StatusCallback& done) {
   // Populate the fields common across instance.
   {
     mutex_lock l(ir->out_mu);
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index ff3415b0a9..7b2946e936 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -56,7 +56,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // Used to complete/verify CollGroup.
   struct GroupRec {
     CollGroupParams group;
-    mutex mu;
+    mutable mutex mu;
     Status status GUARDED_BY(mu);
     std::set<string> device_set GUARDED_BY(mu);
     std::vector<string> device_list GUARDED_BY(mu);
@@ -71,7 +71,8 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // calling done.  Callback GroupRec* arg is only valid if status is ok.
   // Ownership of GroupRec stays with this object and does not pass to the
   // callback.
-  typedef std::function<void(const Status& s, GroupRec* gr)> GroupRecCallback;
+  typedef std::function<void(const Status& s, const GroupRec* gr)>
+      GroupRecCallback;
   void CompleteGroupLocal(const string& device, CollectiveParams* cp,
                           const GroupRecCallback& done)
       LOCKS_EXCLUDED(group_mu_);
@@ -135,7 +136,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // with this object and does not pass to the callback.
   typedef std::function<void(const Status& s, InstanceRec* ir)>
       InstanceRecCallback;
-  void FindInstanceRec(GroupRec* gr, CollectiveParams* cp,
+  void FindInstanceRec(const GroupRec* gr, CollectiveParams* cp,
                        const InstanceRecCallback& done)
       LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
@@ -144,27 +145,28 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   //
   // Preconditions:
   //  cp is populated with all DeviceLocalities
-  Status InitInstanceSharedParams(GroupRec* gr, const CollectiveParams* cp,
-                                  InstanceRec* ir)
+  Status InitInstanceSharedParams(const GroupRec* gr,
+                                  const CollectiveParams* cp, InstanceRec* ir)
       EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
 
   // Establishes the final order of ir->shared.instance.device_names and
   // ir->shared.instance.task_names by considering localities of all devices.
-  void CompleteDefaultRanking(GroupRec* gr, const CollectiveParams* cp,
+  void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
                               InstanceRec* ir,
                               const std::vector<DeviceLocality>& localities)
       EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
 
   // Finish populating *cp.
   // Precondition: *gr has been fully populated by CompleteGroupLocal.
-  void CompleteInstanceLocal(const string& device, GroupRec* gr,
+  void CompleteInstanceLocal(const string& device, const GroupRec* gr,
                              CollectiveParams* cp, bool is_source,
                              const StatusCallback& done)
       LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Finish populating *cp from fully initialized *ir.
   // Precondition: *gr and *ir are fully populated.
-  void CompleteInstanceFromInitializedIRec(const string& device, GroupRec* gr,
+  void CompleteInstanceFromInitializedIRec(const string& device,
+                                           const GroupRec* gr,
                                            CollectiveParams* cp,
                                            InstanceRec* ir, bool is_source,
                                            const StatusCallback& done)
-- 
GitLab


From 72df3d60faa8bbf42bb3f5c7ed38887215fad037 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 13:36:46 -0700
Subject: [PATCH 0975/1262] [XLA] Redesign: support xla::XlaComputation in
 compile-only client and service.

PiperOrigin-RevId: 193247845
---
 tensorflow/compiler/xla/client/BUILD          |  1 +
 .../xla/client/compile_only_client.cc         | 18 +++++++++++++
 .../compiler/xla/client/compile_only_client.h | 22 +++++++++++++++
 .../xla/service/compile_only_service.cc       | 27 +++++++++++++++++++
 .../xla/service/compile_only_service.h        | 19 +++++++++++++
 5 files changed, 87 insertions(+)

diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index a299c2afd4..286d06d12f 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -130,6 +130,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:stream_executor_no_cuda",
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index 59662c95ac..96e38bca01 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -39,6 +39,24 @@ CompileOnlyClient::CompileAheadOfTime(
   return compiler_service_->CompileAheadOfTime(service_instances, options);
 }
 
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotXlaComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotXlaComputationInstance& instance : computations) {
+    service_instances.emplace_back();
+    CompileOnlyService::AotXlaComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->proto();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
 int64 CompileOnlyClient::PointerSizeForTriple(tensorflow::StringPiece triple) {
   llvm::Triple llvm_triple(
       llvm::Triple::normalize(llvm::StringRef(triple.data(), triple.size())));
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index 5900048711..c8725b8517 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,6 +55,27 @@ class CompileOnlyClient : public Client {
       const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
       const AotCompilationOptions& options);
 
+  // A description of an xla computation to compile using CompileAheadOfTime.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct AotXlaComputationInstance {
+    const XlaComputation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options);
+
   // Returns the size of a pointer in bytes for a given triple.
   static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
 
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index c83da9eddc..fb70ea5315 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -61,6 +61,33 @@ CompileOnlyService::CompileOnlyService(const ServiceOptions& options,
                                        Compiler* compiler)
     : Service(options, /*execute_backend=*/nullptr), compiler_(compiler) {}
 
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  for (const AotXlaComputationInstance& instance : computations) {
+    TF_RET_CHECK(instance.computation.has_program_shape());
+
+    const DebugOptions& debug_options = options.debug_options();
+    const auto& program_shape = instance.computation.program_shape();
+    ExecutionOptions execution_options;
+    *execution_options.mutable_debug_options() = debug_options;
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(program_shape, instance.argument_layouts,
+                           &execution_options));
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> hlo_module,
+        HloModule::CreateFromProto(instance.computation, *module_config));
+    TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
+    hlo_modules.push_back(std::move(hlo_module));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
+}
+
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyService::CompileAheadOfTime(
     const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index 9859941c6c..dd8de42a0f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -53,6 +53,25 @@ class CompileOnlyService : public Service {
       const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
       const AotCompilationOptions& Options);
 
+  // A description of a xla computation to compile using CompileAheadOfTime.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct AotXlaComputationInstance {
+    HloModuleProto computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of xla computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
+      const AotCompilationOptions& options);
+
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
   // computing constants produces global data that we may wish to transfer.
-- 
GitLab


From 953a2f745cc6cbf26345e906694da054dde30ab5 Mon Sep 17 00:00:00 2001
From: Dalmo Cirne <dalmo@clarifai.com>
Date: Tue, 17 Apr 2018 16:46:06 -0400
Subject: [PATCH 0976/1262] QueueOptions size var data types to size_t

QueueOptions' max_batch_size and max_enqueued_batches are positive quantities, and when compared, in the code, with unsigned member functions, a warning is raised. By changing the data type from int to size_t, not only the meaning of the member variables are more aligned with their intent, but also the comparisons are done between unsigned integers, thus fixing the warnings.
---
 .../core/kernels/batching_util/shared_batch_scheduler.h     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index b77289aded..139475389d 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -135,7 +135,7 @@ class SharedBatchScheduler
     // (inclusive). If there is a need to quantize the batch sizes, i.e. only
     // submit batches whose size is in a small set of allowed sizes, that can be
     // done by adding padding in the process-batch callback.
-    int max_batch_size = 1000;
+    size_t max_batch_size = 1000;
 
     // If a task has been enqueued for this amount of time (in microseconds),
     // and a thread is available, the scheduler will immediately form a batch
@@ -156,7 +156,7 @@ class SharedBatchScheduler
     // If this limit is reached, Schedule() will return an UNAVAILABLE error.
     // See the class documentation above for guidelines on how to tune this
     // parameter.
-    int max_enqueued_batches = 10;
+    size_t max_enqueued_batches = 10;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
@@ -393,7 +393,7 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
     std::function<void(std::unique_ptr<Batch<TaskType>>)>
         process_batch_callback,
     std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.max_batch_size <= 0) {
+  if (options.max_batch_size == 0) {
     return errors::InvalidArgument("max_batch_size must be positive; was ",
                                    options.max_batch_size);
   }
-- 
GitLab


From 4764bf2986e2779d5c80b5aca08d72d5c878818b Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 17 Apr 2018 14:26:16 -0700
Subject: [PATCH 0977/1262] [StreamExecutor] Rename ::perftools::gputools ->
 ::stream_executor, part 1.

Step 1 of re-namespace'ing StreamExecutor into ::stream_executor.

This moves everything inside of stream_executor/..., and leaves a
namespace alias into ::perftools::gputools.  The next steps will clean
up users to use the new namespace.

This is mostly a mechanical change, but it also includes a bunch of
non-mechanical changes that ideally would be split out into separate
patches.  Unfortunately they all sort of need to be shoved in here for
various reasons:

 - forward declarations need to be in the same namespace as the actual
   types, so we need to change all forward declarations of
   StreamExecutor types in this one patch.

 - Uses of these forward declarations need to be changed to the new
   namespace (or otherwise we need to add a namespace alias to the
   relevant header, but this is pretty ugly).

 - Various initialization code needs to live in StreamExecutor's "real"
   namespace, so all this needs to be changed.

PiperOrigin-RevId: 193256128
---
 .../compiler/xla/executable_run_options.cc    |   4 +-
 .../compiler/xla/executable_run_options.h     |  24 +++--
 tensorflow/compiler/xla/types.h               |   8 ++
 .../core/common_runtime/gpu/gpu_event_mgr.h   |   6 +-
 .../core/common_runtime/gpu/gpu_id_utils.h    |   5 +-
 tensorflow/core/common_runtime/gpu/gpu_init.h |   8 +-
 .../core/common_runtime/gpu/gpu_util.cc       |   4 +-
 tensorflow/core/common_runtime/gpu/gpu_util.h |   4 +-
 .../core/common_runtime/gpu_device_context.h  |  37 ++++---
 tensorflow/core/framework/device_base.h       |  15 ++-
 .../default/from_stream_executor_status.h     |   2 -
 tensorflow/core/platform/stream_executor.h    |  11 ++
 tensorflow/core/platform/types.h              |   8 ++
 tensorflow/stream_executor/blas.cc            |   6 +-
 tensorflow/stream_executor/blas.h             |   6 +-
 .../stream_executor/cuda/cuda_activation.cc   |   6 +-
 .../stream_executor/cuda/cuda_activation.h    |   6 +-
 tensorflow/stream_executor/cuda/cuda_blas.cc  |  94 ++++++++--------
 tensorflow/stream_executor/cuda/cuda_blas.h   |   6 +-
 .../stream_executor/cuda/cuda_diagnostics.cc  |   6 +-
 .../stream_executor/cuda/cuda_diagnostics.h   |   6 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |  82 +++++++-------
 tensorflow/stream_executor/cuda/cuda_dnn.h    |   6 +-
 .../stream_executor/cuda/cuda_driver.cc       |   6 +-
 tensorflow/stream_executor/cuda/cuda_driver.h |   6 +-
 tensorflow/stream_executor/cuda/cuda_event.cc |   6 +-
 tensorflow/stream_executor/cuda/cuda_event.h  |   6 +-
 tensorflow/stream_executor/cuda/cuda_fft.cc   | 102 +++++++++---------
 tensorflow/stream_executor/cuda/cuda_fft.h    |   6 +-
 .../stream_executor/cuda/cuda_gpu_executor.cc |  18 ++--
 .../stream_executor/cuda/cuda_gpu_executor.h  |   6 +-
 .../stream_executor/cuda/cuda_helpers.h       |   6 +-
 tensorflow/stream_executor/cuda/cuda_kernel.h |   6 +-
 .../stream_executor/cuda/cuda_platform.cc     |  18 ++--
 .../stream_executor/cuda/cuda_platform.h      |   6 +-
 .../stream_executor/cuda/cuda_platform_id.cc  |   6 +-
 .../stream_executor/cuda/cuda_platform_id.h   |   6 +-
 tensorflow/stream_executor/cuda/cuda_rng.cc   |  87 ++++++++-------
 tensorflow/stream_executor/cuda/cuda_rng.h    |   6 +-
 .../stream_executor/cuda/cuda_stream.cc       |   6 +-
 tensorflow/stream_executor/cuda/cuda_stream.h |   6 +-
 tensorflow/stream_executor/cuda/cuda_timer.cc |   6 +-
 tensorflow/stream_executor/cuda/cuda_timer.h  |  10 +-
 .../stream_executor/cuda/cudnn_version.cc     |   6 +-
 .../stream_executor/cuda/cudnn_version.h      |   6 +-
 .../cuda/cudnn_version_test.cc                |   6 +-
 .../stream_executor/device_description.cc     |   7 +-
 .../stream_executor/device_description.h      |   6 +-
 tensorflow/stream_executor/device_memory.h    |  13 ++-
 tensorflow/stream_executor/device_options.h   |   6 +-
 tensorflow/stream_executor/dnn.cc             |   6 +-
 tensorflow/stream_executor/dnn.h              |   6 +-
 tensorflow/stream_executor/dso_loader.cc      |   6 +-
 tensorflow/stream_executor/dso_loader.h       |   6 +-
 tensorflow/stream_executor/event.cc           |   6 +-
 tensorflow/stream_executor/event.h            |   6 +-
 tensorflow/stream_executor/executor_cache.cc  |   6 +-
 tensorflow/stream_executor/executor_cache.h   |   6 +-
 tensorflow/stream_executor/fft.h              |   8 +-
 .../stream_executor/host/host_gpu_executor.cc |   6 +-
 .../stream_executor/host/host_gpu_executor.h  |   6 +-
 .../stream_executor/host/host_platform.cc     |  16 ++-
 .../stream_executor/host/host_platform.h      |   6 +-
 .../stream_executor/host/host_platform_id.cc  |   6 +-
 .../stream_executor/host/host_platform_id.h   |   6 +-
 .../stream_executor/host/host_stream.cc       |   6 +-
 tensorflow/stream_executor/host/host_stream.h |   6 +-
 tensorflow/stream_executor/host/host_timer.cc |   6 +-
 tensorflow/stream_executor/host/host_timer.h  |   6 +-
 tensorflow/stream_executor/host_buffer.h      |   6 +-
 tensorflow/stream_executor/kernel.cc          |   6 +-
 tensorflow/stream_executor/kernel.h           |  12 +--
 .../stream_executor/kernel_cache_config.h     |   6 +-
 tensorflow/stream_executor/kernel_spec.cc     |   7 +-
 tensorflow/stream_executor/kernel_spec.h      |   6 +-
 tensorflow/stream_executor/launch_dim.h       |   8 +-
 tensorflow/stream_executor/lib/array_slice.h  |  13 ++-
 tensorflow/stream_executor/lib/casts.h        |   8 +-
 tensorflow/stream_executor/lib/demangle.cc    |   6 +-
 tensorflow/stream_executor/lib/demangle.h     |   6 +-
 tensorflow/stream_executor/lib/env.h          |   6 +-
 tensorflow/stream_executor/lib/error.h        |   8 +-
 .../stream_executor/lib/human_readable.h      |   6 +-
 tensorflow/stream_executor/lib/initialize.h   |  17 ++-
 .../stream_executor/lib/inlined_vector.h      |   6 +-
 tensorflow/stream_executor/lib/mathutil.h     |   6 +-
 tensorflow/stream_executor/lib/notification.h |   6 +-
 tensorflow/stream_executor/lib/numbers.cc     |   6 +-
 tensorflow/stream_executor/lib/numbers.h      |   6 +-
 tensorflow/stream_executor/lib/path.cc        |   6 +-
 tensorflow/stream_executor/lib/path.h         |   6 +-
 .../stream_executor/lib/process_state.cc      |   6 +-
 .../stream_executor/lib/process_state.h       |   6 +-
 tensorflow/stream_executor/lib/ptr_util.h     |  14 ++-
 tensorflow/stream_executor/lib/stacktrace.h   |   6 +-
 tensorflow/stream_executor/lib/status.h       |  17 ++-
 tensorflow/stream_executor/lib/statusor.h     |   8 +-
 tensorflow/stream_executor/lib/str_util.h     |   6 +-
 tensorflow/stream_executor/lib/strcat.h       |   8 +-
 tensorflow/stream_executor/lib/stringpiece.h  |   6 +-
 tensorflow/stream_executor/lib/stringprintf.h |   6 +-
 .../stream_executor/lib/thread_options.h      |   6 +-
 tensorflow/stream_executor/lib/threadpool.h   |   6 +-
 .../stream_executor/multi_platform_manager.cc |   6 +-
 .../stream_executor/multi_platform_manager.h  |  18 ++--
 tensorflow/stream_executor/platform.cc        |   6 +-
 tensorflow/stream_executor/platform.h         |   8 +-
 .../stream_executor/platform/default/mutex.h  |   6 +-
 tensorflow/stream_executor/platform/port.h    |   8 +-
 tensorflow/stream_executor/plugin.cc          |   6 +-
 tensorflow/stream_executor/plugin.h           |   6 +-
 tensorflow/stream_executor/plugin_registry.cc |   6 +-
 tensorflow/stream_executor/plugin_registry.h  |   6 +-
 tensorflow/stream_executor/rng.cc             |   6 +-
 tensorflow/stream_executor/rng.h              |   6 +-
 .../stream_executor/scratch_allocator.cc      |   6 +-
 .../stream_executor/scratch_allocator.h       |   6 +-
 .../stream_executor/shared_memory_config.h    |   6 +-
 tensorflow/stream_executor/stream.cc          |   6 +-
 tensorflow/stream_executor/stream.h           |   6 +-
 tensorflow/stream_executor/stream_executor.h  |  11 ++
 .../stream_executor_internal.cc               |   6 +-
 .../stream_executor_internal.h                |   6 +-
 .../stream_executor/stream_executor_pimpl.cc  |   6 +-
 .../stream_executor/stream_executor_pimpl.h   |   8 +-
 .../temporary_device_memory.cc                |   6 +-
 .../stream_executor/temporary_device_memory.h |   6 +-
 .../temporary_memory_manager.cc               |   6 +-
 .../temporary_memory_manager.h                |   6 +-
 tensorflow/stream_executor/timer.cc           |   6 +-
 tensorflow/stream_executor/timer.h            |   6 +-
 tensorflow/stream_executor/trace_listener.h   |   6 +-
 132 files changed, 572 insertions(+), 744 deletions(-)

diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 1700c97718..99b8f0558e 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -36,12 +36,12 @@ DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
 }
 
 ExecutableRunOptions& ExecutableRunOptions::set_stream(
-    perftools::gputools::Stream* stream) {
+    stream_executor::Stream* stream) {
   stream_ = stream;
   return *this;
 }
 
-perftools::gputools::Stream* ExecutableRunOptions::stream() const {
+stream_executor::Stream* ExecutableRunOptions::stream() const {
   return stream_;
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 2c1d9ffff1..1a095a82cc 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,29 +16,31 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-// Intentionally forward declared so that ExecutableRunOptions can be linked
+// These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
 // need to be linked).
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
 class Platform;
-}
-}
+}  // namespace stream_executor
 
 namespace tensorflow {
 namespace thread {
 class ThreadPool;
-}
-}
+}  // namespace thread
+}  // namespace tensorflow
 
 namespace Eigen {
 struct ThreadPoolDevice;
-}
+}  // namespace Eigen
 
 namespace xla {
 
+// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
+// is complete, add "using namespace se = stream_executor" here and
+// s/stream_executor/se::/ to match our idiom elsewhere.
+
 class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
@@ -61,8 +63,8 @@ class ExecutableRunOptions {
   // If set, this is the stream to run the computation on. The platform of the
   // stream must match the platform the executable was built for.  A value of
   // nullptr indicates the option has not been set.
-  ExecutableRunOptions& set_stream(perftools::gputools::Stream* stream);
-  perftools::gputools::Stream* stream() const;
+  ExecutableRunOptions& set_stream(stream_executor::Stream* stream);
+  stream_executor::Stream* stream() const;
 
   // Sets the thread pool on which to run parallel CPU backend
   // computations. Does not take ownership.
@@ -91,7 +93,7 @@ class ExecutableRunOptions {
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   DeviceAssignment* device_assignment_ = nullptr;
-  perftools::gputools::Stream* stream_ = nullptr;
+  stream_executor::Stream* stream_ = nullptr;
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 9fa4297523..20f3f1b957 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -46,4 +46,12 @@ using ::Eigen::half;
 
 }  // namespace xla
 
+// Alias namespace ::stream_executor as ::xla::se.
+namespace stream_executor {}
+namespace xla {
+// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
+// removed in ::xla.
+// namespace se = ::stream_executor;
+}  // namespace xla
+
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index d23898e1f2..fd5f50ca4e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -29,13 +29,11 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Event;
 class Stream;
 class StreamExecutor;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 2e90687fe8..5c503d1261 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -23,7 +23,10 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
-namespace gpu = ::perftools::gputools;
+
+// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
+// that's available.
+namespace gpu = ::stream_executor;
 
 // Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
 class GpuIdUtil {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index 927d05d5ba..bfd7a77f83 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -18,11 +18,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Platform;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -34,7 +32,7 @@ Status ValidateGPUMachineManager();
 // initializing the GPUs on the machine if needed the first time it is
 // called.  Must only be called when there is a valid GPU environment
 // in the process (e.g., ValidateGPUMachineManager() returns OK).
-perftools::gputools::Platform* GPUMachineManager();
+stream_executor::Platform* GPUMachineManager();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index a0f5877d62..5214ceaae5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -60,7 +60,9 @@ using perftools::gputools::Stream;
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
+// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
+// that's available.
+namespace gpu = ::stream_executor;
 
 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
                    const Tensor* dst,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index d99a0b1f61..337dc89895 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -27,7 +27,9 @@ namespace tensorflow {
 class RecvTensorResponse;
 class TensorProto;
 
-namespace gpu = ::perftools::gputools;
+// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
+// that's available.
+namespace gpu = ::stream_executor;
 
 class GPUUtil {
  public:
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 8b1430f021..38a18cd087 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -19,23 +19,22 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_base.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
+// TODO(b/77980417): Replace stream_executor:: with se:: once our namespace
+// migration is complete and the alias is available.
 
 class GPUDeviceContext : public DeviceContext {
  public:
   // Does not take ownership of streams.
-  GPUDeviceContext(int stream_id, gpu::Stream* stream,
-                   gpu::Stream* host_to_device_stream,
-                   gpu::Stream* device_to_host_stream,
-                   gpu::Stream* device_to_device_stream)
+  GPUDeviceContext(int stream_id, stream_executor::Stream* stream,
+                   stream_executor::Stream* host_to_device_stream,
+                   stream_executor::Stream* device_to_host_stream,
+                   stream_executor::Stream* device_to_device_stream)
       : stream_id_(stream_id),
         stream_(stream),
         host_to_device_stream_(host_to_device_stream),
@@ -44,10 +43,14 @@ class GPUDeviceContext : public DeviceContext {
 
   ~GPUDeviceContext() override {}
 
-  gpu::Stream* stream() const override { return stream_; }
-  gpu::Stream* host_to_device_stream() const { return host_to_device_stream_; }
-  gpu::Stream* device_to_host_stream() const { return device_to_host_stream_; }
-  gpu::Stream* device_to_device_stream() const {
+  stream_executor::Stream* stream() const override { return stream_; }
+  stream_executor::Stream* host_to_device_stream() const {
+    return host_to_device_stream_;
+  }
+  stream_executor::Stream* device_to_host_stream() const {
+    return device_to_host_stream_;
+  }
+  stream_executor::Stream* device_to_device_stream() const {
     return device_to_device_stream_;
   }
   int stream_id() const { return stream_id_; }
@@ -67,13 +70,13 @@ class GPUDeviceContext : public DeviceContext {
   int stream_id_;
   // The default primary stream to use for this context.
   // All the memory belongs to this stream.
-  gpu::Stream* stream_;
+  stream_executor::Stream* stream_;
   // The stream to use for copy data from host into GPU.
-  gpu::Stream* host_to_device_stream_;
+  stream_executor::Stream* host_to_device_stream_;
   // The stream to use for copy data from GPU to host.
-  gpu::Stream* device_to_host_stream_;
+  stream_executor::Stream* device_to_host_stream_;
   // The stream to use for copy data between GPU.
-  gpu::Stream* device_to_device_stream_;
+  stream_executor::Stream* device_to_device_stream_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 8473b228d3..223b74857d 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -34,11 +34,9 @@ struct SyclDevice;
 #endif
 }  // end namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 class Stream;
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 namespace tensorflow {
 
@@ -69,9 +67,10 @@ class PerOpGpuDevice {
 class DeviceContext : public core::RefCounted {
  public:
   ~DeviceContext() override {}
-  virtual perftools::gputools::Stream* stream() const { return nullptr; }
-  virtual void MaintainLifetimeOnStream(
-      const Tensor* t, perftools::gputools::Stream* stream) const {}
+  virtual stream_executor::Stream* stream() const { return nullptr; }
+  virtual void MaintainLifetimeOnStream(const Tensor* t,
+                                        stream_executor::Stream* stream) const {
+  }
 
   // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
   // "device_tensor" which is on a GPU device "device". "device_tensor"
@@ -133,7 +132,7 @@ class DeviceBase {
   // but also by TPU devices (to provide default device context).
   struct GpuDeviceInfo {
     // Make sure all the defaults are NULL, so we can spot missing assignments.
-    perftools::gputools::Stream* stream = nullptr;
+    stream_executor::Stream* stream = nullptr;
     DeviceContext* default_context = nullptr;
     EventMgr* event_mgr = nullptr;
     int gpu_id = -1;
diff --git a/tensorflow/core/platform/default/from_stream_executor_status.h b/tensorflow/core/platform/default/from_stream_executor_status.h
index 2a2297a657..36a67a3648 100644
--- a/tensorflow/core/platform/default/from_stream_executor_status.h
+++ b/tensorflow/core/platform/default/from_stream_executor_status.h
@@ -23,8 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace gpu = ::perftools::gputools;
-
 // On the open-source platform, stream_executor currently uses
 // tensorflow::Status
 inline Status FromStreamExecutorStatus(
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index f31e556a70..006184ddef 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -37,4 +37,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(jlebar): Remove this once we've completed
+// the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
 #endif  // TENSORFLOW_PLATFORM_STREAM_EXECUTOR_H_
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 6308e58847..f2471712cc 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -60,4 +60,12 @@ typedef uint64 Fprint;
 
 }  // namespace tensorflow
 
+// Alias namespace ::stream_executor as ::tensorflow::se.
+namespace stream_executor {}
+namespace tensorflow {
+// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
+// removed in ::xla.
+// namespace se = ::stream_executor;
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_PLATFORM_TYPES_H_
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index 31724cf6c9..906d6fb702 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace blas {
 
 string TransposeString(Transpose t) {
@@ -95,5 +94,4 @@ std::ostream& operator<<(std::ostream& os, ComputationType ty) {
 }
 
 }  // namespace blas
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index c5f778a5c7..6e62b85728 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -49,8 +49,7 @@ namespace Eigen {
 struct half;
 }  // namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 class ScratchAllocator;
@@ -2100,7 +2099,6 @@ class BlasSupport {
                   DeviceMemory<std::complex<double>> *b, int ldb) override;
 
 }  // namespace blas
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc
index 5f4cf9dbd7..cf6b9e2c6e 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
@@ -40,5 +39,4 @@ ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index c9d43a9766..04ffaef364 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -25,8 +25,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class StreamExecutor;
 
@@ -56,7 +55,6 @@ class ScopedActivateExecutorContext {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 1c550dbb13..007c0f1c86 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -75,15 +75,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     static const char *kName;                                       \
     template <typename... Args>                                     \
@@ -94,8 +93,8 @@ namespace wrap {
   } __name;                                                         \
   const char *WrapperShim__##__name::kName = #__name;
 
-#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \
-  PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSnrm2)                    \
@@ -269,28 +268,28 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetPointerMode)
+STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasGetPointerMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
+CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmEx)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
 #endif
 
 #if CUDA_VERSION >= 8000
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGemmEx)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmEx)
 #endif
 
 #if CUDA_VERSION >= 9000
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGetMathMode)
-PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSetMathMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
 }  // namespace wrap
@@ -2803,46 +2802,39 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cublas() {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::BlasFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::blas::BlasSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuBLAS "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CUDABlas *blas =
-                    new gpu::cuda::CUDABlas(cuda_executor);
-                if (!blas->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete blas;
-                  return nullptr;
-                }
-                return blas;
-              });
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
+          cuda::kCudaPlatformId, cuda::kCuBlasPlugin, "cuBLAS",
+          [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuBLAS "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CUDABlas *blas = new cuda::CUDABlas(cuda_executor);
+            if (!blas->Init()) {
+              // Note: Init() will log a more specific error.
+              delete blas;
+              return nullptr;
+            }
+            return blas;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuBLAS factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kBlas,
-                                                     gpu::cuda::kCuBlasPlugin);
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kBlas, cuda::kCuBlasPlugin);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(register_cublas,
-                            { perftools::gputools::initialize_cublas(); });
+                            { stream_executor::initialize_cublas(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index deb211c04b..55c414a1f9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -29,8 +29,7 @@ limitations under the License.
 
 typedef struct cublasContext *cublasHandle_t;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -162,7 +161,6 @@ class CUDABlas : public blas::BlasSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 933c103f52..feb529297e 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -51,8 +51,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 #ifdef __APPLE__
@@ -384,5 +383,4 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index aa68321acc..f2db2eb20a 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
@@ -93,7 +92,6 @@ class Diagnostician {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 4a6b2bf5d7..d673e19007 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -59,8 +59,7 @@ NarrowT CheckedNarrowing(const WideT& wide) {
 
 }  // namespace
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 using dnn::BatchDescriptor;
 using dnn::FilterDescriptor;
@@ -159,7 +158,7 @@ static port::ThreadPool* GetCudaThreadpool() {
   return cudnn_threadpool;
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                         \
   struct WrapperShim__##__name {                                   \
     template <typename... Args>                                    \
     cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \
@@ -169,7 +168,7 @@ static port::ThreadPool* GetCudaThreadpool() {
     }                                                              \
   } __name;
 
-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)        \
+#define STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)           \
   struct WrapperShim__##__name {                                         \
     template <typename... Args>                                          \
     cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
@@ -220,7 +219,7 @@ struct WrapperShim__cudnnSetStream {
   __macro(cudnnSetFilterNdDescriptor)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH
 
 // clang-format off
@@ -242,7 +241,7 @@ CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
 
 // APIs available after R3:
@@ -252,7 +251,7 @@ CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
   __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
   __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
 #endif
 
@@ -266,7 +265,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // clang-format on
 
 CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
 #endif
 
@@ -293,7 +292,7 @@ CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R5(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R5
 
 // clang-format off
@@ -305,7 +304,7 @@ CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
 #endif
 
@@ -316,7 +315,7 @@ CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
   __macro(cudnnSetRNNDescriptor_v6)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R6(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R6
 
 // clang-format off
@@ -325,7 +324,7 @@ CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
-    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
 #undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
 #endif
 
@@ -337,7 +336,7 @@ CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
   __macro(cudnnSetRNNMatrixMathType)
 
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_R7(STREAM_EXECUTOR_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R7
 #endif
 
@@ -4727,46 +4726,39 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cudnn() {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::DnnFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuDnnPlugin, "cuDNN",
-              [](gpu::internal::StreamExecutorInterface*
-                     parent) -> gpu::dnn::DnnSupport* {
-                gpu::cuda::CUDAExecutor* cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor*>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuBLAS "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CudnnSupport* dnn =
-                    new gpu::cuda::CudnnSupport(cuda_executor);
-                if (!dnn->Init().ok()) {
-                  // Note: Init() will log a more specific error.
-                  delete dnn;
-                  return nullptr;
-                }
-                return dnn;
-              });
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
+          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+            cuda::CUDAExecutor* cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuBLAS "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            if (!dnn->Init().ok()) {
+              // Note: Init() will log a more specific error.
+              delete dnn;
+              return nullptr;
+            }
+            return dnn;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuDNN factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kDnn,
-                                                     gpu::cuda::kCuDnnPlugin);
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(register_cudnn,
-                            { perftools::gputools::initialize_cudnn(); });
+                            { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 7518b23757..e6d12bfef9 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -810,7 +809,6 @@ class CudnnSupport : public dnn::DnnSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 58e1e58c59..fedf4f53b8 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -53,8 +53,7 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
 // matches the expected one.
 constexpr bool kVerifyCudaContext = false;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 namespace {
@@ -1649,5 +1648,4 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index fa9172b3f0..a9969e247e 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "cuda/include/cuda.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Identifies the memory space where an allocation resides. See
@@ -506,7 +505,6 @@ class CudaContext {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
index 1b41502300..96dcf17356 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 CUDAEvent::CUDAEvent(CUDAExecutor* parent)
@@ -68,5 +67,4 @@ const CUevent& CUDAEvent::cuda_event() {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index 56667e65d3..f62344672e 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // CUDAEvent wraps a CUevent in the platform-independent EventInterface
@@ -58,7 +57,6 @@ class CUDAEvent : public internal::EventInterface {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index a922f14fb4..5b34740f9f 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
@@ -44,7 +43,7 @@ namespace wrap {
 // manner on first use. This dynamic loading technique is used to avoid DSO
 // dependencies on vendor libraries which may or may not be available in the
 // deployed binary environment.
-#define PERFTOOLS_GPUTOOLS_CUFFT_WRAP(__name)                    \
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
   struct WrapperShim__##__name {                                 \
     template <typename... Args>                                  \
     cufftResult operator()(CUDAExecutor *parent, Args... args) { \
@@ -68,7 +67,7 @@ namespace wrap {
                                               __macro(cufftGetSizeMany)        \
                                                   __macro(cufftMakePlanMany)
 
-CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
+CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
 
 }  // namespace wrap
 
@@ -514,62 +513,59 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
   return true;
 }
 
-#define PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2, \
-                                           __fft_type3)                      \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<std::complex<__type>> &input,       \
-                      DeviceMemory<std::complex<__type>> *output) {          \
-    return DoFftWithDirectionInternal(                                       \
-        stream, plan, wrap::cufftExec##__fft_type1, input, output);          \
-  }                                                                          \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<__type> &input,                     \
-                      DeviceMemory<std::complex<__type>> *output) {          \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input,  \
-                         output);                                            \
-  }                                                                          \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                       \
-                      const DeviceMemory<std::complex<__type>> &input,       \
-                      DeviceMemory<__type> *output) {                        \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input,  \
-                         output);                                            \
+#define STREAM_EXECUTOR_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,   \
+                                        __fft_type3)                        \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<std::complex<__type>> &input,      \
+                      DeviceMemory<std::complex<__type>> *output) {         \
+    return DoFftWithDirectionInternal(                                      \
+        stream, plan, wrap::cufftExec##__fft_type1, input, output);         \
+  }                                                                         \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<__type> &input,                    \
+                      DeviceMemory<std::complex<__type>> *output) {         \
+    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input, \
+                         output);                                           \
+  }                                                                         \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
+                      const DeviceMemory<std::complex<__type>> &input,      \
+                      DeviceMemory<__type> *output) {                       \
+    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input, \
+                         output);                                           \
   }
 
-PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
-PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+STREAM_EXECUTOR_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
+STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 
-#undef PERFTOOLS_GPUTOOLS_CUDA_DEFINE_FFT
+#undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
-
-namespace gpu = ::perftools::gputools;
-
-REGISTER_MODULE_INITIALIZER(register_cufft, {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::FftFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuFftPlugin, "cuFFT",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::fft::FftSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuFFT "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                return new gpu::cuda::CUDAFft(cuda_executor);
-              });
+
+void initialize_cufft() {
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
+          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
+                         << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            return new cuda::CUDAFft(cuda_executor);
+          });
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuFFT factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kFft,
-                                                     gpu::cuda::kCuFftPlugin);
-});
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_cufft,
+                            { stream_executor::initialize_cufft(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 04c7dfe501..8171e61418 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -133,7 +132,6 @@ class CUDAFft : public fft::FftSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 5ecaf46b8c..9700daca89 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -66,8 +66,7 @@ limitations under the License.
 extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
@@ -1168,17 +1167,14 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
 }  // namespace cuda
 
-namespace gpu = ::perftools::gputools;
-
 void initialize_cuda_gpu_executor() {
-  *gpu::internal::MakeCUDAExecutorImplementation() = [](
-      const gpu::PluginConfig &config) {
-    return new gpu::cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
+    return new cuda::CUDAExecutor{config};
   };
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
-    cuda_gpu_executor, {perftools::gputools::initialize_cuda_gpu_executor();});
+REGISTER_MODULE_INITIALIZER(cuda_gpu_executor, {
+  stream_executor::initialize_cuda_gpu_executor();
+});
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index dbbbcd476f..f686685474 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -35,8 +35,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // CUDA-platform implementation of the platform-agnostic
@@ -273,7 +272,6 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index 6a6134bf88..d55706c66a 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "cuda/include/cuComplex.h"
 #include "cuda/include/cuda.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 template <typename ElemT>
 class DeviceMemory;
@@ -101,7 +100,6 @@ inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index 88d29fddd0..beaebe8f12 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -40,8 +40,7 @@ limitations under the License.
     "CUDA runtime being included into CUDA GPU executor; should be driver only."
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Wraps a CUfunction to implement the platform-independent KernelInterface.
@@ -124,7 +123,6 @@ inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 3a73846148..7a6ef5a248 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 namespace {
 
@@ -41,16 +40,16 @@ const DeviceOptions GetDeviceOptionsFromEnv() {
       std::getenv("TF_CUDA_PLATFORM_GPU_DEVICE_SCHEDULE");
 
   if (gpu_schedule_string == nullptr) {
-    return perftools::gputools::DeviceOptions::Default();
+    return DeviceOptions::Default();
   }
 
   unsigned device_flags = 0;
   if (strcmp(kScheduleSpinString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleSpin;
+    device_flags = DeviceOptions::kScheduleSpin;
   } else if (strcmp(kScheduleYieldString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleYield;
+    device_flags = DeviceOptions::kScheduleYield;
   } else if (strcmp(kScheduleBlockingSyncString, gpu_schedule_string) == 0) {
-    device_flags = perftools::gputools::DeviceOptions::kScheduleBlockingSync;
+    device_flags = DeviceOptions::kScheduleBlockingSync;
   } else {
     LOG(QFATAL) << "Unknown option for environment variable "
                    "TF_CUDA_PLATFORM_GPU_DEVICE_SCHEDULE "
@@ -59,7 +58,7 @@ const DeviceOptions GetDeviceOptionsFromEnv() {
                 << ", " << kScheduleYieldString << "}";
   }
 
-  return perftools::gputools::DeviceOptions(device_flags);
+  return DeviceOptions(device_flags);
 }
 
 }  // namespace
@@ -202,11 +201,10 @@ static void InitializeCudaPlatform() {
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(cuda_platform,
-                            perftools::gputools::InitializeCudaPlatform());
+                            stream_executor::InitializeCudaPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index dab25602d0..fc0e15d5a6 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Opaque and unique identifier for the CUDA platform plugin.
@@ -104,7 +103,6 @@ class CudaPlatform : public Platform {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.cc b/tensorflow/stream_executor/cuda/cuda_platform_id.cc
index dfd11a9abe..a7bb304cc8 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform_id.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLATFORM_DEFINE_ID(kCudaPlatformId);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.h b/tensorflow/stream_executor/cuda/cuda_platform_id.h
index c677724517..92bcfd8372 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform_id.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 // Opaque and unique identifier for the cuda platform.
@@ -30,7 +29,6 @@ namespace cuda {
 extern const Platform::Id kCudaPlatformId;
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 8641b60227..e289e7ced5 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -54,15 +54,14 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
   }
 }
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
-#define PERFTOOLS_GPUTOOLS_CURAND_WRAP(__name)                      \
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
     curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
@@ -71,15 +70,15 @@ namespace wrap {
     }                                                               \
   } __name;
 
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandCreateGenerator);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandDestroyGenerator);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetStream);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniform);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateUniformDouble);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandSetGeneratorOffset);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormal);
-PERFTOOLS_GPUTOOLS_CURAND_WRAP(curandGenerateNormalDouble);
+STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
+STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniform);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniformDouble);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
+STREAM_EXECUTOR_CURAND_WRAP(curandSetGeneratorOffset);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormal);
+STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
 
 }  // namespace wrap
 
@@ -271,42 +270,40 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
-
-namespace gpu = ::perftools::gputools;
-
-REGISTER_MODULE_INITIALIZER(register_curand, {
-  gpu::port::Status status =
-      gpu::PluginRegistry::Instance()
-          ->RegisterFactory<gpu::PluginRegistry::RngFactory>(
-              gpu::cuda::kCudaPlatformId, gpu::cuda::kCuRandPlugin, "cuRAND",
-              [](gpu::internal::StreamExecutorInterface
-                     *parent) -> gpu::rng::RngSupport * {
-                gpu::cuda::CUDAExecutor *cuda_executor =
-                    dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
-                if (cuda_executor == nullptr) {
-                  LOG(ERROR)
-                      << "Attempting to initialize an instance of the cuRAND "
-                      << "support library with a non-CUDA StreamExecutor";
-                  return nullptr;
-                }
-
-                gpu::cuda::CUDARng *rng = new gpu::cuda::CUDARng(cuda_executor);
-                if (!rng->Init()) {
-                  // Note: Init() will log a more specific error.
-                  delete rng;
-                  return nullptr;
-                }
-                return rng;
-              });
+
+void initialize_curand() {
+  port::Status status =
+      PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
+          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
+            cuda::CUDAExecutor *cuda_executor =
+                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            if (cuda_executor == nullptr) {
+              LOG(ERROR)
+                  << "Attempting to initialize an instance of the cuRAND "
+                  << "support library with a non-CUDA StreamExecutor";
+              return nullptr;
+            }
+
+            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            if (!rng->Init()) {
+              // Note: Init() will log a more specific error.
+              delete rng;
+              return nullptr;
+            }
+            return rng;
+          });
 
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuRAND factory: "
                << status.error_message();
   }
 
-  gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
-                                                     gpu::PluginKind::kRng,
-                                                     gpu::cuda::kCuRandPlugin);
-});
+  PluginRegistry::Instance()->SetDefaultFactory(
+      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_curand,
+                            { stream_executor::initialize_curand(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
index 5bbfd0b37a..57ef398aaa 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -24,8 +24,7 @@ limitations under the License.
 
 typedef struct curandGenerator_st *curandGenerator_t;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -98,7 +97,6 @@ class CUDARng : public rng::RngSupport {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/cuda/cuda_stream.cc
index 3eb37a7d84..b5aa7694f7 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stream.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool CUDAStream::Init() {
@@ -59,5 +58,4 @@ CUstream AsCUDAStreamValue(Stream *stream) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
index 7358243dc4..02edff6431 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -89,7 +88,6 @@ CUDAStream *AsCUDAStream(Stream *stream);
 CUstream AsCUDAStreamValue(Stream *stream);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
index 8532f08725..991a12a23d 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool CUDATimer::Init() {
@@ -92,5 +91,4 @@ bool CUDATimer::Stop(CUDAStream* stream) {
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 2abc55ec94..70554ec931 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 class CUDAExecutor;
@@ -60,13 +59,13 @@ class CUDATimer : public internal::TimerInterface {
   // events.
   float GetElapsedMilliseconds() const;
 
-  // See perftools::gputools::Timer::Microseconds().
+  // See Timer::Microseconds().
   // TODO(leary) make this into an error code interface...
   uint64 Microseconds() const override {
     return GetElapsedMilliseconds() * 1e3;
   }
 
-  // See perftools::GPUTools::Timer::Nanoseconds().
+  // See Timer::Nanoseconds().
   uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
 
  private:
@@ -85,7 +84,6 @@ struct TimerDeleter {
 };
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
index 5591801aae..e8fcc03618 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
@@ -38,5 +37,4 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
 }
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 2ed02e1700..6464e7f8e8 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 
 struct CudnnVersion {
@@ -46,7 +45,6 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version);
 
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 42b3dc8cc6..7d4c6399d0 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/test.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace cuda {
 namespace {
 
@@ -70,5 +69,4 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
 
 }  // namespace
 }  // namespace cuda
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 52f5319a3b..8ca0677f8a 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 static const uint64 kUninitializedUint64 = -1ULL;
 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
@@ -234,6 +233,4 @@ uint64 CalculateRegisterLimitForTargetOccupancy(
   return 0;
 }
 
-
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index fcf0928096..7f99d81ef3 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 class DeviceDescriptionBuilder;
 }  // namespace internal
@@ -388,7 +387,6 @@ uint64 CalculateRegisterLimitForTargetOccupancy(
     const DeviceDescription &device_description, uint64 shared_memory_per_block,
     const ThreadDim &thread_dims, uint64 target_blocks_per_core);
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h
index 4c92b7dc78..5a5334e0f5 100644
--- a/tensorflow/stream_executor/device_memory.h
+++ b/tensorflow/stream_executor/device_memory.h
@@ -32,6 +32,16 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
+namespace stream_executor {
+
 class StreamExecutor;
 
 // void*-analogous device memory allocation. For the typed variation, see
@@ -280,7 +290,6 @@ static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/device_options.h b/tensorflow/stream_executor/device_options.h
index 169325e7d1..2646950f42 100644
--- a/tensorflow/stream_executor/device_options.h
+++ b/tensorflow/stream_executor/device_options.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Indicates a set of options for a device's usage, which generally must be
 // provided at StreamExecutor device-initialization time.
@@ -84,7 +83,6 @@ struct DeviceOptions {
   unsigned flags_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 0a3c4bcf50..6edb572820 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace dnn {
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -554,5 +553,4 @@ string NormalizeDescriptor::ToShortString() const {
 }
 
 }  // namespace dnn
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 3c47d2c2e8..8e202d115a 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -38,8 +38,7 @@ namespace Eigen {
 struct half;
 }  // namespace Eigen
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class HostBuffer;
 class Stream;
@@ -2285,7 +2284,6 @@ class DnnSupport {
 };
 
 }  // namespace dnn
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 9516883627..114143b3ab 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -37,8 +37,7 @@ limitations under the License.
 #include "cuda/cuda_config.h"
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 string GetCudaVersion() { return TF_CUDA_VERSION; }
@@ -291,5 +290,4 @@ static std::vector<string>* CreatePrimordialRpaths() {
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
index 354c7b50b8..9ee081cb3d 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // Permits StreamExecutor code to dynamically load a pre-determined set of
@@ -114,7 +113,6 @@ class CachedDsoLoader {
 };
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/event.cc b/tensorflow/stream_executor/event.cc
index c423a453e9..50a6edd80b 100644
--- a/tensorflow/stream_executor/event.cc
+++ b/tensorflow/stream_executor/event.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 Event::Event(StreamExecutor* stream_exec)
     : stream_exec_(stream_exec),
@@ -48,5 +47,4 @@ Event::Status Event::PollForStatus() {
   return stream_exec_->PollForEventStatus(this);
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/event.h b/tensorflow/stream_executor/event.h
index a06c26ea51..1f37262c78 100644
--- a/tensorflow/stream_executor/event.h
+++ b/tensorflow/stream_executor/event.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class EventInterface;
@@ -76,7 +75,6 @@ class Event {
   SE_DISALLOW_COPY_AND_ASSIGN(Event);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_EVENT_H_
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index d1a8aae167..0b3ad7ebbc 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
@@ -104,5 +103,4 @@ ExecutorCache::Entry::~Entry() {
   configurations.clear();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h
index 12f2275f6d..bbeeaed787 100644
--- a/tensorflow/stream_executor/executor_cache.h
+++ b/tensorflow/stream_executor/executor_cache.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Utility class to allow Platform objects to manage cached StreamExecutors.
 // Thread-safe.
@@ -76,7 +75,6 @@ class ExecutorCache {
   SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
index 6b1728829a..814efb2e92 100644
--- a/tensorflow/stream_executor/fft.h
+++ b/tensorflow/stream_executor/fft.h
@@ -48,8 +48,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -210,7 +209,7 @@ class FftSupport {
 
 // Macro used to quickly declare overrides for abstract virtuals in the
 // fft::FftSupport base class. Assumes that it's emitted somewhere inside the
-// ::perftools::gputools namespace.
+// ::stream_executor namespace.
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES                   \
   std::unique_ptr<fft::Plan> Create1dPlan(Stream *stream, uint64 num_x,        \
                                           fft::Type type, bool in_place_fft)   \
@@ -265,7 +264,6 @@ class FftSupport {
              DeviceMemory<double> *output) override;
 
 }  // namespace fft
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_FFT_H_
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 542f521ef7..2c4819651a 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -28,8 +28,7 @@ limitations under the License.
 
 bool FLAGS_stream_executor_cpu_real_clock_rate = false;
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostStream *AsHostStream(Stream *stream) {
@@ -266,5 +265,4 @@ rng::RngSupport *HostExecutor::CreateRng() {
 }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index e2c0e6d6b7..0c3991c151 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // An implementation of StreamExecutor that does no communication or interaction
@@ -210,7 +209,6 @@ class HostExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 2cb7d36967..00a17a05ed 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -26,10 +26,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace gpu = ::perftools::gputools;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostPlatform::HostPlatform() : name_("Host") {}
@@ -93,16 +90,15 @@ void HostPlatform::UnregisterTraceListener(TraceListener* listener) {
 }
 
 static void InitializeHostPlatform() {
-  std::unique_ptr<gpu::Platform> platform(new gpu::host::HostPlatform);
-  SE_CHECK_OK(gpu::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  std::unique_ptr<Platform> platform(new host::HostPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
-    host_platform, perftools::gputools::host::InitializeHostPlatform());
+REGISTER_MODULE_INITIALIZER(host_platform,
+                            stream_executor::host::InitializeHostPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h
index 0faec6c8b7..c6f46a2cc4 100644
--- a/tensorflow/stream_executor/host/host_platform.h
+++ b/tensorflow/stream_executor/host/host_platform.h
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // Host (CPU) platform plugin, registered as a singleton value via module
@@ -79,7 +78,6 @@ class HostPlatform : public Platform {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
diff --git a/tensorflow/stream_executor/host/host_platform_id.cc b/tensorflow/stream_executor/host/host_platform_id.cc
index 69a203f298..2256bccec3 100644
--- a/tensorflow/stream_executor/host/host_platform_id.cc
+++ b/tensorflow/stream_executor/host/host_platform_id.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/host/host_platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 PLATFORM_DEFINE_ID(kHostPlatformId);
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_platform_id.h b/tensorflow/stream_executor/host/host_platform_id.h
index 61d84ea2e2..18d1f282f1 100644
--- a/tensorflow/stream_executor/host/host_platform_id.h
+++ b/tensorflow/stream_executor/host/host_platform_id.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 // Opaque and unique identifier for the host platform.
@@ -30,7 +29,6 @@ namespace host {
 extern const Platform::Id kHostPlatformId;
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index 5961c31516..5a7d3b3dd4 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -17,8 +17,7 @@ limitations under the License.
 // the HostExecutor implementation.
 #include "tensorflow/stream_executor/host/host_stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 HostStream::HostStream()
@@ -53,5 +52,4 @@ void HostStream::BlockUntilDone() {
 
 }  // namespace host
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index 9894d17feb..5d7b8a3782 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 class HostStream : public internal::StreamInterface {
@@ -52,7 +51,6 @@ class HostStream : public internal::StreamInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
diff --git a/tensorflow/stream_executor/host/host_timer.cc b/tensorflow/stream_executor/host/host_timer.cc
index d84d825c92..e138daf0e1 100644
--- a/tensorflow/stream_executor/host/host_timer.cc
+++ b/tensorflow/stream_executor/host/host_timer.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 using std::chrono::duration_cast;
@@ -46,5 +45,4 @@ void HostTimer::StartNow() { start_time_ = clock::now(); }
 void HostTimer::StopNow() { duration_ = clock::now() - start_time_; }
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/host/host_timer.h b/tensorflow/stream_executor/host/host_timer.h
index 17af7c0521..5954b8023b 100644
--- a/tensorflow/stream_executor/host/host_timer.h
+++ b/tensorflow/stream_executor/host/host_timer.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace host {
 
 class HostTimer : public internal::TimerInterface {
@@ -57,7 +56,6 @@ class HostTimer : public internal::TimerInterface {
 };
 
 }  // namespace host
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
index 8fa542e9ff..20299da517 100644
--- a/tensorflow/stream_executor/host_buffer.h
+++ b/tensorflow/stream_executor/host_buffer.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/dnn.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // A HostBuffer is a block of memory in host memory containing the data for a
 // dnn::BatchDescriptor using a device-dependent memory layout.
@@ -42,7 +41,6 @@ class HostBuffer {
   const dnn::BatchDescriptor descriptor_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index 636199cfa2..d1aa596b73 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -27,8 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 bool KernelMetadata::registers_per_thread(int *registers_per_thread) const {
   if (has_registers_per_thread_) {
@@ -103,5 +102,4 @@ void KernelBase::set_name(port::StringPiece name) {
   demangled_name_ = port::Demangle(stubless_name.data());
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 5358eac1ae..2216884b87 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -64,7 +64,7 @@ limitations under the License.
 //
 // Users typically won't need to type out the TypedKernel signature in full, it
 // will be typedef'd by automatically generated code; for example, see
-// perftools::gputools::executor_sample::VecReduceAddKernel.
+// stream_executor::executor_sample::VecReduceAddKernel.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
@@ -82,8 +82,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class DeviceMemoryBase;
 template <typename ElemT>
@@ -639,8 +638,8 @@ struct KernelInvocationChecker {
   // NOTE: if you encounter an error here, you can see the mismatch by looking
   // at the end of the last error message, which will be of the form:
   //
-  //    ...::Compatible<const perftools::gputools::DeviceMemory<OneThing> &,
-  //                    perftools::gputools::DeviceMemory<AnotherThing>, true,
+  //    ...::Compatible<const stream_executor::DeviceMemory<OneThing> &,
+  //                    stream_executor::DeviceMemory<AnotherThing>, true,
   //                    0>'
   //    requested here
   //
@@ -711,7 +710,6 @@ struct KernelParamsOk<TypedKernel<Params...>, Args...> {
       std::tuple<Params...>, std::tuple<Args...>>::CheckAllNoStaticAssert();
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
diff --git a/tensorflow/stream_executor/kernel_cache_config.h b/tensorflow/stream_executor/kernel_cache_config.h
index 9d7ab1b79f..e63d6c6a0c 100644
--- a/tensorflow/stream_executor/kernel_cache_config.h
+++ b/tensorflow/stream_executor/kernel_cache_config.h
@@ -18,8 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // This enum represents potential configurations of L1/shared memory when
 // running a particular kernel. These values represent user preference, and
@@ -38,7 +37,6 @@ enum class KernelCacheConfig {
   kPreferEqual,
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index 0404c573f0..6a1f0a591f 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/kernel_spec.h"
 
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
     : kernelname_(kernelname.ToString()) {}
@@ -247,5 +245,4 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
 
 MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
index 3811bd833e..7cc23bb4e6 100644
--- a/tensorflow/stream_executor/kernel_spec.h
+++ b/tensorflow/stream_executor/kernel_spec.h
@@ -56,8 +56,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Describes how to load a kernel on a target platform.
 //
@@ -374,7 +373,6 @@ class MultiKernelLoaderSpec {
   size_t arity_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h
index b95462667e..68f2f74840 100644
--- a/tensorflow/stream_executor/launch_dim.h
+++ b/tensorflow/stream_executor/launch_dim.h
@@ -21,7 +21,7 @@ limitations under the License.
 // a single PC in a unit called a warp. There is a maximum number of threads
 // that can execute in a shared-context entity called a block. Presently, that
 // number is 1024 -- again, something that should not be relied on from this
-// comment, but checked via perftools::gputools::DeviceDescription.
+// comment, but checked via stream_executor::DeviceDescription.
 //
 // For additional information, see
 // http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy
@@ -40,8 +40,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Basic type that represents a 3-dimensional index space.
 struct Dim3D {
@@ -74,7 +73,6 @@ struct BlockDim : public Dim3D {
   }
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
diff --git a/tensorflow/stream_executor/lib/array_slice.h b/tensorflow/stream_executor/lib/array_slice.h
index bef61bb2fc..8e3c4ca047 100644
--- a/tensorflow/stream_executor/lib/array_slice.h
+++ b/tensorflow/stream_executor/lib/array_slice.h
@@ -18,14 +18,23 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::gtl::ArraySlice;
 using tensorflow::gtl::MutableArraySlice;
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
diff --git a/tensorflow/stream_executor/lib/casts.h b/tensorflow/stream_executor/lib/casts.h
index 2261944e25..ec562e804f 100644
--- a/tensorflow/stream_executor/lib/casts.h
+++ b/tensorflow/stream_executor/lib/casts.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
 
 #include <stdlib.h>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // port::bit_cast<Dest,Source> is a template function that implements the
@@ -96,7 +95,6 @@ inline Dest bit_cast(const Source& source) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
diff --git a/tensorflow/stream_executor/lib/demangle.cc b/tensorflow/stream_executor/lib/demangle.cc
index fa2b4fa005..adb6b4f2d1 100644
--- a/tensorflow/stream_executor/lib/demangle.cc
+++ b/tensorflow/stream_executor/lib/demangle.cc
@@ -27,8 +27,7 @@ limitations under the License.
 #include <cxxabi.h>
 #endif
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // The API reference of abi::__cxa_demangle() can be found in
@@ -49,5 +48,4 @@ string Demangle(const char *mangled) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/demangle.h b/tensorflow/stream_executor/lib/demangle.h
index 30be522557..af16fa7d8c 100644
--- a/tensorflow/stream_executor/lib/demangle.h
+++ b/tensorflow/stream_executor/lib/demangle.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Demangle(const char* mangled);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index c9a22ebd55..776eba0408 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::Env;
@@ -37,7 +36,6 @@ inline Status FileExists(const port::StringPiece& filename) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
diff --git a/tensorflow/stream_executor/lib/error.h b/tensorflow/stream_executor/lib/error.h
index 89df70cb5e..c659f5fc14 100644
--- a/tensorflow/stream_executor/lib/error.h
+++ b/tensorflow/stream_executor/lib/error.h
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"  // IWYU pragma: export
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 namespace error = tensorflow::error;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
diff --git a/tensorflow/stream_executor/lib/human_readable.h b/tensorflow/stream_executor/lib/human_readable.h
index f918c180d9..893865f6da 100644
--- a/tensorflow/stream_executor/lib/human_readable.h
+++ b/tensorflow/stream_executor/lib/human_readable.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class HumanReadableNumBytes {
@@ -67,7 +66,6 @@ class HumanReadableNumBytes {
 };
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
index 9a09318a6c..688b021469 100644
--- a/tensorflow/stream_executor/lib/initialize.h
+++ b/tensorflow/stream_executor/lib/initialize.h
@@ -26,8 +26,7 @@ limitations under the License.
 #undef DECLARE_MODULE_INITIALIZER
 #undef REGISTER_MODULE_INITIALIZER_SEQUENCE
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class Initializer {
@@ -49,20 +48,18 @@ class Initializer {
 };
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-#define REGISTER_INITIALIZER(type, name, body)                               \
-  static void google_init_##type##_##name() { body; }                        \
-  perftools::gputools::port::Initializer google_initializer_##type##_##name( \
+#define REGISTER_INITIALIZER(type, name, body)                             \
+  static void google_init_##type##_##name() { body; }                      \
+  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
       google_init_##type##_##name)
 
 #define REGISTER_MODULE_INITIALIZER(name, body) \
   REGISTER_INITIALIZER(module, name, body)
 
-#define DECLARE_INITIALIZER(type, name)         \
-  extern perftools::gputools::port::Initializer \
-      google_initializer_##type##_##name
+#define DECLARE_INITIALIZER(type, name) \
+  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
 
 #define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
 
diff --git a/tensorflow/stream_executor/lib/inlined_vector.h b/tensorflow/stream_executor/lib/inlined_vector.h
index 55a1e3ad10..40bdddb180 100644
--- a/tensorflow/stream_executor/lib/inlined_vector.h
+++ b/tensorflow/stream_executor/lib/inlined_vector.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::gtl::InlinedVector;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_
diff --git a/tensorflow/stream_executor/lib/mathutil.h b/tensorflow/stream_executor/lib/mathutil.h
index e8310d55dd..c225dc5f3c 100644
--- a/tensorflow/stream_executor/lib/mathutil.h
+++ b/tensorflow/stream_executor/lib/mathutil.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 class MathUtil {
@@ -97,7 +96,6 @@ IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_
diff --git a/tensorflow/stream_executor/lib/notification.h b/tensorflow/stream_executor/lib/notification.h
index 9bb3e170dc..472d8c9845 100644
--- a/tensorflow/stream_executor/lib/notification.h
+++ b/tensorflow/stream_executor/lib/notification.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/notification.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::Notification;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NOTIFICATION_H_
diff --git a/tensorflow/stream_executor/lib/numbers.cc b/tensorflow/stream_executor/lib/numbers.cc
index 11a65e198d..b670c42ec8 100644
--- a/tensorflow/stream_executor/lib/numbers.cc
+++ b/tensorflow/stream_executor/lib/numbers.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <stdlib.h>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 bool safe_strto32(const char* str, int32* value) {
@@ -38,5 +37,4 @@ bool safe_strto32(const string& str, int32* value) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/numbers.h b/tensorflow/stream_executor/lib/numbers.h
index 4a8692b746..2f48281d2d 100644
--- a/tensorflow/stream_executor/lib/numbers.h
+++ b/tensorflow/stream_executor/lib/numbers.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Convert strings to floating point values.
@@ -28,7 +27,6 @@ namespace port {
 bool safe_strto32(const string& str, int32* value);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index f2591f47f7..56e08c316f 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 namespace internal {
 
@@ -58,5 +57,4 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
 
 }  // namespace internal
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h
index 93053dbcb6..325f04ff47 100644
--- a/tensorflow/stream_executor/lib/path.h
+++ b/tensorflow/stream_executor/lib/path.h
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::io::Dirname;
@@ -56,7 +55,6 @@ inline string JoinPath(const T&... args) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index 3d856187f0..72d71e6211 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #endif
 #include <memory>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Hostname() {
@@ -54,5 +53,4 @@ bool GetCurrentDirectory(string* dir) {
 }
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/lib/process_state.h b/tensorflow/stream_executor/lib/process_state.h
index 205e726d95..248218c759 100644
--- a/tensorflow/stream_executor/lib/process_state.h
+++ b/tensorflow/stream_executor/lib/process_state.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 string Hostname();
 bool GetCurrentDirectory(string* dir);
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h
index 3d5e56faf7..3f89794688 100644
--- a/tensorflow/stream_executor/lib/ptr_util.h
+++ b/tensorflow/stream_executor/lib/ptr_util.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Trait to select overloads and return types for MakeUnique.
@@ -59,8 +58,17 @@ typename MakeUniqueResult<T>::invalid MakeUnique(Args&&... /* args */) =
     delete;  // NOLINT
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(jlebar): Remove this once we've completed
+// the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
-
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/stacktrace.h b/tensorflow/stream_executor/lib/stacktrace.h
index ba7e5317f0..a15b0f3026 100644
--- a/tensorflow/stream_executor/lib/stacktrace.h
+++ b/tensorflow/stream_executor/lib/stacktrace.h
@@ -19,14 +19,12 @@ limitations under the License.
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::CurrentStackTrace;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 8c289e1927..407b71b405 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
@@ -23,15 +23,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using Status = tensorflow::Status;
 
 #define SE_CHECK_OK(val) TF_CHECK_OK(val)
 #define SE_ASSERT_OK(val) \
-  ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val))
+  ASSERT_EQ(::stream_executor::port::Status::OK(), (val))
 
 // Define some canonical error helpers.
 inline Status UnimplementedError(StringPiece message) {
@@ -45,6 +44,16 @@ inline Status FailedPreconditionError(StringPiece message) {
 }
 
 }  // namespace port
+}  // namespace stream_executor
+
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
 }  // namespace gputools
 }  // namespace perftools
 
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 3b97929b37..dab5909674 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 // Use XLA's StatusOr so we don't duplicate code.
@@ -29,7 +28,6 @@ template <typename T>
 using StatusOr = ::xla::StatusOr<T>;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index 5dd3d06aff..a81c666818 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::str_util::Join;
@@ -38,7 +37,6 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix)
 using tensorflow::str_util::Lowercase;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
diff --git a/tensorflow/stream_executor/lib/strcat.h b/tensorflow/stream_executor/lib/strcat.h
index 424cb75f0e..c959e4df5b 100644
--- a/tensorflow/stream_executor/lib/strcat.h
+++ b/tensorflow/stream_executor/lib/strcat.h
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
 
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::strings::StrCat;
 using tensorflow::strings::StrAppend;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
diff --git a/tensorflow/stream_executor/lib/stringpiece.h b/tensorflow/stream_executor/lib/stringpiece.h
index 97ee0c9206..b80de5df30 100644
--- a/tensorflow/stream_executor/lib/stringpiece.h
+++ b/tensorflow/stream_executor/lib/stringpiece.h
@@ -19,14 +19,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::StringPiece;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_
diff --git a/tensorflow/stream_executor/lib/stringprintf.h b/tensorflow/stream_executor/lib/stringprintf.h
index 504de25a68..2f65ed9c6a 100644
--- a/tensorflow/stream_executor/lib/stringprintf.h
+++ b/tensorflow/stream_executor/lib/stringprintf.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::strings::Printf;
 using tensorflow::strings::Appendf;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPRINTF_H_
diff --git a/tensorflow/stream_executor/lib/thread_options.h b/tensorflow/stream_executor/lib/thread_options.h
index bd7f63714e..079cf757ac 100644
--- a/tensorflow/stream_executor/lib/thread_options.h
+++ b/tensorflow/stream_executor/lib/thread_options.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::ThreadOptions;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
diff --git a/tensorflow/stream_executor/lib/threadpool.h b/tensorflow/stream_executor/lib/threadpool.h
index 35630c5106..220068ade1 100644
--- a/tensorflow/stream_executor/lib/threadpool.h
+++ b/tensorflow/stream_executor/lib/threadpool.h
@@ -21,14 +21,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/thread_options.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace port {
 
 using tensorflow::thread::ThreadPool;
 
 }  // namespace port
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index f9f3737a06..5b51398d8c 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 /* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED};
 
@@ -132,8 +131,7 @@ MultiPlatformManager::InitializePlatformWithId(
   GetPlatformByIdMap()->clear();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 REGISTER_MODULE_INITIALIZER(
     multi_platform_manager,
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 438653ee20..672855d5fb 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -22,8 +22,8 @@ limitations under the License.
 // In your BUILD rule, add a dependency on a platform plugin that you'd like
 // to use, such as:
 //
-//   //perftools/gputools/executor/cuda:cuda_platform
-//   //perftools/gputools/executor/opencl:opencl_platform
+//   //third_party/tensorflow/stream_executor/cuda:cuda_platform
+//   //third_party/tensorflow/stream_executor/opencl:opencl_platform
 //
 // This will register platform plugins that can be discovered via this
 // interface. Sample API usage:
@@ -56,10 +56,10 @@ limitations under the License.
 // And similarly, for standard interfaces (BLAS, RNG, etc.) you can add
 // dependencies on support libraries, e.g.:
 //
-//    //perftools/gputools/executor/cuda:pluton_blas_plugin
-//    //perftools/gputools/executor/cuda:cudnn_plugin
-//    //perftools/gputools/executor/cuda:cublas_plugin
-//    //perftools/gputools/executor/cuda:curand_plugin
+//    //third_party/tensorflow/stream_executor/cuda:pluton_blas_plugin
+//    //third_party/tensorflow/stream_executor/cuda:cudnn_plugin
+//    //third_party/tensorflow/stream_executor/cuda:cublas_plugin
+//    //third_party/tensorflow/stream_executor/cuda:curand_plugin
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
@@ -75,8 +75,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Manages multiple platforms that may be present on the current machine.
 class MultiPlatformManager {
@@ -181,7 +180,6 @@ class MultiPlatformManager {
   SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index 4cdc22bd16..777abced86 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 string PlatformKindString(PlatformKind kind) {
   switch (kind) {
@@ -135,5 +134,4 @@ port::Status Platform::EnablePeerAccess() {
   return port::Status::OK();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index 54f8aa86c2..5cb7047b6f 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class StreamExecutor;
 
@@ -106,7 +105,7 @@ class Platform {
   namespace {                           \
   int plugin_id_value;                  \
   }                                     \
-  const perftools::gputools::Platform::Id ID_VAR_NAME = &plugin_id_value;
+  const ::stream_executor::Platform::Id ID_VAR_NAME = &plugin_id_value;
 
   // Returns a key uniquely identifying this platform.
   virtual Id id() const = 0;
@@ -205,7 +204,6 @@ class Platform {
   SE_DISALLOW_COPY_AND_ASSIGN(Platform);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index 62de0cbce0..c9f5a7c609 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 #undef mutex_lock
 #undef tf_shared_lock
@@ -35,7 +34,6 @@ using tensorflow::tf_shared_lock;
 #define tf_shared_lock(x) \
   static_assert(0, "tf_shared_lock_decl_missing_var_name");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 6603df4878..259cf380d6 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
+// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 using tensorflow::int8;
 using tensorflow::int16;
@@ -50,8 +49,7 @@ using tensorflow::LINKER_INITIALIZED;
 
 #define SE_FALLTHROUGH_INTENDED TF_FALLTHROUGH_INTENDED
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #define SE_DISALLOW_COPY_AND_ASSIGN TF_DISALLOW_COPY_AND_ASSIGN
 #define SE_MUST_USE_RESULT TF_MUST_USE_RESULT
diff --git a/tensorflow/stream_executor/plugin.cc b/tensorflow/stream_executor/plugin.cc
index 6424658e22..cfbc52ff17 100644
--- a/tensorflow/stream_executor/plugin.cc
+++ b/tensorflow/stream_executor/plugin.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/plugin.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Mostly-arbitrary ID only used as a sentinel "not otherwise initialized"
 // value. This value should never [need to] be specified aside by initialization
@@ -51,5 +50,4 @@ PluginConfig& PluginConfig::SetRng(PluginId rng) {
   return *this;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h
index 0b88b86e2b..0505412e7a 100644
--- a/tensorflow/stream_executor/plugin.h
+++ b/tensorflow/stream_executor/plugin.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // A plugin ID is a unique identifier for each registered plugin type.
 typedef void* PluginId;
@@ -83,7 +82,6 @@ class PluginConfig {
   PluginId blas_, dnn_, fft_, rng_;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc
index 54761139ea..7812703efd 100644
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 const PluginId kNullPlugin = nullptr;
 
@@ -244,5 +243,4 @@ EMIT_PLUGIN_SPECIALIZATIONS(DnnFactory, dnn, "DNN");
 EMIT_PLUGIN_SPECIALIZATIONS(FftFactory, fft, "FFT");
 EMIT_PLUGIN_SPECIALIZATIONS(RngFactory, rng, "RNG");
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/plugin_registry.h b/tensorflow/stream_executor/plugin_registry.h
index 8636a49ce6..49628ecd24 100644
--- a/tensorflow/stream_executor/plugin_registry.h
+++ b/tensorflow/stream_executor/plugin_registry.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class StreamExecutorInterface;
@@ -160,7 +159,6 @@ class PluginRegistry {
   SE_DISALLOW_COPY_AND_ASSIGN(PluginRegistry);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
diff --git a/tensorflow/stream_executor/rng.cc b/tensorflow/stream_executor/rng.cc
index 1c05005067..b0efad9108 100644
--- a/tensorflow/stream_executor/rng.cc
+++ b/tensorflow/stream_executor/rng.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/logging.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace rng {
 
 bool RngSupport::CheckSeed(const uint8 *seed, uint64 seed_bytes) {
@@ -47,5 +46,4 @@ const int RngSupport::kMaxSeedBytes;
 #endif
 
 }  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rng.h b/tensorflow/stream_executor/rng.h
index 36d0fdd454..acbf8fce4c 100644
--- a/tensorflow/stream_executor/rng.h
+++ b/tensorflow/stream_executor/rng.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 template <typename ElemT>
@@ -89,7 +88,6 @@ class RngSupport {
 };
 
 }  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_RNG_H_
diff --git a/tensorflow/stream_executor/scratch_allocator.cc b/tensorflow/stream_executor/scratch_allocator.cc
index 0c1db414f2..8fc4c4c509 100644
--- a/tensorflow/stream_executor/scratch_allocator.cc
+++ b/tensorflow/stream_executor/scratch_allocator.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 ScratchAllocator::~ScratchAllocator() {}
 
@@ -38,5 +37,4 @@ port::StatusOr<DeviceMemory<uint8>> OneTimeScratchAllocator::AllocateBytes(
   return temporary_->device_memory();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/scratch_allocator.h b/tensorflow/stream_executor/scratch_allocator.h
index 94d5ede161..2aed2c4437 100644
--- a/tensorflow/stream_executor/scratch_allocator.h
+++ b/tensorflow/stream_executor/scratch_allocator.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -77,7 +76,6 @@ class OneTimeScratchAllocator : public ScratchAllocator {
   SE_DISALLOW_COPY_AND_ASSIGN(OneTimeScratchAllocator);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/shared_memory_config.h b/tensorflow/stream_executor/shared_memory_config.h
index de556cb734..7cbeb3bcd9 100644
--- a/tensorflow/stream_executor/shared_memory_config.h
+++ b/tensorflow/stream_executor/shared_memory_config.h
@@ -19,8 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // SharedMemoryConfig enum describes potential widths of shared memory banks for
 // a device or kernel.
@@ -30,7 +29,6 @@ enum class SharedMemoryConfig {
   kEightByte,  // Sets shared memory banks to be eight bytes wide.
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index fe498507a8..f59d9a13ac 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace {
 // Code to turn parameters to functions on stream into strings that
@@ -5192,5 +5191,4 @@ port::Status Stream::BlockHostUntilDone() {
   return first_error;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 4af426001f..d4a81440e9 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -38,8 +38,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace host {
 class HostBlas;
@@ -2098,7 +2097,6 @@ struct Quantization<int32> {
       dnn::QuantizedActivationMode::k32Bit;
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_H_
diff --git a/tensorflow/stream_executor/stream_executor.h b/tensorflow/stream_executor/stream_executor.h
index 2995dccf46..d63d485df5 100644
--- a/tensorflow/stream_executor/stream_executor.h
+++ b/tensorflow/stream_executor/stream_executor.h
@@ -35,4 +35,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"  // IWYU pragma: export
 #include "tensorflow/stream_executor/timer.h"  // IWYU pragma: export
 
+namespace perftools {
+namespace gputools {
+
+// Temporarily pull stream_executor into perftools::gputools while we migrate
+// code to the new namespace.  TODO(b/77980417): Remove this once we've
+// completed the migration.
+using namespace stream_executor;  // NOLINT[build/namespaces]
+
+}  // namespace gputools
+}  // namespace perftools
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 273d970b6f..8297228e6f 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // -- CUDA
@@ -38,5 +37,4 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 StreamExecutorFactory MakeHostExecutorImplementation;
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 37ef182e14..2584c92f0c 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -45,8 +45,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/trace_listener.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 class Timer;
@@ -343,7 +342,6 @@ extern StreamExecutorFactory MakeHostExecutorImplementation;
 
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index f55fa68402..2e1adeb31e 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -39,8 +39,7 @@ namespace {
 bool FLAGS_check_device_leaks = false;
 }  // namespace
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace {
 
 string StackTraceIfVLOG10() {
@@ -788,5 +787,4 @@ internal::StreamExecutorInterface *StreamExecutor::implementation() {
   return implementation_->GetUnderlyingExecutor();
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 69d0374d73..39af7115d8 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -37,8 +37,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 // Structure used for device memory leak checking.
 struct AllocRecord {
@@ -95,7 +94,7 @@ class StreamExecutor {
   // Parameters:
   //   spec: The MultiKernelLoaderSpec is usually generated as a compile-time
   //    constant into an appropriate namespace. For example, see
-  //    perftools::gputools::executor_sample::kKernelLoaderSpecs, from which a
+  //    stream_executor::executor_sample::kKernelLoaderSpecs, from which a
   //    MultiKernelLoaderSpec is selected.
   //   kernel: Outparam that the kernel is loaded into. A given Kernel
   //    instantiation should not be loaded into more than once.
@@ -803,7 +802,6 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
   return *this;
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/tensorflow/stream_executor/temporary_device_memory.cc b/tensorflow/stream_executor/temporary_device_memory.cc
index c33166b224..f113ce9be5 100644
--- a/tensorflow/stream_executor/temporary_device_memory.cc
+++ b/tensorflow/stream_executor/temporary_device_memory.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 TemporaryDeviceMemoryBase::~TemporaryDeviceMemoryBase() {
   parent_->temporary_memory_manager()->MarkFinalized(device_memory_,
@@ -64,5 +63,4 @@ TemporaryDeviceMemoryBase::TemporaryDeviceMemoryBase(
   DCHECK(IsAllocated());
 }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/temporary_device_memory.h b/tensorflow/stream_executor/temporary_device_memory.h
index 2255e7ffd7..77be8599a2 100644
--- a/tensorflow/stream_executor/temporary_device_memory.h
+++ b/tensorflow/stream_executor/temporary_device_memory.h
@@ -43,8 +43,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 namespace internal {
@@ -132,7 +131,6 @@ class TemporaryDeviceMemory : public TemporaryDeviceMemoryBase {
   }
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/temporary_memory_manager.cc b/tensorflow/stream_executor/temporary_memory_manager.cc
index 449ab7d3f0..420dbb0933 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/stream_executor/temporary_memory_manager.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 void TemporaryMemoryManager::ForceDeallocateAll() {
@@ -124,5 +123,4 @@ TemporaryMemoryManager::AllocateArrayBase(uint64 element_count,
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h
index 2e6fbd9d62..faf13380dc 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/stream_executor/temporary_memory_manager.h
@@ -31,8 +31,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace internal {
 
 // Record used inside the TemporaryMemoryManager as metadata for a given device
@@ -147,7 +146,6 @@ TemporaryMemoryManager::AllocateArray(uint64 element_count) {
 }
 
 }  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
diff --git a/tensorflow/stream_executor/timer.cc b/tensorflow/stream_executor/timer.cc
index 41d7e4359d..a29791a104 100644
--- a/tensorflow/stream_executor/timer.cc
+++ b/tensorflow/stream_executor/timer.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 Timer::Timer(StreamExecutor *parent)
     : parent_(parent),
@@ -34,5 +33,4 @@ uint64 Timer::Microseconds() const { return implementation_->Microseconds(); }
 
 uint64 Timer::Nanoseconds() const { return implementation_->Nanoseconds(); }
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/timer.h b/tensorflow/stream_executor/timer.h
index 0a37caa0f2..fba7dd8f58 100644
--- a/tensorflow/stream_executor/timer.h
+++ b/tensorflow/stream_executor/timer.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 namespace internal {
 class TimerInterface;
@@ -69,7 +68,6 @@ class Timer {
   SE_DISALLOW_COPY_AND_ASSIGN(Timer);
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TIMER_H_
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index d1e87c348b..0e874a1d47 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 
 class Stream;
 
@@ -69,7 +68,6 @@ class TraceListener {
                                           const port::Status* result) {}
 };
 
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
-- 
GitLab


From 495d511bf384e296d7149537bc0900c32e0b76b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 14:33:53 -0700
Subject: [PATCH 0978/1262] Use easy_install to update pip only on Ubuntu14.04

* We only depends on easy_install/easy_install3 to update pip on Ubuntu14.04
* They are not always available for later system e.g. debian9
* We can use pip/pip3 to update themselves

PiperOrigin-RevId: 193257326
---
 .../tools/ci_build/install/install_pip_packages.sh  | 13 +++++++++----
 .../ci_build/install/install_pip_packages_remote.sh |  6 ++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index d406b83a62..fc137aeeed 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -16,10 +16,15 @@
 
 set -e
 
-# We don't apt-get install so that we can install a newer version of pip. Not
-# needed after we upgrade to Ubuntu 16.04
-easy_install -U pip
-easy_install3 -U pip
+# We don't apt-get install so that we can install a newer version of pip.
+# Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
+if $(cat /etc/*-release | grep -q 14.04); then
+  easy_install -U pip
+  easy_install3 -U pip
+else
+  pip2 install --upgrade pip
+  pip3 install --upgrade pip
+fi
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
index 39a6d557d1..0beabcf5ef 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
@@ -20,10 +20,8 @@ if [ ! -f /usr/bin/x86_64-linux-gnu-gcc ]; then
   ln -s /usr/local/bin/clang /usr/bin/x86_64-linux-gnu-gcc
 fi
 
-pip2 install -U pip
-pip3 install -U pip
-pip2  install -U setuptools
-pip3 install -U setuptools
+pip2 install --upgrade setuptools
+pip3 install --upgrade setuptools
 
 # The rest of the pip packages will be installed in
 # `install_pip_packages.sh`
-- 
GitLab


From 75fd390fc14d50683c59a087c1f5541fc1fecaf5 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 21:33:27 +0000
Subject: [PATCH 0979/1262] Remove duplicate imports in several places

Wrote a script to scan throught the python files in the repo,
and found the remaining duplicate imports in some python files like:
```
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.util.deprecation import deprecated
```
This fix removed all of them for duplicate imports.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/init_ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 5ded3f7cc2..8bf6c7f8c1 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -42,7 +42,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
-- 
GitLab


From 0946bbc5cfd1dc9f6c832cbd056e74b9d587f86e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 21:36:05 +0000
Subject: [PATCH 0980/1262] Fix duplicate import in kmeans_test.py

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index b28835a809..584556992a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
-- 
GitLab


From dc70fe423965be1efdbf6747aa73ff7738c91308 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 21:36:21 +0000
Subject: [PATCH 0981/1262] Clean up remaining issues.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/feature_column/feature_column.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index f9201a4794..9a423ee0ca 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -161,7 +161,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
-- 
GitLab


From 8a2eb27d7bbb552e2375c4fafa1863e017c503be Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 17 Apr 2018 14:47:44 -0700
Subject: [PATCH 0982/1262] Don't consider control flow edges when computing
 switch depth of switch.

PiperOrigin-RevId: 193259710
---
 tensorflow/compiler/tf2xla/functionalize_control_flow.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 16b9142cbf..23629d85ae 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -870,6 +870,9 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       // Merge the inputs of the switch node with one another. This results in
       // predicates and control input residing in the same cluster.
       for (const Edge* e : n->in_edges()) {
+        // Only consider the data inputs to the Switch node.
+        if (e->IsControlEdge()) continue;
+
         Node* src = e->src();
         UnionFind<Cluster>* src_cluster = find_output_cluster(src);
         int src_cluster_depth = switch_depth[src_cluster->Get().representative];
-- 
GitLab


From ee3669301e3a6e2c9945124851c5a6b2ee74fe2b Mon Sep 17 00:00:00 2001
From: Derek Murray <derek.murray@gmail.com>
Date: Tue, 17 Apr 2018 14:59:58 -0700
Subject: [PATCH 0983/1262] [tf.data] Fix a device placement issue in
 `prefetch_to_device()`. (#18607)

* [tf.data] Fix a device placement issue in `prefetch_to_device()`.

Previously, the `iterator_get_device()` op was being infeasibly colocated with
both the iterator and placed on the prefetch target device. Move the
construction of that op outside the `with device():` block to fix this.

Also enable the relevant test to run as a CUDA test.

* Import the cuda_py_test rule.
---
 tensorflow/contrib/data/python/kernel_tests/BUILD     | 7 +++----
 tensorflow/contrib/data/python/ops/prefetching_ops.py | 6 ++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index b475c9fa6b..b15b9663f4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -471,12 +471,11 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
     srcs = ["prefetching_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 89c04dc89a..e4c9f8b58a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -114,11 +114,13 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
+    iterator_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
-          target_device=gen_dataset_ops.iterator_get_device(
-              self._input_iterator._iterator_resource),
+          target_device=iterator_device,
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
           shared_name=shared_name)
-- 
GitLab


From 8670a5e23717a8740d1360d34147f90fdf0b3b68 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Tue, 17 Apr 2018 15:01:22 -0700
Subject: [PATCH 0984/1262] Internal Change.

PiperOrigin-RevId: 193262066
---
 tensorflow/tools/ci_build/windows/bazel/common_env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 7d4cc7ac30..0e6c0227b7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -44,6 +44,8 @@ export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/${PYTHON_BASE_PATH}:$PATH"
+# Add git into PATH needed for gen_git_source.py
+export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-- 
GitLab


From ba1ea3ff90ee44c8e82a1fb9ba757d798b55d144 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 16 Apr 2018 11:24:43 -0700
Subject: [PATCH 0985/1262] Porting tests for the `decode_proto` and
 `encode_proto` to OS.

PiperOrigin-RevId: 193070420
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   6 +-
 tensorflow/contrib/proto/BUILD                |  16 +
 .../contrib/proto/python/kernel_tests/BUILD   |  86 +++++
 .../proto/python/kernel_tests/build_defs.bzl  |  89 ++++++
 .../kernel_tests/decode_proto_fail_test.py    |  68 ++++
 .../kernel_tests/decode_proto_op_test.py      | 300 ++++++++++++++++++
 .../kernel_tests/encode_proto_op_test.py      | 180 +++++++++++
 .../python/kernel_tests/minmax.TestCase.pbtxt | 161 ++++++++++
 .../python/kernel_tests/nested.TestCase.pbtxt |  16 +
 .../kernel_tests/optional.TestCase.pbtxt      |  20 ++
 .../promote_unsigned.TestCase.pbtxt           |  21 ++
 .../python/kernel_tests/ragged.TestCase.pbtxt |  32 ++
 .../kernel_tests/shaped_batch.TestCase.pbtxt  |  62 ++++
 .../python/kernel_tests/simple.TestCase.pbtxt |  21 ++
 .../proto/python/kernel_tests/test_case.py    |  35 ++
 .../python/kernel_tests/test_example.proto    | 149 +++++++++
 tensorflow/tools/pip_package/BUILD            |   1 +
 19 files changed, 1263 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_case.py
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 9bef0d8b61..ae68f4aec4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -77,6 +77,7 @@ py_library(
         "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
+        "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index aaddb06fa0..e27ece8fa5 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -64,6 +64,7 @@ from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
+from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
 from tensorflow.contrib import recurrent
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index ded15b4b66..21f59d2563 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,8 +330,10 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
-GENERATE_PYTHON_OP_LIB("decode_proto_ops")
-GENERATE_PYTHON_OP_LIB("encode_proto_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("encode_proto_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index 046652cbc5..3e9b1a0b8d 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "proto",
     srcs = [
@@ -14,3 +16,17 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
+
+py_library(
+    name = "proto_pip",
+    data = [
+        "//tensorflow/contrib/proto/python/kernel_tests:test_messages",
+    ] + if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":proto",
+        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..a380a131f8
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -0,0 +1,86 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Much of the work in this BUILD file actually happens in the corresponding
+# build_defs.bzl, which creates an individual testcase for each example .pbtxt
+# file in this directory.
+#
+load(":build_defs.bzl", "decode_proto_test_suite")
+load(":build_defs.bzl", "encode_proto_test_suite")
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :decode_proto_op_tests.
+decode_proto_test_suite(
+    name = "decode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# This expands to a tf_py_test for each test file.
+# It defines the test_suite :encode_proto_op_tests.
+encode_proto_test_suite(
+    name = "encode_proto_tests",
+    examples = glob(["*.pbtxt"]),
+)
+
+# Below here are tests that are not tied to an example text proto.
+filegroup(
+    name = "test_messages",
+    srcs = glob(["*.pbtxt"]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_py_test(
+    name = "decode_proto_fail_test",
+    size = "small",
+    srcs = ["decode_proto_fail_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/proto:proto",
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
+
+py_library(
+    name = "test_case",
+    srcs = ["test_case.py"],
+    deps = ["//tensorflow/python:client_testlib"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [
+        ":test_case",
+        ":test_example_proto_py",
+    ],
+)
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
new file mode 100644
index 0000000000..f425601691
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/build_defs.bzl
@@ -0,0 +1,89 @@
+"""BUILD rules for generating file-driven proto test cases.
+
+The decode_proto_test_suite() and encode_proto_test_suite() rules take a list
+of text protos and generates a tf_py_test() for each one.
+"""
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "register_extension_info")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+def _test_name(test, path):
+  return "%s_%s_test" % (test, path.split("/")[-1].split(".")[0])
+
+def decode_proto_test_suite(name, examples):
+  """Build the decode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("decode_proto", test_filename),
+        srcs = ["decode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "decode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("decode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+def encode_proto_test_suite(name, examples):
+  """Build the encode_proto py_test for each test filename."""
+  for test_filename in examples:
+    tf_py_test(
+        name = _test_name("encode_proto", test_filename),
+        srcs = ["encode_proto_op_test.py"],
+        size = "small",
+        data = [test_filename] + if_static(
+            [],
+            otherwise = [":libtestexample.so"],
+        ),
+        main = "encode_proto_op_test.py",
+        args = [
+            "--message_text_file=\"%s/%s\"" % (native.package_name(), test_filename),
+        ],
+        additional_deps = [
+            ":py_test_deps",
+            "//third_party/py/numpy",
+            "//tensorflow/contrib/proto:proto",
+            "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+            "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+        ],
+        tags = [
+            "no_pip",  # TODO(b/78026780)
+            "no_windows",  # TODO(b/78028010)
+        ],
+    )
+  native.test_suite(
+      name = name,
+      tests = [":" + _test_name("encode_proto", test_filename)
+               for test_filename in examples],
+  )
+
+register_extension_info(
+    extension_name = "decode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:decode_example_.*",
+    })
+
+register_extension_info(
+    extension_name = "encode_proto_test_suite",
+    label_regex_map = {
+        "deps": "deps:encode_example_.*",
+    })
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
new file mode 100644
index 0000000000..5298342ee7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_fail_test.py
@@ -0,0 +1,68 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class DecodeProtoFailTest(test_case.ProtoOpTestCase):
+  """Test failure cases for DecodeToProto."""
+
+  def _TestCorruptProtobuf(self, sanitize):
+    """Test failure cases for DecodeToProto."""
+
+    # The goal here is to check the error reporting.
+    # Testing against a variety of corrupt protobufs is
+    # done by fuzzing.
+    corrupt_proto = 'This is not a binary protobuf'
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(corrupt_proto, dtype=object)
+    msg_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['sizes']
+    field_types = [dtypes.int32]
+
+    with self.test_session() as sess:
+      ctensor, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=msg_type,
+          field_names=field_names,
+          output_types=field_types,
+          sanitize=sanitize)
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   'Unable to parse binary protobuf'
+                                   '|Failed to consume entire buffer'):
+        _ = sess.run([ctensor] + vtensor)
+
+  def testCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=False)
+
+  def testSanitizerCorrupt(self):
+    self._TestCorruptProtobuf(sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
new file mode 100644
index 0000000000..d1c13c82bc
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test.py
@@ -0,0 +1,300 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for decode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+"""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class DecodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def _compareValues(self, fd, vs, evs):
+    """Compare lists/arrays of field values."""
+
+    if len(vs) != len(evs):
+      self.fail('Field %s decoded %d outputs, expected %d' %
+                (fd.name, len(vs), len(evs)))
+    for i, ev in enumerate(evs):
+      # Special case fuzzy match for float32. TensorFlow seems to mess with
+      # MAX_FLT slightly and the test doesn't work otherwise.
+      # TODO(nix): ask on TF list about why MAX_FLT doesn't pass through.
+      if fd.cpp_type == fd.CPPTYPE_FLOAT:
+        # Numpy isclose() is better than assertIsClose() which uses an absolute
+        # value comparison.
+        self.assertTrue(
+            np.isclose(vs[i], ev), 'expected %r, actual %r' % (ev, vs[i]))
+      elif fd.cpp_type == fd.CPPTYPE_STRING:
+        # In Python3 string tensor values will be represented as bytes, so we
+        # reencode the proto values to match that.
+        self.assertEqual(vs[i], ev.encode('ascii'))
+      else:
+        # Doubles and other types pass through unscathed.
+        self.assertEqual(vs[i], ev)
+
+  def _compareRepeatedPrimitiveValue(self, batch_shape, sizes, fields,
+                                     field_dict):
+    """Compare protos of type RepeatedPrimitiveValue.
+
+    Args:
+      batch_shape: the shape of the input tensor of serialized messages.
+      sizes: int matrix of repeat counts returned by decode_proto
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      field_dict: map from field names to decoded numpy tensors of values
+    """
+
+    # Check that expected values match.
+    for field in fields:
+      values = field_dict[field.name]
+      self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
+
+      fd = field.expected.DESCRIPTOR.fields_by_name[field.name]
+
+      # Values has the same shape as the input plus an extra
+      # dimension for repeats.
+      self.assertEqual(list(values.shape)[:-1], batch_shape)
+
+      # Nested messages are represented as TF strings, requiring
+      # some special handling.
+      if field.name == 'message_value':
+        vs = []
+        for buf in values.flat:
+          msg = test_example_pb2.PrimitiveValue()
+          msg.ParseFromString(buf)
+          vs.append(msg)
+        evs = getattr(field.expected, field.name)
+        if len(vs) != len(evs):
+          self.fail('Field %s decoded %d outputs, expected %d' %
+                    (fd.name, len(vs), len(evs)))
+        for v, ev in zip(vs, evs):
+          self.assertEqual(v, ev)
+        continue
+
+      # This can be a little confusing. For testing we are using
+      # RepeatedPrimitiveValue in two ways: it's the proto that we
+      # decode for testing, and it's used in the expected value as a
+      # union type. The two cases are slightly different: this is the
+      # second case.
+      # We may be fetching the uint64_value from the test proto, but
+      # in the expected proto we store it in the int64_value field
+      # because TensorFlow doesn't support unsigned int64.
+      tf_type_to_primitive_value_field = {
+          dtypes.float32:
+              'float_value',
+          dtypes.float64:
+              'double_value',
+          dtypes.int32:
+              'int32_value',
+          dtypes.uint8:
+              'uint8_value',
+          dtypes.int8:
+              'int8_value',
+          dtypes.string:
+              'string_value',
+          dtypes.int64:
+              'int64_value',
+          dtypes.bool:
+              'bool_value',
+          # Unhandled TensorFlow types:
+          # DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+          # DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+      }
+      tf_field_name = tf_type_to_primitive_value_field.get(field.dtype)
+      if tf_field_name is None:
+        self.fail('Unhandled tensorflow type %d' % field.dtype)
+
+      self._compareValues(fd, values.flat,
+                          getattr(field.expected, tf_field_name))
+
+  def _runDecodeProtoTests(self, fields, case_sizes, batch_shape, batch,
+                           message_type, message_format, sanitize,
+                           force_disordered=False):
+    """Run decode tests on a batch of messages.
+
+    Args:
+      fields: list of test_example_pb2.FieldSpec (types and expected values)
+      case_sizes: expected sizes array
+      batch_shape: the shape of the input tensor of serialized messages
+      batch: list of serialized messages
+      message_type: descriptor name for messages
+      message_format: format of messages, 'text' or 'binary'
+      sanitize: whether to sanitize binary protobuf inputs
+      force_disordered: whether to force fields encoded out of order.
+    """
+
+    if force_disordered:
+      # Exercise code path that handles out-of-order fields by prepending extra
+      # fields with tag numbers higher than any real field. Note that this won't
+      # work with sanitization because that forces reserialization using a
+      # trusted decoder and encoder.
+      assert not sanitize
+      extra_fields = test_example_pb2.ExtraFields()
+      extra_fields.string_value = 'IGNORE ME'
+      extra_fields.bool_value = False
+      extra_msg = extra_fields.SerializeToString()
+      batch = [extra_msg + msg for msg in batch]
+
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    batch = np.array(batch, dtype=object)
+    batch = np.reshape(batch, batch_shape)
+
+    field_names = [f.name for f in fields]
+    output_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, vtensor = decode_proto_op.decode_proto(
+          batch,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=output_types,
+          message_format=message_format,
+          sanitize=sanitize)
+
+      vlist = sess.run([sizes] + vtensor)
+      sizes = vlist[0]
+      # Values is a list of tensors, one for each field.
+      value_tensors = vlist[1:]
+
+      # Check that the repeat sizes are correct.
+      self.assertTrue(
+          np.all(np.array(sizes.shape) == batch_shape + [len(field_names)]))
+
+      # Check that the decoded sizes match the expected sizes.
+      self.assertEqual(len(sizes.flat), len(case_sizes))
+      self.assertTrue(
+          np.all(sizes.flat == np.array(
+              case_sizes, dtype=np.int32)))
+
+      field_dict = dict(zip(field_names, value_tensors))
+
+      self._compareRepeatedPrimitiveValue(batch_shape, sizes, fields,
+                                          field_dict)
+
+  def testBinary(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testBinaryDisordered(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=False,
+        force_disordered=True)
+
+  def testPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    packed_batch = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        packed_batch,
+        'tensorflow.contrib.proto.PackedPrimitiveValue',
+        'binary',
+        sanitize=False)
+
+  def testText(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Note: float_format='.17g' is necessary to ensure preservation of
+    # doubles and floats in text format.
+    text_batch = [
+        text_format.MessageToString(
+            primitive, float_format='.17g') for primitive in case.primitive
+    ]
+
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        text_batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'text',
+        sanitize=False)
+
+  def testSanitizerGood(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    batch = [primitive.SerializeToString() for primitive in case.primitive]
+    self._runDecodeProtoTests(
+        case.field,
+        case.sizes,
+        list(case.shape),
+        batch,
+        'tensorflow.contrib.proto.RepeatedPrimitiveValue',
+        'binary',
+        sanitize=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
new file mode 100644
index 0000000000..30e58e6336
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test.py
@@ -0,0 +1,180 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Table-driven test for encode_proto op.
+
+This test is run once with each of the *.TestCase.pbtxt files
+in the test directory.
+
+It tests that encode_proto is a lossless inverse of decode_proto
+(for the specified fields).
+"""
+# Python3 readiness boilerplate
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.proto.python.kernel_tests import test_case
+from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('message_text_file', None,
+                    'A file containing a text serialized TestCase protobuf.')
+
+
+class EncodeProtoOpTest(test_case.ProtoOpTestCase):
+
+  def testBadInputs(self):
+    # Invalid field name
+    with self.test_session():
+      with self.assertRaisesOpError('Unknown field: non_existent_field'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['non_existent_field']).eval()
+
+    # Incorrect types.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Incompatible type for field double_value.'):
+        encode_proto_op.encode_proto(
+            sizes=[[1]],
+            values=[np.array([[0.0]], dtype=np.int32)],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval()
+
+    # Incorrect shapes of sizes.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r'sizes should be batch_size \+ \[len\(field_names\)\]'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values = array_ops.placeholder(dtypes.float64)
+        encode_proto_op.encode_proto(
+            sizes=sizes,
+            values=[values],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value']).eval(feed_dict={
+                sizes: [[[0, 0]]],
+                values: [[0.0]]
+            })
+
+    # Inconsistent shapes of values.
+    with self.test_session():
+      with self.assertRaisesOpError(
+          'Values must match up to the last dimension'):
+        sizes = array_ops.placeholder(dtypes.int32)
+        values1 = array_ops.placeholder(dtypes.float64)
+        values2 = array_ops.placeholder(dtypes.int32)
+        (encode_proto_op.encode_proto(
+            sizes=[[1, 1]],
+            values=[values1, values2],
+            message_type='tensorflow.contrib.proto.RepeatedPrimitiveValue',
+            field_names=['double_value', 'int32_value']).eval(feed_dict={
+                values1: [[0.0]],
+                values2: [[0], [0]]
+            }))
+
+  def _testRoundtrip(self, in_bufs, message_type, fields):
+
+    field_names = [f.name for f in fields]
+    out_types = [f.dtype for f in fields]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = decode_proto_op.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=out_types)
+
+      out_tensors = encode_proto_op.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        in_obj = test_example_pb2.RepeatedPrimitiveValue()
+        in_obj.ParseFromString(in_buf)
+
+        out_obj = test_example_pb2.RepeatedPrimitiveValue()
+        out_obj.ParseFromString(out_buf)
+
+        # Check that the deserialized objects are identical.
+        self.assertEqual(in_obj, out_obj)
+
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testRoundtrip(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    in_bufs = [primitive.SerializeToString() for primitive in case.primitive]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.RepeatedPrimitiveValue', case.field)
+
+  def testRoundtripPacked(self):
+    with open(FLAGS.message_text_file, 'r') as fp:
+      case = text_format.Parse(fp.read(), test_example_pb2.TestCase())
+
+    # Now try with the packed serialization.
+    # We test the packed representations by loading the same test cases
+    # using PackedPrimitiveValue instead of RepeatedPrimitiveValue.
+    # To do this we rely on the text format being the same for packed and
+    # unpacked fields, and reparse the test message using the packed version
+    # of the proto.
+    in_bufs = [
+        # Note: float_format='.17g' is necessary to ensure preservation of
+        # doubles and floats in text format.
+        text_format.Parse(
+            text_format.MessageToString(
+                primitive, float_format='.17g'),
+            test_example_pb2.PackedPrimitiveValue()).SerializeToString()
+        for primitive in case.primitive
+    ]
+
+    # np.array silently truncates strings if you don't specify dtype=object.
+    in_bufs = np.reshape(np.array(in_bufs, dtype=object), list(case.shape))
+    return self._testRoundtrip(
+        in_bufs, 'tensorflow.contrib.proto.PackedPrimitiveValue', case.field)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
new file mode 100644
index 0000000000..b170f89c0f
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/minmax.TestCase.pbtxt
@@ -0,0 +1,161 @@
+primitive {
+  double_value: -1.7976931348623158e+308
+  double_value: 2.2250738585072014e-308
+  double_value: 1.7976931348623158e+308
+  float_value: -3.402823466e+38
+  float_value: 1.175494351e-38
+  float_value: 3.402823466e+38
+  int64_value: -9223372036854775808
+  int64_value: 9223372036854775807
+  uint64_value: 0
+  uint64_value: 18446744073709551615
+  int32_value: -2147483648
+  int32_value: 2147483647
+  fixed64_value: 0
+  fixed64_value: 18446744073709551615
+  fixed32_value: 0
+  fixed32_value: 4294967295
+  bool_value: false
+  bool_value: true
+  string_value: ""
+  string_value: "I refer to the infinite."
+  uint32_value: 0
+  uint32_value: 4294967295
+  sfixed32_value: -2147483648
+  sfixed32_value: 2147483647
+  sfixed64_value: -9223372036854775808
+  sfixed64_value: 9223372036854775807
+  sint32_value: -2147483648
+  sint32_value: 2147483647
+  sint64_value: -9223372036854775808
+  sint64_value: 9223372036854775807
+}
+shape: 1
+sizes: 3
+sizes: 3
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+sizes: 2
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: -1.7976931348623158e+308
+    double_value: 2.2250738585072014e-308
+    double_value: 1.7976931348623158e+308
+  }
+}
+field {
+  name: "float_value"
+  dtype: DT_FLOAT
+  expected {
+    float_value: -3.402823466e+38
+    float_value: 1.175494351e-38
+    float_value: 3.402823466e+38
+  }
+}
+field {
+  name: "int64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "uint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1
+  }
+}
+field {
+  name: "int32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "fixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 0
+    int64_value: -1  # unsigned is 18446744073709551615
+  }
+}
+field {
+  name: "fixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: false
+    bool_value: true
+  }
+}
+field {
+  name: "string_value"
+  dtype: DT_STRING
+  expected {
+    string_value: ""
+    string_value: "I refer to the infinite."
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: 0
+    int32_value: -1  # unsigned is 4294967295
+  }
+}
+field {
+  name: "sfixed32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sfixed64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
+field {
+  name: "sint32_value"
+  dtype: DT_INT32
+  expected {
+    int32_value: -2147483648
+    int32_value: 2147483647
+  }
+}
+field {
+  name: "sint64_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: -9223372036854775808
+    int64_value: 9223372036854775807
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
new file mode 100644
index 0000000000..c664e52851
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/nested.TestCase.pbtxt
@@ -0,0 +1,16 @@
+primitive {
+  message_value {
+    double_value: 23.5
+  }
+}
+shape: 1
+sizes: 1
+field {
+  name: "message_value"
+  dtype: DT_STRING
+  expected {
+    message_value {
+      double_value: 23.5
+    }
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
new file mode 100644
index 0000000000..125651d7ea
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/optional.TestCase.pbtxt
@@ -0,0 +1,20 @@
+primitive {
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 0
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 0.0
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
new file mode 100644
index 0000000000..db7555bf2d
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  fixed32_value: 4294967295
+  uint32_value: 4294967295
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "fixed32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
+field {
+  name: "uint32_value"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
new file mode 100644
index 0000000000..61c7ac53f7
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/ragged.TestCase.pbtxt
@@ -0,0 +1,32 @@
+primitive {
+  double_value: 23.5
+  double_value: 123.0
+  bool_value: true
+}
+primitive {
+  double_value: 3.1
+  bool_value: false
+}
+shape: 2
+sizes: 2
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 123.0
+    double_value: 3.1
+    double_value: 0.0
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
new file mode 100644
index 0000000000..f4828076d5
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/shaped_batch.TestCase.pbtxt
@@ -0,0 +1,62 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+primitive {
+  double_value: 44.0
+  bool_value: false
+}
+primitive {
+  double_value: 3.14159
+  bool_value: true
+}
+primitive {
+  double_value: 1.414
+  bool_value: true
+}
+primitive {
+  double_value: -32.2
+  bool_value: false
+}
+primitive {
+  double_value: 0.0001
+  bool_value: true
+}
+shape: 3
+shape: 2
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+    double_value: 44.0
+    double_value: 3.14159
+    double_value: 1.414
+    double_value: -32.2
+    double_value: 0.0001
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+    bool_value: false
+    bool_value: true
+    bool_value: true
+    bool_value: false
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
new file mode 100644
index 0000000000..dc20ac147b
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/simple.TestCase.pbtxt
@@ -0,0 +1,21 @@
+primitive {
+  double_value: 23.5
+  bool_value: true
+}
+shape: 1
+sizes: 1
+sizes: 1
+field {
+  name: "double_value"
+  dtype: DT_DOUBLE
+  expected {
+    double_value: 23.5
+  }
+}
+field {
+  name: "bool_value"
+  dtype: DT_BOOL
+  expected {
+    bool_value: true
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_case.py b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
new file mode 100644
index 0000000000..b95202c5df
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_case.py
@@ -0,0 +1,35 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Test case base for testing proto operations."""
+
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+from tensorflow.python.platform import test
+
+
+class ProtoOpTestCase(test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(ProtoOpTestCase, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..dc495034ff
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -0,0 +1,149 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.proto;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 376644718f..a0bae23a7c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -74,6 +74,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-- 
GitLab


From d995be2debded727f2b99bb87c0d209604a5bb4b Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 16 Apr 2018 14:47:31 -0700
Subject: [PATCH 0986/1262] Porting tests for `rpc_op` to OS.

PiperOrigin-RevId: 193102564
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   3 +-
 tensorflow/contrib/rpc/BUILD                  |  16 +
 .../contrib/rpc/python/kernel_tests/BUILD     |  80 +++++
 .../rpc/python/kernel_tests/rpc_op_test.py    |  71 ++++
 .../python/kernel_tests/rpc_op_test_base.py   | 336 ++++++++++++++++++
 .../kernel_tests/rpc_op_test_servicer.py      | 101 ++++++
 .../python/kernel_tests/test_example.proto    | 171 +++++++++
 .../core/platform/default/build_config.bzl    |  86 ++++-
 tensorflow/tools/pip_package/BUILD            |   1 +
 tensorflow/workspace.bzl                      |   4 +
 12 files changed, 867 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/BUILD
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
 create mode 100644 tensorflow/contrib/rpc/python/kernel_tests/test_example.proto

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index ae68f4aec4..7e47516550 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -87,6 +87,7 @@ py_library(
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/rpc",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
         "//tensorflow/contrib/signal:signal_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e27ece8fa5..36cc5144d0 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -71,6 +71,7 @@ from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
+from tensorflow.contrib import rpc
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
 from tensorflow.contrib import signal
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 21f59d2563..f6aaf41f73 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -347,7 +347,8 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
-GENERATE_PYTHON_OP_LIB("rpc_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index 597f18c771..dbd311a276 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -4,6 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
 py_library(
     name = "rpc",
     srcs = [
@@ -11,3 +13,17 @@ py_library(
     ],
     deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
 )
+
+py_library(
+    name = "rpc_pip",
+    data = if_static(
+        [],
+        otherwise = ["//tensorflow/contrib/rpc/python/kernel_tests:libtestexample.so"],
+    ),
+    deps = [
+        ":rpc",
+        "//tensorflow/contrib/rpc/python/kernel_tests:py_test_deps",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_base",
+        "//tensorflow/contrib/rpc/python/kernel_tests:rpc_op_test_servicer",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
new file mode 100644
index 0000000000..2311c15a68
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -0,0 +1,80 @@
+# TODO(b/76425722): Port everything in here to OS (currently excluded).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+# Placeholder for loading internal BUILD rule.
+
+tf_proto_library(
+    name = "test_example_proto",
+    srcs = ["test_example.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
+
+py_library(
+    name = "py_test_deps",
+    deps = [":test_example_proto_py"],
+)
+
+py_library(
+    name = "rpc_op_test_base",
+    srcs = ["rpc_op_test_base.py"],
+    deps = [
+        ":test_example_proto_py",
+        "//tensorflow/contrib/proto",
+        "//tensorflow/contrib/rpc",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "rpc_op_test_servicer",
+    srcs = ["rpc_op_test_servicer.py"],
+    deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_cc_shared_object(
+    name = "libtestexample.so",
+    linkstatic = 1,
+    deps = [
+        ":test_example_proto_cc",
+    ],
+)
+
+tf_py_test(
+    name = "rpc_op_test",
+    size = "small",
+    srcs = ["rpc_op_test.py"],
+    additional_deps = [
+        ":py_test_deps",
+        ":rpc_op_test_base",
+        ":rpc_op_test_servicer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    data = if_static(
+        [],
+        otherwise = [":libtestexample.so"],
+    ),
+    tags = [
+        "no_pip",  # TODO(b/78026780)
+        "no_windows",  # TODO(b/78028010)
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
new file mode 100644
index 0000000000..e2e0dbc7a2
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for RpcOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes as ct
+import os
+
+import grpc
+from grpc.framework.foundation import logging_pool
+import portpicker
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_servicer
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+from tensorflow.python.platform import test
+
+
+class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
+  _protocol = 'grpc'
+
+  invalid_method_string = 'Method not found'
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(RpcOpTest, self).__init__(methodName)
+    lib = os.path.join(os.path.dirname(__file__), 'libtestexample.so')
+    if os.path.isfile(lib):
+      ct.cdll.LoadLibrary(lib)
+
+  def get_method_name(self, suffix):
+    return '/tensorflow.contrib.rpc.TestCaseService/%s' % suffix
+
+  def setUp(self):
+    super(RpcOpTest, self).setUp()
+
+    service_port = portpicker.pick_unused_port()
+
+    server = grpc.server(logging_pool.pool(max_workers=25))
+    servicer = rpc_op_test_servicer.RpcOpTestServicer()
+    test_example_pb2_grpc.add_TestCaseServiceServicer_to_server(
+        servicer, server)
+    self._address = 'localhost:%d' % service_port
+    server.add_insecure_port(self._address)
+    server.start()
+    self._server = server
+
+  def tearDown(self):
+    # TODO(ebrevdo): Figure out why this sometimes times out.
+    #    self._service.ExitLoop()
+    #    self._service_thread.join()
+    # self._server.stop()
+    super(RpcOpTest, self).tearDown()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
new file mode 100644
index 0000000000..89f3ee1a1c
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -0,0 +1,336 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Base class for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2
+from tensorflow.contrib.rpc.python.ops import rpc_op
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+
+__all__ = ['I_WARNED_YOU', 'RpcOpTestBase']
+
+I_WARNED_YOU = 'I warned you!'
+
+
+class RpcOpTestBase(object):
+  # pylint: disable=missing-docstring,invalid-name
+  """Base class for RpcOp tests."""
+
+  def get_method_name(self, suffix):
+    raise NotImplementedError
+
+  def rpc(self, *args, **kwargs):
+    return rpc_op.rpc(*args, protocol=self._protocol, **kwargs)
+
+  def try_rpc(self, *args, **kwargs):
+    return rpc_op.try_rpc(*args, protocol=self._protocol, **kwargs)
+
+  def testScalarHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, ())
+      response_values = sess.run(response_tensors)
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+
+  def testScalarHostPortTryRpc(self):
+    with self.test_session() as sess:
+      request_tensors = (
+          test_example_pb2.TestCase(shape=[1, 2, 3]).SerializeToString())
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(status_code.shape, ())
+      self.assertEqual(status_message.shape, ())
+      self.assertEqual(response_tensors.shape, ())
+      response_values, status_code_values, status_message_values = (
+          sess.run((response_tensors, status_code, status_message)))
+    response_message = test_example_pb2.TestCase()
+    self.assertTrue(response_message.ParseFromString(response_values))
+    self.assertAllEqual([2, 3, 4], response_message.shape)
+    # For the base Rpc op, don't expect to get error status back.
+    self.assertEqual(errors.OK, status_code_values)
+    self.assertEqual(b'', status_message_values)
+
+  def testEmptyHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = []
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertAllEqual(response_tensors.shape, [0])
+      response_values = sess.run(response_tensors)
+    self.assertAllEqual(response_values.shape, [0])
+
+  def testInvalidAddresses(self):
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method='/InvalidService.IncrementTestShapes',
+                address=self._address,
+                request=''))
+
+      with self.assertRaisesOpError(self.invalid_method_string):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('InvalidMethodName'),
+                address=self._address,
+                request=''))
+
+      # This also covers the case of address=''
+      # and address='localhost:293874293874'
+      with self.assertRaises(errors.UnavailableError):
+        sess.run(
+            self.rpc(
+                method=self.get_method_name('IncrementTestShapes'),
+                address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@',
+                request=''))
+
+      # Test invalid method with the TryRpc op
+      _, status_code_value, status_message_value = sess.run(
+          self.try_rpc(
+              method=self.get_method_name('InvalidMethodName'),
+              address=self._address,
+              request=''))
+      self.assertEqual(errors.UNIMPLEMENTED, status_code_value)
+      self.assertTrue(
+          self.invalid_method_string in status_message_value.decode('ascii'))
+
+  def testAlwaysFailingMethod(self):
+    with self.test_session() as sess:
+      response_tensors = self.rpc(
+          method=self.get_method_name('AlwaysFailWithInvalidArgument'),
+          address=self._address,
+          request='')
+      self.assertEqual(response_tensors.shape, ())
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+  def testSometimesFailingMethodWithManyRequests(self):
+    with self.test_session() as sess:
+      # Fail hard by default.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      with self.assertRaisesOpError(I_WARNED_YOU):
+        sess.run(response_tensors)
+
+      # Don't fail hard, use TryRpc - return the failing status instead.
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesFailWithInvalidArgument'),
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values, status_message_values = sess.run((status_code,
+                                                            status_message))
+      self.assertTrue([
+          x in (errors.OK, errors.INVALID_ARGUMENT) for x in status_code_values
+      ])
+      expected_message_values = np.where(
+          status_code_values == errors.INVALID_ARGUMENT,
+          I_WARNED_YOU.encode('ascii'), b'')
+      self.assertAllEqual(expected_message_values, status_message_values)
+
+  def testVecHostPortRpc(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      self.assertEqual(response_tensors.shape, (20,))
+      response_values = sess.run(response_tensors)
+    self.assertEqual(response_values.shape, (20,))
+    for i in range(20):
+      response_message = test_example_pb2.TestCase()
+      self.assertTrue(response_message.ParseFromString(response_values[i]))
+      self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortManyParallelRpcs(self):
+    with self.test_session() as sess:
+      request_tensors = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      many_response_tensors = [
+          self.rpc(
+              method=self.get_method_name('IncrementTestShapes'),
+              address=self._address,
+              request=request_tensors) for _ in range(10)
+      ]
+      # Launch parallel 10 calls to the RpcOp, each containing
+      # 20 rpc requests.
+      many_response_values = sess.run(many_response_tensors)
+    self.assertEqual(10, len(many_response_values))
+    for response_values in many_response_values:
+      self.assertEqual(response_values.shape, (20,))
+      for i in range(20):
+        response_message = test_example_pb2.TestCase()
+        self.assertTrue(response_message.ParseFromString(response_values[i]))
+        self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
+
+  def testVecHostPortRpcUsingEncodeAndDecodeProto(self):
+    with self.test_session() as sess:
+      request_tensors = encode_proto_op.encode_proto(
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          sizes=[[3]] * 20,
+          values=[
+              [[i, i + 1, i + 2] for i in range(20)],
+          ])
+      response_tensor_strings = self.rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=self._address,
+          request=request_tensors)
+      _, (response_shape,) = decode_proto_op.decode_proto(
+          bytes=response_tensor_strings,
+          message_type='tensorflow.contrib.rpc.TestCase',
+          field_names=['shape'],
+          output_types=[dtypes.int32])
+      response_shape_values = sess.run(response_shape)
+    self.assertAllEqual([[i + 1, i + 2, i + 3]
+                         for i in range(20)], response_shape_values)
+
+  def testVecHostPortRpcCancelsUponSessionTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          request=request_tensors)
+      for timeout_ms in [1, 500, 1000]:
+        options = config_pb2.RunOptions(timeout_in_ms=timeout_ms)
+        with self.assertRaises((errors.UnavailableError,
+                                errors.DeadlineExceededError)):
+          sess.run(response_tensors, options=options)
+
+  def testVecHostPortRpcCancelsUponConfiguredTimeOutWhenSleepingForever(self):
+    with self.test_session() as sess:
+      request_tensors = [''] * 25  # This will launch 25 RPC requests.
+      response_tensors = self.rpc(
+          method=self.get_method_name('SleepForever'),
+          address=self._address,
+          timeout_in_ms=1000,
+          request=request_tensors)
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(response_tensors)
+
+  def testTryRpcPropagatesDeadlineErrorWithSometimesTimingOutRequests(self):
+    with self.test_session() as sess:
+      response_tensors, status_code, status_message = self.try_rpc(
+          method=self.get_method_name('SometimesSleepForever'),
+          timeout_in_ms=1000,
+          address=self._address,
+          request=[''] * 20)
+      self.assertEqual(response_tensors.shape, (20,))
+      self.assertEqual(status_code.shape, (20,))
+      self.assertEqual(status_message.shape, (20,))
+      status_code_values = sess.run(status_code)
+      self.assertTrue([
+          x in (errors.OK, errors.DEADLINE_EXCEEDED) for x in status_code_values
+      ])
+
+  def testTryRpcWithMultipleAddressesSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleMethodsSingleRequest(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      methods = flatten(
+          [[self.get_method_name('IncrementTestShapes'), 'InvalidMethodName']
+           for _ in range(10)])
+      request = test_example_pb2.TestCase(shape=[0, 1, 2]).SerializeToString()
+      response_tensors, status_code, _ = self.try_rpc(
+          method=methods, address=self._address, request=request)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNIMPLEMENTED] for _ in range(10)),
+          status_code_values)
+      for i in range(10):
+        self.assertTrue(response_tensors_values[2 * i])
+        self.assertFalse(response_tensors_values[2 * i + 1])
+
+  def testTryRpcWithMultipleAddressesAndRequests(self):
+    flatten = lambda x: list(itertools.chain.from_iterable(x))
+    with self.test_session() as sess:
+      addresses = flatten([[
+          self._address, 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@'
+      ] for _ in range(10)])
+      requests = [
+          test_example_pb2.TestCase(
+              shape=[i, i + 1, i + 2]).SerializeToString() for i in range(20)
+      ]
+      response_tensors, status_code, _ = self.try_rpc(
+          method=self.get_method_name('IncrementTestShapes'),
+          address=addresses,
+          request=requests)
+      response_tensors_values, status_code_values = sess.run((response_tensors,
+                                                              status_code))
+      self.assertAllEqual(
+          flatten([errors.OK, errors.UNAVAILABLE] for _ in range(10)),
+          status_code_values)
+      for i in range(20):
+        if i % 2 == 1:
+          self.assertFalse(response_tensors_values[i])
+        else:
+          response_message = test_example_pb2.TestCase()
+          self.assertTrue(
+              response_message.ParseFromString(response_tensors_values[i]))
+          self.assertAllEqual([i + 1, i + 2, i + 3], response_message.shape)
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
new file mode 100644
index 0000000000..7cbd636cb1
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_servicer.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Test servicer for RpcOp tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import time
+
+import grpc
+
+from tensorflow.contrib.rpc.python.kernel_tests import rpc_op_test_base
+from tensorflow.contrib.rpc.python.kernel_tests import test_example_pb2_grpc
+
+
+class RpcOpTestServicer(test_example_pb2_grpc.TestCaseServiceServicer):
+  """Test servicer for RpcOp tests."""
+
+  def IncrementTestShapes(self, request, context):
+    """Increment the entries in the shape attribute of request.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    for i in range(len(request.shape)):
+      request.shape[i] += 1
+    return request
+
+  def AlwaysFailWithInvalidArgument(self, request, context):
+    """Always fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    del request
+    context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+    context.set_details(rpc_op_test_base.I_WARNED_YOU)
+
+  def SometimesFailWithInvalidArgument(self, request, context):
+    """Sometimes fails with an InvalidArgument status.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+      context.set_details(rpc_op_test_base.I_WARNED_YOU)
+    return request
+
+  def SleepForever(self, request, context):
+    """Sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    # TODO(ebrevdo): Make this async wait like the stubby version.
+    time.sleep(5)
+
+  def SometimesSleepForever(self, request, context):
+    """Sometimes sleeps forever.
+
+    Args:
+      request: input TestCase.
+      context: the rpc context.
+
+    Returns:
+      output TestCase.
+    """
+    if random.randint(0, 1) == 1:
+      time.sleep(5)
+    return request
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
new file mode 100644
index 0000000000..96f4550f62
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/kernel_tests/test_example.proto
@@ -0,0 +1,171 @@
+// Test description and protos to work with it.
+//
+// Many of the protos in this file are for unit tests that haven't been written yet.
+
+syntax = "proto2";
+
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.contrib.rpc;
+
+// A TestCase holds a proto and a bunch of assertions
+// about how it should decode.
+message TestCase {
+  // A batch of primitives to be serialized and decoded.
+  repeated RepeatedPrimitiveValue primitive = 1;
+  // The shape of the batch.
+  repeated int32 shape = 2;
+  // Expected sizes for each field.
+  repeated int32 sizes = 3;
+  // Expected values for each field.
+  repeated FieldSpec field = 4;
+};
+
+service TestCaseService {
+  // Copy input, and increment each entry in 'shape' by 1.
+  rpc IncrementTestShapes(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever.
+  rpc SleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Sleep forever 50% of the time, return immediately the other 50%.
+  rpc SometimesSleepForever(TestCase) returns (TestCase) {
+  }
+
+  // Always fails with InvalidArgument.
+  rpc AlwaysFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+
+  // Fails with InvalidArgument 50% of the time.
+  rpc SometimesFailWithInvalidArgument(TestCase) returns (TestCase) {
+  }
+};
+
+// FieldSpec describes the expected output for a single field.
+message FieldSpec {
+  optional string name = 1;
+  optional tensorflow.DataType dtype = 2;
+  optional RepeatedPrimitiveValue expected = 3;
+};
+
+message TestValue {
+  optional PrimitiveValue primitive_value = 1;
+  optional EnumValue enum_value = 2;
+  optional MessageValue message_value = 3;
+  optional RepeatedMessageValue repeated_message_value = 4;
+  optional RepeatedPrimitiveValue repeated_primitive_value = 6;
+}
+
+message PrimitiveValue {
+  optional double double_value = 1;
+  optional float float_value = 2;
+  optional int64 int64_value = 3;
+  optional uint64 uint64_value = 4;
+  optional int32 int32_value = 5;
+  optional fixed64 fixed64_value = 6;
+  optional fixed32 fixed32_value = 7;
+  optional bool bool_value = 8;
+  optional string string_value = 9;
+  optional bytes bytes_value = 12;
+  optional uint32 uint32_value = 13;
+  optional sfixed32 sfixed32_value = 15;
+  optional sfixed64 sfixed64_value = 16;
+  optional sint32 sint32_value = 17;
+  optional sint64 sint64_value = 18;
+}
+
+// NOTE: This definition must be kept in sync with PackedPrimitiveValue.
+message RepeatedPrimitiveValue {
+  repeated double double_value = 1;
+  repeated float float_value = 2;
+  repeated int64 int64_value = 3;
+  repeated uint64 uint64_value = 4;
+  repeated int32 int32_value = 5;
+  repeated fixed64 fixed64_value = 6;
+  repeated fixed32 fixed32_value = 7;
+  repeated bool bool_value = 8;
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13;
+  repeated sfixed32 sfixed32_value = 15;
+  repeated sfixed64 sfixed64_value = 16;
+  repeated sint32 sint32_value = 17;
+  repeated sint64 sint64_value = 18;
+  repeated PrimitiveValue message_value = 19;
+}
+
+// A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
+// in the text format, but the binary serializion is different.
+// We test the packed representations by loading the same test cases
+// using this definition instead of RepeatedPrimitiveValue.
+// NOTE: This definition must be kept in sync with RepeatedPrimitiveValue
+// in every way except the packed=true declaration.
+message PackedPrimitiveValue {
+  repeated double double_value = 1 [packed = true];
+  repeated float float_value = 2 [packed = true];
+  repeated int64 int64_value = 3 [packed = true];
+  repeated uint64 uint64_value = 4 [packed = true];
+  repeated int32 int32_value = 5 [packed = true];
+  repeated fixed64 fixed64_value = 6 [packed = true];
+  repeated fixed32 fixed32_value = 7 [packed = true];
+  repeated bool bool_value = 8 [packed = true];
+  repeated string string_value = 9;
+  repeated bytes bytes_value = 12;
+  repeated uint32 uint32_value = 13 [packed = true];
+  repeated sfixed32 sfixed32_value = 15 [packed = true];
+  repeated sfixed64 sfixed64_value = 16 [packed = true];
+  repeated sint32 sint32_value = 17 [packed = true];
+  repeated sint64 sint64_value = 18 [packed = true];
+  repeated PrimitiveValue message_value = 19;
+}
+
+message EnumValue {
+  enum Color {
+    RED = 0;
+    ORANGE = 1;
+    YELLOW = 2;
+    GREEN = 3;
+    BLUE = 4;
+    INDIGO = 5;
+    VIOLET = 6;
+  };
+  optional Color enum_value = 14;
+  repeated Color repeated_enum_value = 15;
+}
+
+
+message InnerMessageValue {
+  optional float float_value = 2;
+  repeated bytes bytes_values = 8;
+}
+
+message MiddleMessageValue {
+  repeated int32 int32_values = 5;
+  optional InnerMessageValue message_value = 11;
+  optional uint32 uint32_value = 13;
+}
+
+message MessageValue {
+  optional double double_value = 1;
+  optional MiddleMessageValue message_value = 11;
+}
+
+message RepeatedMessageValue {
+  message NestedMessageValue {
+    optional float float_value = 2;
+    repeated bytes bytes_values = 8;
+  }
+
+  repeated NestedMessageValue message_values = 11;
+}
+
+// Message containing fields with field numbers higher than any field above. An
+// instance of this message is prepended to each binary message in the test to
+// exercise the code path that handles fields encoded out of order of field
+// number.
+message ExtraFields {
+  optional string string_value = 1776;
+  optional bool bool_value = 1777;
+}
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e01e076bcf..a43f5745c0 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,6 @@
 # Platform-specific build configurations.
 
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
-load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
@@ -110,6 +109,12 @@ def _proto_cc_srcs(srcs, use_grpc_plugin=False):
     ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
   return ret
 
+def _proto_py_outs(srcs, use_grpc_plugin=False):
+  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+  if use_grpc_plugin:
+    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+  return ret
+
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
@@ -212,6 +217,80 @@ def cc_proto_library(
       hdrs=gen_hdrs,
       **kargs)
 
+# Re-defined protocol buffer rule to bring in the change introduced in commit
+# https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
+# which was not part of a stable protobuf release in 04/2018.
+# TODO(jsimsa): Remove this once the protobuf dependency version is updated
+# to include the above commit.
+def py_proto_library(
+        name,
+        srcs=[],
+        deps=[],
+        py_libs=[],
+        py_extra_srcs=[],
+        include=None,
+        default_runtime="@protobuf_archive//:protobuf_python",
+        protoc="@protobuf_archive//:protoc",
+        use_grpc_plugin=False,
+        **kargs):
+  """Bazel rule to create a Python protobuf library from proto source files
+
+  NOTE: the rule is only an internal workaround to generate protos. The
+  interface may change and the rule may be removed when bazel has introduced
+  the native rule.
+
+  Args:
+    name: the name of the py_proto_library.
+    srcs: the .proto files of the py_proto_library.
+    deps: a list of dependency labels; must be py_proto_library.
+    py_libs: a list of other py_library targets depended by the generated
+        py_library.
+    py_extra_srcs: extra source files that will be added to the output
+        py_library. This attribute is used for internal bootstrapping.
+    include: a string indicating the include path of the .proto files.
+    default_runtime: the implicitly default runtime which will be depended on by
+        the generated py_library target.
+    protoc: the label of the protocol compiler to generate the sources.
+    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+        when processing the proto files.
+    **kargs: other keyword arguments that are passed to cc_library.
+  """
+  outs = _proto_py_outs(srcs, use_grpc_plugin)
+
+  includes = []
+  if include != None:
+    includes = [include]
+
+  grpc_python_plugin = None
+  if use_grpc_plugin:
+    grpc_python_plugin = "//external:grpc_python_plugin"
+    # Note: Generated grpc code depends on Python grpc module. This dependency
+    # is not explicitly listed in py_libs. Instead, host system is assumed to
+    # have grpc installed.
+
+  proto_gen(
+      name=name + "_genproto",
+      srcs=srcs,
+      deps=[s + "_genproto" for s in deps],
+      includes=includes,
+      protoc=protoc,
+      gen_py=1,
+      outs=outs,
+      visibility=["//visibility:public"],
+      plugin=grpc_python_plugin,
+      plugin_language="grpc"
+  )
+
+  if default_runtime and not default_runtime in py_libs + deps:
+    py_libs = py_libs + [default_runtime]
+
+  native.py_library(
+      name=name,
+      srcs=outs+py_extra_srcs,
+      deps=py_libs+deps,
+      imports=includes,
+      **kargs)
+
 def tf_proto_library_cc(name, srcs = [], has_services = None,
                         protodeps = [],
                         visibility = [], testonly = 0,
@@ -256,8 +335,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   )
 
 def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0,
-                        srcs_version="PY2AND3"):
+                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
   py_proto_library(
       name = name + "_py",
       srcs = srcs,
@@ -267,6 +345,7 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
       default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
+      use_grpc_plugin = use_grpc_plugin,
   )
 
 def tf_jspb_proto_library(**kwargs):
@@ -305,6 +384,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
       srcs_version = "PY2AND3",
       testonly = testonly,
       visibility = visibility,
+      use_grpc_plugin = has_services,
   )
 
 def tf_additional_lib_hdrs(exclude = []):
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a0bae23a7c..2ef105755f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -76,6 +76,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/predictor:predictor_pip",
     "//tensorflow/contrib/proto:proto_pip",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
     "//tensorflow/contrib/signal:test_util",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 018a395063..48728ac131 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -752,6 +752,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc_cpp_plugin",
       actual = "@grpc//:grpc_cpp_plugin",
   )
+  native.bind(
+      name = "grpc_python_plugin",
+      actual = "@grpc//:grpc_python_plugin",
+  )
 
   # gRPC has three empty C++ functions which it wants the user to define
   # at build time. https://github.com/grpc/grpc/issues/13590
-- 
GitLab


From 113f102164e822aa15d1e875287009fef9d8b823 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 16 Apr 2018 12:56:14 -0700
Subject: [PATCH 0987/1262] boosted_trees: Make some regularizer/hyper-params
 as inputs instead of attributes.

PiperOrigin-RevId: 193085059
---
 ...tedTreesCalculateBestGainsPerFeature.pbtxt | 38 +++++++--------
 .../api_def_BoostedTreesPredict.pbtxt         |  6 ---
 .../api_def_BoostedTreesTrainingPredict.pbtxt |  6 ---
 .../api_def_BoostedTreesUpdateEnsemble.pbtxt  |  4 +-
 .../kernels/boosted_trees/prediction_ops.cc   | 16 +++----
 .../core/kernels/boosted_trees/stats_ops.cc   | 44 ++++++++++--------
 .../kernels/boosted_trees/training_ops.cc     | 19 ++++----
 tensorflow/core/ops/boosted_trees_ops.cc      | 36 +++++----------
 .../core/ops/compat/ops_history.v1.pbtxt      | 46 +++++++------------
 .../python/estimator/canned/boosted_trees.py  |  6 +--
 .../boosted_trees/prediction_ops_test.py      | 14 +-----
 11 files changed, 96 insertions(+), 139 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index 62876a293c..7f18c64574 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -11,6 +11,24 @@ END
     name: "stats_summary_list"
     description: <<END
 A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+l1 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+l2 regularization factor on leaf weights, per instance based.
+END
+  }
+  in_arg {
+    name: "tree_complexity"
+    description: <<END
+adjustment to the gain, per leaf based.
 END
   }
   out_arg {
@@ -41,24 +59,6 @@ END
     name: "right_node_contribs_list"
     description: <<END
 A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-END
-  }
-  attr {
-    name: "l1"
-    description: <<END
-l1 regularization factor on leaf weights, per instance based.
-END
-  }
-  attr {
-    name: "l2"
-    description: <<END
-l2 regularization factor on leaf weights, per instance based.
-END
-  }
-  attr {
-    name: "tree_complexity"
-    description: <<END
-adjustment to the gain, per leaf based.
 END
   }
   attr {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
index b23e77a1fa..60ad9b4640 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -25,12 +25,6 @@ END
     description: <<END
 scalar, dimension of the logits, to be used for partial logits
 shape.
-END
-  }
-  attr {
-    name: "max_depth"
-    description: <<END
-scalar, max depth of trees. To be used for parallelization costs.
 END
   }
   summary: "Runs multiple additive regression ensemble predictors on input instances and"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
index 7203d3cb58..f8a3639c9b 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -52,12 +52,6 @@ END
     description: <<END
 scalar, dimension of the logits, to be used for partial logits
 shape.
-END
-  }
-  attr {
-    name: "max_depth"
-    description: <<END
-scalar, max depth of trees. To be used for parallelization costs.
 END
   }
   summary: "Runs multiple additive regression ensemble predictors on input instances and"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
index 00f8953875..3cf486d087 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -51,13 +51,13 @@ of the feature's splits. Will be added to the previous node values to constitute
 the values of the right nodes.
 END
   }
-  attr {
+  in_arg {
     name: "max_depth"
     description: <<END
 Max depth of the tree to build.
 END
   }
-  attr {
+  in_arg {
     name: "learning_rate"
     description: <<END
 shrinkage const for each new tree.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index b13a450546..1b5ce32b7b 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -50,7 +50,6 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
     OP_REQUIRES(context, logits_dimension_ == 1,
                 errors::InvalidArgument(
                     "Currently only one dimensional outputs are supported."));
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -155,9 +154,10 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           output_partial_logits(i, 0) = partial_all_logit;
         }
       };
-      // Assume we will not go over more than one full tree. 4 is a magic
-      // number.
-      const int64 cost = 4 * max_depth_;
+      // 30 is the magic number. The actual value might be a function of (the
+      // number of layers) * (cpu cycles spent on each layer), but this value
+      // would work for many cases. May be tuned later.
+      const int64 cost = 30;
       thread::ThreadPool* const worker_threads =
           context->device()->tensorflow_cpu_worker_threads()->workers;
       Shard(worker_threads->NumThreads(), worker_threads, batch_size,
@@ -168,7 +168,6 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
  private:
   int32 logits_dimension_;         // the size of the output prediction vector.
   int32 num_bucketized_features_;  // Indicates the number of features.
-  int32 max_depth_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesTrainingPredict").Device(DEVICE_CPU),
@@ -186,7 +185,6 @@ class BoostedTreesPredictOp : public OpKernel {
     OP_REQUIRES(context, logits_dimension_ == 1,
                 errors::InvalidArgument(
                     "Currently only one dimensional outputs are supported."));
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -243,7 +241,10 @@ class BoostedTreesPredictOp : public OpKernel {
         output_logits(i, 0) = tree_logit;
       }
     };
-    const int64 cost = (latest_tree + 1) * max_depth_;
+    // 10 is the magic number. The actual number might depend on (the number of
+    // layers in the trees) and (cpu cycles spent on each layer), but this
+    // value would work for many cases. May be tuned later.
+    const int64 cost = (latest_tree + 1) * 10;
     thread::ThreadPool* const worker_threads =
         context->device()->tensorflow_cpu_worker_threads()->workers;
     Shard(worker_threads->NumThreads(), worker_threads, batch_size,
@@ -254,7 +255,6 @@ class BoostedTreesPredictOp : public OpKernel {
   int32
       logits_dimension_;  // Indicates the size of the output prediction vector.
   int32 num_bucketized_features_;  // Indicates the number of features.
-  int32 max_depth_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BoostedTreesPredict").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 16e65cf284..40f50333d3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -29,10 +29,6 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   explicit BoostedTreesCalculateBestGainsPerFeatureOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("l1", &l1_));
-    OP_REQUIRES_OK(context, context->GetAttr("l2", &l2_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("tree_complexity", &tree_complexity_));
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
   }
@@ -54,6 +50,16 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     for (const auto& tensor : stats_summary_list) {
       stats_summary.emplace_back(tensor.tensor<float, 3>());
     }
+    const Tensor* l1_t;
+    OP_REQUIRES_OK(context, context->input("l1", &l1_t));
+    const auto l1 = l1_t->scalar<float>()();
+    const Tensor* l2_t;
+    OP_REQUIRES_OK(context, context->input("l2", &l2_t));
+    const auto l2 = l2_t->scalar<float>()();
+    const Tensor* tree_complexity_t;
+    OP_REQUIRES_OK(context,
+                   context->input("tree_complexity", &tree_complexity_t));
+    const auto tree_complexity = tree_complexity_t->scalar<float>()();
 
     // Allocate output lists of tensors:
     OpOutputList output_node_ids_list;
@@ -106,7 +112,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
         // Parent gain.
         float parent_gain;
         float unused;
-        CalculateWeightsAndGains(total_grad, total_hess, &unused, &parent_gain);
+        CalculateWeightsAndGains(total_grad, total_hess, l1, l2, &unused,
+                                 &parent_gain);
 
         for (int bucket = 0; bucket < num_buckets; ++bucket) {
           const float cum_grad_bucket = cum_grad[bucket];
@@ -114,13 +121,13 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
           // Left child.
           float contrib_for_left;
           float gain_for_left;
-          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket,
+          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket, l1, l2,
                                    &contrib_for_left, &gain_for_left);
           // Right child.
           float contrib_for_right;
           float gain_for_right;
           CalculateWeightsAndGains(total_grad - cum_grad_bucket,
-                                   total_hess - cum_hess_bucket,
+                                   total_hess - cum_hess_bucket, l1, l2,
                                    &contrib_for_right, &gain_for_right);
 
           if (gain_for_left + gain_for_right > best_gain) {
@@ -173,7 +180,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       for (int i = 0; i < num_nodes; ++i) {
         output_node_ids_vec(i) = output_node_ids[i];
         // Adjust the gains to penalize by tree complexity.
-        output_gains_vec(i) = output_gains[i] - tree_complexity_;
+        output_gains_vec(i) = output_gains[i] - tree_complexity;
         output_thresholds_vec(i) = output_thresholds[i];
         // Logits are 1-dimensional for now.
         // TODO(nponomareva): Consider multi-dimensional logits.
@@ -184,8 +191,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   }
 
  private:
-  void CalculateWeightsAndGains(const float g, const float h, float* weight,
-                                float* gain) {
+  void CalculateWeightsAndGains(const float g, const float h, const float l1,
+                                const float l2, float* weight, float* gain) {
     //
     // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
     // (g+l1*sgn(w))^2/(h+l2).
@@ -196,11 +203,11 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
     // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
     // For g from (-l1, l1), thus there is no solution => set to 0.
-    if (l1_ > 0) {
-      if (g > l1_) {
-        g_with_l1 -= l1_;
-      } else if (g < -l1_) {
-        g_with_l1 += l1_;
+    if (l1 > 0) {
+      if (g > l1) {
+        g_with_l1 -= l1;
+      } else if (g < -l1) {
+        g_with_l1 += l1;
       } else {
         *weight = 0.0;
         *gain = 0.0;
@@ -208,19 +215,16 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       }
     }
     // Apply L2 regularization.
-    if (h + l2_ <= kEps) {
+    if (h + l2 <= kEps) {
       // Avoid division by 0 or infinitesimal.
       *weight = 0;
       *gain = 0;
     } else {
-      *weight = -g_with_l1 / (h + l2_);
+      *weight = -g_with_l1 / (h + l2);
       *gain = -g_with_l1 * (*weight);
     }
   }
 
-  float l1_;
-  float l2_;
-  float tree_complexity_;
   int max_splits_;
   int num_features_;
 };
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 67cac14c52..a14fd4a133 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -43,8 +43,6 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("learning_rate", &learning_rate_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
 
     int32 pruning_index;
@@ -79,8 +77,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
     const Tensor* feature_ids_t;
     OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    const auto feature_ids = feature_ids_t->vec<int32>();
 
-    auto feature_ids = feature_ids_t->vec<int32>();
+    const Tensor* max_depth_t;
+    OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    const auto max_depth = max_depth_t->scalar<int32>()();
+
+    const Tensor* learning_rate_t;
+    OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    const auto learning_rate = learning_rate_t->scalar<float>()();
 
     // Find best splits for each active node.
     std::map<int32, SplitCandidate> best_splits;
@@ -125,10 +130,10 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       // For now assume that the weights vectors are one dimensional.
       // TODO(nponomareva): change here for multiclass.
       const float left_contrib =
-          learning_rate_ *
+          learning_rate *
           left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
       const float right_contrib =
-          learning_rate_ *
+          learning_rate *
           right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
 
       // unused.
@@ -145,7 +150,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
-      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth) {
         // If the tree is finalized, next growing will start from node 0;
         node_id_start = 0;
         node_id_end = 1;
@@ -216,8 +221,6 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
  private:
   int32 num_features_;
-  float learning_rate_;
-  int32 max_depth_;
   PruningMode pruning_mode_;
 };
 
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 8af4903418..4d74e6d63a 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -37,9 +37,9 @@ REGISTER_OP("IsBoostedTreesEnsembleInitialized")
 REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
     .Input("node_id_range: int32")
     .Input("stats_summary_list: num_features * float32")
-    .Attr("l1: float")
-    .Attr("l2: float")
-    .Attr("tree_complexity: float")
+    .Input("l1: float")
+    .Input("l2: float")
+    .Input("tree_complexity: float")
     .Attr("max_splits: int >= 1")
     .Attr("num_features: int >= 1")  // not passed but populated automatically.
     .Output("node_ids_list: num_features * int32")
@@ -51,19 +51,6 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
       // Confirms the rank of the inputs and sets the shape of the outputs.
       int max_splits;
       int num_features;
-      float l1, l2, tree_complexity;
-      TF_RETURN_IF_ERROR(c->GetAttr("l1", &l1));
-      if (l1 < 0) {
-        return errors::InvalidArgument("l1 must be non-negative.");
-      }
-      TF_RETURN_IF_ERROR(c->GetAttr("l2", &l2));
-      if (l2 < 0) {
-        return errors::InvalidArgument("l2 must be non-negative.");
-      }
-      TF_RETURN_IF_ERROR(c->GetAttr("tree_complexity", &tree_complexity));
-      if (tree_complexity < 0) {
-        return errors::InvalidArgument("Tree complexity must be non-negative.");
-      }
       TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
       TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
       shape_inference::ShapeHandle node_id_range_shape;
@@ -83,6 +70,12 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
         TF_RETURN_IF_ERROR(
             c->Merge(summary_shape_base, summary_shape, &unused_shape));
       }
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 1), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 2), 0, &unused_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(num_features + 3), 0, &unused_shape));
       // Sets the output lists.
       std::vector<shape_inference::ShapeHandle> output_shapes_vec(
           num_features, c->MakeShape({-1}));
@@ -185,9 +178,8 @@ REGISTER_OP("BoostedTreesMakeStatsSummary")
 REGISTER_OP("BoostedTreesPredict")
     .Input("tree_ensemble_handle: resource")
     .Input("bucketized_features: num_bucketized_features * int32")
-    .Attr("num_bucketized_features: int >= 1")
+    .Attr("num_bucketized_features: int >= 1")  // Inferred.
     .Attr("logits_dimension: int")
-    .Attr("max_depth: int >= 1")
     .Output("logits: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle feature_shape;
@@ -229,7 +221,6 @@ REGISTER_OP("BoostedTreesTrainingPredict")
     .Input("bucketized_features: num_bucketized_features * int32")
     .Attr("num_bucketized_features: int >= 1")
     .Attr("logits_dimension: int")
-    .Attr("max_depth: int >= 1")
     .Output("partial_logits: float")
     .Output("tree_ids: int32")
     .Output("node_ids: int32")
@@ -239,9 +230,6 @@ REGISTER_OP("BoostedTreesTrainingPredict")
       TF_RETURN_IF_ERROR(
           c->GetAttr("num_bucketized_features", &num_bucketized_features));
 
-      int max_depth;
-      TF_RETURN_IF_ERROR(c->GetAttr("max_depth", &max_depth));
-
       shape_inference::ShapeHandle unused_input;
       for (int i = 0; i < num_bucketized_features; ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
@@ -273,8 +261,8 @@ REGISTER_OP("BoostedTreesUpdateEnsemble")
     .Input("thresholds: num_features * int32")
     .Input("left_node_contribs: num_features * float")
     .Input("right_node_contribs: num_features * float")
-    .Attr("max_depth: int >= 1")
-    .Attr("learning_rate: float")
+    .Input("max_depth: int32")
+    .Input("learning_rate: float")
     .Attr("pruning_mode: int >=0")
     .Attr("num_features: int >= 0")  // Inferred.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 2f6f588d2c..c627fee352 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10735,6 +10735,18 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
@@ -10760,18 +10772,6 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "tree_complexity"
-    type: "float"
-  }
   attr {
     name: "max_splits"
     type: "int"
@@ -10934,12 +10934,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -10999,12 +10993,6 @@ op {
     name: "logits_dimension"
     type: "int"
   }
-  attr {
-    name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   is_stateful: true
 }
 op {
@@ -11042,15 +11030,13 @@ op {
     type: DT_FLOAT
     number_attr: "num_features"
   }
-  attr {
+  input_arg {
     name: "max_depth"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "learning_rate"
-    type: "float"
+    type: DT_FLOAT
   }
   attr {
     name: "pruning_mode"
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 0ecc8c7089..d099d308f5 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -325,8 +325,7 @@ def _bt_model_fn(
           # so no local copy is needed; using tree_ensemble directly.
           tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
-          logits_dimension=head.logits_dimension,
-          max_depth=tree_hparams.max_depth)
+          logits_dimension=head.logits_dimension)
     else:
       if is_single_machine:
         local_tree_ensemble = tree_ensemble
@@ -361,8 +360,7 @@ def _bt_model_fn(
             cached_tree_ids=cached_tree_ids,
             cached_node_ids=cached_node_ids,
             bucketized_features=input_feature_list,
-            logits_dimension=head.logits_dimension,
-            max_depth=tree_hparams.max_depth)
+            logits_dimension=head.logits_dimension)
       logits = cached_logits + partial_logits
 
     # Create training graph.
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index d132f15e51..54f33f3360 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -49,7 +49,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -116,7 +115,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values],
@@ -189,7 +187,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=4,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -299,7 +296,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=4,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -429,7 +425,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=2,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -562,7 +557,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=3,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -705,7 +699,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=3,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -782,7 +775,6 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # Grow tree ensemble.
       predict_op = boosted_trees_ops.training_predict(
           tree_ensemble_handle,
-          max_depth=1,
           cached_tree_ids=cached_tree_ids,
           cached_node_ids=cached_node_ids,
           bucketized_features=[feature_0_values, feature_1_values],
@@ -905,8 +897,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1,
-          max_depth=2)
+          logits_dimension=1)
 
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
@@ -915,8 +906,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1,
-          max_depth=2)
+          logits_dimension=1)
 
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
-- 
GitLab


From 91129bbb3cbc01c7ecc776048988ae83ba50e3c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 16 Apr 2018 18:03:05 -0700
Subject: [PATCH 0988/1262] Adding min node weight regularization

PiperOrigin-RevId: 193131300
---
 .../python/estimator/boosted_trees.py         | 18 +++-
 ...tedTreesCalculateBestGainsPerFeature.pbtxt |  8 +-
 .../core/kernels/boosted_trees/stats_ops.cc   |  9 ++
 tensorflow/core/ops/boosted_trees_ops.cc      |  1 +
 .../core/ops/compat/ops_history.v1.pbtxt      |  4 +
 .../python/estimator/canned/boosted_trees.py  | 85 ++++++++++---------
 .../estimator/canned/boosted_trees_test.py    |  3 +-
 .../boosted_trees/stats_ops_test.py           | 51 +++++++++++
 ....estimator.-boosted-trees-classifier.pbtxt |  2 +-
 ...w.estimator.-boosted-trees-regressor.pbtxt |  2 +-
 10 files changed, 138 insertions(+), 45 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 314c54ed00..00356ce0ca 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -36,6 +36,7 @@ class _BoostedTreesEstimator(estimator.Estimator):
                l1_regularization=0.,
                l2_regularization=0.,
                tree_complexity=0.,
+               min_node_weight=0.,
                config=None):
     """Initializes a `BoostedTreesEstimator` instance.
 
@@ -65,13 +66,16 @@ class _BoostedTreesEstimator(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
     """
     # pylint:disable=protected-access
     # HParams for the model.
     tree_hparams = canned_boosted_trees._TreeHParams(
         n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+        tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return canned_boosted_trees._bt_model_fn(
@@ -96,6 +100,7 @@ def boosted_trees_classifier_train_in_memory(
     l1_regularization=0.,
     l2_regularization=0.,
     tree_complexity=0.,
+    min_node_weight=0.,
     config=None,
     train_hooks=None):
   """Trains a boosted tree classifier with in memory dataset.
@@ -162,6 +167,9 @@ def boosted_trees_classifier_train_in_memory(
     l2_regularization: regularization multiplier applied to the square weights
       of the tree leafs.
     tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
     config: `RunConfig` object to configure the runtime settings.
     train_hooks: a list of Hook instances to be passed to estimator.train().
 
@@ -184,7 +192,7 @@ def boosted_trees_classifier_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity)
+      tree_complexity, min_node_weight)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
@@ -220,6 +228,7 @@ def boosted_trees_regressor_train_in_memory(
     l1_regularization=0.,
     l2_regularization=0.,
     tree_complexity=0.,
+    min_node_weight=0.,
     config=None,
     train_hooks=None):
   """Trains a boosted tree regressor with in memory dataset.
@@ -279,6 +288,9 @@ def boosted_trees_regressor_train_in_memory(
     l2_regularization: regularization multiplier applied to the square weights
       of the tree leafs.
     tree_complexity: regularization factor to penalize trees with more leaves.
+    min_node_weight: minimum hessian a node must have for a split to be
+        considered. The value will be compared with sum(leaf_hessian)/
+        (batch_size * n_batches_per_layer).
     config: `RunConfig` object to configure the runtime settings.
     train_hooks: a list of Hook instances to be passed to estimator.train().
 
@@ -300,7 +312,7 @@ def boosted_trees_regressor_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity)
+      tree_complexity, min_node_weight)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index 7f18c64574..3f181e91ce 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -29,6 +29,12 @@ END
     name: "tree_complexity"
     description: <<END
 adjustment to the gain, per leaf based.
+END
+  }
+  in_arg {
+    name: "min_node_weight"
+    description: <<END
+mininum avg of hessians in a node before required for the node to be considered for splitting.
 END
   }
   out_arg {
@@ -84,4 +90,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 40f50333d3..6dfcd63ab3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -60,6 +60,10 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->input("tree_complexity", &tree_complexity_t));
     const auto tree_complexity = tree_complexity_t->scalar<float>()();
+    const Tensor* min_node_weight_t;
+    OP_REQUIRES_OK(context,
+                   context->input("min_node_weight", &min_node_weight_t));
+    const auto min_node_weight = min_node_weight_t->scalar<float>()();
 
     // Allocate output lists of tensors:
     OpOutputList output_node_ids_list;
@@ -105,6 +109,11 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
           cum_grad.push_back(total_grad);
           cum_hess.push_back(total_hess);
         }
+        // Check if node has enough of average hessian.
+        if (total_hess < min_node_weight) {
+          // Do not split the node because not enough avg hessian.
+          continue;
+        }
         float best_gain = std::numeric_limits<float>::lowest();
         float best_bucket = 0;
         float best_contrib_for_left = 0.0;
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 4d74e6d63a..88d6eaf819 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -40,6 +40,7 @@ REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
     .Input("l1: float")
     .Input("l2: float")
     .Input("tree_complexity: float")
+    .Input("min_node_weight: float")
     .Attr("max_splits: int >= 1")
     .Attr("num_features: int >= 1")  // not passed but populated automatically.
     .Output("node_ids_list: num_features * int32")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index c627fee352..4a24c44d69 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10747,6 +10747,10 @@ op {
     name: "tree_complexity"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index d099d308f5..536bd2bf81 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -40,9 +40,11 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
-_TreeHParams = collections.namedtuple(
-    'TreeHParams',
-    ['n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity'])
+# TODO(nponomareva): Reveal pruning params here.
+_TreeHParams = collections.namedtuple('TreeHParams', [
+    'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
+    'min_node_weight'
+])
 
 _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
@@ -397,6 +399,7 @@ def _bt_model_fn(
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
                  tree_complexity=tree_hparams.tree_complexity,
+                 min_node_weight=tree_hparams.min_node_weight,
                  max_splits=max_splits))
         grow_op = boosted_trees_ops.update_ensemble(
             # Confirm if local_tree_ensemble or tree_ensemble should be used.
@@ -515,21 +518,21 @@ def _create_regression_head(label_dimension, weight_column=None):
 class BoostedTreesClassifier(estimator.Estimator):
   """A Classifier for Tensorflow Boosted Trees models."""
 
-  def __init__(
-      self,
-      feature_columns,
-      n_batches_per_layer,
-      model_dir=None,
-      n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
-      weight_column=None,
-      label_vocabulary=None,
-      n_trees=100,
-      max_depth=6,
-      learning_rate=0.1,
-      l1_regularization=0.,
-      l2_regularization=0.,
-      tree_complexity=0.,
-      config=None):
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
+               weight_column=None,
+               label_vocabulary=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
     """Initializes a `BoostedTreesClassifier` instance.
 
     Example:
@@ -593,6 +596,9 @@ class BoostedTreesClassifier(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
 
     Raises:
@@ -606,9 +612,9 @@ class BoostedTreesClassifier(estimator.Estimator):
         n_classes, weight_column, label_vocabulary=label_vocabulary)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
@@ -630,20 +636,20 @@ class BoostedTreesClassifier(estimator.Estimator):
 class BoostedTreesRegressor(estimator.Estimator):
   """A Regressor for Tensorflow Boosted Trees models."""
 
-  def __init__(
-      self,
-      feature_columns,
-      n_batches_per_layer,
-      model_dir=None,
-      label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
-      weight_column=None,
-      n_trees=100,
-      max_depth=6,
-      learning_rate=0.1,
-      l1_regularization=0.,
-      l2_regularization=0.,
-      tree_complexity=0.,
-      config=None):
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               model_dir=None,
+               label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               min_node_weight=0.,
+               config=None):
     """Initializes a `BoostedTreesRegressor` instance.
 
     Example:
@@ -700,6 +706,9 @@ class BoostedTreesRegressor(estimator.Estimator):
       l2_regularization: regularization multiplier applied to the square weights
         of the tree leafs.
       tree_complexity: regularization factor to penalize trees with more leaves.
+      min_node_weight: min_node_weight: minimum hessian a node must have for a
+        split to be considered. The value will be compared with
+        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
       config: `RunConfig` object to configure the runtime settings.
 
     Raises:
@@ -712,9 +721,9 @@ class BoostedTreesRegressor(estimator.Estimator):
     head = _create_regression_head(label_dimension, weight_column)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity)
+    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
+                                l1_regularization, l2_regularization,
+                                tree_complexity, min_node_weight)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 7823ef8410..56e67a6707 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -188,7 +188,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         learning_rate=0.1,
         l1=0.,
         l2=0.01,
-        tree_complexity=0.)
+        tree_complexity=0.,
+        min_node_weight=0.)
 
   def _get_expected_ensembles_for_classification(self):
     first_round = """
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 4d09cf94d4..f0bb84e69a 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -59,6 +59,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=0.0,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -106,6 +107,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=0.1,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -154,6 +156,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=l1,
           l2=0.0,
           tree_complexity=0.0,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
@@ -205,6 +208,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           l1=0.0,
           l2=l2,
           tree_complexity=tree_complexity,
+          min_node_weight=0,
           max_splits=max_splits)
 
       self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
@@ -220,6 +224,53 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
                           sess.run(right_node_contribs_list))
 
+  def testCalculateBestGainsWithMinNodeWEight(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .036], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .68], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .6], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .03], [-.4, .05], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          min_node_weight=1,
+          max_splits=max_splits)
+
+      # We can't split node 1 on feature 1 and node 2 on feature 2 because of
+      # the min node weight.
+      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
+      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllClose([[[0.4852941]], [[-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-0.75]], [[-0.014925]]],
+                          sess.run(right_node_contribs_list))
+
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.test_session():
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index fd9be8c759..53a903c239 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 6b305be43f..ba17c90de2 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "evaluate"
-- 
GitLab


From cbf1fc8ba96a6e9d6a36a2d09a82ea1ff9af2752 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Mon, 16 Apr 2018 19:10:10 -0700
Subject: [PATCH 0989/1262] BoostedTreesEstimator in contrib: train_in_memory
 works with input_fns returning data.Dataset. Only one batch of data is
 expected, so dataset.batch() is disallowed, and dataset.repeat() will be
 ignored (only the first one would be used)

PiperOrigin-RevId: 193137094
---
 .../python/estimator/boosted_trees.py         |  38 +++-
 .../python/estimator/boosted_trees_test.py    |  80 +++++++--
 .../python/estimator/canned/boosted_trees.py  | 149 +++++++++++-----
 .../estimator/canned/boosted_trees_test.py    | 167 +++++++++++++++++-
 4 files changed, 362 insertions(+), 72 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 00356ce0ca..bd641014e9 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -17,10 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 
 
+def _validate_input_fn_and_repeat_dataset(train_input_fn):
+  """Validates whether the input_fn is valid, and repeat() if tf.Dataset."""
+  def _input_fn():
+    result_input_fn = train_input_fn()
+    if isinstance(result_input_fn, dataset_ops.Dataset):
+      return result_input_fn.repeat()
+    return result_input_fn
+
+  return _input_fn
+
+
 class _BoostedTreesEstimator(estimator.Estimator):
   """An Estimator for Tensorflow Boosted Trees models."""
 
@@ -113,10 +125,13 @@ def boosted_trees_classifier_train_in_memory(
   bucketized_feature_2 = bucketized_column(
     numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
 
-  def input_fn_train():
+  def train_input_fn():
     dataset = create-dataset-from-training-data
-    # Don't use repeat or cache, since it is assumed to be one epoch
-    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
     return dataset
 
   classifier = boosted_trees_classifier_train_in_memory(
@@ -210,7 +225,9 @@ def boosted_trees_classifier_train_in_memory(
   in_memory_classifier = estimator.Estimator(
       model_fn=_model_fn, model_dir=model_dir, config=config)
 
-  in_memory_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
+  in_memory_classifier.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
 
   return in_memory_classifier
   # pylint: enable=protected-access
@@ -241,10 +258,13 @@ def boosted_trees_regressor_train_in_memory(
   bucketized_feature_2 = bucketized_column(
     numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
 
-  def input_fn_train():
+  def train_input_fn():
     dataset = create-dataset-from-training-data
-    # Don't use repeat or cache, since it is assumed to be one epoch
-    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    # This is tf.data.Dataset of a tuple of feature dict and label.
+    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
+    #                     Dataset.from_tensors(label_array)))
+    # The returned Dataset shouldn't be batched.
+    # If Dataset repeats, only the first repetition would be used for training.
     return dataset
 
   regressor = boosted_trees_regressor_train_in_memory(
@@ -329,7 +349,9 @@ def boosted_trees_regressor_train_in_memory(
   in_memory_regressor = estimator.Estimator(
       model_fn=_model_fn, model_dir=model_dir, config=config)
 
-  in_memory_regressor.train(input_fn=train_input_fn, hooks=train_hooks)
+  in_memory_regressor.train(
+      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
+      hooks=train_hooks)
 
   return in_memory_regressor
   # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index eee5910687..76cbefe5e9 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
 from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -49,12 +50,24 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features = dict(FEATURES_DICT)
-    if is_classification:
-      labels = CLASSIFICATION_LABELS
-    else:
-      labels = REGRESSION_LABELS
-    return features, labels
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(features_dict),
+         dataset_ops.Dataset.from_tensors(labels)
+        ))
+    return ds
 
   return _input_fn
 
@@ -132,15 +145,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
     est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
-    # Check eval.
+    # Check evaluate and predict.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
     # Validate predictions.
@@ -148,24 +159,59 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testBinaryClassifierTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
     est = boosted_trees.boosted_trees_regressor_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
-    # Check eval.
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testRegressorTrainInMemoryWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
+        n_trees=1, max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Check evaluate and predict.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['average_loss'], 2.478283)
-    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
     self.assertAllClose(
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 536bd2bf81..085dace1b3 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
@@ -50,6 +51,32 @@ _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
 
 
+def _get_max_buckets(feature_columns):
+  """Gets the maximum number of buckets from feature_columns.
+
+  Args:
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    max_buckets: the maximum number of buckets among bucketized_columns.
+
+  Raises:
+    ValueError: when unsupported feature_columns are given.
+  """
+  if not feature_columns:
+    raise ValueError('feature_columns must be a non-empty list/set of '
+                     'tf.feature_column.')
+  max_buckets = 1
+  for fc in feature_columns:
+    if isinstance(fc, feature_column_lib._BucketizedColumn):  # pylint:disable=protected-access
+      # N boundaries creates (N+1) buckets.
+      max_buckets = max(max_buckets, len(fc.boundaries) + 1)
+    else:
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  return max_buckets
+
+
 def _get_transformed_features(features, feature_columns):
   """Gets the transformed features from features/feature_columns pair.
 
@@ -59,36 +86,31 @@ def _get_transformed_features(features, feature_columns):
 
   Returns:
     result_features: a list of the transformed features, sorted by the name.
-    num_buckets: the maximum number of buckets across bucketized_columns.
 
   Raises:
     ValueError: when unsupported features/columns are tried.
   """
-  num_buckets = 1
   # pylint:disable=protected-access
   for fc in feature_columns:
-    if isinstance(fc, feature_column_lib._BucketizedColumn):
-      # N boundaries creates (N+1) buckets.
-      num_buckets = max(num_buckets, len(fc.boundaries) + 1)
-    else:
+    if not isinstance(fc, feature_column_lib._BucketizedColumn):
       raise ValueError('For now, only bucketized_column is supported but '
                        'got: {}'.format(fc))
-  transformed = feature_column_lib._transform_features(features,
-                                                       feature_columns)
+  transformed_features = feature_column_lib._transform_features(
+      features, feature_columns)
   # pylint:enable=protected-access
   result_features = []
-  for column in sorted(transformed, key=lambda tc: tc.name):
+  for column in sorted(transformed_features, key=lambda tc: tc.name):
     source_name = column.source_column.name
-    squeezed_tensor = array_ops.squeeze(transformed[column], axis=1)
+    squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
     if len(squeezed_tensor.shape) > 1:
       raise ValueError('For now, only supports features equivalent to rank 1 '
                        'but column `{}` got: {}'.format(
                            source_name, features[source_name].shape))
     result_features.append(squeezed_tensor)
-  return result_features, num_buckets
+  return result_features
 
 
-def _keep_as_local_variable(tensor, name=None):
+def _local_variable(tensor, name=None):
   """Stores a tensor as a local Variable for faster read."""
   return variable_scope.variable(
       initial_value=tensor,
@@ -98,6 +120,48 @@ def _keep_as_local_variable(tensor, name=None):
       name=name)
 
 
+def _cache_transformed_features(features, feature_columns, batch_size):
+  """Transform features and cache, then returns (cached_features, cache_op)."""
+  num_features = len(feature_columns)
+  cached_features = [
+      _local_variable(
+          array_ops.zeros([batch_size], dtype=dtypes.int32),
+          name='cached_feature_{}'.format(i))
+      for i in range(num_features)
+  ]
+  are_features_cached = _local_variable(False, name='are_features_cached')
+
+  def cache_features_and_return():
+    """Caches transoformed features.
+
+    The intention is to hide get_transformed_features() from the graph by
+    caching the result except the first step, since bucketize operation
+    (inside get_transformed_features) is expensive.
+
+    Returns:
+      input_feature_list: a list of input features.
+      cache_flip_op: op to add to graph to make sure cache update is included to
+          the graph.
+    """
+
+    transformed_features = _get_transformed_features(features, feature_columns)
+    cached = [
+        state_ops.assign(cached_features[i], transformed_features[i])
+        for i in range(num_features)
+    ]
+    # TODO(youngheek): Try other combination of dependencies so that the
+    # function returns a single result, not a tuple.
+    with ops.control_dependencies(cached):
+      cache_flip_op = are_features_cached.assign(True)
+    return cached, cache_flip_op
+
+  input_feature_list, cache_flip_op = control_flow_ops.cond(
+      are_features_cached,
+      lambda: (cached_features, control_flow_ops.no_op()),
+      cache_features_and_return)
+  return input_feature_list, cache_flip_op
+
+
 class _CacheTrainingStatesUsingHashTable(object):
   """Caching logits, etc. using MutableHashTable."""
 
@@ -186,13 +250,13 @@ class _CacheTrainingStatesUsingVariables(object):
       logits_dimension: a constant (int) for the dimension of logits.
     """
     self._logits_dimension = logits_dimension
-    self._tree_ids = _keep_as_local_variable(
+    self._tree_ids = _local_variable(
         array_ops.zeros([batch_size], dtype=dtypes.int32),
         name='tree_ids_cache')
-    self._node_ids = _keep_as_local_variable(
+    self._node_ids = _local_variable(
         array_ops.zeros([batch_size], dtype=dtypes.int32),
         name='node_ids_cache')
-    self._logits = _keep_as_local_variable(
+    self._logits = _local_variable(
         array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
         name='logits_cache')
 
@@ -290,33 +354,38 @@ def _bt_model_fn(
         'When train_in_memory is enabled, input_fn should return the entire '
         'dataset as a single batch, and n_batches_per_layer should be set as '
         '1.')
+    if (not config.is_chief or config.num_worker_replicas > 1 or
+        config.num_ps_replicas > 0):
+      raise ValueError('train_in_memory is supported only for '
+                       'non-distributed training.')
   worker_device = control_flow_ops.no_op().device
   # maximum number of splits possible in the whole tree =2^(D-1)-1
   # TODO(youngheek): perhaps storage could be optimized by storing stats with
   # the dimension max_splits_per_layer, instead of max_splits (for the entire
   # tree).
   max_splits = (1 << tree_hparams.max_depth) - 1
+  max_buckets = _get_max_buckets(feature_columns)
+  train_op = []
   with ops.name_scope(name) as name:
     # Prepare.
     global_step = training_util.get_or_create_global_step()
-    input_feature_list, num_buckets = _get_transformed_features(
-        features, feature_columns)
-    if train_in_memory and mode == model_fn.ModeKeys.TRAIN:
-      input_feature_list = [
-          _keep_as_local_variable(feature) for feature in input_feature_list
-      ]
-    num_features = len(input_feature_list)
-
-    cache = None
-    if mode == model_fn.ModeKeys.TRAIN:
-      if train_in_memory and is_single_machine:  # maybe just train_in_memory?
-        batch_size = array_ops.shape(input_feature_list[0])[0]
-        cache = _CacheTrainingStatesUsingVariables(batch_size,
-                                                   head.logits_dimension)
-      elif example_id_column_name:
+    num_features = len(feature_columns)
+    # Extract input features and set up cache for training.
+    training_state_cache = None
+    if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
+      # cache transformed features as well for in-memory training.
+      batch_size = array_ops.shape(labels)[0]
+      input_feature_list, input_cache_op = _cache_transformed_features(
+          features, feature_columns, batch_size)
+      train_op.append(input_cache_op)
+      training_state_cache = _CacheTrainingStatesUsingVariables(
+          batch_size, head.logits_dimension)
+    else:
+      input_feature_list = _get_transformed_features(features, feature_columns)
+      if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
         example_ids = features[example_id_column_name]
-        cache = _CacheTrainingStatesUsingHashTable(example_ids,
-                                                   head.logits_dimension)
+        training_state_cache = _CacheTrainingStatesUsingHashTable(
+            example_ids, head.logits_dimension)
 
     # Create Ensemble resources.
     tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
@@ -340,11 +409,12 @@ def _bt_model_fn(
         # TODO(soroush): Do partial updates if this becomes a bottleneck.
         ensemble_reload = local_tree_ensemble.deserialize(
             *tree_ensemble.serialize())
-      if cache:
-        cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
+      if training_state_cache:
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            training_state_cache.lookup())
       else:
         # Always start from the beginning when no cache is set up.
-        batch_size = array_ops.shape(input_feature_list[0])[0]
+        batch_size = array_ops.shape(labels)[0]
         cached_tree_ids, cached_node_ids, cached_logits = (
             array_ops.zeros([batch_size], dtype=dtypes.int32),
             array_ops.zeros([batch_size], dtype=dtypes.int32),
@@ -368,9 +438,8 @@ def _bt_model_fn(
     # Create training graph.
     def _train_op_fn(loss):
       """Run one training iteration."""
-      train_op = []
-      if cache:
-        train_op.append(cache.insert(tree_ids, node_ids, logits))
+      if training_state_cache:
+        train_op.append(training_state_cache.insert(tree_ids, node_ids, logits))
       if closed_form_grad_and_hess_fn:
         gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
       else:
@@ -385,7 +454,7 @@ def _bt_model_fn(
                   hessians=hessians,
                   bucketized_features_list=[input_feature_list[f]],
                   max_splits=max_splits,
-                  num_buckets=num_buckets),
+                  num_buckets=max_buckets),
               axis=0) for f in range(num_features)
       ]
 
@@ -422,7 +491,7 @@ def _bt_model_fn(
         summary_accumulator = data_flow_ops.ConditionalAccumulator(
             dtype=dtypes.float32,
             # The stats consist of gradients and hessians (the last dimension).
-            shape=[num_features, max_splits, num_buckets, 2],
+            shape=[num_features, max_splits, max_buckets, 2],
             shared_name='stats_summary_accumulator')
         apply_grad = summary_accumulator.apply_grad(
             array_ops.stack(stats_summary_list, axis=0), stamp_token)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 56e67a6707..c8c52d3bc6 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator.canned import boosted_trees
@@ -58,13 +59,32 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features = dict(FEATURES_DICT)
-    features[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
-    if is_classification:
-      labels = CLASSIFICATION_LABELS
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    return features_dict, labels
+
+  return _input_fn
+
+
+def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
+  """Makes input_fn using Dataset."""
+
+  def _input_fn():
+    features_dict = dict(FEATURES_DICT)
+    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
+    if batch:
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensor_slices(features_dict),
+           dataset_ops.Dataset.from_tensor_slices(labels))).batch(batch)
     else:
-      labels = REGRESSION_LABELS
-    return features, labels
+      ds = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensors(features_dict),
+           dataset_ops.Dataset.from_tensors(labels)))
+    # repeat indefinitely by default, or stop at the given step.
+    ds = ds.repeat(repeat)
+    return ds
 
   return _input_fn
 
@@ -125,9 +145,28 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
+  def testTrainClassifierWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
     predictions = list(est.predict(input_fn=predict_input_fn))
-    # All labels are correct.
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
@@ -166,12 +205,126 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
     self._assert_checkpoint(
         est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDataset(self):
+    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetBatch(self):
+    # The batch_size as the entire data size should yield the same result as
+    # dataset without batching.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=5)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetLargerBatch(self):
+    # The batch_size as the multiple of the entire data size should still yield
+    # the same result.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=15)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testTrainRegressorWithDatasetSmallerBatch(self):
+    # Even when using small batches, if (n_batches_per_layer * batch_size) makes
+    # the same entire data size, the result should be the same.
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, batch=1)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
 
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=5,
+        n_trees=1,
+        max_depth=5)
+    # Train stops after (n_batches_per_layer * n_trees * max_depth) steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=25, finalized_trees=1, attempted_layers=5)
+    # 5 batches = one epoch.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=5)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     predictions = list(est.predict(input_fn=predict_input_fn))
     self.assertAllClose(
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
         [pred['predictions'] for pred in predictions])
 
+  def testTrainRegressorWithDatasetWhenInputIsOverEarlier(self):
+    train_input_fn = _make_train_input_fn_dataset(
+        is_classification=False, repeat=3)  # to stop input after 3 steps.
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    # Note that training will stop when input exhausts.
+    # This might not be a typical pattern, but dataset.repeat(3) causes
+    # the input stream to cease after 3 steps.
+    est.train(train_input_fn, steps=100)
+    self._assert_checkpoint(
+        est.model_dir, global_step=3, finalized_trees=0, attempted_layers=3)
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 3.777295)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.353850], [0.254100], [0.106850], [0.712100], [1.012100]],
+        [pred['predictions'] for pred in predictions])
+
 
 class ModelFnTests(test_util.TensorFlowTestCase):
   """Tests bt_model_fn including unexposed internal functionalities."""
-- 
GitLab


From 421d1c077053e6e38e4c9cee99641edcd4d9ca1e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 11 Apr 2018 18:20:19 -0700
Subject: [PATCH 0990/1262] In model_to_estimator, only run get_weights when
 there are initialized Keras variables(which assumes there exists a session).
 Otherwise create a session so that we can run get_config(). Actually fix
 #18193.

PiperOrigin-RevId: 192541442
---
 .../python/keras/_impl/keras/estimator.py     | 45 +++++++++-----
 .../keras/_impl/keras/estimator_test.py       | 61 ++++++++++---------
 2 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 8043242b70..b922a6c683 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -26,7 +26,6 @@ from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -38,6 +37,7 @@ from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
@@ -55,6 +55,19 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
+def _any_variable_initalized():
+  """Check if any variable has been initialized in the Keras model.
+
+  Returns:
+    boolean, True if at least one variable has been initalized, else False.
+  """
+  variables = variables_module.global_variables()
+  for v in variables:
+    if getattr(v, '_keras_initialized', False):
+      return True
+  return False
+
+
 def _create_ordered_io(keras_model, estimator_io, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
 
@@ -396,7 +409,8 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
                                      custom_objects)
       # save to checkpoint
       with session.Session(config=estimator._session_config) as sess:
-        model.set_weights(keras_weights)
+        if keras_weights:
+          model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
         if not model.train_function:
           # pylint: disable=protected-access
@@ -466,20 +480,21 @@ def model_to_estimator(keras_model=None,
   estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
 
-  old_session = K._SESSION
-  # Pass the config into keras backend's default session.
-  sess = session.Session(config=estimator._session_config)
-  K.set_session(sess)
-  try:
-    keras_weights = keras_model.get_weights()
-  except errors.FailedPreconditionError as e:
-    if old_session is None:
-      raise e
-    logging.warning(
-        'The Keras backend session has already been '
-        'set. The _session_config passed to model_to_estimator is not used.')
-    K.set_session(old_session)
+  # Check if we need to call get_weights:
+  if _any_variable_initalized():
     keras_weights = keras_model.get_weights()
+    # Warn if config passed to estimator tries to update GPUOptions. If a
+    # session has already been created, the GPUOptions passed to the first
+    # session sticks.
+    if estimator._session_config.HasField('gpu_options'):
+      logging.warning(
+          'The Keras backend session has already been set. '
+          'The _session_config passed to model_to_estimator will not be used.')
+  else:
+    # Pass the config into keras backend's default session.
+    sess = session.Session(config=estimator._session_config)
+    K.set_session(sess)
+    keras_weights = None
 
   if keras_model._is_graph_network:
     # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 27b7ec7dd4..653cdc01e2 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -27,10 +27,12 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
+from tensorflow.python.keras._impl.keras.optimizers import SGD
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -443,8 +445,9 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     model = simple_functional_model()
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
-    est_keras = keras.estimator.model_to_estimator(
-        keras_model=model, config=self._config)
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=model, config=self._config)
 
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -497,20 +500,22 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
   def test_gpu_config(self):
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.categorical_accuracy])
+    with ops.Graph().as_default():
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['mse', keras.metrics.categorical_accuracy])
 
-    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
-    sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
-    self._config._session_config = sess_config
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
-    self.assertEqual(keras.backend.get_session()
-                     ._config.gpu_options.per_process_gpu_memory_fraction,
-                     gpu_options.per_process_gpu_memory_fraction)
+      gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
+      sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
+      self._config._session_config = sess_config
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      self.assertEqual(
+          keras.backend.get_session()
+          ._config.gpu_options.per_process_gpu_memory_fraction,
+          gpu_options.per_process_gpu_memory_fraction)
 
   def test_pretrained_weights(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
@@ -518,19 +523,19 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         loss='categorical_crossentropy',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         metrics=['mse', keras.metrics.categorical_accuracy])
-
-    keras_model.train_on_batch(
-        np.random.random((10,) + _INPUT_SIZE), np.random.random((10,
-                                                                 _NUM_CLASS)))
-    weights = keras_model.get_weights()
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.set_weights(weights)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.categorical_accuracy])
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
+    with self.test_session():
+      keras_model.train_on_batch(
+          np.random.random((10,) + _INPUT_SIZE),
+          np.random.random((10, _NUM_CLASS)))
+      weights = keras_model.get_weights()
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.set_weights(weights)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=SGD(lr=0.0001, momentum=0.9),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
 
 
 if __name__ == '__main__':
-- 
GitLab


From ba25b8ba9f88df5db8c11c0bec9b27c8151af7d7 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Mon, 16 Apr 2018 14:52:41 -0700
Subject: [PATCH 0991/1262] Increase softmax gpu unittest numeric stability

PiperOrigin-RevId: 193103363
---
 tensorflow/python/kernel_tests/softmax_op_test.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 981f96b74d..dc4d4dbeab 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -39,6 +39,10 @@ class SoftmaxTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
+    is_fp16 = features.dtype == np.float16
+    if is_fp16:
+      # Do the compute in fp32 and cast the input back to fp32.
+      features = features.astype(np.float32)
     e = np.exp(features - np.reshape(
         np.amax(
             features, axis=dim), one_only_on_dim))
@@ -47,6 +51,8 @@ class SoftmaxTest(test.TestCase):
       res = np.log(softmax)
     else:
       res = softmax
+    if is_fp16:
+      res = res.astype(np.float16)
     return res
 
   def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
@@ -125,8 +131,8 @@ class SoftmaxTest(test.TestCase):
                        "Test only applicable when running on GPUs")
   def testFloatGPU(self):
     if test.is_gpu_available(cuda_only=True):
-      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
-      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 10)]
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
       for row, col in zip(rows, cols):
         logging.info("Testing softmax float dtype in shape [%d, %d]", row, col)
         data = np.random.rand(row, col)
@@ -140,8 +146,8 @@ class SoftmaxTest(test.TestCase):
                        "Test only applicable when running on GPUs")
   def testHalfGPU(self):
     if test.is_gpu_available(cuda_only=True):
-      rows = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
-      cols = [2**x + np.random.randint(0, 1024) for x in range(1, 8)]
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
       for row, col in zip(rows, cols):
         logging.info("Testing softmax half dtype in shape [%d, %d]", row, col)
         data = np.random.rand(row, col)
-- 
GitLab


From 5aba07dce5b9e924183efcd05cd82f2fbb70edc8 Mon Sep 17 00:00:00 2001
From: Chris Ying <chrisying@google.com>
Date: Tue, 17 Apr 2018 15:28:12 -0700
Subject: [PATCH 0992/1262] Fix CheckpointSaverHook to properly save every
 save_checkpoints_steps for TPU workloads.

PiperOrigin-RevId: 193266515
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   |  9 ++
 .../training/basic_session_run_hooks.py       | 10 +-
 .../training/basic_session_run_hooks_test.py  | 93 +++++++++++++++++++
 ...sorflow.train.-checkpoint-saver-hook.pbtxt |  2 +-
 4 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7fab19afee..0948997b28 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2054,6 +2054,14 @@ class TPUEstimator(estimator_lib.Estimator):
                   },
                   every_n_secs=30)
           ] + input_hooks
+          chief_hooks = [
+              training.CheckpointSaverHook(
+                  self.model_dir,
+                  save_secs=self._config.save_checkpoints_secs,
+                  save_steps=self._config.save_checkpoints_steps,
+                  steps_per_run=self._config.tpu_config.iterations_per_loop,
+                  scaffold=scaffold)
+          ]
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -2067,6 +2075,7 @@ class TPUEstimator(estimator_lib.Estimator):
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
+              training_chief_hooks=chief_hooks,
               training_hooks=hooks,
               train_op=train_op,
               scaffold=scaffold)
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 094a9e886b..3651291bdf 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -391,7 +391,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
                saver=None,
                checkpoint_basename="model.ckpt",
                scaffold=None,
-               listeners=None):
+               listeners=None,
+               steps_per_run=1):
     """Initializes a `CheckpointSaverHook`.
 
     Args:
@@ -404,6 +405,9 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
       listeners: List of `CheckpointSaverListener` subclass instances.
         Used for callbacks that run immediately before or after this hook saves
         the checkpoint.
+      steps_per_run: `int`, number of steps that occur between each invocation
+        of the hook. Primarily used for TPU workloads which run multiple steps
+        in a while loop in a single Session.run.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
@@ -419,6 +423,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
     self._listeners = listeners or []
+    self._steps_per_run = steps_per_run
 
   def begin(self):
     self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
@@ -450,7 +455,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
 
   def after_run(self, run_context, run_values):
     stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(stale_global_step+1):
+    if self._timer.should_trigger_for_step(
+        stale_global_step + self._steps_per_run):
       # get the real value after train op.
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index f39a5261a9..25962f6bf7 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -719,6 +719,99 @@ class CheckpointSaverHookTest(test.TestCase):
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
 
+class CheckpointSaverHookMultiStepTest(test.TestCase):
+
+  def setUp(self):
+    self.model_dir = tempfile.mkdtemp()
+    self.graph = ops.Graph()
+    self.steps_per_run = 5
+    with self.graph.as_default():
+      self.scaffold = monitored_session.Scaffold()
+      self.global_step = variables.get_or_create_global_step()
+      self.train_op = training_util._increment_global_step(self.steps_per_run)
+
+  def tearDown(self):
+    shutil.rmtree(self.model_dir, ignore_errors=True)
+
+  def test_save_steps_saves_in_first_step(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        # Saved (step=5)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Not saved (step=10)
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Saved (step=15)
+        self.assertEqual(15,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Not saved (step=20)
+        self.assertEqual(15,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+        mon_sess.run(self.train_op)
+        # Saved (step=25)
+        self.assertEqual(25,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+  def test_save_steps_saves_at_end(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=2*self.steps_per_run,
+          scaffold=self.scaffold,
+          steps_per_run=self.steps_per_run)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        mon_sess.run(self.train_op)
+        hook.end(sess)
+        self.assertEqual(10,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+
 class ResourceCheckpointSaverHookTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
index c3037baa8c..327799729c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\', \'steps_per_run\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "after_create_session"
-- 
GitLab


From b5f8c3531924c56cf4866f57ce0ccea1b72b289e Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Tue, 17 Apr 2018 10:53:07 -0700
Subject: [PATCH 0993/1262] Enable consumption of GIT_TAG_OVERRIDE env var in
 release build script. (#18579)

Enable consumption of GIT_TAG_OVERRIDE env var in release build script.
---
 tensorflow/contrib/cmake/tf_core_framework.cmake | 2 +-
 tensorflow/tools/ci_build/builds/pip.sh          | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347f..73cadc58ff 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 82042b93c0..5fa75e1d61 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -123,6 +123,10 @@ done
 
 BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
 
+if [[ -z "$GIT_TAG_OVERRIDE" ]]; then
+  BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE"
+fi
+
 echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
-- 
GitLab


From 6e9d3ad2aad1d6fc417882a7f5c7aba22b7df18e Mon Sep 17 00:00:00 2001
From: Derek Murray <derek.murray@gmail.com>
Date: Tue, 17 Apr 2018 14:59:58 -0700
Subject: [PATCH 0994/1262] [tf.data] Fix a device placement issue in
 `prefetch_to_device()`. (#18607)

* [tf.data] Fix a device placement issue in `prefetch_to_device()`.

Previously, the `iterator_get_device()` op was being infeasibly colocated with
both the iterator and placed on the prefetch target device. Move the
construction of that op outside the `with device():` block to fix this.

Also enable the relevant test to run as a CUDA test.

* Import the cuda_py_test rule.
---
 tensorflow/contrib/data/python/kernel_tests/BUILD     | 7 +++----
 tensorflow/contrib/data/python/ops/prefetching_ops.py | 6 ++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 7270d533c6..fa5662ce0b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -473,12 +473,11 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
     srcs = ["prefetching_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 89c04dc89a..e4c9f8b58a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -114,11 +114,13 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
+    iterator_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
-          target_device=gen_dataset_ops.iterator_get_device(
-              self._input_iterator._iterator_resource),
+          target_device=iterator_device,
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
           shared_name=shared_name)
-- 
GitLab


From 30331b3fc02d9ae259e1241b40b945d242924376 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 17 Apr 2018 15:55:27 -0700
Subject: [PATCH 0995/1262] Make requested review changes

---
 tensorflow/contrib/tensorrt/BUILD                           | 5 +++++
 tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index d382adb986..b7c2a2d527 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -19,6 +19,7 @@ load(
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
@@ -280,6 +281,10 @@ py_test(
     srcs = ["test/tf_trt_integration_test.py"],
     main = "test/tf_trt_integration_test.py",
     srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         ":init_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
index b17fdd52b2..7a47328762 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -37,10 +37,11 @@ from tensorflow.python.platform import googletest
 @test_util.with_c_api
 class IntegrationTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
+
   def setUp(self):
     """Setup method."""
     super(IntegrationTest, self).setUp()
-    warnings.simplefilter('always')
+    warnings.simplefilter("always")
     inp_dims = (100, 24, 24, 2)
     self._input = np.random.random_sample(inp_dims)
     self._original_graph = self.get_simple_graph_def()
@@ -151,5 +152,5 @@ class IntegrationTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(result1, result)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From 77586aefab8f5be9677659099ebe5467559c2d37 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 17 Apr 2018 16:18:07 -0700
Subject: [PATCH 0996/1262] Update version strings for 1.8.0rc1.

---
 tensorflow/core/public/version.h              |  2 +-
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       |  4 ++--
 tensorflow/tools/pip_package/setup.py         |  2 +-
 8 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 0ca7d8475f..ba69efb289 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX "-rc1"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 995b8ae666..8c165aad52 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 2938a8f7ee..26cbcc9a9b 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index c87eacfa93..1b0bbdba7b 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0-rc0</version>
+                 <version>1.8.0-rc1</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 8387289fcf..f19f827e25 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -194,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -299,7 +299,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -485,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -659,14 +659,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -678,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -697,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -716,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index a237d1af54..ff6c2f5e44 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 677e3329b6..d48a6ee550 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0rc0 on Linux:
+for TensorFlow 1.8.0rc1 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index f676f040ad..6da3223d33 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0-rc0'
+_VERSION = '1.8.0-rc1'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 9a6e21726e9978b9ab5442ad63dbc8037ec4a941 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Tue, 17 Apr 2018 16:24:42 -0700
Subject: [PATCH 0997/1262] TpuStrategy -> TPUStrategy

PiperOrigin-RevId: 193275991
---
 tensorflow/contrib/distribute/python/combinations.py | 2 +-
 tensorflow/contrib/distribute/python/tpu_strategy.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 1f66997e6e..946310aa6f 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -266,7 +266,7 @@ one_device_strategy = NamedDistribution(
     "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
     None)
 tpu_strategy = NamedDistribution(
-    "TPU", tpu_strategy.TpuStrategy(), required_tpu=True)
+    "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
     mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 0ac307dd6a..804217b5ce 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -32,10 +32,10 @@ from tensorflow.python.ops import control_flow_ops
 
 
 # TODO(isaprykin):  Consider whether inheriting is really appropriate.
-class TpuStrategy(one_device_strategy.OneDeviceStrategy):
+class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
   def __init__(self, master=None, iterations=None, model_dir=None):
-    super(TpuStrategy, self).__init__('/cpu:0')
+    super(TPUStrategy, self).__init__('/cpu:0')
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
-- 
GitLab


From 35e1198ffcaf1724da7f8cad545edaa4cd02b4ae Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 17 Apr 2018 16:34:22 -0700
Subject: [PATCH 0998/1262] Fix py_test import

---
 tensorflow/contrib/tensorrt/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index b7c2a2d527..f80b4f1b11 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,6 +11,7 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -19,7 +20,6 @@ load(
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
-load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-- 
GitLab


From 82618eee9ddda444516590688d349dfd2c05cb22 Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Tue, 17 Apr 2018 16:43:53 -0700
Subject: [PATCH 0999/1262] [INTEL MK] Updating MKL CPU CI build and test.
 (#18513)

* Setting  KMP_BLOCKTIME to 0

* Adding comments
---
 tensorflow/tools/ci_build/linux/cpu/run_mkl.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index dbf376be6f..2a9f295188 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -30,7 +30,10 @@ export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
+# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution
+# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads
+# caused by executing multiple tests concurrently.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
-    --config=mkl --config=opt --test_output=errors -- \
+    --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
-- 
GitLab


From 9477835866648389f109748c8986453ca3c9a1e2 Mon Sep 17 00:00:00 2001
From: shengfuintel <sheng.fu@intel.com>
Date: Tue, 17 Apr 2018 16:44:10 -0700
Subject: [PATCH 1000/1262] Fixed the bug in mkl_input_conversion_op when
 reorder is not needed (#18498)

---
 .../core/kernels/mkl_input_conversion_op.cc   | 35 +++++++++----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 68d3e1c9ab..7ab72bbb70 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -310,9 +310,8 @@ class MklInputConversionOp : public OpKernel {
           VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
                      "different, "
                   << "need to convert to same format";
-
-          // Convert input0, and keep input1 unchanged
-          // Create MklDnnShape for output mkl tensor based on input0
+          // TODO: For now, input0 is converted and input1 is unchanged
+          //       we should choose the optimal MKL format to convert to.
           Tensor* tensor_out;
           MklDnnShape mkl_output_mkl_shape;
           mkl_output_mkl_shape.SetMklTensor(true);
@@ -360,7 +359,8 @@ class MklInputConversionOp : public OpKernel {
       // with MKL tensors)
       VLOG(1) << "MklInputConversionOp: Broadcast needed, "
               << "converted MKL inputs to TF format";
-
+      // TODO: Cleanup op_data_type and has_avx512f_ after these two parameters
+      //       are removed from ConvertMklToTf
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
                                            op_data_type, has_avx512f_, kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
@@ -399,19 +399,7 @@ class MklInputConversionOp : public OpKernel {
     }
 
     // Broadcast is needed if the shapes are not the same
-    bool broadcast_needed;
-
-    size_t in0_size = 1;
-    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
-      in0_size *= mkl_shape->TfDimSize(i);
-
-    size_t in1_size = 1;
-    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
-      in1_size *= tf_tensor->shape().dim_size(i);
-
-    broadcast_needed = (in0_size != in1_size);
-
-    if (!broadcast_needed) {
+    if (mkl_shape->GetTfShape().num_elements() == tf_tensor->shape().num_elements() ) {
       // Both shapes are same, convert the TF input to MKL
       VLOG(1) << "MklInputConversionOp: No broadcast needed.";
       VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
@@ -442,10 +430,19 @@ class MklInputConversionOp : public OpKernel {
 
       // Create reorder between tensorflow layout and Mkl layout if necessary
       std::vector<primitive> net;
-      tf_input.CheckReorderToOpMem(
+      bool reordered = tf_input.CheckReorderToOpMem(
                    memory::primitive_desc(output_mkl_md, cpu_engine),
                    tensor_out, &net);
-      stream(stream::kind::eager).submit(net).wait();
+      if(!reordered) {
+        // This is the case that the TF tensor has the same shape and format of
+        // mkl tensor. However, tf_tensor can not be simply forwarded to the output
+        // tensor since mkl data tensor is always one dimensional tensor. 
+        // Tensor::CopyFrom shares the buffer of the other tensor while set its shape
+        // to the other tensor. 
+        tensor_out->CopyFrom(*tf_tensor, tensor_out->shape());
+      }
+      else  
+        stream(stream::kind::eager).submit(net).wait();
 
       // -- The tensor in MKL format passes through --
       ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
-- 
GitLab


From f185600509b46414f05dec70df080c7a3d62c58c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 16:44:21 -0700
Subject: [PATCH 1001/1262] Fix shape validation error with
 tf.nn.conv3d_transpose (#18465)

* Fix shape validation error with tf.nn.conv3d_transpose

This fix tries to address the issue raised in 18460.
In `tf.nn.conv3d_transpose` when list or np array is passed,
the validate of the output shape with filter shape uses
`output_shape[4]` (channel). This will not work with
`data_format='NCDHW'`.

This fix fixes the issue by replace `output_shape[4]` with `output_shape[axis]`.

This fix also adds a test case. Before this fix, the test case will fail.

This fix fixes 18460.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for output and filter shape check in conv3d_transpose

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Also fix the error message
---
 .../python/kernel_tests/conv3d_transpose_test.py     | 12 ++++++++++++
 tensorflow/python/ops/nn_ops.py                      |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index a8b3af5096..8973a450fa 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -119,6 +119,18 @@ class Conv3DTransposeTest(test.TestCase):
                   target = 3.0
                 self.assertAllClose(target, value[n, d, h, w, k])
 
+  def testConv3DTransposeShapeMismatch(self):
+    # Test case for GitHub issue 18460
+    x_shape = [2, 2, 3, 4, 3]
+    f_shape = [3, 3, 3, 2, 2]
+    y_shape = [2, 2, 6, 8, 6]
+    strides = [1, 1, 2, 2, 2]
+    np.random.seed(1)
+    x_value = np.random.random_sample(x_shape).astype(np.float64)
+    f_value = np.random.random_sample(f_shape).astype(np.float64)
+    nn_ops.conv3d_transpose(
+        x_value, f_value, y_shape, strides, data_format='NCDHW')
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index ea83ba7748..a8d0293d13 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1458,10 +1458,10 @@ def conv3d_transpose(
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
+      if not filter.get_shape()[3].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4],
+            "{} != {}".format(output_shape[axis],
                               filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
-- 
GitLab


From 1ab692972f34353ecdb8dfbcd611ef3927c3f14a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 17 Apr 2018 16:44:53 -0700
Subject: [PATCH 1002/1262] Replace raw_input/input with six.moves.input for
 python 2/3 (#18461)

In python 3 input is the equivalent of raw_input in python 2.
This fix is an enhancement to replace raw_input/input with
six.moves.input, which is compatible with both python 2 and python 3.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/debug/cli/readline_ui.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/debug/cli/readline_ui.py b/tensorflow/python/debug/cli/readline_ui.py
index 151638789f..3296e45d07 100644
--- a/tensorflow/python/debug/cli/readline_ui.py
+++ b/tensorflow/python/debug/cli/readline_ui.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import readline
 
+import six
+
 from tensorflow.python.debug.cli import base_ui
 from tensorflow.python.debug.cli import debugger_cli_common
 
@@ -39,11 +41,7 @@ class ReadlineUI(base_ui.BaseUI):
     readline.set_completer(self._readline_complete)
     readline.parse_and_bind("tab: complete")
 
-    # For Python 2-3 compatibility.
-    try:
-      self._input = raw_input
-    except NameError:
-      self._input = input
+    self._input = six.moves.input
 
   def _readline_complete(self, text, state):
     context, prefix, except_last_word = self._analyze_tab_complete_input(text)
-- 
GitLab


From 87fc941a6a16d21e2164dbab104b04701426c65e Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Wed, 18 Apr 2018 07:45:19 +0800
Subject: [PATCH 1003/1262] Fix the default value doc string of global_step in
 contrib.slim (#18313)

---
 tensorflow/contrib/slim/python/slim/learning.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 6a200de1ea..8a2c74742a 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -389,7 +389,7 @@ def create_train_op(total_loss,
     total_loss: A `Tensor` representing the total loss.
     optimizer: A tf.Optimizer to use for computing the gradients.
     global_step: A `Tensor` representing the global step variable. If left as
-      `_USE_GLOBAL_STEP`, then slim.variables.global_step() is used.
+      `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used.
     update_ops: An optional list of updates to execute. If `update_ops` is
       `None`, then the update ops are set to the contents of the
       `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
@@ -578,7 +578,8 @@ def train(train_op,
     is_chief: Specifies whether or not the training is being run by the primary
       replica during replica training.
     global_step: The `Tensor` representing the global step. If left as `None`,
-      then slim.variables.get_or_create_global_step() is used.
+      then training_util.get_or_create_global_step(), that is,
+      tf.contrib.framework.global_step() is used.
     number_of_steps: The max number of gradient steps to take during training,
       as measured by 'global_step': training will stop if global_step is
       greater than 'number_of_steps'. If the value is left as None, training
-- 
GitLab


From da26c0736981df63455abdfc2662d8d6a2213224 Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@users.noreply.github.com>
Date: Wed, 18 Apr 2018 07:45:40 +0800
Subject: [PATCH 1004/1262] fix build break cmake windows 32bit (#18295)

* fix build break cmake windows 32bit

* Fix lint errors
---
 tensorflow/contrib/cmake/CMakeLists.txt           | 10 +++++++++-
 tensorflow/contrib/cmake/tf_python.cmake          |  3 ++-
 tensorflow/contrib/cmake/tf_shared_lib.cmake      |  3 ++-
 tensorflow/contrib/cmake/tools/create_def_file.py |  8 ++++++--
 tensorflow/core/common_runtime/bfc_allocator.h    |  2 +-
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 95df69465a..10f29deca0 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -128,8 +128,16 @@ endif()
 
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      # 64 bits
+      add_definitions(-DWIN64)
+  elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+      # 32 bits
+      # temporary fix for #18241
+      add_definitions(-DEIGEN_DEFAULT_DENSE_INDEX_TYPE=std::int64_t)
+  endif()
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11)
-  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
   add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm-)
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 954e215fcc..c4bdb69d82 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -554,12 +554,13 @@ if(WIN32)
         set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.def")
     endif()
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+    math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
             --input "${pywrap_tensorflow_internal_static_dependencies}"
             --output "${pywrap_tensorflow_deffile}"
             --target _pywrap_tensorflow_internal.pyd
+            --bitness "${tensorflow_target_bitness}"
         BYPRODUCTS ${pywrap_tensorflow_deffile} # Required for Ninja
     )
 endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 9738bbeb9a..38f40452b5 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -52,12 +52,13 @@ if(WIN32)
     set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/tensorflow.def")
   endif()
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+  math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
   add_custom_command(TARGET tensorflow_static POST_BUILD
       COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
           --input "${tensorflow_static_dependencies}"
           --output "${tensorflow_deffile}"
           --target tensorflow.dll
+          --bitness "${tensorflow_target_bitness}"
   )
 endif(WIN32)
 
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 53c2285699..cffe069aa3 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -63,7 +63,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"^(TFE_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
-                        r"nsync_|"
+                        r"\?nsync_|"
                         r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
@@ -87,6 +87,7 @@ def get_args():
                       required=True)
   parser.add_argument("--output", help="output deffile", required=True)
   parser.add_argument("--target", help="name of the target", required=True)
+  parser.add_argument("--bitness", help="build target bitness", required=True)
   args = parser.parse_args()
   return args
 
@@ -125,7 +126,10 @@ def main():
     # Header for the def file.
     def_fp.write("LIBRARY " + args.target + "\n")
     def_fp.write("EXPORTS\n")
-    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+    if args.bitness == "64":
+      def_fp.write("\t??1OpDef@tensorflow@@UEAA@XZ\n")
+    else:
+      def_fp.write("\t??1OpDef@tensorflow@@UAE@XZ\n")
 
     # Each symbols returned by undname matches the same position in candidates.
     # We compare on undname but use the decorated name from candidates.
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index b8e773503c..ba5a3eea3a 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -378,7 +378,7 @@ class BFCAllocator : public VisitableAllocator {
   inline int Log2FloorNonZero(uint64 n) {
 #if defined(__GNUC__)
     return 63 ^ __builtin_clzll(n);
-#elif defined(PLATFORM_WINDOWS)
+#elif defined(PLATFORM_WINDOWS) && (_WIN64)
     unsigned long index;
     _BitScanReverse64(&index, n);
     return index;
-- 
GitLab


From ab16333f04df819fff34714b748010149443106d Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 17 Apr 2018 17:17:04 -0700
Subject: [PATCH 1005/1262] Minor cleanups to the gather expander; NFC

This change is NFC now, but it makes the code more general and this generality
will be used later on.  For instance

  ExpandFirstDimIntoNDims(transposed_gather_indices, {1, shape.dimensions(0)})

does not work if shape is a scalar shape (and this fine because today shape is
never scalar) but

  PrependDegenerateDims(transposed_gather_indices, 1)

works fine if transposed_gather_indices is scalar (and it will be, in a future
change).

PiperOrigin-RevId: 193283404
---
 tensorflow/compiler/xla/service/BUILD         |  17 ++
 .../compiler/xla/service/gather_expander.cc   |  15 +-
 .../xla/service/hlo_creation_utils.cc         |  13 +
 .../compiler/xla/service/hlo_creation_utils.h |  12 +-
 .../xla/service/hlo_creation_utils_test.cc    | 234 ++++++++++++++++++
 5 files changed, 279 insertions(+), 12 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_creation_utils_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9831a09c1f..9009cbf845 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1220,6 +1220,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_creation_utils_test",
+    srcs = ["hlo_creation_utils_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_evaluator",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "batchnorm_expander",
     srcs = ["batchnorm_expander.cc"],
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 221ff7900f..1239f56364 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -86,8 +86,7 @@ static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
   // all of the non-index-vector dimensions.
   const Shape& shape = transposed_gather_indices->shape();
   if (shape.dimensions_size() == 1) {
-    return ExpandFirstDimIntoNDims(transposed_gather_indices,
-                                   {1, shape.dimensions(0)});
+    return PrependDegenerateDims(transposed_gather_indices, 1);
   } else {
     return CollapseFirstNDims(transposed_gather_indices,
                               shape.dimensions_size() - 1);
@@ -112,11 +111,7 @@ static StatusOr<HloInstruction*> AdjustGatherDimsInAccumulator(
     // dynamic-slice.  In that case, there is a leading degenerate gather
     // dimension that we added to make this special case play well with the
     // general while loop which we need to remove now.
-    CHECK_EQ(accumulator->shape().dimensions(0), 1);
-    ArraySlice<int64> reshaped_dim_sizes =
-        AsInt64Slice(accumulator->shape().dimensions());
-    reshaped_dim_sizes.remove_prefix(1);
-    return MakeReshapeHlo(reshaped_dim_sizes, accumulator);
+    return ElideDegenerateDims(accumulator, {0});
   }
 
   return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds);
@@ -195,10 +190,8 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
                       MakeDynamicSliceHlo(operand, gathered_slice_start,
                                           gather.gather_window_bounds()));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * gathered_slice_for_update,
-      ExpandFirstDimIntoNDims(gathered_slice,
-                              {1, gathered_slice->shape().dimensions(0)}));
+  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_for_update,
+                      PrependDegenerateDims(gathered_slice, 1));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * index_vector_into_accumulator,
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index b186767ce7..9a89888480 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -163,6 +163,8 @@ StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
 }
 
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
+  CHECK_GT(n, 0);
+
   const Shape& operand_shape = operand->shape();
   CHECK_GE(operand_shape.dimensions_size(), n);
   int64 new_shape_leading_bound = 1;
@@ -184,6 +186,17 @@ StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   return MakeReshapeHlo(output_shape, operand);
 }
 
+StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                int64 n) {
+  CHECK_GT(n, 0);
+  std::vector<int64> new_shape_dims;
+  const Shape& operand_shape = operand->shape();
+  new_shape_dims.reserve(n + operand_shape.dimensions_size());
+  new_shape_dims.insert(new_shape_dims.begin(), n, 1);
+  c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims));
+  return MakeReshapeHlo(new_shape_dims, operand);
+}
+
 StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
     HloInstruction* operand, ArraySlice<int64> expanded_dims) {
   CHECK_GT(operand->shape().dimensions_size(), 0);
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index d99e32a737..c9a7361a6a 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -103,12 +103,22 @@ StatusOr<HloInstruction*> MakeConcatHlo(
 // their operand(s).
 
 // Collapses (via reshape) the first N (logical) dimensions of `operand` into a
-// single leading dimension.  `operand` must have rank > n.
+// single leading dimension.  `operand` must have rank > `n` and `n` must not be
+// 0.
 //
 // For instance if `operand` has shape f32[7,8,9] and n is 2 then the output is
 // the `operand` reshaped to [56,9].
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n);
 
+// Prepends `n` degenerate dimensions (dimensions with bound = 1) to `operand`
+// using a reshape.
+//
+// For instance if operand has shape f32[3,4,5] then this returns the operand
+// reshaped to f32[1,3,4,5].  If the operand is a f32 scalar (i.e. has shape
+// f32[]) then this returns the operand reshaped to f32[1].
+StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                int64 n);
+
 // Expands (via reshape) the first (logical) dimension of `operand` into a
 // sequence of `expanded_dims` dimensions.  `operand` must at least be of rank 1
 // and the number of elements in its first dimension must be equal to the
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
new file mode 100644
index 0000000000..6b681a5bf6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+using tensorflow::gtl::ArraySlice;
+
+std::unique_ptr<HloModule> CreateModuleWithProgramShape(
+    PrimitiveType primitive_type, ArraySlice<int64> input_shape_dims,
+    ArraySlice<int64> output_shape_dims, HloInstruction** param,
+    HloComputation** entry_computation) {
+  Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
+  Shape output_shape = ShapeUtil::MakeShape(primitive_type, output_shape_dims);
+  std::unique_ptr<HloModule> module = MakeUnique<HloModule>("test");
+  *entry_computation = module->AddEntryComputation(
+      CreateComputationWithSignature({&input_shape}, output_shape, "entry")
+          .ValueOrDie());
+  *param = (*entry_computation)->parameter_instruction(0);
+  return module;
+}
+
+TEST(HloCreationUtilsTest, CollapseFirst1Dim) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed,
+                          CollapseFirstNDims(param, 1));
+  entry_computation->set_root_instruction(first_1_dims_collapsed);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({3, 4})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({3, 4}));
+}
+
+TEST(HloCreationUtilsTest, CollapseFirst2Dims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed,
+                          CollapseFirstNDims(param, 2));
+  entry_computation->set_root_instruction(first_2_dims_collapsed);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(
+          *module,
+          {Literal::CreateR3<int32>(
+              {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR2<int32>(
+               {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
+}
+
+TEST(HloCreationUtilsTest, Prepend1DegenerateDim) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended,
+                          PrependDegenerateDims(param, 1));
+  entry_computation->set_root_instruction(with_1_degenerate_dim_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({9, 10})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9, 10}}));
+}
+
+TEST(HloCreationUtilsTest, Prepend2DegenerateDims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
+                          PrependDegenerateDims(param, 2));
+  entry_computation->set_root_instruction(with_2_degenerate_dims_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({9, 10})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR3<int32>({{{9, 10}}}));
+}
+
+TEST(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{1, 1}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
+                          PrependDegenerateDims(param, 2));
+  entry_computation->set_root_instruction(with_2_degenerate_dims_prepended);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<int32>(9)}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9}}));
+}
+
+TEST(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded,
+                          ExpandFirstDimIntoNDims(param, {3, 1, 2}));
+  entry_computation->set_root_instruction(first_dim_expanded);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(
+          *module, {Literal::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
+}
+
+TEST(HloCreationUtilsTest, PadVectorWithZeros) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{6}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zero_padded_param,
+      PadVectorWithZeros(param, /*zeros_to_prepend=*/3, /*zeros_to_append=*/1));
+  entry_computation->set_root_instruction(zero_padded_param);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR1<int32>({3, 4})}));
+  CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
+}
+
+TEST(HloCreationUtilsTest, BroadcastZeros_S32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      S32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zeros,
+      BroadcastZeros(module->entry_computation(), S32, {2, 2}));
+  entry_computation->set_root_instruction(zeros);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<int32>(0)}));
+  CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{0, 0}, {0, 0}}));
+}
+
+TEST(HloCreationUtilsTest, BroadcastZeros_F32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  std::unique_ptr<HloModule> module = CreateModuleWithProgramShape(
+      F32,
+      /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, &param,
+      &entry_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * zeros,
+      BroadcastZeros(module->entry_computation(), F32, {2, 2}));
+  entry_computation->set_root_instruction(zeros);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
+                          evaluator.Evaluate<std::unique_ptr<Literal>>(
+                              *module, {Literal::CreateR0<float>(0.0f)}));
+  CHECK_EQ(*result_literal,
+           *Literal::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
+}
+
+}  // namespace
+}  // namespace xla
-- 
GitLab


From 235a9e32c58c68f2ccf1552c1c2842b42f5c2cf0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 17 Apr 2018 18:48:38 -0700
Subject: [PATCH 1006/1262] Comment out part of ring_reducer_test suspected to
 have a race.

PiperOrigin-RevId: 193292788
---
 tensorflow/core/common_runtime/ring_reducer_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index e4387a074a..57c36d6582 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -572,9 +572,9 @@ DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
 DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
 
-// Failure tests
-DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
-DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+// // Failure tests
+// DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+// DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
 #endif
 
 #ifdef GOOGLE_CUDA
@@ -597,9 +597,9 @@ DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
 // DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
 
-// Failure tests
-DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
-DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+// // Failure tests
+// DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+// DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
 #endif
 
 }  // namespace
-- 
GitLab


From 41e2cd187b31e9e6d88bc042e21e73f7be0ed729 Mon Sep 17 00:00:00 2001
From: Chris Ying <chrisying@google.com>
Date: Tue, 17 Apr 2018 20:31:30 -0700
Subject: [PATCH 1007/1262] Disable CheckpointSaverHook when both
 save_checkpoints_secs and save_checkpoints_steps are None

PiperOrigin-RevId: 193299688
---
 .../contrib/tpu/python/tpu/tpu_estimator.py    | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 0948997b28..98eb0e240f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2054,14 +2054,16 @@ class TPUEstimator(estimator_lib.Estimator):
                   },
                   every_n_secs=30)
           ] + input_hooks
-          chief_hooks = [
-              training.CheckpointSaverHook(
-                  self.model_dir,
-                  save_secs=self._config.save_checkpoints_secs,
-                  save_steps=self._config.save_checkpoints_steps,
-                  steps_per_run=self._config.tpu_config.iterations_per_loop,
-                  scaffold=scaffold)
-          ]
+          chief_hooks = []
+          if (self._config.save_checkpoints_secs or
+              self._config.save_checkpoints_steps):
+            chief_hooks.append(
+                training.CheckpointSaverHook(
+                    self.model_dir,
+                    save_secs=self._config.save_checkpoints_secs,
+                    save_steps=self._config.save_checkpoints_steps,
+                    steps_per_run=self._config.tpu_config.iterations_per_loop,
+                    scaffold=scaffold))
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
-- 
GitLab


From d77a621a571d8ab0d69f2682586674e6dff4ec4e Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 17 Apr 2018 21:04:35 -0700
Subject: [PATCH 1008/1262] [XLA] Convert XLA to use xla::se as a namespace
 alias for ::stream_executor.

PiperOrigin-RevId: 193301997
---
 tensorflow/compiler/xla/BUILD                 |  3 +
 .../compiler/xla/client/client_library.cc     | 18 ++---
 .../compiler/xla/client/client_library.h      | 20 +++---
 .../compiler/xla/client/local_client.cc       |  2 -
 tensorflow/compiler/xla/client/local_client.h |  2 +-
 tensorflow/compiler/xla/device_util.h         |  2 +-
 .../compiler/xla/executable_run_options.h     |  7 +-
 tensorflow/compiler/xla/rpc/grpc_service.cc   |  2 +-
 tensorflow/compiler/xla/rpc/grpc_service.h    |  2 +-
 .../xla/service/allocation_tracker.cc         |  6 +-
 .../compiler/xla/service/allocation_tracker.h |  8 +--
 tensorflow/compiler/xla/service/backend.cc    | 19 ++---
 tensorflow/compiler/xla/service/backend.h     | 34 ++++-----
 .../xla/service/compile_only_service.cc       |  4 +-
 .../xla/service/compile_only_service.h        |  2 +-
 tensorflow/compiler/xla/service/compiler.cc   | 13 ++--
 tensorflow/compiler/xla/service/compiler.h    | 25 +++----
 .../xla/service/computation_placer.cc         | 16 ++---
 .../compiler/xla/service/computation_placer.h |  9 ++-
 .../compiler/xla/service/cpu/cpu_compiler.cc  | 14 ++--
 .../compiler/xla/service/cpu/cpu_compiler.h   | 12 ++--
 .../xla/service/cpu/cpu_executable.cc         |  9 +--
 .../compiler/xla/service/cpu/cpu_executable.h | 12 ++--
 .../xla/service/cpu/cpu_transfer_manager.cc   | 13 ++--
 .../xla/service/cpu/cpu_transfer_manager.h    | 25 ++++---
 .../service/cpu/parallel_cpu_executable.cc    |  4 +-
 .../xla/service/cpu/parallel_cpu_executable.h |  9 ++-
 .../xla/service/device_memory_allocator.cc    | 25 +++----
 .../xla/service/device_memory_allocator.h     | 28 ++++----
 tensorflow/compiler/xla/service/executable.cc |  6 +-
 tensorflow/compiler/xla/service/executable.h  |  2 +-
 .../xla/service/generic_transfer_manager.cc   | 13 ++--
 .../xla/service/generic_transfer_manager.h    | 35 ++++------
 .../xla/service/gpu/buffer_allocations.cc     |  2 -
 .../xla/service/gpu/buffer_allocations.h      | 21 +++---
 .../xla/service/gpu/conditional_thunk.cc      |  5 +-
 .../xla/service/gpu/conditional_thunk.h       |  2 +-
 .../xla/service/gpu/convolution_thunk.cc      |  2 -
 .../xla/service/gpu/convolution_thunk.h       | 24 +++----
 .../compiler/xla/service/gpu/copy_thunk.cc    | 12 ++--
 .../compiler/xla/service/gpu/copy_thunk.h     |  6 +-
 .../xla/service/gpu/cudnn_batchnorm_thunk.cc  |  1 -
 .../xla/service/gpu/cudnn_batchnorm_thunk.h   |  6 +-
 .../gpu/cudnn_convolution_algorithm_picker.cc |  2 -
 .../gpu/cudnn_convolution_algorithm_picker.h  |  7 +-
 .../service/gpu/cudnn_convolution_runner.cc   | 28 +++-----
 .../service/gpu/cudnn_convolution_runner.h    | 26 +++----
 .../compiler/xla/service/gpu/fft_thunk.cc     |  2 -
 .../compiler/xla/service/gpu/fft_thunk.h      | 17 +++--
 .../compiler/xla/service/gpu/for_thunk.cc     |  3 +-
 .../compiler/xla/service/gpu/for_thunk.h      |  3 +-
 .../compiler/xla/service/gpu/gemm_thunk.cc    |  2 -
 .../compiler/xla/service/gpu/gemm_thunk.h     |  9 +--
 .../compiler/xla/service/gpu/gpu_compiler.cc  |  8 +--
 .../compiler/xla/service/gpu/gpu_compiler.h   | 10 ++-
 .../xla/service/gpu/gpu_executable.cc         |  4 +-
 .../xla/service/gpu/gpu_transfer_manager.cc   |  6 +-
 .../xla/service/gpu/gpu_transfer_manager.h    | 11 ++-
 .../xla/service/gpu/infeed_manager.cc         |  2 -
 .../compiler/xla/service/gpu/infeed_manager.h | 17 ++---
 .../compiler/xla/service/gpu/infeed_thunk.cc  |  6 +-
 .../compiler/xla/service/gpu/infeed_thunk.h   |  2 +-
 .../xla/service/gpu/ir_emitter_context.h      |  6 +-
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  2 -
 .../compiler/xla/service/gpu/kernel_thunk.h   | 11 ++-
 .../compiler/xla/service/gpu/memset_thunk.cc  |  2 -
 .../compiler/xla/service/gpu/memset_thunk.h   |  4 +-
 .../xla/service/gpu/partition_assignment.cc   |  2 -
 .../xla/service/gpu/partition_assignment.h    |  3 +-
 .../xla/service/gpu/sequential_thunk.cc       |  3 +-
 .../xla/service/gpu/sequential_thunk.h        |  3 +-
 tensorflow/compiler/xla/service/gpu/thunk.h   |  6 +-
 .../compiler/xla/service/gpu/tuple_thunk.cc   |  2 -
 .../compiler/xla/service/gpu/tuple_thunk.h    |  3 +-
 .../compiler/xla/service/gpu/while_thunk.cc   |  4 +-
 .../compiler/xla/service/gpu/while_thunk.h    |  2 +-
 .../xla/service/hlo_execution_profile.h       |  2 +-
 tensorflow/compiler/xla/service/hlo_runner.cc |  2 -
 tensorflow/compiler/xla/service/hlo_runner.h  |  5 +-
 .../xla/service/interpreter/compiler.cc       | 14 ++--
 .../xla/service/interpreter/compiler.h        | 11 ++-
 .../xla/service/interpreter/executable.cc     |  2 -
 .../xla/service/interpreter/executor.cc       |  6 +-
 .../xla/service/interpreter/executor.h        |  6 +-
 .../interpreter_transfer_manager.cc           |  7 +-
 .../xla/service/interpreter/platform.cc       | 18 ++---
 .../xla/service/interpreter/platform.h        |  6 +-
 .../xla/service/interpreter/platform_id.cc    |  6 +-
 .../xla/service/interpreter/platform_id.h     |  6 +-
 .../compiler/xla/service/llvm_compiler.cc     |  2 +-
 .../compiler/xla/service/llvm_compiler.h      |  7 +-
 .../compiler/xla/service/local_service.cc     |  4 +-
 .../compiler/xla/service/platform_util.cc     |  2 -
 .../compiler/xla/service/platform_util.h      | 16 ++---
 tensorflow/compiler/xla/service/service.cc    | 45 +++++-------
 tensorflow/compiler/xla/service/service.h     | 27 ++++----
 .../service/service_executable_run_options.h  |  7 +-
 .../compiler/xla/service/shaped_buffer.cc     |  4 +-
 .../compiler/xla/service/shaped_buffer.h      | 25 +++----
 .../compiler/xla/service/transfer_manager.cc  | 19 ++---
 .../compiler/xla/service/transfer_manager.h   | 69 +++++++++----------
 .../xla/tests/bitcast_convert_test.cc         |  2 +-
 .../xla/tests/client_library_test_base.cc     |  5 +-
 .../xla/tests/client_library_test_base.h      |  5 +-
 .../xla/tests/compute_constant_test.cc        |  8 +--
 tensorflow/compiler/xla/tests/convert_test.cc |  2 +-
 .../compiler/xla/tests/dynamic_ops_test.cc    |  2 -
 tensorflow/compiler/xla/tests/fusion_test.cc  |  2 -
 .../compiler/xla/tests/hlo_test_base.cc       |  2 -
 tensorflow/compiler/xla/tests/hlo_test_base.h |  3 +-
 .../compiler/xla/tests/llvm_compiler_test.cc  |  4 +-
 .../xla/tests/local_client_execute_test.cc    |  2 -
 .../xla/tests/local_client_test_base.cc       | 14 ++--
 .../xla/tests/local_client_test_base.h        | 18 +++--
 tensorflow/compiler/xla/tests/map_test.cc     |  2 +-
 tensorflow/compiler/xla/tests/test_utils.cc   |  4 +-
 tensorflow/compiler/xla/tests/test_utils.h    |  3 +-
 .../xla/tests/vector_ops_simple_test.cc       |  2 +-
 tensorflow/compiler/xla/tests/while_test.cc   |  2 -
 .../xla/tests/xla_hlo_profile_test.cc         |  2 +-
 tensorflow/compiler/xla/types.h               |  4 +-
 121 files changed, 443 insertions(+), 663 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 751777222f..88f37433a5 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -443,6 +443,9 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":types",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index b1663bc815..803a9e4009 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -23,22 +23,19 @@ limitations under the License.
 
 namespace xla {
 
-LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
+LocalClientOptions::LocalClientOptions(se::Platform* platform,
                                        int number_of_replicas,
                                        int intra_op_parallelism_threads)
     : platform_(platform),
       number_of_replicas_(number_of_replicas),
       intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
 
-LocalClientOptions& LocalClientOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* LocalClientOptions::platform() const {
-  return platform_;
-}
+se::Platform* LocalClientOptions::platform() const { return platform_; }
 
 LocalClientOptions& LocalClientOptions::set_number_of_replicas(
     int number_of_replicas) {
@@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default;
 ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   return GetOrCreateLocalClient(default_options);
@@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
     const LocalClientOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   int replica_count = options.number_of_replicas();
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
@@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default;
 }
 
 /* static */ LocalService* ClientLibrary::GetXlaService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
   auto it = client_library.local_instances_.find(platform->id());
@@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default;
 }
 
 /* static */ StatusOr<CompileOnlyClient*>
-ClientLibrary::GetOrCreateCompileOnlyClient(
-    perftools::gputools::Platform* platform) {
+ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
 
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index a6f30d82e4..3ad558fa53 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -43,13 +43,13 @@ namespace xla {
 // Options to configure the local client when it is created.
 class LocalClientOptions {
  public:
-  LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
+  LocalClientOptions(se::Platform* platform = nullptr,
                      int number_of_replicas = 1,
                      int intra_op_parallelism_threads = -1);
 
   // Set the platform backing the service, or nullptr for the default platform.
-  LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  LocalClientOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
   // programs.
@@ -61,7 +61,7 @@ class LocalClientOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
   int number_of_replicas_;
   int intra_op_parallelism_threads_;
 };
@@ -74,7 +74,7 @@ class ClientLibrary {
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
       const LocalClientOptions& options);
 
@@ -84,14 +84,14 @@ class ClientLibrary {
 
   // Returns the service from the service thread. Only used in unit tests to
   // access user computations from client.
-  static LocalService* GetXlaService(perftools::gputools::Platform* platform);
+  static LocalService* GetXlaService(se::Platform* platform);
 
   // Singleton constructor-or-accessor for compile-only clients. Arguments:
   //
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
   static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
 
   // Clears the local instance and compile only instance caches. The client
   // pointers returned by the previous GetOrCreateLocalClient() or
@@ -120,12 +120,10 @@ class ClientLibrary {
   };
 
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<LocalInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
       local_instances_ GUARDED_BY(service_mutex_);
 
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<CompileOnlyInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
       compile_only_instances_ GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 30594243dc..d951c44cb9 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace se = ::perftools::gputools;
-
 using xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 98ee7c62c9..42812b936f 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -167,7 +167,7 @@ class LocalClient : public Client {
   StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
   // Returns the platform that the underlying service targets.
-  perftools::gputools::Platform* platform() const;
+  se::Platform* platform() const;
 
   // Returns the number of devices on the system of the service platform
   // type. Not all devices may be supported by the service (see
diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h
index 23a622b1ad..1a51fdee68 100644
--- a/tensorflow/compiler/xla/device_util.h
+++ b/tensorflow/compiler/xla/device_util.h
@@ -29,7 +29,7 @@ namespace xla {
 
 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
-string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) {
+string DeviceIdentifier(se::StreamExecutor* stream_exec) {
   return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
                                      stream_exec->device_ordinal());
 }
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 1a095a82cc..a306ae16ba 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
+// Pulls in the ::stream_executor -> ::xla::se namespace alias.
+#include "tensorflow/compiler/xla/types.h"
+
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -37,10 +40,6 @@ struct ThreadPoolDevice;
 
 namespace xla {
 
-// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
-// is complete, add "using namespace se = stream_executor" here and
-// s/stream_executor/se::/ to match our idiom elsewhere.
-
 class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 414829d6e7..0b100bd108 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   std::unique_ptr<GRPCService> grpc_service(new GRPCService());
   TF_ASSIGN_OR_RETURN(grpc_service->service_,
                       ::xla::Service::NewService(platform));
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index 7c9e484517..fad74375bd 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service {
   // that the service should target. If platform is null then the default
   // platform is used.
   static StatusOr<std::unique_ptr<GRPCService>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
 
   ::grpc::Status Computation(::grpc::ServerContext* context,
                              const ComputationRequest* arg,
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 4f819a743c..359582a78c 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -204,7 +204,7 @@ StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
 }
 
 void AllocationTracker::AddAllocationOrIncrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+    se::DeviceMemoryBase device_memory, int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
@@ -215,8 +215,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   }
 }
 
-Status AllocationTracker::DecrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
+                                            int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   TF_RET_CHECK(it != allocation_map.end());
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 038aee8541..60e93358ef 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -77,7 +77,7 @@ class AllocationTracker {
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    perftools::gputools::DeviceMemoryBase device_memory;
+    se::DeviceMemoryBase device_memory;
 
     // The device that the memory is allocated on.
     int device_ordinal;
@@ -103,13 +103,13 @@ class AllocationTracker {
 
   // Adds the given device address to the allocation tracker, or if it already
   // exists, then increment it's reference count.
-  void AddAllocationOrIncrementRefCount(
-      perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
+  void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
+                                        int device_ordinal)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Decrements the reference count of the given device memory. Then, if it is
   // zero, deallocate the memory.
-  Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
+  Status DecrementRefCount(se::DeviceMemoryBase device_memory,
                            int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // A map from device memory opaque value to allocation. One such map is
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 05f2d06278..a582dbffd6 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -36,19 +36,14 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
-BackendOptions& BackendOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* BackendOptions::platform() const {
-  return platform_;
-}
+se::Platform* BackendOptions::platform() const { return platform_; }
 
 BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
     int num_threads) {
@@ -77,7 +72,7 @@ struct Backend::EigenThreadPoolWrapper {
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
     const BackendOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
@@ -121,7 +116,7 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
 }
 
 Backend::Backend(
-    perftools::gputools::Platform* platform, Compiler* compiler,
+    se::Platform* platform, Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
     TransferManager* transfer_manager, ComputationPlacer* computation_placer,
     int intra_op_parallelism_threads)
@@ -178,7 +173,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
   return intra_op_thread_pool_wrapper_->pool.get();
 }
 
-StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
+StatusOr<se::StreamExecutor*> Backend::stream_executor(
     int device_ordinal) const {
   if (device_ordinal < 0 ||
       device_ordinal > stream_executors_.back()->device_ordinal()) {
@@ -201,9 +196,9 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
   // bit crude but works for GPUs which is the important case where we compile
   // an executable for one GPU and want to know if it will run (well) on
   // another.
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a,
                       stream_executor(device_ordinal_a));
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b,
                       stream_executor(device_ordinal_b));
   return (executor_a->GetDeviceDescription().name() ==
           executor_b->GetDeviceDescription().name());
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index b5ca483b72..d32a0a400d 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -44,8 +44,8 @@ namespace xla {
 class BackendOptions {
  public:
   // Set the platform backing the backend, or nullptr for the default platform.
-  BackendOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  BackendOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Sets the thread pool size for parallel execution of an individual operator.
   // The default value of -1 will result in initializing the thread pool with
@@ -54,7 +54,7 @@ class BackendOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
   int intra_op_parallelism_threads_ = -1;
 };
 
@@ -66,7 +66,7 @@ class BackendOptions {
 //    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
  public:
-  using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
+  using StreamPtr = Pool<se::Stream>::SmartPtr;
 
   // Creates a new backend.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
@@ -79,7 +79,7 @@ class Backend {
   ~Backend();
 
   // Accessors for the various objects.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  se::Platform* platform() const { return platform_; }
   Compiler* compiler() const { return compiler_; }
   DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_.get();
@@ -96,19 +96,17 @@ class Backend {
 
   // Returns stream executors of all supported devices for this backend. The
   // executors are ordered by the device ordinal.
-  const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
-      const {
+  const std::vector<se::StreamExecutor*>& stream_executors() const {
     return stream_executors_;
   }
 
   // Returns the stream executor for the given device ordinal.
-  StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
-      int device_ordinal) const;
+  StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;
 
   // Returns the stream executor for the default device ordinal. This stream
   // executor can only be used when the number of computations is 1 (replication
   // can be > 1).
-  perftools::gputools::StreamExecutor* default_stream_executor() const {
+  se::StreamExecutor* default_stream_executor() const {
     CHECK(!stream_executors_.empty());
     return stream_executors_[0];
   }
@@ -117,8 +115,7 @@ class Backend {
   // internal pool, or by constructing/initializating it, and returns the result
   // to the caller.
   StatusOr<StreamPtr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPtr> BorrowStream(
-      perftools::gputools::StreamExecutor* executor);
+  StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
 
   // Returns a function to borrow a stream, as `BorrowStream` above does.
   // Purely for convenience, the caller could rather make this anonymous
@@ -157,29 +154,26 @@ class Backend {
 
  private:
   struct EigenThreadPoolWrapper;
-  Backend(perftools::gputools::Platform* platform, Compiler* compiler,
-          tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-              stream_executors,
+  Backend(se::Platform* platform, Compiler* compiler,
+          tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
           TransferManager* transfer_manager,
           ComputationPlacer* computation_placer,
           int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
   Compiler* compiler_;
   TransferManager* transfer_manager_;
   ComputationPlacer* computation_placer_;
 
   // Vector of stream executors. stream_executors_[0] is the default executor.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<perftools::gputools::StreamExecutor*,
-           Pool<perftools::gputools::Stream>>
-      stream_pools_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index fb70ea5315..c9f78a0f9f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
-CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+CompileOnlyService::NewService(se::Platform* platform) {
   ServiceOptions default_options;
   default_options.set_platform(platform);
   return NewService(default_options);
@@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
 
 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
 CompileOnlyService::NewService(const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index dd8de42a0f..c10609e67f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -34,7 +34,7 @@ class CompileOnlyService : public Service {
   // platform that the service should target. If platform is null then the
   // default platform is used.
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
-      perftools::gputools::Platform* platform);
+      se::Platform* platform);
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
       const ServiceOptions& options);
 
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 0392d4af48..8b01a6c4b5 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -23,26 +23,21 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      Compiler::CompilerFactory>*
+/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, CompilerFactory>;
+  static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
   return r;
 }
 
 /* static */
-std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
 Compiler::GetPlatformCompilers() {
-  static auto* r = new std::map<perftools::gputools::Platform::Id,
-                                std::unique_ptr<Compiler>>;
+  static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
   return r;
 }
 
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index b4b53ae2ed..5c14591d93 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -70,7 +70,7 @@ class AotCompilationOptions {
   virtual ~AotCompilationOptions() = default;
 
   // Returns the ID of the platform to which these options apply.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
@@ -109,7 +109,7 @@ class Compiler {
   virtual ~Compiler() {}
 
   // Returns the ID of the platform that this compiler targets.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
@@ -120,8 +120,7 @@ class Compiler {
   // algorithm over those buffers, to see which variant is fastest.  Any space
   // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
@@ -137,8 +136,7 @@ class Compiler {
   //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
@@ -151,8 +149,7 @@ class Compiler {
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
@@ -171,14 +168,12 @@ class Compiler {
   // be a singleton, so no ownership is transferred.
   //
   // Precondition: a platform kind must not be registered more than once.
-  static void RegisterCompilerFactory(
-      perftools::gputools::Platform::Id platform_id,
-      CompilerFactory compiler_factory);
+  static void RegisterCompilerFactory(se::Platform::Id platform_id,
+                                      CompilerFactory compiler_factory);
 
   // Returns the compiler singleton pointer if it is available for the given
   // platform, or an error status if it is not.
-  static StatusOr<Compiler*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+  static StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
 
   // Returns a function that computes the size in bytes of the logical
   // buffer that contains a shape.
@@ -198,12 +193,12 @@ class Compiler {
   static tensorflow::mutex platform_compiler_mutex_;
 
   // Map from platform kind to compiler factory.
-  static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
+  static std::map<se::Platform::Id, CompilerFactory>*
   GetPlatformCompilerFactories();
 
   // Map from platform kind to compiler instance, if we made one already (based
   // on the factories above).
-  static std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+  static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
   GetPlatformCompilers();
 };
 
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index 657fba6b62..7c1bacff92 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
@@ -132,11 +130,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
     ComputationPlacer::platform_computation_placer_mutex_(
         tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      ComputationPlacer::State>*
+/* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
 ComputationPlacer::GetPlatformComputationPlacers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
+  static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
   return r;
 }
 
@@ -147,10 +143,10 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
-                                                    &CreateComputationPlacer);
-  xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::host::kHostPlatformId, &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 737ccabaa7..737d00e93e 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -80,13 +80,13 @@ class ComputationPlacer {
 
   // Registers a computation placer creation function for a particular platform.
   static void RegisterComputationPlacer(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
       ComputationPlacerCreationFunction creation_function);
 
   // Returns the computation placer singleton pointer if it is available for the
   // given platform, or an error status if it is not.
   static StatusOr<ComputationPlacer*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);
 
  private:
   // The mutex that guards the platform-to-computation placer map.
@@ -101,10 +101,9 @@ class ComputationPlacer {
   };
 
   // Map from platform kind to computation placer singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformComputationPlacers();
+  static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index e43777c5e5..e8472fd36b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -100,8 +100,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -440,8 +438,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
     DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
@@ -454,8 +451,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 }
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
@@ -938,9 +934,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 }  // namespace xla
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
-    return xla::MakeUnique<xla::cpu::CpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::host::kHostPlatformId,
+      []() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 3498139ab9..151af38438 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
                            RelocationModel relocation_model);
   ~CpuAotCompilationOptions() override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   // The triple used for compilation, similar to clang's -target flag.
   const string& triple() const { return triple_; }
@@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler {
   // Bring in
   // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
   //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
   //        stream_execs)
   using LLVMCompiler::Compile;
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                      const AotCompilationOptions& options) override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index c053703c35..aee62a4935 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -45,8 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/host/host_stream.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable(
 
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -247,8 +245,7 @@ static Status DeallocateTempBuffers(
 
 StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        allocated_buffers,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
     std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
   auto result_buffer = MakeUnique<ShapedBuffer>(
@@ -322,7 +319,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
         "supported on CPU.");
   }
 
-  auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
+  auto* host_stream = dynamic_cast<se::host::HostStream*>(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index d3502b3a03..c3c2820c26 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -90,17 +90,16 @@ class CpuExecutable : public Executable {
   // assignment. Each vector element corresponds to a particular Index. If
   // a vector element already contains a non-null DeviceMemoryBase, then no
   // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
   Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
   // Create a ShapedBuffer for holding the result of the computation. The
@@ -111,8 +110,7 @@ class CpuExecutable : public Executable {
   // the returned ShapedBuffer).
   StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          allocated_buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
       std::vector<bool>* buffers_in_result);
 
   // Returns the points-to set of the root instruction of the entry
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index f5e61aef53..9b39e7f576 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 namespace {
@@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
 }
 
 StatusOr<Shape> CpuTransferManager::TransferTupleBuffersFromOutfeed(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data) {
   return TransferBuffersFromOutfeedInternal(executor, buffer_data,
                                             /*is_tuple=*/true);
 }
 
 StatusOr<Shape> CpuTransferManager::TransferArrayBufferFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, void* destination,
-    int64 size_bytes) {
+    se::StreamExecutor* executor, void* destination, int64 size_bytes) {
   return TransferBuffersFromOutfeedInternal(
       executor, {{destination, size_bytes}}, /*is_tuple=*/false);
 }
 
 StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
     bool is_tuple) {
   std::vector<std::unique_ptr<CpuOutfeedBuffer>> buffers;
@@ -306,8 +303,8 @@ static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
 }
 
 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId,
-                                                &CreateCpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::host::kHostPlatformId, &CreateCpuTransferManager);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 6c7524d947..3ecb0d2364 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager {
   CpuTransferManager();
   ~CpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
 
  private:
   // Transfers infeed data to device. InfeedBuffer->Done() must be
   // called to clean up the memory allocated for InfeedBuffer.
   StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);
 
   // Helper that transfers a tuple of element buffers from the device's outfeed.
   StatusOr<Shape> TransferTupleBuffersFromOutfeed(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data);
 
   // Helper that transfers an array buffer from the device's outfeed.
-  StatusOr<Shape> TransferArrayBufferFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, void* destination,
-      int64 size_bytes);
+  StatusOr<Shape> TransferArrayBufferFromOutfeed(se::StreamExecutor* executor,
+                                                 void* destination,
+                                                 int64 size_bytes);
 
   // On success, returns the shape that was transferred from the outfeed -- if
   // is_tuple is true, the returned shape will be a tuple of the returned shapes
   // for the given buffers.
   StatusOr<Shape> TransferBuffersFromOutfeedInternal(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
       bool is_tuple);
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 07a9f0efcb..2d0f1d0be5 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -49,8 +49,6 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {
 
@@ -325,7 +323,7 @@ const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
 
 Status ParallelCpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 87c0a3df45..d87ba57a1e 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -89,17 +89,16 @@ class ParallelCpuExecutable : public Executable {
   // assignment. Each vector element corresponds to a particular Index. If
   // a vector element already contains a non-null DeviceMemoryBase, then no
   // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);
 
   // Calls the generated functions in 'function_names_', performing the
   // computation with the given arguments using the supplied buffers.
   Status ExecuteComputeFunctions(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
   // Returns the points-to set of the root instruction of the entry
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 78e7aa48ac..35db4fd2a2 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,19 +24,16 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const perftools::gputools::Platform* platform,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors)
+    const se::Platform* platform,
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors)
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
-  perftools::gputools::DeviceMemoryBase result =
-      stream_executor->AllocateArray<uint8>(size);
+  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
   if (size > 0 && result == nullptr) {
     return ResourceExhausted(
         "Failed to allocate request for %s (%lluB) on device ordinal %d",
@@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
 }
 
 tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+    int device_ordinal, se::DeviceMemoryBase* mem) {
   if (!mem->is_null()) {
-    TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                         GetStreamExecutor(device_ordinal));
     // We make a local copy of 'mem' so the original is not zeroed out by the
     // Deallocate() call below. This gives us a better chance of
     // catching double-free bugs, since Deallocate silently succeeds for null
     // values.
-    perftools::gputools::DeviceMemoryBase mem_copy(*mem);
+    se::DeviceMemoryBase mem_copy(*mem);
     stream_executor->Deallocate(&mem_copy);
   }
   return tensorflow::Status::OK();
 }
 
-StatusOr<perftools::gputools::StreamExecutor*>
-StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
+StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
+    int device_ordinal) {
   if (device_ordinal < 0) {
     return InvalidArgument("device ordinal value (%d) must be non-negative",
                            device_ordinal);
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 39dfad84c1..240acf8973 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const se::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -43,20 +43,20 @@ class DeviceMemoryAllocator {
   // has only performance impact.
   // Allocate() should return a null pointer for a size-0 allocation.
   // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
+  virtual StatusOr<se::DeviceMemoryBase> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
-  virtual tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
+  virtual tensorflow::Status Deallocate(int device_ordinal,
+                                        se::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,25 +64,23 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      const perftools::gputools::Platform* platform,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      const se::Platform* platform,
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
+  StatusOr<se::DeviceMemoryBase> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure = true) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
  private:
-  StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
-      int device_ordinal);
+  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
 
   // A vector indexed by device ordinal of StreamExecutors for each device of
   // the allocator's platform type. If an element is nullptr, then the device
   // with the respective device ordinal is not supported by XLA.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 471d2fd6ce..caa46686be 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -61,10 +61,10 @@ Executable::ExecuteOnStreams(
 StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
     const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
     ArraySlice<const ShapedBuffer*> arguments) {
-  perftools::gputools::Stream* stream = run_options->stream();
-  std::unique_ptr<perftools::gputools::Timer> timer;
+  se::Stream* stream = run_options->stream();
+  std::unique_ptr<se::Timer> timer;
   if (profile != nullptr) {
-    timer.reset(new perftools::gputools::Timer(stream->parent()));
+    timer.reset(new se::Timer(stream->parent()));
     stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
   }
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index a157235f8a..6f4cd99767 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -90,7 +90,7 @@ class Executable {
   // has completed.
   virtual Status PopulateExecutionProfile(
       HloExecutionProfile* hlo_execution_profile,
-      perftools::gputools::StreamExecutor* executor) {
+      se::StreamExecutor* executor) {
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index a99e2b7794..ddb687314e 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
@@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
 }
 
 Status GenericTransferManager::WriteSingleTupleIndexTable(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
-    const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
+    const Shape& shape, se::DeviceMemoryBase* region) {
   TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
 
   std::vector<const void*> element_pointers;
@@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed(
 }
 
 Status GenericTransferManager::TransferBufferToInfeed(
-    perftools::gputools::StreamExecutor* executor, int64 size,
-    const void* source) {
+    se::StreamExecutor* executor, int64 size, const void* source) {
   return Unimplemented("Generic transfer to Infeed");
 }
 
 Status GenericTransferManager::TransferLiteralFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    se::StreamExecutor* executor, const Shape& literal_shape,
     Literal* literal) {
   return Unimplemented(
       "Outfeed is not supported on this platform (b/30467474)");
 }
 
 Status GenericTransferManager::ResetDevices(
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*>
     /*executors*/) {
   return Unimplemented(
       "Device reset is not yet supported on this platform (b/30481585)");
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 63a7c820cf..0579099de4 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -36,46 +36,41 @@ namespace xla {
 // infeed.
 class GenericTransferManager : public TransferManager {
  public:
-  GenericTransferManager(perftools::gputools::Platform::Id platform_id,
-                         size_t pointer_size);
+  GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
   ~GenericTransferManager() override {}
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) override;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
 
-  Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
                                  const Literal& literal,
                                  const ShapedBuffer& device_buffer) override;
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
 
   Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executors) override;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
 
   int64 GetByteSizeRequirement(const Shape& shape) const override;
 
  protected:
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
 
   Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape,
-      perftools::gputools::DeviceMemoryBase* region) override;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) override;
 
  private:
   // The platform this transfer manager targets.
-  const perftools::gputools::Platform::Id platform_id_;
+  const se::Platform::Id platform_id_;
 
   // The size in bytes of pointers on this platform.
   const size_t pointer_size_;
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 2029c303d4..837f05244f 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index ea7f0eb374..c2fc35be4c 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -41,7 +41,7 @@ class BufferAllocations {
     // user-specified result buffers) to the given buffer index. The builder
     // will skip allocating buffers for registered buffer indices.
     void RegisterBuffer(BufferAllocation::Index index,
-                        perftools::gputools::DeviceMemoryBase address);
+                        se::DeviceMemoryBase address);
 
     // Builds a BufferAllocations object from the given buffer assignment.
     // `memory_allocator` is what this function uses to allocate device memory.
@@ -52,8 +52,7 @@ class BufferAllocations {
         DeviceMemoryAllocator* memory_allocator);
 
    private:
-    std::map<BufferAllocation::Index, perftools::gputools::DeviceMemoryBase>
-        registered_buffers_;
+    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
   };
 
   BufferAllocations(const BufferAllocations&) = delete;
@@ -65,22 +64,20 @@ class BufferAllocations {
   // Returns the device address of buffer `buffer_index`. `buffer_index` must be
   // a valid index, i.e., in [0, buffer_count). This function returns null if
   // `buffer_index` is not assigned to a buffer address.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
       BufferAllocation::Index buffer_index) const;
 
   // Same as above, but also adjusts the returned address for the offset and
   // size contained in the given slice.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
       const BufferAllocation::Slice& buffer_slice) const;
 
-  perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
-    return temp_buffer_base_;
-  }
+  se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; }
 
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
   tensorflow::Status TearDown(
-      const std::set<perftools::gputools::DeviceMemoryBase>& live_addresses,
+      const std::set<se::DeviceMemoryBase>& live_addresses,
       const BufferAssignment& buffer_assignment);
 
  private:
@@ -92,15 +89,15 @@ class BufferAllocations {
 
   // Sets the device address of buffer `buffer_index`.
   void SetBuffer(BufferAllocation::Index buffer_index,
-                 perftools::gputools::DeviceMemoryBase buffer);
+                 se::DeviceMemoryBase buffer);
 
   // An array of device pointers that stores the address of each buffer
   // indexed by Index. Each element can point to a temporary buffer, an
   // input buffer, or nullptr if no buffer is needed for that Index.
-  std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
+  std::vector<se::DeviceMemoryBase> buffers_;
 
   // The base address of the memory block that contains all temporary buffers.
-  perftools::gputools::DeviceMemoryBase temp_buffer_base_;
+  se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 790ca535b1..dce8de2e30 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
 }
 
 Status ConditionalThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // Copy the predicate value from device.
   bool predicate;
-  perftools::gputools::DeviceMemoryBase predicate_address =
+  se::DeviceMemoryBase predicate_address =
       buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
   stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 7725c46a3b..e40872688f 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk {
 
   Status Initialize(const GpuExecutable& executable) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice predicate_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 461747b699..64d3b84b8c 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 900d9cb624..6d845025b1 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk {
 
   // Does the convolution for the thunk on "stream".
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   class ScratchAllocator;
 
-  Status Convolve(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
-      perftools::gputools::dnn::ProfileResult* profile_result);
+  Status Convolve(const se::dnn::BatchDescriptor& input_descriptor,
+                  se::DeviceMemory<float> input_data,
+                  const se::dnn::FilterDescriptor& filter_descriptor,
+                  se::DeviceMemory<float> filter_data,
+                  const se::dnn::BatchDescriptor& output_descriptor,
+                  se::DeviceMemory<float> output_data,
+                  const se::dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const se::dnn::AlgorithmConfig& algorithm_config,
+                  se::Stream* stream, ScratchAllocator* scratch_allocator,
+                  se::dnn::ProfileResult* profile_result);
 
   const CudnnConvKind convolution_kind_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index f4498663b1..bf912fbd14 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
       mem_size_(mem_size) {}
 
 tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
   return tensorflow::Status::OK();
@@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
       mem_size_(mem_size) {}
 
 tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
-  perftools::gputools::DeviceMemoryBase source_data =
+  se::DeviceMemoryBase source_data =
       buffer_allocations.GetDeviceAddress(source_buffer_);
   stream->ThenMemcpy(&destination_data, source_data, mem_size_);
   return tensorflow::Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index e2783fd255..2e7eb5f344 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk {
   HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const void* source_address_;
@@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice source_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index 58d9c8caff..68099fd638 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -28,7 +28,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-namespace se = ::perftools::gputools;
 namespace dnn = se::dnn;
 
 static std::pair<dnn::BatchDescriptor /*input_desc*/,
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
index c5fbb6d8a3..874f85a863 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -60,7 +60,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
       const CudnnBatchNormForwardInferenceThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
@@ -90,7 +90,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
       const CudnnBatchNormForwardTrainingThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
@@ -123,7 +123,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
       delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   BufferAllocation::Slice operand_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index d6b457a91b..1790c50d4d 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -24,8 +24,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace se = perftools::gputools;
-
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index 516210ec2e..bc5d1ce94a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -33,9 +33,8 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
   // If the `allocator` parameter is not null, we will use it to allocate temp
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
-  CudnnConvolutionAlgorithmPicker(
-      perftools::gputools::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* allocator)
+  CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
+                                  DeviceMemoryAllocator* allocator)
       : stream_exec_(stream_exec), allocator_(allocator) {}
 
   tensorflow::StringPiece name() const override {
@@ -52,7 +51,7 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
       const Shape& output_shape, const Window& window,
       const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
index e4ae839e1d..10b4c3de89 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -22,8 +22,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace se = ::perftools::gputools;
-
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
 using se::Stream;
@@ -215,14 +213,12 @@ string CudnnConvKindToString(CudnnConvKind kind) {
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
     const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
   return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
                              input_buf, filter_buf, output_buf,
@@ -232,14 +228,12 @@ Status RunCudnnConvolution(
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
   PrimitiveType output_primitive_type = output_shape.element_type();
   CHECK(output_primitive_type == F32 || output_primitive_type == F16)
       << ShapeUtil::HumanString(output_shape);
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
index 3dbfa2730d..944e4ac686 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@@ -72,25 +72,21 @@ string CudnnConvKindToString(CudnnConvKind kind);
 // that size, if you like.
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
     const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);
 
 Status RunCudnnConvolution(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 66931bdc8b..cc747addbd 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 52fb8c376d..24b1dca998 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -34,24 +34,24 @@ namespace gpu {
 // released on destruction.
 //
 // Not thread-safe in that AllocateBytes, destructor are not locked.
-class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
+class FftScratchAllocator : public se::ScratchAllocator {
  public:
   FftScratchAllocator(int device_ordinal,
                       DeviceMemoryAllocator* memory_allocator);
 
   ~FftScratchAllocator() override;
 
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
 
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;
 
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
@@ -74,16 +74,15 @@ class FftThunk : public Thunk {
 
   // Does the FFT for the thunk on "stream".
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
-  const perftools::gputools::fft::Type fft_type_;
+  const se::fft::Type fft_type_;
   const std::vector<int64> fft_length_;
 
   float scale_factor_;
 
-  std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
+  std::unique_ptr<se::fft::Plan> fft_plan_;
 
   const BufferAllocation::Slice input_buffer_;
   const BufferAllocation::Slice output_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 283d21ca22..6e6966df39 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
 }
 
 tensorflow::Status ForThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (int64 i = 0; i < loop_limit_; ++i) {
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 832494d17e..c78d1c5068 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,8 +38,7 @@ class ForThunk : public Thunk {
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const int64 loop_limit_;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 38668ff455..0ec12f52d8 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index df3edcefef..a18f425bc3 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -50,14 +50,12 @@ class GemmThunk : public Thunk {
 
   // Does the gemm operation for the thunk on "stream", which must be non-null.
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
   // Returns true if we'll perform autotuning if run on the given stream.  If
   // so, we want the GPU to be quiescent during autotuning, so as not to
   // introduce noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* stream) override {
+  bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
     return autotune_results_.count(
                stream->parent()->GetDeviceDescription().name()) != 0;
   }
@@ -79,8 +77,7 @@ class GemmThunk : public Thunk {
   // results.  The map's value is the best algorithm we've found for this thunk
   // on this device, or an error if none of the algorithms worked and we should
   // use the regular gemm without an algorithm.
-  std::unordered_map<string,
-                     StatusOr<::perftools::gputools::blas::AlgorithmType>>
+  std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
       autotune_results_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 07be2a0cf9..30bfc9351a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -91,8 +91,6 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
@@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const {
 }  // namespace xla
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() {
-    return xla::MakeUnique<xla::gpu::GpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::cuda::kCudaPlatformId,
+      []() { return xla::MakeUnique<xla::gpu::GpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index c352d4d846..f3b02ae5d8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler {
   // Bring in
   // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
   //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
   //        stream_execs)
   using LLVMCompiler::Compile;
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
                      AotCompilationOptions const& options) override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
     // Capture just the pointer size, not the entire GpuCompiler object.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 28f9344795..5676d4de8e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 namespace {
@@ -324,7 +322,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
             this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
         CHECK(!slice.allocation()->is_entry_computation_parameter());
 
-        perftools::gputools::DeviceMemoryBase src_base =
+        se::DeviceMemoryBase src_base =
             buffer_allocations->GetDeviceAddress(slice.index());
         CHECK(!src_base.is_null() || src_base.size() == 0);
         *device_memory = src_base;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index af9897769f..f13727ca9b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -33,8 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 // TODO(b/30467474) Once GPU infeed implementation settles, consider
@@ -153,8 +151,8 @@ static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
 }
 
 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
-                                                &CreateGpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index 9aa369c668..d040a99975 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager {
   GpuTransferManager();
   ~GpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
 
  private:
   // Initiates the infeed data transfers. InfeedBuffer->Done() must be
   // called to clean up the memory allocated for InfeedBuffer.
   StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);
 
   // Enqueues infeed data buffers with the infeed manager after their
   // transfer completes.
-  Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status EnqueueBuffersToInfeed(se::StreamExecutor* executor,
                                 std::vector<gpu::InfeedBuffer*> buffers);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index ee5b447c9c..3ddc1c0789 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
index 73d5a5ce35..d5f2216d46 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -46,7 +46,7 @@ namespace gpu {
 // the client. The client manages the memory of the buffer.
 class InfeedBuffer {
  public:
-  InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
+  InfeedBuffer(se::StreamExecutor* executor, int64 length)
       : executor_(executor), length_(length) {
     device_memory_ = executor_->AllocateArray<uint8>(length);
     CHECK(!device_memory_.is_null());
@@ -60,14 +60,12 @@ class InfeedBuffer {
   // client to manage memory for the infeed buffers.
   void Done() { delete this; }
 
-  perftools::gputools::DeviceMemoryBase* device_memory() {
-    return &device_memory_;
-  }
+  se::DeviceMemoryBase* device_memory() { return &device_memory_; }
 
  private:
-  perftools::gputools::StreamExecutor* executor_;  // Not owned.
+  se::StreamExecutor* executor_;  // Not owned.
   const int64 length_;
-  perftools::gputools::DeviceMemoryBase device_memory_;
+  se::DeviceMemoryBase device_memory_;
 };
 
 // Client-side class used to enqueue infeed buffers.
@@ -100,8 +98,7 @@ class InfeedManager {
   // new stream on the first invocation. On subsequent invocations, if
   // the cached executor is not the same as the requested executor,
   // returns null.
-  perftools::gputools::Stream* GetStream(
-      perftools::gputools::StreamExecutor* executor);
+  se::Stream* GetStream(se::StreamExecutor* executor);
 
  private:
   // TODO(b/30467474): Revisit if this mutex becomes a point of
@@ -121,10 +118,10 @@ class InfeedManager {
   tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;
 
   // Cached host to device stream for queuing infeed data.
-  std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
+  std::unique_ptr<se::Stream> host_to_device_stream_;
 
   // Executor that the host_to_device_stream belongs to. Not owned.
-  perftools::gputools::StreamExecutor* host_to_device_executor_;
+  se::StreamExecutor* host_to_device_executor_;
 };
 
 // Singleton creator-or-accessor: Returns the GPU infeed manager.
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 2ac95ceb69..ea34d5b30c 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk(
       destination_buffer_(destination_buffer) {}
 
 Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                    perftools::gputools::Stream* stream) {
+                                    se::Stream* stream) {
   VLOG(2) << "Infeeding to GPU ";
 
-  perftools::gputools::DeviceMemoryBase destination_address =
+  se::DeviceMemoryBase destination_address =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
 
   InfeedManager* infeed_manager = GetOrCreateInfeedManager();
@@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     std::vector<void*> tuple_element_addresses;
     for (BufferAllocation::Slice tuple_element_buffer :
          tuple_element_buffers_) {
-      perftools::gputools::DeviceMemoryBase tuple_element_address =
+      se::DeviceMemoryBase tuple_element_address =
           buffer_allocations.GetDeviceAddress(tuple_element_buffer);
 
       InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 86918705fa..93713cb12d 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -44,7 +44,7 @@ class InfeedThunk : public Thunk {
   InfeedThunk& operator=(const InfeedThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index 3790ed313b..a78b4ff830 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -32,7 +32,7 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
-                   const perftools::gputools::DeviceDescription* device_desc,
+                   const se::DeviceDescription* device_desc,
                    llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
@@ -47,7 +47,7 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
-  const perftools::gputools::DeviceDescription& device_description() const {
+  const se::DeviceDescription& device_description() const {
     return *device_desc_;
   }
   llvm::Module* llvm_module() { return llvm_module_; }
@@ -56,7 +56,7 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
-  const perftools::gputools::DeviceDescription* device_desc_;
+  const se::DeviceDescription* device_desc_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index c24dc1457f..d376ef7a24 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index df8971b083..b556befe66 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -61,8 +61,7 @@ class KernelThunk : public Thunk {
 
   // Executes the kernel for the thunk on "stream", which must be non-null.
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   // Buffers passed to the kernel as arguments.
@@ -82,13 +81,11 @@ class KernelThunk : public Thunk {
   // Describes how to load this kernel. ExecuteOnStream reuses this loader
   // specification for all executions.
   mutable tensorflow::mutex mutex_;
-  std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
-      GUARDED_BY(mutex_);
+  std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
 
   // Loaded kernels for each `StreamExecutor`
-  std::unordered_map<perftools::gputools::StreamExecutor*,
-                     perftools::gputools::KernelBase>
-      kernel_cache_ GUARDED_BY(mutex_);
+  std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
+      GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
index 18e673542c..d4100a898b 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
@@ -19,8 +19,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-namespace se = ::perftools::gputools;
-
 Status MemzeroThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
index b4bb74d1dd..51c332d287 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk {
       : Thunk(Kind::kMemzero, hlo), dest_(dest) {}
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice dest_;
@@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk {
       : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   uint32 value_;
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 5283d51cd1..d3fd0544fb 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 42d2d2af2e..c125474edb 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims);
 
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc,
+    const Shape& shape, const se::DeviceDescription& device_desc,
     int unroll_factor = 1);
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index d8a43091d4..c8510808f1 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize(
 }
 
 tensorflow::Status SequentialThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (const auto& thunk : thunks_) {
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 32c5b748ab..df17b8d67b 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,8 +40,7 @@ class SequentialThunk : public Thunk {
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   // The list of sub-thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 9eea958d12..a0c785ed91 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -85,8 +85,7 @@ class Thunk {
   // This value is not required to be constant for a given Thunk.  For example,
   // a Thunk that performs autotuning may return true for its first run and
   // false thereafter.
-  virtual bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* /*stream*/) {
+  virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
     return false;
   }
 
@@ -104,8 +103,7 @@ class Thunk {
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
   virtual tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) = 0;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
 
  private:
   Kind kind_;
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index bd65e72393..ecb54857cc 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -17,8 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/util.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index 3b1a496328..8b459c29a1 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -46,8 +46,7 @@ class TupleThunk : public Thunk {
   TupleThunk& operator=(const TupleThunk&) = delete;
 
   tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index c21559af6d..a9f3d619a3 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) {
 }
 
 Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                   perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase condition_result_data =
+                                   se::Stream* stream) {
+  se::DeviceMemoryBase condition_result_data =
       buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
 
   while (true) {
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 4c9f45de9e..e589ca78a7 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
 
   Status Initialize(const GpuExecutable& executable) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 6fb91b9bef..be989846ef 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -88,7 +88,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
 // down how much time each HLO took.
 class HloExecutionProfile {
  public:
-  using DeviceDescription = perftools::gputools::DeviceDescription;
+  using DeviceDescription = se::DeviceDescription;
 
   HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                       const HloProfileIndexMap* hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 2e834a79d9..171477299e 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -30,8 +30,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index f54fb44766..53f7c6fe4a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -80,7 +80,7 @@ class HloRunner {
     bool run_hlo_passes = false;
   };
 
-  explicit HloRunner(::perftools::gputools::Platform* platform);
+  explicit HloRunner(se::Platform* platform);
 
   ~HloRunner();
 
@@ -149,8 +149,7 @@ class HloRunner {
   // will be used to configure the replication parameters. Replicated executions
   // should pass the device_assignment parameter.
   ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
-      int64 device, ::perftools::gputools::Stream* stream,
-      DeviceAssignment* device_assignment);
+      int64 device, se::Stream* stream, DeviceAssignment* device_assignment);
 
   std::unique_ptr<Backend> backend_;
 };
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 5b9bf5faf3..76b3ecad26 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -41,9 +41,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
@@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime(
 }
 
 se::Platform::Id InterpreterCompiler::PlatformId() const {
-  return sep::kXlaInterpreterPlatformId;
+  return se::interpreter::kXlaInterpreterPlatformId;
 }
 
 HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@@ -109,11 +106,12 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
-    return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      se::interpreter::kXlaInterpreterPlatformId, []() {
+        return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
+      });
   xla::ComputationPlacer::RegisterComputationPlacer(
-      sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
+      se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index c8660c04d8..e90ae3e818 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler {
   ~InterpreterCompiler() override {}
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
@@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler {
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;
 
  private:
   Status RunHloOptimization(HloModule* hlo_module);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 883063d0f0..acfa79ea75 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -38,8 +38,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
-namespace se = ::perftools::gputools;
-
 InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<const HloModule> hlo_module)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 3caf9e7b82..97e9fa2c8e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -19,8 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 host::HostStream *AsExecutorStream(Stream *stream) {
@@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
 }
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 77426b0820..9b109022fb 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -44,8 +44,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/timer.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
@@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
 };
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
index 3cf8506d1c..d27cd7502f 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@@ -21,12 +21,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 
-namespace sei = ::perftools::gputools::interpreter;
-
 namespace xla {
 
 InterpreterTransferManager::InterpreterTransferManager()
-    : GenericTransferManager(sei::kXlaInterpreterPlatformId,
+    : GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
 }  // namespace xla
@@ -38,7 +36,8 @@ CreateInterpreterTransferManager() {
 
 static bool InitModule() {
   xla::TransferManager::RegisterTransferManager(
-      sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
+      stream_executor::interpreter::kXlaInterpreterPlatformId,
+      &CreateInterpreterTransferManager);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index 015e00e1e8..ce2f4d378c 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -28,11 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
@@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
 }
 
 static void InitializeXlaInterpreterPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
-  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(interpreter_platform,
-                            sep::InitializeXlaInterpreterPlatform());
+REGISTER_MODULE_INITIALIZER(
+    interpreter_platform,
+    stream_executor::interpreter::InitializeXlaInterpreterPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index 2f71b29be4..d68c5aa20d 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 class XlaInterpreterPlatform : public Platform {
@@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform {
 };
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
index b7fb365b70..3272396ce5 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@@ -14,12 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h
index 292f958449..a6cc10bcc1 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@@ -18,14 +18,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform.h"
 
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {
 
 extern const Platform::Id kXlaInterpreterPlatformId;
 
 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 911b243fe2..b17c9d5045 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
     DeviceMemoryAllocator* device_allocator) {
   // Tensorflow tries to enable the following behaviors in all its threads:
   //
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index d74e81bb7f..f1c623508c 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
   //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
   //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
       DeviceMemoryAllocator* device_allocator) override;
 
  protected:
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 499f280211..0fa4061738 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -43,13 +43,11 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
     const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index aa974ee61a..7c63c0acc7 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 using tensorflow::str_util::Lowercase;
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index 69188820a7..571451ba43 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -34,29 +34,27 @@ class PlatformUtil {
   //
   // Note that, even if a platform is present with zero devices, if we *do* have
   // compilation support for it, it will be returned in this sequence.
-  static StatusOr<std::vector<perftools::gputools::Platform*>>
-  GetSupportedPlatforms();
+  static StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();
 
   // Convenience function which returns the default supported platform for
   // tests. If exactly one supported platform is present, then this platform is
   // the default platform. If exactly two platforms are present and one of them
   // is the interpreter platform, then the other platform is the default
   // platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<se::Platform*> GetDefaultPlatform();
 
   // Convenience function which returns the sole supported platform. If
   // exactly one supported platform is present, then this platform is the
   // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
+  static StatusOr<se::Platform*> GetSolePlatform();
 
   // Returns the platform according to the given name. Returns error if there is
   // no such platform.
-  static StatusOr<perftools::gputools::Platform*> GetPlatform(
-      const string& platform_name);
+  static StatusOr<se::Platform*> GetPlatform(const string& platform_name);
 
   // Returns exactly one platform that does not have given name. Returns error
   // if there is no such platform, or there are multiple such platforms.
-  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+  static StatusOr<se::Platform*> GetPlatformExceptFor(
       const string& platform_name);
 
   // Returns a vector of StreamExecutors for the given platform. The vector is
@@ -64,8 +62,8 @@ class PlatformUtil {
   // element is nullptr, then the device is present by not supported by XLA.
   //
   // If the platform has no visible devices, a not-found error is returned.
-  static StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-  GetStreamExecutors(perftools::gputools::Platform* platform);
+  static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
+      se::Platform* platform);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 52500e4e79..2df59c3556 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -54,8 +54,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
 using ::xla::source_map_util::InvalidParameterArgument;
@@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result,
 
 }  // namespace
 
-ServiceOptions& ServiceOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
   return *this;
 }
 
-perftools::gputools::Platform* ServiceOptions::platform() const {
-  return platform_;
-}
+se::Platform* ServiceOptions::platform() const { return platform_; }
 
 ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
   number_of_replicas_ = number_of_replicas;
@@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 }
 
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   ServiceOptions default_options;
   default_options.set_platform(platform);
   return NewService(default_options);
@@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
   std::unique_ptr<Backend> execute_backend;
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
 StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
 Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors) {
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors) {
   CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
   std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
   replicated_arguments.resize(options_.number_of_replicas());
@@ -349,8 +343,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
     DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
@@ -412,8 +405,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     const std::vector<const HloModuleProto*>& module_protos,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
     DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
@@ -493,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    se::StreamExecutor* executor, ExecutionProfile* profile,
     DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
@@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult(
   // Streams where the computation are launched, so we can wait on the streams
   // to complete.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
-  std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
+  std::vector<std::unique_ptr<se::Timer>> timers;
 
   // Global data handles for the computation results, one for each computation.
   std::vector<GlobalDataHandle> result_handles;
@@ -565,8 +557,7 @@ Service::ExecuteParallelAndRegisterResult(
       streams.push_back(std::move(stream));
 
       if (replica == 0 && profile != nullptr) {
-        timers.emplace_back(
-            new perftools::gputools::Timer(streams.back()->parent()));
+        timers.emplace_back(new se::Timer(streams.back()->parent()));
         streams.back()
             ->InitTimer(timers.back().get())
             .ThenStartTimer(timers.back().get());
@@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
   return computation->SetReturnValue(arg->operand());
 }
 
-StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-Service::GetExecutors(const ExecutionOptions& execution_options,
-                      int64 requests_size, int64 request_index) const {
+StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
+    const ExecutionOptions& execution_options, int64 requests_size,
+    int64 request_index) const {
   if (execution_options.device_handles().empty()) {
     return FailedPrecondition(
         "device handles must be given to execute parallel computations");
@@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options,
         "handles.",
         requests_size, request_index, execution_options.device_handles_size());
   }
-  std::vector<perftools::gputools::StreamExecutor*> executors;
+  std::vector<se::StreamExecutor*> executors;
   for (const auto& device_handle : execution_options.device_handles()) {
     TF_ASSIGN_OR_RETURN(auto replicas,
                         Replicas(*execute_backend_, device_handle));
@@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
   std::vector<VersionedComputationHandle> versioned_handles;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
   std::vector<string> computation_names;
@@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel(
   VLOG(1) << "running execute-graph-parallel request";
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
   std::vector<const HloModuleProto*> module_protos;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
   std::vector<string> computation_names;
@@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const {
   return device_handle;
 }
 
-StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
+StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
     const Backend& backend, const DeviceHandle& device_handle) const {
-  std::vector<perftools::gputools::StreamExecutor*> replicas;
+  std::vector<se::StreamExecutor*> replicas;
   for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
     // From the computation placer, find out the device ids of the replicas for
     // the given device handle.
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index e399f1ac19..476bd0597d 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -53,8 +53,8 @@ namespace xla {
 class ServiceOptions {
  public:
   // Set the platform backing the service, or nullptr for the default platform.
-  ServiceOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  ServiceOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
   // programs.
@@ -66,7 +66,7 @@ class ServiceOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
   int number_of_replicas_ = 1;
   int intra_op_parallelism_threads_ = -1;
 };
@@ -79,7 +79,7 @@ class Service : public ServiceInterface {
  public:
   // Factory method for creating a new Service.
   static StatusOr<std::unique_ptr<Service>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
   static StatusOr<std::unique_ptr<Service>> NewService(
       const ServiceOptions& options);
 
@@ -286,7 +286,7 @@ class Service : public ServiceInterface {
                               ExecuteResponse* result);
 
   // Prepare the executors for executing parallel.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
+  StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
       const ExecutionOptions& execution_options, int64 requests_size,
       int64 request_index) const;
 
@@ -310,8 +310,7 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
   ResolveAndValidateArguments(
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
@@ -329,7 +328,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Builds an Executable for the given HLO module proto.
@@ -338,7 +337,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const HloModuleProto& module_proto,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
@@ -346,14 +345,12 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       const std::vector<const HloModuleProto*>& module_protos,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
@@ -362,7 +359,7 @@ class Service : public ServiceInterface {
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      se::StreamExecutor* executor, ExecutionProfile* profile,
       DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
@@ -411,7 +408,7 @@ class Service : public ServiceInterface {
   // Returns the stream executors assigned to the replicas represented by the
   // given device handle. Each device_handle is a virtual replicated device that
   // represents a set of physical devices for the replicas.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
+  StatusOr<std::vector<se::StreamExecutor*>> Replicas(
       const Backend& backend, const DeviceHandle& device_handle) const;
 
   Status MaybeDumpHloModule(const HloModule& module) const;
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 6c1f8feac7..7f3910cdb0 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -28,7 +28,7 @@ namespace xla {
 class ServiceExecutableRunOptions {
  public:
   using StreamBorrower =
-      std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
+      std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
 
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@@ -45,14 +45,13 @@ class ServiceExecutableRunOptions {
   ExecutableRunOptions* mutable_run_options() { return &run_options_; }
 
   // Delegate to `ExecutableRunOptions` member.
-  perftools::gputools::Stream* stream() const { return run_options_.stream(); }
+  se::Stream* stream() const { return run_options_.stream(); }
   DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
   int device_ordinal() const { return run_options_.device_ordinal(); }
 
   // Borrows a stream and returns a smart pointer which returns the stream on
   // destruction.
-  StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
-      int device_ordinal) const {
+  StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
     return borrow_stream_
                ? borrow_stream_(device_ordinal)
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 6e9986165f..10a2aa2b30 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 using ::tensorflow::strings::Appendf;
@@ -146,7 +144,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
 
 std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
   auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
-  buffers_ = ShapeTree<perftools::gputools::DeviceMemoryBase>();
+  buffers_ = ShapeTree<se::DeviceMemoryBase>();
   return shaped_buffer;
 }
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index b816df8385..62ba8f2734 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -41,8 +41,7 @@ class ShapedBuffer {
   // determines the number of device allocations (DeviceMemoryBase) held by the
   // ShapedBuffer.
   ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-               const perftools::gputools::Platform* platform,
-               int device_ordinal);
+               const se::Platform* platform, int device_ordinal);
 
   // Returns the shape of the on-host representation of the data held by this
   // ShapedBuffer.
@@ -52,35 +51,29 @@ class ShapedBuffer {
   // ShapedBuffer.
   const Shape& on_device_shape() const { return on_device_shape_; }
 
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
   int device_ordinal() const { return device_ordinal_; }
 
   // Return the root buffer of the shape (shape index {}).
-  const perftools::gputools::DeviceMemoryBase& root_buffer() const {
+  const se::DeviceMemoryBase& root_buffer() const {
     return buffer(/*index=*/{});
   }
 
   // Returns the buffer at the given shape index where index is defined as in
   // ShapeUtil::GetSubshape.
-  const perftools::gputools::DeviceMemoryBase& buffer(
-      const ShapeIndex& index) const {
+  const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
     return buffers_.element(index);
   }
 
   // Sets the device memory buffer at the given index.
-  void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
-                  const ShapeIndex& index) {
+  void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
     *buffers_.mutable_element(index) = buffer;
   }
 
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
-  const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
-    return buffers_;
-  }
-  ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
-    return buffers_;
-  }
+  const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
+  ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
 
   // Set all device memory pointers in the object to null.
   void clear();
@@ -101,13 +94,13 @@ class ShapedBuffer {
   Shape on_device_shape_;
 
   // The platform the memory is allocated on.
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 
   // The device the memory is allocated on.
   int device_ordinal_;
 
   // The tree of device buffers. Its shape is on_device_shape().
-  ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
+  ShapeTree<se::DeviceMemoryBase> buffers_;
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 2f36e2b16e..be8231b73c 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -25,24 +25,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 /* static */ tensorflow::mutex
     TransferManager::platform_transfer_manager_mutex_(
         tensorflow::LINKER_INITIALIZED);
 
-/* static */ std::map<perftools::gputools::Platform::Id,
-                      TransferManager::State>*
+/* static */ std::map<se::Platform::Id, TransferManager::State>*
 TransferManager::GetPlatformTransferManagers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, TransferManager::State>;
+  static auto* r = new std::map<se::Platform::Id, TransferManager::State>;
   return r;
 }
 
 Status TransferManager::TransferArrayToDevice(
-    perftools::gputools::StreamExecutor* executor, const Literal& literal,
-    const perftools::gputools::DeviceMemoryBase& dest) {
+    se::StreamExecutor* executor, const Literal& literal,
+    const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
   TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
       << "On-device representation of "
@@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice(
 }
 
 StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
-    perftools::gputools::StreamExecutor* executor, const Shape& shape,
-    const perftools::gputools::DeviceMemoryBase& source) {
+    se::StreamExecutor* executor, const Shape& shape,
+    const se::DeviceMemoryBase& source) {
   TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
       << "Shape " << ShapeUtil::HumanString(shape)
       << " has a differently shaped representation on-device: "
@@ -112,8 +108,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
 }
 
 Status TransferManager::WriteTupleIndexTables(
-    perftools::gputools::StreamExecutor* executor,
-    const ShapedBuffer& device_buffer) {
+    se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
   VLOG(2) << "Writing tuple index tables for " << device_buffer;
 
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 9f2b5c4aec..410d2af7af 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -42,7 +42,7 @@ class TransferManager {
   virtual ~TransferManager() {}
 
   // Returns the ID of the platform that this transfer manager acts on.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;
 
   // Returns the shape of the on-device representation for the given shape on
   // the host. This is intended for use with ShapedBuffer where buffers are
@@ -58,48 +58,45 @@ class TransferManager {
   // DeviceShape(literal_shape) must be compatible, but need not have the same
   // layout.
   virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) = 0;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) = 0;
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
   // but need not have the same layout
-  virtual Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      const ShapedBuffer& device_buffer) = 0;
+  virtual Status TransferLiteralToDevice(se::StreamExecutor* executor,
+                                         const Literal& literal,
+                                         const ShapedBuffer& device_buffer) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
   // transfer an array at a known address.
-  Status TransferArrayToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      const perftools::gputools::DeviceMemoryBase& dest);
+  Status TransferArrayToDevice(se::StreamExecutor* executor,
+                               const Literal& literal,
+                               const se::DeviceMemoryBase& dest);
   StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
-      perftools::gputools::StreamExecutor* executor, const Shape& shape,
-      const perftools::gputools::DeviceMemoryBase& source);
+      se::StreamExecutor* executor, const Shape& shape,
+      const se::DeviceMemoryBase& source);
 
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralToInfeed(
-      perftools::gputools::StreamExecutor* executor,
-      const Literal& literal) = 0;
+  virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                         const Literal& literal) = 0;
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) = 0;
+  virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                            const Shape& literal_shape,
+                                            Literal* literal) = 0;
 
   // Resets the devices associated with this transfer manager.
   virtual Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executor) = 0;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executor) = 0;
 
   // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
   // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
   // ShapedBuffer is array-shaped this method does nothing.
-  Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor,
+  Status WriteTupleIndexTables(se::StreamExecutor* executor,
                                const ShapedBuffer& device_buffer);
 
   // Determines the byte size requirement for the given shape on the underlying
@@ -127,13 +124,13 @@ class TransferManager {
   // Precondition: a platform kind must not be registered more than once.
   typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
   static void RegisterTransferManager(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
       TransferManagerCreationFunction transfer_manager);
 
   // Returns the transfer manager singleton pointer if it is available for the
   // given platform, or an error status if it is not.
   static StatusOr<TransferManager*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);
 
  protected:
   // Transfer a memory block of the given size from 'source' buffer to the
@@ -143,35 +140,32 @@ class TransferManager {
   //
   // source is the source data that must be in the target-dependent layout that
   // the Infeed HLO used in the computation expects.
-  virtual Status TransferBufferToInfeed(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source) = 0;
+  virtual Status TransferBufferToInfeed(se::StreamExecutor* executor,
+                                        int64 size, const void* source) = 0;
 
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
   // size is the size to transfer to destination in bytes.
-  virtual Status TransferBufferFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source, int64 size,
-      void* destination);
+  virtual Status TransferBufferFromDevice(se::StreamExecutor* executor,
+                                          const se::DeviceMemoryBase& source,
+                                          int64 size, void* destination);
 
   // Transfer a memory block of the given size from 'source' buffer to the given
   // destination of the device.
   //
   // size is the size to transfer from source in bytes.
-  virtual Status TransferBufferToDevice(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source, perftools::gputools::DeviceMemoryBase* destination);
+  virtual Status TransferBufferToDevice(se::StreamExecutor* executor,
+                                        int64 size, const void* source,
+                                        se::DeviceMemoryBase* destination);
 
   // Writes the given device-memory pointers in 'elements' to the given region
   // to construct a tuple index table in the platform-specific tuple
   // representation.
   virtual Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) = 0;
 
  private:
   // The mutex that guards the platform-to-transfer manager map.
@@ -186,8 +180,7 @@ class TransferManager {
   };
 
   // Map from platform kind to transfer manager singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformTransferManagers();
+  static std::map<se::Platform::Id, State>* GetPlatformTransferManagers();
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index 777ac167a3..bff60f25ec 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class BitcastConvertTest : public ClientLibraryTestBase {
  public:
-  explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr)
+  explicit BitcastConvertTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 312d8f284d..69389dae3f 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
@@ -59,8 +57,7 @@ se::Platform* GetReferencePlatform() {
 }  // namespace
 
 ClientLibraryTestBase::ClientLibraryTestBase(
-    perftools::gputools::Platform* platform,
-    const LocalClientOptions& client_options)
+    se::Platform* platform, const LocalClientOptions& client_options)
     : client_(GetOrCreateLocalClientOrDie(client_options)),
       execution_options_(CreateDefaultExecutionOptions()) {
   CHECK_EQ(platform, client_options.platform());
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index b3212dd228..481d7c5c25 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -64,11 +64,10 @@ std::vector<TestCase> ExpandUseBfloat16(
 // A client library test establishes an in-process XLA client connection.
 class ClientLibraryTestBase : public ::testing::Test {
  protected:
-  explicit ClientLibraryTestBase(
-      perftools::gputools::Platform* platform = nullptr);
+  explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
 
   // Creates a new ClientLibraryTestBase with custom client options.
-  ClientLibraryTestBase(perftools::gputools::Platform* platform,
+  ClientLibraryTestBase(se::Platform* platform,
                         const LocalClientOptions& client_options);
 
   // Returns the name of the test currently being run.
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index c15d808f1d..7ea82a791f 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -47,16 +47,14 @@ ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
 
 class ComputeConstantTest : public ::testing::Test {
  public:
-  explicit ComputeConstantTest(
-      perftools::gputools::Platform* platform = nullptr)
+  explicit ComputeConstantTest(se::Platform* platform = nullptr)
       : platform_(platform) {}
 
   string TestName() const {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
-  Client* ClientOrDie(::perftools::gputools::Platform* platform,
-                      ClientType client_type) {
+  Client* ClientOrDie(se::Platform* platform, ClientType client_type) {
     if (client_type == ClientType::kLocal) {
       StatusOr<Client*> result =
           ClientLibrary::GetOrCreateLocalClient(platform);
@@ -107,7 +105,7 @@ class ComputeConstantTest : public ::testing::Test {
     return result.ok() ? result.ValueOrDie() : false;
   }
 
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
 };
 
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 0842a8918b..e67a30d76c 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -36,7 +36,7 @@ namespace {
 
 class ConvertTest : public ClientLibraryTestBase {
  public:
-  explicit ConvertTest(perftools::gputools::Platform* platform = nullptr)
+  explicit ConvertTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 5f00c34002..464b8cbebb 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index a292eab1d1..ed16963b40 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -50,8 +50,6 @@ limitations under the License.
 
 using tensorflow::gtl::ArraySlice;
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 21f71fc91b..c5afe0c3e0 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 
 namespace {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 3e8e2360bb..28d7ab09cb 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -76,8 +76,7 @@ class HloTestBase : public ::testing::Test {
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
-  HloTestBase(::perftools::gputools::Platform* test_platform,
-              ::perftools::gputools::Platform* reference_platform);
+  HloTestBase(se::Platform* test_platform, se::Platform* reference_platform);
 
   ~HloTestBase() override {}
 
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 7e92439c49..2f46ee0be2 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -43,7 +43,7 @@ class LLVMCompilerTest : public ::testing::Test {
   ~LLVMCompilerTest() override {}
 
  protected:
-  using Platform = ::perftools::gputools::Platform;
+  using Platform = se::Platform;
 
   explicit LLVMCompilerTest(string platform_name)
       : platform_name_(std::move(platform_name)) {}
@@ -95,7 +95,7 @@ class LLVMCompilerTest : public ::testing::Test {
     modules.push_back(hlo_module->Clone());
     modules.push_back(std::move(hlo_module));
 
-    std::vector<std::vector<perftools::gputools::StreamExecutor *>> executors;
+    std::vector<std::vector<se::StreamExecutor *>> executors;
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2462ea39f9..373dd3c5df 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -43,8 +43,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 96b976d25d..29fd985acf 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -35,8 +35,9 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<perftools::gputools::DeviceMemoryBase> TestAllocator::Allocate(
-    int device_ordinal, uint64 size, bool retry_on_failure) {
+StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
+                                                       uint64 size,
+                                                       bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -46,8 +47,8 @@ StatusOr<perftools::gputools::DeviceMemoryBase> TestAllocator::Allocate(
   return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size);
 }
 
-tensorflow::Status TestAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
+                                             se::DeviceMemoryBase* mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -88,7 +89,7 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const {
 }
 
 /* static */ TestAllocator* LocalClientTestBase::GetOrCreateAllocator(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
   tensorflow::mutex_lock lock(mu);
 
@@ -115,8 +116,7 @@ struct LocalClientTestBase::EigenThreadPoolWrapper {
   std::unique_ptr<Eigen::ThreadPoolDevice> device;
 };
 
-LocalClientTestBase::LocalClientTestBase(
-    perftools::gputools::Platform* platform)
+LocalClientTestBase::LocalClientTestBase(se::Platform* platform)
     : local_client_(
           ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
       thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index f0c73f04f6..7555d5e893 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -41,15 +41,15 @@ namespace xla {
 
 class TestAllocator : public StreamExecutorMemoryAllocator {
  public:
-  explicit TestAllocator(perftools::gputools::Platform* platform)
+  explicit TestAllocator(se::Platform* platform)
       : StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                          bool retry_on_failure) override;
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;
 
   // Return the number of allocations that have been performed.
   int64 allocation_count() const;
@@ -75,12 +75,10 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
 class LocalClientTestBase : public ::testing::Test {
  protected:
   struct EigenThreadPoolWrapper;
-  explicit LocalClientTestBase(
-      perftools::gputools::Platform* platform = nullptr);
+  explicit LocalClientTestBase(se::Platform* platform = nullptr);
   virtual ~LocalClientTestBase();
 
-  static TestAllocator* GetOrCreateAllocator(
-      perftools::gputools::Platform* platform);
+  static TestAllocator* GetOrCreateAllocator(se::Platform* platform);
 
   // Copy the given literal onto the default device and return a
   // ScopedShapedBuffer. Convenience wrapper around
@@ -128,7 +126,7 @@ class LocalClientTestBase : public ::testing::Test {
   // of the process. So make the allocator static.
   static TestAllocator* allocator_;
 
-  perftools::gputools::StreamExecutor* stream_executor_;
+  se::StreamExecutor* stream_executor_;
   TransferManager* transfer_manager_;
 
   LocalClient* local_client_;
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index efe6cc6787..8fabcaca1b 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -41,7 +41,7 @@ namespace {
 
 class MapTest : public ClientLibraryTestBase {
  public:
-  explicit MapTest(perftools::gputools::Platform* platform = nullptr)
+  explicit MapTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index cda1989fad..997a1d8273 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -339,8 +339,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
   return std::move(arguments);
 }
 
-Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module, bool allow_mixed_precision) {
+Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
+                       bool allow_mixed_precision) {
   return HloVerifier(allow_mixed_precision).Run(module).status();
 }
 
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index b5ab779574..30c147910c 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -68,8 +68,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module,
+Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
                        bool allow_mixed_precision = false);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index b52c718814..697d78fe6e 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -39,7 +39,7 @@ namespace {
 
 class VecOpsSimpleTest : public ClientLibraryTestBase {
  public:
-  explicit VecOpsSimpleTest(perftools::gputools::Platform* platform = nullptr)
+  explicit VecOpsSimpleTest(se::Platform* platform = nullptr)
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 89ce2ce797..1e18b56799 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -37,8 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace {
 
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index ff3418a128..efb00d56c5 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
-namespace se = ::perftools::gputools;
+
 namespace gtl = ::tensorflow::gtl;
 
 class HloProfileTest : public ClientLibraryTestBase {};
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 20f3f1b957..b645acb700 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -49,9 +49,7 @@ using ::Eigen::half;
 // Alias namespace ::stream_executor as ::xla::se.
 namespace stream_executor {}
 namespace xla {
-// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
-// removed in ::xla.
-// namespace se = ::stream_executor;
+namespace se = ::stream_executor;
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
-- 
GitLab


From 288bd10decc86b95ba043e14682bf217181b88ce Mon Sep 17 00:00:00 2001
From: "Karol M. Langner" <langner@users.noreply.github.com>
Date: Tue, 17 Apr 2018 23:04:13 -0700
Subject: [PATCH 1009/1262] Remove over-indentation

---
 tensorflow/docs_src/tutorials/layers.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 611d191506..37cd2bb139 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -613,9 +613,9 @@ following to `main()`:
 
 ```python
 # Set up logging for predictions
-  tensors_to_log = {"probabilities": "softmax_tensor"}
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=50)
+tensors_to_log = {"probabilities": "softmax_tensor"}
+logging_hook = tf.train.LoggingTensorHook(
+    tensors=tensors_to_log, every_n_iter=50)
 ```
 
 We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a
-- 
GitLab


From c72e6858b48d9104b718d4320454d47fde8fff4e Mon Sep 17 00:00:00 2001
From: Noah Eisen <ncteisen@google.com>
Date: Tue, 17 Apr 2018 23:06:57 -0700
Subject: [PATCH 1010/1262] No public changes.

PiperOrigin-RevId: 193309262
---
 tensorflow/core/distributed_runtime/rpc/grpc_util.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index ece56a2727..e211c33732 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -21,8 +21,8 @@ namespace tensorflow {
 ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
                                      grpc::ByteBuffer* dst) {
   bool own_buffer;
-  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter, protobuf::Message>(
-      src, dst, &own_buffer);
+  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter,
+                                  protobuf::Message>(src, dst, &own_buffer);
 }
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
@@ -35,7 +35,7 @@ namespace tensorflow {
 }
 
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
-  grpc::ProtoBufferReader reader(src);
+  ::grpc::ProtoBufferReader reader(src);
   return dst->ParseFromZeroCopyStream(&reader);
 }
 
-- 
GitLab


From 2995582488e5de81aa9545e91ec975f5c280b9e2 Mon Sep 17 00:00:00 2001
From: MyungSung Kwak <yesmung@gmail.com>
Date: Wed, 18 Apr 2018 15:59:48 +0900
Subject: [PATCH 1011/1262] Fix wrong api name in apis.md

typed_output_tensor is the correct api name.
It is implemented in the interpreter class.

Signed-off-by: MyungSung Kwak <yesmung@gmail.com>
---
 tensorflow/contrib/lite/g3doc/apis.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index fe208e47d1..50cc146a87 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -29,7 +29,7 @@ interpreter->AllocateTensors();
 float* input = interpreter->typed_input_tensor<float>(0);
 // Fill `input`.
 interpreter->Invoke();
-float* output = interpreter->type_output_tensor<float>(0);
+float* output = interpreter->typed_output_tensor<float>(0);
 ```
 ### Data Alignment
 
-- 
GitLab


From 069756ce00faf1d1d34ccfdd45163d9a9af6c61b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 00:18:50 -0700
Subject: [PATCH 1012/1262] Enable the n=1 special case in the
 DeserializeSparse op.

The optimized case was previously dead because of two off-by-one errors (mea culpa).

PiperOrigin-RevId: 193314065
---
 tensorflow/core/kernels/serialize_sparse_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 64e0a68c2c..9e041d98f7 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -340,7 +340,7 @@ class DeserializeSparseOp : public OpKernel {
             "but has a zero dimension ",
             serialized_sparse.shape().DebugString()));
 
-    if (num_sparse_tensors == 0 && serialized_sparse.shape().dims() == 1) {
+    if (num_sparse_tensors == 1 && serialized_sparse.shape().dims() == 0) {
       // Special case with a single sparse tensor. We can avoid data
       // motion in the Concat and Reshape.
       const auto& serialized_sparse_t = serialized_sparse.vec<T>();
-- 
GitLab


From 7b6941702271cc36ee1429c9fa71e4bcaaebb310 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 18 Apr 2018 16:35:45 +0800
Subject: [PATCH 1013/1262] Fix useless duplicate lines in *.py files

---
 tensorflow/compiler/xla/python/xla_client_test.py                | 1 -
 .../contrib/distributions/python/kernel_tests/shape_test.py      | 1 -
 tensorflow/contrib/eager/python/saver_test.py                    | 1 -
 tensorflow/contrib/kfac/python/ops/loss_functions_lib.py         | 1 -
 tensorflow/contrib/layers/python/layers/utils_test.py            | 1 -
 5 files changed, 5 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index d97264ea64..433ea56877 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1160,7 +1160,6 @@ class EmbeddedComputationsTest(LocalComputationTest):
       self._ExecuteAndCompareClose(
           c, expected=np.sum(input_array, axis=tuple(dims)))
 
-    _ReduceAndTest(0)
     _ReduceAndTest(0)
     _ReduceAndTest(0, 1)
     _ReduceAndTest(0, 2)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
index c8d795c3f6..243b5a0348 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -584,7 +584,6 @@ class DistributionShapeTest(test.TestCase):
 
   def testDistributionShapeGetDimsStatic(self):
     with self.test_session():
-      shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       x = 1
       self.assertAllEqual((_empty_shape, _empty_shape, _empty_shape),
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index 1a7f7b85e6..4032e755f6 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -102,7 +102,6 @@ class SaverTest(test.TestCase):
       # Can still restore it.
       saver.restore(ckpt_prefix)
       self.assertEqual(v1.read_value().numpy(), 1.0)
-      self.assertEqual(v1.read_value().numpy(), 1.0)
       # However, cannot restore it with default name.
       with self.assertRaisesOpError('not found in checkpoint'):
         saver = _saver.Saver([v1, v2]).restore(ckpt_prefix)
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
index 705a871d48..4279cb2792 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
@@ -33,7 +33,6 @@ _allowed_symbols = [
     "CategoricalLogitsNegativeLogProbLoss",
     "OnehotCategoricalLogitsNegativeLogProbLoss",
     "MultiBernoulliNegativeLogProbLoss",
-    "MultiBernoulliNegativeLogProbLoss",
     "insert_slice_in_zeros",
 ]
 
diff --git a/tensorflow/contrib/layers/python/layers/utils_test.py b/tensorflow/contrib/layers/python/layers/utils_test.py
index 3409860add..645dc1291e 100644
--- a/tensorflow/contrib/layers/python/layers/utils_test.py
+++ b/tensorflow/contrib/layers/python/layers/utils_test.py
@@ -294,7 +294,6 @@ class NPositiveIntegersTest(test.TestCase):
     self.assertEqual(utils.n_positive_integers(2, 2), (2, 2))
     self.assertEqual(utils.n_positive_integers(2, (2, 3)), (2, 3))
     self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
-    self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
     self.assertEqual(
         utils.n_positive_integers(3, tensor_shape.TensorShape([2, 3, 1])),
         (2, 3, 1))
-- 
GitLab


From 019d6479c35e095154206df10b693d288b44612f Mon Sep 17 00:00:00 2001
From: Joe Yearsley <josephelliotyearsley@gmail.com>
Date: Wed, 18 Apr 2018 11:40:17 +0100
Subject: [PATCH 1014/1262] Update debugger.md

Should be using normal softmax not sparse.
---
 tensorflow/docs_src/programmers_guide/debugger.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index f5a0eb0a20..f7817b06d4 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -400,7 +400,7 @@ diff = -(y_ * tf.log(y))
 to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
+diff = tf.losses.softmax_cross_entropy(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
-- 
GitLab


From 779664494d43b18a812361197dcbea2f25912c02 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 18 Apr 2018 20:12:14 +0800
Subject: [PATCH 1015/1262] Add shape check to TextLineDataset op

---
 tensorflow/core/ops/dataset_ops.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 7f4d63b024..f3b51d097c 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -383,10 +383,12 @@ REGISTER_OP("TextLineDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): validate
-                                                // that `filenames` is
-                                                // a scalar or a
-                                                // vector.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
-- 
GitLab


From c8e118877cb9e6d201a64f5627de72877bcb8da6 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 18 Apr 2018 08:10:05 -0700
Subject: [PATCH 1016/1262] Fix bug in importing while loops within a while
 loop.

PiperOrigin-RevId: 193358699
---
 .../python/framework/meta_graph_test.py       | 25 +++++++++++++++++++
 tensorflow/python/ops/control_flow_ops.py     | 10 +++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 5d5fb037fc..e5b157648e 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -522,6 +522,31 @@ class ScopedMetaGraphTest(test.TestCase):
         actual_grad_value = sess.run(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
+  def testImportWhileLoopInWhileLoop(self):
+    # Create a simple while loop.
+    with ops.Graph().as_default():
+      var = variables.Variable(0.0)
+      _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
+                                              lambda i, x: (i + 1, x * 2.0),
+                                              [0, var])
+      output_name = output.name
+
+      # Generate a MetaGraphDef containing the while loop with an export scope.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph()
+
+    # Restore the MetaGraphDef in a while loop in a new graph.
+    with ops.Graph().as_default():
+
+      def body(i, _):
+        meta_graph.import_scoped_meta_graph(meta_graph_def)
+        return i + 1, ops.get_default_graph().get_tensor_by_name(output_name)
+
+      _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
+                                         name="")
+      with session.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(x)
+
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index fb53d9ffea..c43bbd4a1e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -2379,7 +2379,15 @@ class WhileContext(ControlFlowContext):
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
     result = val
-    if val.name not in self._values:
+    new_value = val.name not in self._values
+    # Don't treat ops in this context as new values. Usually all known values
+    # are in self._values, except when we're importing a while loop inside this
+    # WhileContext. Since there's a cycle in this case, `val` may be part of the
+    # imported while loop but not yet processed by this context and added to
+    # self._values in _AddOpInternal. We only want to process external input
+    # tensors to the while loop here.
+    new_value &= val.op._control_flow_context is not self  # pylint: disable=protected-access
+    if new_value:
       self._values.add(val.name)
 
       # If we are in a grad context and val is from its forward context,
-- 
GitLab


From 39047daafcf12864606a2c7e349eacee7f3771b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 08:40:40 -0700
Subject: [PATCH 1017/1262] Allow default min/max ranges for int16 data types.

PiperOrigin-RevId: 193362891
---
 tensorflow/contrib/lite/toco/BUILD            |  1 +
 tensorflow/contrib/lite/toco/args.h           |  2 +
 .../graph_transformations.h                   | 18 ++++
 .../propagate_default_min_max.cc              | 86 +++++++++++++++++++
 .../contrib/lite/toco/toco_cmdline_flags.cc   | 16 +++-
 tensorflow/contrib/lite/toco/toco_flags.proto |  8 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  | 35 ++++++--
 tensorflow/contrib/lite/toco/tooling_util.cc  | 22 -----
 tensorflow/contrib/lite/toco/tooling_util.h   |  2 -
 9 files changed, 155 insertions(+), 35 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 398978b145..f696f4b845 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -238,6 +238,7 @@ cc_library(
         "graph_transformations/merge_reshape_into_preceding_transpose.cc",
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
+        "graph_transformations/propagate_default_min_max.cc",
         "graph_transformations/propagate_fake_quant_num_bits.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
         "graph_transformations/quantization_util.cc",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 71e7318ac3..c9662d05ce 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -227,6 +227,8 @@ struct ParsedTocoFlags {
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
+  Arg<float> default_int16_ranges_min = Arg<float>(0.);
+  Arg<float> default_int16_ranges_max = Arg<float>(0.);
   Arg<string> inference_type;
   Arg<string> inference_input_type;
   Arg<bool> drop_fake_quant = Arg<bool>(false);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 56b3dec5c4..8075d0205d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -190,6 +190,24 @@ DECLARE_GRAPH_TRANSFORMATION(Dequantize)
 DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
 DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights)
 
+class PropagateDefaultMinMax : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "PropagateDefaultMinMax"; }
+
+  bool has_any_ranges_defined() const { return !type_ranges_.empty(); }
+  void DefineTypeRange(ArrayDataType data_type, double min, double max) {
+    MinMax minmax;
+    minmax.min = min;
+    minmax.max = max;
+    type_ranges_.emplace_back(data_type, minmax);
+  }
+
+ private:
+  bool SetArrayMinMax(const string& array_name, Array* array);
+  std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
+};
+
 class ResolveReshapeAttributes : public GraphTransformation {
  public:
   bool Run(Model* model, std::size_t op_index) override;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
new file mode 100644
index 0000000000..50b90e7c2b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Propagates default min/max values to any operator input/output array that
+// is missing them.
+//
+// When provided a set of min/max values for uint8 arrays this will rescale
+// the values for other data types as required and preserving the floating point
+// range within the new type.
+bool PropagateDefaultMinMax::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* op = it->get();
+
+  bool did_change = false;
+
+  for (const auto& input : op->inputs) {
+    auto& input_array = model->GetArray(input);
+    if (!input_array.minmax && !input_array.buffer) {
+      did_change |= SetArrayMinMax(input, &input_array);
+    }
+  }
+
+  for (const auto& output : op->outputs) {
+    auto& output_array = model->GetArray(output);
+    if (!output_array.minmax && !output_array.buffer) {
+      did_change |= SetArrayMinMax(output, &output_array);
+    }
+  }
+
+  return did_change;
+}
+
+// Sets the min/max on the given array, adjusting the reference_minmax for the
+// final data type of the array if it is already specified.
+bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name,
+                                            Array* array) {
+  CHECK(!array->minmax);
+
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(*array, ArrayDataType::kUint8);
+  for (const auto& type_range : type_ranges_) {
+    if (type_range.first == quantized_data_type) {
+      array->GetOrCreateMinMax() = type_range.second;
+      break;
+    }
+  }
+  if (!array->minmax) {
+    AddMessageF(
+        "No defaults specified for quantized data type %s of array %s, "
+        "skipping",
+        ArrayDataTypeName(quantized_data_type), array_name);
+    return false;
+  }
+
+  AddMessageF("Adding default minmax %g,%g to array %s when quantized as %s",
+              array->GetMinMax().min, array->GetMinMax().max, array_name,
+              ArrayDataTypeName(quantized_data_type));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index d1d68b6b47..74f98c8452 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -61,11 +61,21 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
-           "of min/max ranges used for quantization."),
+           "of min/max ranges used for quantization of uint8 arrays."),
       Flag("default_ranges_max", parsed_flags.default_ranges_max.bind(),
            parsed_flags.default_ranges_max.default_value(),
            "If defined, will be used as the default value for the max bound "
-           "of min/max ranges used for quantization."),
+           "of min/max ranges used for quantization of uint8 arrays."),
+      Flag("default_int16_ranges_min",
+           parsed_flags.default_int16_ranges_min.bind(),
+           parsed_flags.default_int16_ranges_min.default_value(),
+           "If defined, will be used as the default value for the min bound "
+           "of min/max ranges used for quantization of int16 arrays."),
+      Flag("default_int16_ranges_max",
+           parsed_flags.default_int16_ranges_max.bind(),
+           parsed_flags.default_int16_ranges_max.default_value(),
+           "If defined, will be used as the default value for the max bound "
+           "of min/max ranges used for quantization of int16 arrays."),
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
@@ -212,6 +222,8 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_max, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_int16_ranges_min, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_int16_ranges_max, FlagRequirement::kNone);
   READ_TOCO_FLAG(drop_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone);
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 751aca948c..869c512d93 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 15.
+// Next ID to use: 17.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -103,8 +103,14 @@ message TocoFlags {
   // for experimentation purposes only and should not be used in production:
   // they make it easy to quantize models, but the resulting quantized model
   // will be inaccurate.
+  //
+  // These values only apply to arrays quantized with the kUint8 data type.
   optional float default_ranges_min = 5;
   optional float default_ranges_max = 6;
+  // Equivalent versions of default_ranges_min/_max for arrays quantized with
+  // the kInt16 data type.
+  optional float default_int16_ranges_min = 15;
+  optional float default_int16_ranges_max = 16;
 
   // Ignore and discard FakeQuant nodes. For instance, that can be used to
   // generate plain float code without fake-quantization from a quantized
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index b69852453c..89cb2f85f8 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <set>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
 #include "tensorflow/contrib/lite/toco/dump_graphviz.h"
@@ -270,10 +271,6 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
 
-  // Fix any issues with IO edges. This must happen after any transform that
-  // may modify the structure of the edges.
-  FixEdgeArrays(model);
-
   if (quantize_output) {
     if (toco_flags.propagate_fake_quant_num_bits()) {
       RunGraphTransformations(model,
@@ -287,16 +284,38 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
                             });
   }
 
+  // Fix any issues with IO edges. This must happen after any transform that
+  // may modify the structure of the edges.
+  FixEdgeArrays(model);
+
   if (quantize_output) {
+    // If the user specified default min/max ranges we need to set all arrays
+    // that didn't either have a min/max specified or get one set via
+    // HardcodeMinMax or PropagateFakeQuantNumBits. This may require running
+    // HardcodeMinMax to move changes through the graph as we make changes.
+    auto propagate_default_min_max =
+        absl::make_unique<PropagateDefaultMinMax>();
     if (toco_flags.has_default_ranges_min() &&
         toco_flags.has_default_ranges_max()) {
-      UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(),
-                                  toco_flags.default_ranges_max());
-      // The new MinMax info may need to be propagated a bit.
+      propagate_default_min_max->DefineTypeRange(
+          ArrayDataType::kUint8, toco_flags.default_ranges_min(),
+          toco_flags.default_ranges_max());
+    }
+    if (toco_flags.has_default_int16_ranges_min() &&
+        toco_flags.has_default_int16_ranges_max()) {
+      propagate_default_min_max->DefineTypeRange(
+          ArrayDataType::kInt16, toco_flags.default_int16_ranges_min(),
+          toco_flags.default_int16_ranges_max());
+    }
+    if (propagate_default_min_max->has_any_ranges_defined()) {
       RunGraphTransformations(
           model, "default min-max range propagation graph transformations",
-          {new HardcodeMinMax});
+          {
+              propagate_default_min_max.release(),
+              new HardcodeMinMax,
+          });
     }
+
     CheckIsReadyForQuantization(*model);
     RunGraphTransformations(model, "quantization graph transformations",
                             {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index ecac0c28a5..cf2cbeedc7 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1474,28 +1474,6 @@ void CheckIsReadyForQuantization(const Model& model) {
   }
 }
 
-void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
-                                 double default_ranges_max) {
-  for (const auto& op : model->operators) {
-    for (const auto& input : op->inputs) {
-      auto& input_array = model->GetArray(input);
-      if (!input_array.minmax && !input_array.buffer) {
-        auto& minmax = input_array.GetOrCreateMinMax();
-        minmax.min = default_ranges_min;
-        minmax.max = default_ranges_max;
-      }
-    }
-    for (const auto& output : op->outputs) {
-      auto& output_array = model->GetArray(output);
-      if (!output_array.minmax && !output_array.buffer) {
-        auto& minmax = output_array.GetOrCreateMinMax();
-        minmax.min = default_ranges_min;
-        minmax.max = default_ranges_max;
-      }
-    }
-  }
-}
-
 int ElementSize(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kBool:
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 4c705f4e5f..5cc15fa57b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -188,8 +188,6 @@ T ConvertOperator(Operator* o, OperatorType type) {
 }
 
 void CheckIsReadyForQuantization(const Model& model);
-void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
-                                 double default_ranges_max);
 
 bool ReshapeIsEquivalentToTranspose(const Model& model,
                                     const TensorFlowReshapeOperator* op,
-- 
GitLab


From 5d8f98cdf0e4919e8558d661517c49960090a575 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Wed, 18 Apr 2018 23:49:52 +0800
Subject: [PATCH 1018/1262] Fix incorrect format in community/documentation.md

---
 .../docs_src/community/documentation.md       | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index d5bc7a5a7a..8639656d07 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -402,24 +402,24 @@ types and default values.
 
 For example:
 
-    ```c++
-    REGISTER_OP("PngDecode")
-      .Input("contents: string")
-      .Attr("channels: int = 0")
-      .Output("image: uint8")
-      .Doc(R"doc(
-    Decodes the contents of a PNG file into a uint8 tensor.
-
-    contents: PNG file contents.
-    channels: Number of color channels, or 0 to autodetect based on the input.
-      Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
-      If the input has a different number of channels, it will be transformed
-      accordingly.
-    image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
-      If `channels` is 0, the last dimension is determined
-      from the png contents.
-    )doc");
-    ```
+```c++
+REGISTER_OP("PngDecode")
+  .Input("contents: string")
+  .Attr("channels: int = 0")
+  .Output("image: uint8")
+  .Doc(R"doc(
+Decodes the contents of a PNG file into a uint8 tensor.
+
+contents: PNG file contents.
+channels: Number of color channels, or 0 to autodetect based on the input.
+  Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
+  If the input has a different number of channels, it will be transformed
+  accordingly.
+image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
+  If `channels` is 0, the last dimension is determined
+  from the png contents.
+)doc");
+```
 
 Results in this piece of Markdown:
 
@@ -429,12 +429,12 @@ Results in this piece of Markdown:
 
     #### Args:
 
-    *  <b>contents</b>: A string Tensor. PNG file contents.
-    *  <b>channels</b>: An optional int. Defaults to 0.
+    *  **contents**: A string Tensor. PNG file contents.
+    *  **channels**: An optional int. Defaults to 0.
        Number of color channels, or 0 to autodetect based on the input.
        Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.  If the
        input has a different number of channels, it will be transformed accordingly.
-    *  <b>name</b>: A name for the operation (optional).
+    *  **name**: A name for the operation (optional).
 
     #### Returns:
     A 3-D uint8 tensor of shape `[height, width, channels]`.  If `channels` is
@@ -442,7 +442,7 @@ Results in this piece of Markdown:
 
 Much of the argument description is added automatically. In particular, the doc
 generator automatically adds the name and type of all inputs, attrs, and
-outputs. In the above example, `<b>contents</b>: A string Tensor.` was added
+outputs. In the above example, `contents: A string Tensor.` was added
 automatically. You should write your additional text to flow naturally after
 that description.
 
@@ -664,10 +664,10 @@ This generates the following Args section in
 
     #### Args:
 
-    * <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded
+    * **`contents`**: A `Tensor` of type `string`. 0-D.  The PNG-encoded
       image.
-    * <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color
+    * **`channels`**: An optional `int`. Defaults to `0`. Number of color
       channels for the decoded image.
-    * <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8,
+    * **`dtype`**: An optional `tf.DType` from: `tf.uint8,
       tf.uint16`. Defaults to `tf.uint 8`.
-    * <b>`name`</b>: A name for the operation (optional).
+    * **`name`**: A name for the operation (optional).
-- 
GitLab


From 5dd4bf753b8f708db69a7ab455a25fb0bb9821a5 Mon Sep 17 00:00:00 2001
From: Martin Wicke <577277+martinwicke@users.noreply.github.com>
Date: Tue, 17 Apr 2018 11:54:48 -0700
Subject: [PATCH 1019/1262] Merge pull request #18601 from
 yongtang/18598-tf.compat.as_str

Fix tf.compat.as_str returns bytes issue in Python 3
---
 tensorflow/python/util/compat.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 4163fcac79..738479c946 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -45,7 +45,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -68,7 +67,6 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
-@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,8 +91,12 @@ def as_text(bytes_or_text, encoding='utf-8'):
 # Convert an object to a `str` in both Python 2 and 3.
 if _six.PY2:
   as_str = as_bytes
+  tf_export('compat.as_bytes', 'compat.as_str')(as_bytes)
+  tf_export('compat.as_text')(as_text)
 else:
   as_str = as_text
+  tf_export('compat.as_bytes')(as_bytes)
+  tf_export('compat.as_text', 'compat.as_str')(as_text)
 
 
 @tf_export('compat.as_str_any')
-- 
GitLab


From 9187be7adff07be82856add498aa3ff4b5f95998 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Thu, 19 Apr 2018 00:05:05 +0800
Subject: [PATCH 1020/1262] add checks for compression_type and buffer_size
 also

---
 tensorflow/core/ops/dataset_ops.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index f3b51d097c..34f2c612ec 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -388,6 +388,10 @@ REGISTER_OP("TextLineDataset")
       // `filenames` must be a scalar or a vector.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
       return shape_inference::ScalarShape(c);
+      // `compression_type` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // `buffer_size` could only be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
     });
 
 REGISTER_OP("SqlDataset")
-- 
GitLab


From fc1485183013b5e71cdc1b566e01083cbde8305f Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 18 Apr 2018 09:03:21 -0700
Subject: [PATCH 1021/1262] Avoid generating degenerate dimensions during
 gather expansions

This gets rid of two cases that used to introduce degenerate dimensions
(dimensions with bound = 1) into the while loop state:

 - Previously we'd explicitly reshape gathers using scalar indices to have a
   minor degenerate dimension.  With this CL we no longer do that - instead we
   push this into the code that looks up the index vector from the gather
   indices tensor.
 - Previously we'd have the accumulator (the tensor we're
   dynamic-update-slice-ing into) contain all of the degenerate window dims that
   the gather op would later elide (after the while loop).  With this CL we
   eagerly elide these dimensions as we slice out individual windows from the
   operand.

PiperOrigin-RevId: 193365863
---
 .../compiler/xla/service/gather_expander.cc   | 143 +++++++++---------
 .../xla/service/gather_expander_test.cc       |  57 +++++++
 2 files changed, 130 insertions(+), 70 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 1239f56364..2d3e4b1fcd 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -28,9 +28,15 @@ using tensorflow::gtl::ArraySlice;
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
     HloInstruction* gather_indices, int64 index_vector_dim) {
   const Shape& gather_indices_shape = gather_indices->shape();
+
+  if (gather_indices_shape.dimensions_size() == index_vector_dim) {
+    return gather_indices;
+  }
+
   if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) {
     return gather_indices;
   }
+
   std::vector<int64> permutation;
   permutation.reserve(gather_indices_shape.dimensions_size());
   for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
@@ -42,54 +48,35 @@ static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
   return MakeTransposeHlo(gather_indices, permutation);
 }
 
-// If the gather_indices holds scalar indices (i.e. gather_indices has rank N
-// and index_vector_dim is N) then reshape it to have a trailing degenerate
-// dimension.  This makes the code for slicing out the index vector more
-// uniform.
-static StatusOr<HloInstruction*> DeScalarizeGatherIndices(
-    HloInstruction* gather_indices, int64 index_vector_dim) {
-  const Shape& gather_indices_shape = gather_indices->shape();
-  if (index_vector_dim != gather_indices_shape.dimensions_size()) {
-    return gather_indices;
-  }
-
-  DCHECK_EQ(index_vector_dim, gather_indices_shape.dimensions_size());
-
-  std::vector<int64> result_shape_dims;
-  c_copy(gather_indices_shape.dimensions(),
-         std::back_inserter(result_shape_dims));
-  result_shape_dims.push_back(1);
-
-  return MakeReshapeHlo(result_shape_dims, gather_indices);
-}
-
 // Canonicalizes the gather_indices tensors so that we only have deal with some
 // specific cases in the while loop that does the heavy lifting.
 //
 // See the "High Level Algorithm" section for a broader picture.
 static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
     HloInstruction* gather_indices, int64 index_vector_dim) {
-  // If gather_indices holds scalar indices, normalize it to hold index vectors
-  // of size 1.
+  // Transpose the non-index-vector dimensions to the front.
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * descalarized_gather_indices,
-      DeScalarizeGatherIndices(gather_indices, index_vector_dim));
+      HloInstruction * transposed_gather_indices,
+      TransposeIndexVectorDimToLast(gather_indices, index_vector_dim));
+  bool indices_are_scalar =
+      index_vector_dim == gather_indices->shape().dimensions_size();
 
-  // Transpose the non-index-vector dimensions to the front.
-  TF_ASSIGN_OR_RETURN(HloInstruction * transposed_gather_indices,
-                      TransposeIndexVectorDimToLast(descalarized_gather_indices,
-                                                    index_vector_dim));
+  // The number of dimensions in gather_indices that are index dimensions.
+  const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1;
 
   // If there is only one index (i.e. gather_indices has rank 1 and this gather
   // is really just a dynamic slice) add a leading degenerate dimension for
   // uniformity.  Otherwise create a "collapsed" leading dimension that subsumes
   // all of the non-index-vector dimensions.
   const Shape& shape = transposed_gather_indices->shape();
-  if (shape.dimensions_size() == 1) {
+  if (shape.dimensions_size() == index_dims_in_gather_indices) {
     return PrependDegenerateDims(transposed_gather_indices, 1);
   } else {
-    return CollapseFirstNDims(transposed_gather_indices,
-                              shape.dimensions_size() - 1);
+    // Collapse all but the dimensions (0 or 1) in gather_indices containing the
+    // index vectors.
+    return CollapseFirstNDims(
+        transposed_gather_indices,
+        shape.dimensions_size() - index_dims_in_gather_indices);
   }
 }
 
@@ -156,48 +143,73 @@ static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
 static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
     const HloInstruction& gather, HloInstruction* induction_var,
     const std::vector<HloInstruction*>& incoming_loop_state) {
+  const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers();
   CHECK_EQ(incoming_loop_state.size(), 3);
   HloInstruction* const operand = incoming_loop_state[0];
   HloInstruction* const gather_indices = incoming_loop_state[1];
   HloInstruction* const output_accumulator = incoming_loop_state[2];
 
-  int64 index_vector_size = gather_indices->shape().dimensions(1);
+  bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1;
+  CHECK_EQ(has_scalar_indices,
+           dim_numbers.index_vector_dim() ==
+               gather.operand(1)->shape().dimensions_size());
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * induction_var_as_vector,
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
                        /*result_shape_bounds=*/{1}));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * index_into_gather_indices,
-      PadVectorWithZeros(induction_var_as_vector,
-                         /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
-
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * index_vector_2d,
-      MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
-                          {1, index_vector_size}));
+  HloInstruction* index_vector;
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * index_vector,
-                      ElideDegenerateDims(index_vector_2d, {0}));
+  if (has_scalar_indices) {
+    // In this case gather_indices has rank 1 and induction_var_as_vector (of
+    // shape {1}) is an index into this rank 1 tensor.
+    TF_ASSIGN_OR_RETURN(
+        index_vector,
+        MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1}));
+  } else {
+    // In this case gather_indices has rank 2 and induction_var_as_vector (of
+    // shape {1}) is an index into just the first dimension of this rank 2
+    // tensor.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_into_gather_indices,
+        PadVectorWithZeros(induction_var_as_vector,
+                           /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
+
+    int64 index_vector_size = gather_indices->shape().dimensions(1);
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_vector_2d,
+        MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
+                            {1, index_vector_size}));
+
+    TF_ASSIGN_OR_RETURN(index_vector,
+                        ElideDegenerateDims(index_vector_2d, {0}));
+  }
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_start,
-                      ExpandIndexVectorIntoOperandSpace(
-                          index_vector, gather.gather_dimension_numbers(),
-                          operand->shape().dimensions_size()));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_start,
+      ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers,
+                                        operand->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice,
                       MakeDynamicSliceHlo(operand, gathered_slice_start,
                                           gather.gather_window_bounds()));
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_for_update,
-                      PrependDegenerateDims(gathered_slice, 1));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_with_dims_elided,
+      ElideDegenerateDims(gathered_slice,
+                          AsInt64Slice(dim_numbers.elided_window_dims())));
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gathered_slice_for_update,
+      PrependDegenerateDims(gathered_slice_with_dims_elided, 1));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * index_vector_into_accumulator,
       PadVectorWithZeros(
           induction_var_as_vector, /*zeros_to_prepend=*/0,
-          /*zeros_to_append=*/gathered_slice->shape().dimensions_size()));
+          /*zeros_to_append=*/
+          gathered_slice_with_dims_elided->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * updated_accumulator,
@@ -213,26 +225,20 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
 
 static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
-    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count) {
+    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count,
+    const GatherDimensionNumbers& dim_numbers) {
   std::vector<int64> accumulator_state_shape_dims;
   accumulator_state_shape_dims.reserve(1 + window_bounds.size());
   accumulator_state_shape_dims.push_back(gather_loop_trip_count);
-  c_copy(window_bounds, std::back_inserter(accumulator_state_shape_dims));
+  for (int64 i = 0; i < window_bounds.size(); i++) {
+    if (!c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      accumulator_state_shape_dims.push_back(window_bounds[i]);
+    }
+  }
   return BroadcastZeros(computation, element_type,
                         accumulator_state_shape_dims);
 }
 
-static StatusOr<HloInstruction*> ElideWindowDimsFromAccumulator(
-    HloInstruction* accumulator, const GatherDimensionNumbers& dim_numbers) {
-  std::vector<int64> dims_to_elide;
-  dims_to_elide.reserve(dim_numbers.elided_window_dims_size());
-  for (int64 elided_window_dim : dim_numbers.elided_window_dims()) {
-    dims_to_elide.push_back(elided_window_dim + 1);
-  }
-
-  return ElideDegenerateDims(accumulator, dims_to_elide);
-}
-
 // `accumulator` is almost the tensor the gather operation would have produced,
 // except that it has the dimensions in the wrong order -- the gather dimensions
 // are the major dimensions and the window dimensions are the minor dimensions.
@@ -331,7 +337,8 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
       HloInstruction * accumulator_init,
       CreateGatherLoopAccumulatorInitValue(
           computation, output_shape.element_type(),
-          gather_instr->gather_window_bounds(), gather_loop_trip_count));
+          gather_instr->gather_window_bounds(), gather_loop_trip_count,
+          gather_instr->gather_dimension_numbers()));
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
@@ -346,14 +353,10 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
                       gather_loop_result_or_error);
 
   HloInstruction* accumulator_result = gather_loop_result.back();
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_with_window_dims_elided,
-      ElideWindowDimsFromAccumulator(accumulator_result, dim_numbers));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * accumulator_with_output_gather_dims_decanonicalized,
-      AdjustGatherDimsInAccumulator(gather_indices->shape(),
-                                    accumulator_with_window_dims_elided,
+      AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result,
                                     dim_numbers.index_vector_dim()));
 
   return PermuteGatherAndWindowDims(
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index ba41ee8428..1c72ca0665 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -47,5 +47,62 @@ ENTRY main {
                            "indices are not supported."));
 }
 
+TEST(GatherExpanderTest, AvoidDegenerateDims) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  HloInstruction* while_instr = nullptr;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      ASSERT_EQ(while_instr, nullptr)
+          << "Expected exactly one while instruction in the entry computation "
+             "after gather expansion";
+      while_instr = instr;
+    }
+  }
+
+  ASSERT_NE(while_instr, nullptr)
+      << "Expected exactly one while instruction in the entry computation "
+         "after gather expansion";
+
+  // We want to avoid create while loop with shapes that have degenerate
+  // dimensions for TF gather.  In this case we expect the loop state to be of
+  // the shape (sNN[], s32[3,3]{1,0}, s32[2]{0}, s32[2,3]{1,0}).  The leading
+  // sNN is an implementation detail from WhileUtil::MakeCountedLoop so we don't
+  // check it here (though in theory the form of the while loop state is itself
+  // an implementation detail from WhileUtil::MakeCountedLoop).
+
+  const Shape& while_shape = while_instr->shape();
+  ASSERT_TRUE(ShapeUtil::IsTuple(while_shape));
+  ASSERT_EQ(ShapeUtil::TupleElementCount(while_shape), 4);
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {3, 3}),
+      ShapeUtil::GetTupleElementShape(while_shape, 1)));
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {2}),
+      ShapeUtil::GetTupleElementShape(while_shape, 2)));
+
+  EXPECT_TRUE(ShapeUtil::SameDimensions(
+      ShapeUtil::MakeShape(S32, {2, 3}),
+      ShapeUtil::GetTupleElementShape(while_shape, 3)));
+}
 }  // namespace
 }  // namespace xla
-- 
GitLab


From bdbf1554dddf2da6609a0eb7799ee0f3ca2d94b9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 16:20:01 +0000
Subject: [PATCH 1022/1262] Fix build failure in `bazel test -s --config=opt
 --cache_test_results=no //tensorflow/python/kernel_tests:init_ops_test`

With the most recent master the following test fails:
```
bazel test -s --config=opt --cache_test_results=no //tensorflow/python/kernel_tests:init_ops_test
...
...
...
    eye = linalg_ops.eye(n, dtype=self.dtype)
NameError: global name 'linalg_ops' is not defined
```

This fix fixes the test failure.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/init_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 820e56eb9b..f93bf0a17f 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -848,7 +848,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
     """
     n = projection_matrix.shape.as_list()[0]
     kernel = {}
-    eye = linalg_ops.eye(n, dtype=self.dtype)
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel[0] = projection_matrix
     kernel[1] = eye - projection_matrix
     return kernel
@@ -976,7 +976,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
     if p1_shape != p2.shape.as_list() or p1_shape != p3.shape.as_list():
       raise ValueError("The dimension of the matrices must be the same.")
     n = p1_shape[0]
-    eye = linalg_ops.eye(n, dtype=self.dtype)
+    eye = linalg_ops_impl.eye(n, dtype=self.dtype)
     kernel2x2x2 = {}
     def matmul(p1, p2, p3):
       return math_ops.matmul(math_ops.matmul(p1, p2), p3)
-- 
GitLab


From b234c288c1e3ec8f98ba99df738aa64b81659925 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 09:30:17 -0700
Subject: [PATCH 1023/1262] Updating some more tests in
 constant_folding_test.cc so that the tests evaluate the original and
 optimized graphs and check if their outputs are the same.

PiperOrigin-RevId: 193369280
---
 .../optimizers/constant_folding_test.cc       | 88 ++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 36625b68b7..4b41dae480 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -520,6 +520,25 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
       EXPECT_EQ("Mul", node.op()) << node.name();
     }
   }
+
+  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
+  auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_known", x_known_t},
+                     {"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_known", x_known_t},
+                                {"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(expected_tensors[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
@@ -572,6 +591,20 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
+  const std::vector<string> fetch = {"addn1"};
+  auto x_partially_unknown_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto expected_tensors =
+      EvaluateNodes(item.graph, fetch,
+                    {{"x_partially_unknown", x_partially_unknown_t},
+                     {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  auto tensors = EvaluateNodes(output, fetch,
+                               {{"x_partially_unknown", x_partially_unknown_t},
+                                {"x_unknown", x_unknown_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(expected_tensors[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, CreateConstNodes) {
@@ -1056,6 +1089,20 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
   }
   EXPECT_EQ(9, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
+                                           "i2c", "i3a", "i3b"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
@@ -1888,6 +1935,14 @@ TEST_F(ConstantFoldingTest, Packing) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  const std::vector<string> fetch_nodes = {"i1", "i2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes);
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+
   // Make sure that the representation of the folded constant is space
   // efficient: in particular, the whole message should be smaller than 8k
   // (the size needed to naively encode 1000 floats folded twice).
@@ -1923,6 +1978,13 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
+  auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
   status = optimizer.Optimize(nullptr, item, &output);
@@ -1963,6 +2025,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     }
   }
   EXPECT_EQ(6, found);
+
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
@@ -1982,6 +2049,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch_nodes = {"o1", "o2"};
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2036,6 +2108,10 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
     }
   }
   EXPECT_EQ(7, found);
+  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}});
+  EXPECT_EQ(fetch_nodes.size(), tensors.size());
+  for (int i = 0; i < fetch_nodes.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -2513,7 +2589,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   value_tensor.AsProtoTensorContent(value.mutable_tensor());
 
   GraphDef& graph = item.graph;
-  AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
+  AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
   AddNode("enter1", "Enter", {"x"},
           {{"T", type},
@@ -2539,6 +2615,10 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(item.fetch.size(), tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2566,6 +2646,12 @@ TEST_F(ConstantFoldingTest, Enter) {
       EXPECT_EQ("enter3", node.input(0));
     }
   }
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(item.fetch.size(), tensors.size());
+
+  for (int i = 0; i < item.fetch.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, TensorArraySize) {
-- 
GitLab


From 857ee499e35d94a61ca4c90a6f6a20bc9dee80c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 09:40:21 -0700
Subject: [PATCH 1024/1262] Simplify the break canonicalization transformer to
 use more of the base transformer helpers. Add support for the loop's else
 block.

PiperOrigin-RevId: 193370640
---
 .../autograph/converters/break_statements.py  | 92 +++++++++++------
 .../converters/break_statements_test.py       | 99 +++++++++++++------
 .../contrib/autograph/pyct/transformer.py     |  4 +
 3 files changed, 132 insertions(+), 63 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 5dfb7a59d5..91de82f0a7 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -24,72 +24,102 @@ from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-class BreakCanonicalizationTransformer(transformer.Base):
+# Tags for local state.
+BREAK_USED = 'break_used'
+CONTROL_VAR_NAME = 'control_var_name'
+
+
+class BreakStatementTransformer(transformer.Base):
   """Canonicalizes break statements into additional conditionals."""
 
-  def __init__(self, context):
-    super(BreakCanonicalizationTransformer, self).__init__(context)
-    # This is a stack structure, to correctly process nested loops.
-    # Each item is a list [break_used, break_variable_name]
-    self.break_uses = []
+  def _track_body(self, nodes, break_var):
+    self.enter_local_scope()
+    self.set_local(CONTROL_VAR_NAME, break_var)
+    nodes = self.visit_block(nodes)
+    break_used = self.get_local(BREAK_USED, False)
+    self.exit_local_scope()
+    return nodes, break_used
 
   def visit_Break(self, node):
-    self.break_uses[-1][0] = True
+    self.set_local(BREAK_USED, True)
+    var_name = self.get_local(CONTROL_VAR_NAME)
+    # TODO(mdan): This will fail when expanded inside a top-level else block.
     template = """
       var_name = True
       continue
     """
-    return templates.replace(template, var_name=self.break_uses[-1][1])
+    return templates.replace(template, var_name=var_name)
+
+  def _guard_if_present(self, block, var_name):
+    """Prevents the block from executing if var_name is set."""
+    if not block:
+      return block
+    template = """
+        if not var_name:
+          block
+      """
+    node = templates.replace(
+        template,
+        var_name=var_name,
+        block=block)
+    return node
 
   def visit_While(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    break_var = self.context.namer.new_symbol('break_requested',
-                                              scope.referenced)
+    break_var = self.context.namer.new_symbol('break__', scope.referenced)
 
-    self.break_uses.append([False, break_var])
-    node = self.generic_visit(node)
-    if self.break_uses[-1][0]:
+    node.test = self.visit(node.test)
+    node.body, break_used = self._track_body(node.body, break_var)
+    # A break in the else clause applies to the containing scope.
+    node.orelse = self.visit_block(node.orelse)
+
+    if break_used:
       template = """
         var_name = False
-        while original_test and not var_name:
-          original_body
+        while test and not var_name:
+          body
         else:
-          original_orelse
+          orelse
       """
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
       node = templates.replace(
           template,
           var_name=break_var,
-          original_test=node.test,
-          original_body=node.body,
-          original_orelse=node.orelse)
-    self.break_uses.pop()
+          test=node.test,
+          body=node.body,
+          orelse=self._guard_if_present(node.orelse, break_var))
 
     return node
 
   def visit_For(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    break_var = self.context.namer.new_symbol('break_requested',
-                                              scope.referenced)
+    break_var = self.context.namer.new_symbol('break__', scope.referenced)
+
+    node.target = self.visit(node.target)
+    node.iter = self.visit(node.iter)
+    node.body, break_used = self._track_body(node.body, break_var)
+    # A break in the else clause applies to the containing scope.
+    node.orelse = self.visit_block(node.orelse)
 
-    self.break_uses.append([False, break_var])
-    node = self.generic_visit(node)
-    if self.break_uses[-1][0]:
+    if break_used:
+      node.orelse = self._guard_if_present(node.orelse, break_var)
       template = """
         var_name = False
-        original_for
+        for_stmt
       """
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
       node = templates.replace(
           template,
           var_name=break_var,
-          original_for=node)
+          for_stmt=node)
       extra_cond = templates.replace_as_expression(
           'not var_name', var_name=break_var)
-      new_for_node = node[1]
-      anno.setanno(new_for_node, 'extra_cond', extra_cond)
-    self.break_uses.pop()
+      anno.setanno(node[1], 'extra_cond', extra_cond)
 
     return node
 
 
 def transform(node, context):
-  return BreakCanonicalizationTransformer(context).visit(node)
+  return BreakStatementTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/break_statements_test.py b/tensorflow/contrib/autograph/converters/break_statements_test.py
index dd4914a022..1af59e9b52 100644
--- a/tensorflow/contrib/autograph/converters/break_statements_test.py
+++ b/tensorflow/contrib/autograph/converters/break_statements_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.platform import test
 
 class BreakCanonicalizationTest(converter_test_base.TestCase):
 
-  def test_basic_break(self):
+  def test_basic_while(self):
 
     def test_fn(x):
       v = []
@@ -40,13 +40,11 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
     node = break_statements.transform(node, self.ctx)
 
     with self.compiled(node) as result:
-      self.assertEqual(test_fn(0), result.test_fn(0))
-      self.assertEqual(test_fn(1), result.test_fn(1))
-      self.assertEqual(test_fn(2), result.test_fn(2))
-      self.assertEqual(test_fn(3), result.test_fn(3))
-      self.assertEqual(test_fn(4), result.test_fn(4))
+      self.assertEqual([], result.test_fn(0))
+      self.assertEqual([], result.test_fn(1))
+      self.assertEqual([3], result.test_fn(4))
 
-  def test_basic_break_for_loop(self):
+  def test_basic_for(self):
 
     def test_fn(a):
       v = []
@@ -57,30 +55,18 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    # The break is incompletely canonicalized for for loops. Everything is
-    # in place except for the condition verification.
-    def test_equiv_fn(a):
-      v = []
-      for x in a:
-        x -= 1
-        if x % 2 == 0:
-          continue
-        v.append(x)
-      return v
-
     node = self.parse_and_analyze(test_fn, {})
     node = break_statements.transform(node, self.ctx)
 
     with self.compiled(node) as result:
-      # The break is incompletely canonicalized. Everything is in place, but
-      # the loop does not break.
-      self.assertEqual(test_equiv_fn([]), result.test_fn([]))
-      self.assertEqual(test_equiv_fn([1]), result.test_fn([1]))
-      self.assertEqual(test_equiv_fn([2]), result.test_fn([2]))
-      self.assertEqual(
-          test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4]))
+      # The break is incompletely canonicalized. The loop will not interrupt,
+      # but the section following the break will be skipped.
+      self.assertEqual([], result.test_fn([]))
+      self.assertEqual([3, 3], result.test_fn([4, 4]))
+      self.assertEqual([3], result.test_fn([4, 5]))
+      self.assertEqual([3], result.test_fn([5, 4]))
 
-  def test_continue_deeply_nested(self):
+  def test_deeply_nested(self):
 
     def test_fn(x):
       v = []
@@ -93,7 +79,7 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
             u.append(x)
           else:
             w.append(x)
-            continue
+            break
         v.append(x)
       return v, u, w
 
@@ -101,11 +87,60 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
     node = break_statements.transform(node, self.ctx)
 
     with self.compiled(node) as result:
-      self.assertEqual(test_fn(0), result.test_fn(0))
-      self.assertEqual(test_fn(1), result.test_fn(1))
-      self.assertEqual(test_fn(2), result.test_fn(2))
-      self.assertEqual(test_fn(3), result.test_fn(3))
-      self.assertEqual(test_fn(4), result.test_fn(4))
+      self.assertEqual(([], [], []), result.test_fn(0))
+      self.assertEqual(([2, 1], [2], [0]), result.test_fn(3))
+      self.assertEqual(([10, 9, 8, 7], [10, 8], [6]), result.test_fn(11))
+
+  def test_nested_loops(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      while x > 0:
+        x -= 1
+        y = x
+        while y > 0:
+          y -= 1
+          if y % 2 == 0:
+            break
+          u.append(y)
+        if x == 0:
+          break
+        v.append(x)
+      return v, u
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], []), result.test_fn(0))
+      self.assertEqual(([1], []), result.test_fn(2))
+      self.assertEqual(([2, 1], [1]), result.test_fn(3))
+      self.assertEqual(([4, 3, 2, 1], [3, 1]), result.test_fn(5))
+
+  def test_loop_else(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      while x > 0:
+        x -= 1
+        y = x
+        while y > 1:
+          break
+        else:
+          u.append(y)
+          break
+        v.append(x)
+      return v, u
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(([], []), result.test_fn(0))
+      self.assertEqual(([], [1]), result.test_fn(2))
+      self.assertEqual(([2], [1]), result.test_fn(3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index e102ab7630..4db6cc0adf 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -69,6 +69,10 @@ class Base(gast.NodeTransformer):
   def enclosing_entities(self):
     return tuple(self._enclosing_entities)
 
+  @property
+  def locel_scope_level(self):
+    return len(self._local_scope_state)
+
   def enter_local_scope(self):
     self._local_scope_state.append({})
 
-- 
GitLab


From 910b77c46ce58a36964e30a1590d8037013d0782 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 10:27:48 -0700
Subject: [PATCH 1025/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193378087
---
 tensorflow/core/ops/ops.pbtxt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1659adc9fe..a36608ded3 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4021,6 +4021,10 @@ op {
     name: "tree_complexity"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "node_ids_list"
     type: DT_INT32
-- 
GitLab


From ce7a92a62a6bbf0765e68a3340fe3efb07ac1e2b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 10:28:47 -0700
Subject: [PATCH 1026/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 193378249

---
 tensorflow/go/op/wrappers.go | 3049 ++++++++++++++++++++++++++++++++--
 1 file changed, 2918 insertions(+), 131 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1d5ebf6687..1d4b1399ed 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -43,7 +43,7 @@ type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
 // FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
 //
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// value: The bitwidth of the quantization; between 2 and 16, inclusive.
 // If not specified, defaults to 8
 func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
 	return func(m optionalAttr) {
@@ -124,7 +124,7 @@ func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMa
 // `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 // when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 // then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
 // This operation has a gradient and thus allows for training `min` and `max`
 // values.
@@ -305,7 +305,7 @@ func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr
 // `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 // when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 // then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
 // Quantization is called fake since the output is still in floating point.
 func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
@@ -401,6 +401,9 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 //      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
 //      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 //
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
 // Arguments:
 //	indices: Index tensor.
 //	updates: Updates to scatter into output.
@@ -1845,6 +1848,93 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
+// UniqueWithCountsV2Attr is an optional argument to UniqueWithCountsV2.
+type UniqueWithCountsV2Attr func(optionalAttr)
+
+// UniqueWithCountsV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements along an axis of a tensor.
+//
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` and a tensor `count`
+// that are the same size as the number of the elements in `x` along the
+// `axis` dimension. The `idx` contains the index in the unique output `y`
+// and the `count` contains the count in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 0`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// count ==> [2, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// count ==> [1, 2]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+// find the unique elements.
+//
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.A 1-D Tensor. The count of each value of x in the output y.
+func UniqueWithCountsV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueWithCountsV2Attr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCountsV2",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
 type UniqueWithCountsAttr func(optionalAttr)
 
@@ -1910,12 +2000,15 @@ func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
+// Finds unique elements along an axis of a tensor.
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` that is the same size as
+// the number of the elements in `x` along the `axis` dimension. It
+// contains the index in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
 //
 // `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
@@ -1928,9 +2021,34 @@ func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 // idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 // ```
 //
+// For an `2-D` tensor `x` with `axis = 0`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx = unique(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// ```
+//
 // Arguments:
 //	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
 // find the unique elements.
 //
 // Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
@@ -2217,6 +2335,35 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a sequence of numbers.
 //
 // This operation creates a sequence of numbers that begins at `start` and
@@ -2277,7 +2424,7 @@ func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, seg
 
 // Computes the mean along sparse segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
@@ -2332,7 +2479,7 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 // Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // For example:
@@ -2507,7 +2654,7 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 
 // Computes the sum along sparse segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
@@ -2572,6 +2719,44 @@ func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the minimum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_j data_j\\) where min is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMin",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -2589,7 +2774,7 @@ func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 
 // Computes the sum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -2920,6 +3105,32 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 	return op.Output(0)
 }
 
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	stride: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be in `[1, window_size)`.
+//
+//
+func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SlideDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, stride,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
 // N is the size of the segment being reduced.
@@ -2927,7 +3138,7 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 // Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Arguments:
@@ -3233,20 +3444,21 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
+// Computes the maximum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
 //
 // \\(output_i = \max_j data_j\\) where max is over `j` such
 // that `segment_ids[j] == i`.
 //
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 // <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
@@ -3656,7 +3868,7 @@ func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output)
 // Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Arguments:
@@ -3758,9 +3970,8 @@ type ResizeBicubicAttr func(optionalAttr)
 
 // ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
@@ -4171,6 +4382,26 @@ func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Checks whether a tree ensemble has been initialized.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resouce.
+//
+// Returns output boolean on whether it is initialized or not.
+func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsBoostedTreesEnsembleInitialized",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Cast x of type SrcT to y of DstT.
 func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
@@ -4845,6 +5076,23 @@ func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backpr
 	return op.Output(0)
 }
 
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // BatchMatMulAttr is an optional argument to BatchMatMul.
 type BatchMatMulAttr func(optionalAttr)
 
@@ -5315,6 +5563,51 @@ func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax
 	return op.Output(0)
 }
 
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
+
+// MutexV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutexV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a Mutex resource that can be locked by `MutexLock`.
+//
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -7069,6 +7362,44 @@ func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastSend",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes a copy of `x`.
+//
+// Arguments:
+//	x: The source tensor of type `T`.
+//
+// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+//       is not an alias of `x`.
+func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeepCopy",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
 // If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
@@ -7342,6 +7673,46 @@ func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Updates the tree ensemble by either adding a layer to the last tree being grown
+//
+// or by starting a new tree.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesUpdateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
 type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
@@ -7419,7 +7790,7 @@ func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 //
 // N is the size of the segment being reduced.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Arguments:
@@ -7652,24 +8023,65 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// Divides sparse updates into the variable referenced by `resource`.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
+// This operation computes
 //
-// The generated values will have mean 0 and standard deviation 1.
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterDiv",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
 //	shape: The shape of the output tensor.
@@ -7695,6 +8107,47 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Reshapes a quantized tensor as per the Reshape op.
 //
 // ```
@@ -7904,9 +8357,8 @@ type ResizeBilinearAttr func(optionalAttr)
 
 // ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
@@ -7959,6 +8411,26 @@ func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a TensorList which, when stacked, has the value of `tensor`.
+//
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
 type GenerateVocabRemappingAttr func(optionalAttr)
 
@@ -8066,6 +8538,30 @@ func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPoolGradAttr is an optional argument to AvgPoolGrad.
 type AvgPoolGradAttr func(optionalAttr)
 
@@ -8547,6 +9043,49 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
+
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
+//
+// Returns The text after applying pattern and rewrite.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexReplace",
+		Input: []tf.Input{
+			input, pattern, rewrite,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes numerical negative value element-wise.
 //
 // I.e., \\(y = -x\\).
@@ -8745,7 +9284,7 @@ func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr
 // `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
 // when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
 // then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
 // This operation has a gradient and thus allows for training `min` and `max`
 // values.
@@ -9039,9 +9578,70 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 	return scope.AddOperation(opspec)
 }
 
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+//
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the mean along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -9453,9 +10053,8 @@ type ResizeAreaAttr func(optionalAttr)
 
 // ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
@@ -9467,6 +10066,11 @@ func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 //
 // Input images can be of different types but output images are always float.
 //
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
 // Each output pixel is computed by first transforming the pixel's footprint into
 // the input tensor and then averaging the pixels that intersect the footprint. An
 // input pixel's contribution to the average is weighted by the fraction of its
@@ -10471,6 +11075,50 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns x / y element-wise for integer types.
 //
 // Truncation designates that negative numbers will round fractional quantities
@@ -10571,7 +11219,7 @@ func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_
 
 // Computes the maximum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -10620,6 +11268,21 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Decode web-safe base64-encoded strings.
 //
 // Input may or may not have padding at the end. See EncodeBase64 for padding.
@@ -11452,6 +12115,35 @@ func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0), op.Output(1)
 }
 
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
 type MatrixSolveLsAttr func(optionalAttr)
 
@@ -11484,14 +12176,14 @@ func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 // If `fast` is `True`, then the solution is computed by solving the normal
 // equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
 // \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
 // \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
 // minimum-norm solution to the under-determined linear system, i.e.
 // \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
 // subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
 // when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
 // sufficiently large.
 //
 // If `fast` is `False` an algorithm based on the numerically robust complete
@@ -11739,6 +12431,47 @@ func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtyp
 	return op.Output(0)
 }
 
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Inverse 2D fast Fourier transform.
 //
 // Computes the inverse 2-dimensional discrete Fourier transform over the
@@ -12337,9 +13070,8 @@ type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
 // FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
@@ -13065,6 +13797,117 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
+
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
+//
+// Compute the backprop of both data and weights in a RNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: a 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: a 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: the same shape has input_h.
+// output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackprop",
+		Input: []tf.Input{
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
 type FractionalMaxPoolGradAttr func(optionalAttr)
 
@@ -13163,6 +14006,107 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 	return scope.AddOperation(opspec)
 }
 
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Converts CudnnRNN params from canonical form to usable form.
+//
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNCanonicalToParams",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
 type SparseReduceMaxSparseAttr func(optionalAttr)
 
@@ -13357,6 +14301,47 @@ func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	return op.Output(0)
 }
 
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes sigmoid of `x` element-wise.
 //
 // Specifically, `y = 1 / (1 + exp(-x))`.
@@ -13374,6 +14359,30 @@ func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+//     Updates specified rows with values in `v`.
+//
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceUpdate",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FusedBatchNormAttr is an optional argument to FusedBatchNorm.
 type FusedBatchNormAttr func(optionalAttr)
 
@@ -13584,6 +14593,43 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
+// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
+type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
+
+// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a BoostedTreesEnsembleResource
+func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesEnsembleResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -13717,6 +14763,30 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
+// Returns the last element of the input list as well as a list with all but that element.
+//
+// Fails if the list is empty.
+//
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListPopBack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Returns element-wise integer closest to x.
 //
 // If the result is midway between two representable values,
@@ -14471,6 +15541,26 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
+// Serializes the tree ensemble to a proto.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesSerializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // StageSizeAttr is an optional argument to StageSize.
 type StageSizeAttr func(optionalAttr)
 
@@ -14612,7 +15702,7 @@ func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_
 
 // Computes the sum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -14668,6 +15758,99 @@ func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
+//
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsSize",
+		Input: []tf.Input{
+			num_layers, num_units, input_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes gradients for SparseSegmentMean.
 //
 // Returns tensor "output" with same shape as grad, except for dimension 0 whose
@@ -14696,6 +15879,7 @@ func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segm
 //
 // Note that this routine only supports wildcard characters in the
 // basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
 // Arguments:
 //	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
@@ -15116,18 +16300,58 @@ func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
 // Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
@@ -16505,11 +17729,8 @@ func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Out
 
 // Subtracts a value from the current value of a variable.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
 //	resource: handle to the resource in which to store the variable.
@@ -16594,9 +17815,8 @@ type QuantizedResizeBilinearAttr func(optionalAttr)
 
 // QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
@@ -16638,7 +17858,7 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 
 // Computes the minimum along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -16677,8 +17897,8 @@ type SdcaOptimizerAttr func(optionalAttr)
 
 // SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
 func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
 		m["adaptative"] = value
@@ -17248,7 +18468,7 @@ func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataTy
 // Duplicate entries are handled correctly: if multiple `indices` reference
 // the same location, their contributions add.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 // <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
@@ -17526,6 +18746,43 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
+// Computes the product along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RandomUniformIntAttr is an optional argument to RandomUniformInt.
 type RandomUniformIntAttr func(optionalAttr)
 
@@ -18552,6 +19809,57 @@ func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, inp
 	return op.Output(0), op.Output(1)
 }
 
+// Rolls the elements of a tensor along an axis.
+//
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
+//
+// For example:
+//
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+//
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
+//
+// Arguments:
+//
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Roll",
+		Input: []tf.Input{
+			input, shift, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MapPeekAttr is an optional argument to MapPeek.
 type MapPeekAttr func(optionalAttr)
 
@@ -18690,6 +19998,68 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
+// Calculates gains for each feature and returns the best possible split information for the feature.
+//
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Input: []tf.Input{
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+}
+
 // EncodePngAttr is an optional argument to EncodePng.
 type EncodePngAttr func(optionalAttr)
 
@@ -18804,9 +20174,8 @@ type ResizeBilinearGradAttr func(optionalAttr)
 
 // ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
 func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
@@ -19469,6 +20838,47 @@ func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMax",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Outputs a `Summary` protocol buffer with scalar values.
 //
 // The input `tags` and `values` must have the same shape.  The generated summary
@@ -19853,6 +21263,88 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+//
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The shape of the elements of the given list, as a tensor.
+//
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
+	opspec := tf.OpSpec{
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the matrix exponential of one or more square matrices:
 //
 // exp(A) = \sum_{n=0}^\infty A^n/n!
@@ -19888,6 +21380,46 @@ func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// log(exp(A)) = A
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -20067,7 +21599,8 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 // SelfAdjointEig.
 //
 // The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
 //
 // Arguments:
 //	input: Shape is `[..., M, M]`.
@@ -20125,7 +21658,8 @@ func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 // Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
 // Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
 //
 // ```python
 // # a is a tensor.
@@ -20308,7 +21842,7 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 
 // Computes the product along segments of a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 // segments.
 //
 // Computes a tensor such that
@@ -21182,29 +22716,53 @@ func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Out
 	return op.Output(0), op.Output(1)
 }
 
-// Restore a Reader to its initial clean state.
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "InplaceAdd",
 		Input: []tf.Input{
-			reader_handle,
+			x, i, v,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var, m, and v tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
@@ -21290,9 +22848,8 @@ type ResizeBicubicGradAttr func(optionalAttr)
 
 // ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
 func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
@@ -21334,9 +22891,8 @@ type ResizeNearestNeighborAttr func(optionalAttr)
 
 // ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
 func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
@@ -21377,9 +22933,8 @@ type ResizeNearestNeighborGradAttr func(optionalAttr)
 
 // ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
 func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
@@ -21778,6 +23333,58 @@ func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Gets the next output from the given iterator.
+//
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNextSync",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
+}
+
 // SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
 type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
@@ -22326,6 +23933,83 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// The op serializes protobuf messages provided in the input tensors.
+//
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeProto",
+		Input: []tf.Input{
+			sizes, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a TensorArray for storing the gradients of values in the given handle.
 //
 // If the given TensorArray gradient already exists, returns a reference to it.
@@ -22386,6 +24070,132 @@ func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0), op.Output(1)
 }
 
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
+
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
+//
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+//
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["message_format"] = value
+	}
+}
+
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+//
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
+	}
+}
+
+// The op extracts fields from a serialized protocol buffers message into tensors.
+//
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
+//
+// Arguments:
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names.
+//	output_types: List of TF types to use for the respective field in field_names.
+//
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeProtoV2",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
+	}
+	return sizes, values
+}
+
 // Creates a dataset that splits a SparseTensor into elements row-wise.
 func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -22440,11 +24250,8 @@ func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset t
 
 // Adds a value to the current value of a variable.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
 // Arguments:
 //	resource: handle to the resource in which to store the variable.
@@ -23107,6 +24914,35 @@ func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Op
 	return scope.AddOperation(opspec)
 }
 
+// Makes the summary of accumulated stats for the batch.
+//
+// The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
+//
+// Arguments:
+//	node_ids: int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+//	gradients: float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+//	hessians: float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+//	bucketized_features_list: int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+//	max_splits: int; the maximum number of splits possible in the whole tree.
+//	num_buckets: int; equals to the maximum possible value of bucketized feature.
+//
+// Returns output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, bucketized_features_list []tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeStatsSummary",
+		Input: []tf.Input{
+			node_ids, gradients, hessians, tf.OutputList(bucketized_features_list),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Adjust the contrast of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
@@ -23331,6 +25167,10 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 // <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 // </div>
 //
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
 // Arguments:
 //	params: The tensor from which to gather values. Must be at least rank
 // `axis + 1`.
@@ -23827,6 +25667,28 @@ func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
 type PriorityQueueV2Attr func(optionalAttr)
 
@@ -24366,6 +26228,125 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // OrderedMapStageAttr is an optional argument to OrderedMapStage.
 type OrderedMapStageAttr func(optionalAttr)
 
@@ -24604,30 +26585,140 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// input: a 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: a 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: a 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: a 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: the same shape has input_h.
+// output_c: the same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: an opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNN",
+		Input: []tf.Input{
+			input, input_h, input_c, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
 // a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
@@ -24697,6 +26788,47 @@ func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, val
 	return op.Output(0)
 }
 
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
+
+// EmptyInit sets the optional init attribute to value.
+//
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
+	return func(m optionalAttr) {
+		m["init"] = value
+	}
+}
+
+// Creates a tensor with the given shape.
+//
+// This operation creates a tensor of `shape` and `dtype`.
+//
+// Arguments:
+//	shape: 1-D. Represents the shape of the output tensor.
+//
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Empty",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
 type TensorArrayConcatV3Attr func(optionalAttr)
 
@@ -24814,6 +26946,27 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
+// Sets the index-th position of the list to contain the given tensor.
+//
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a diagonal tensor with a given diagonal values.
 //
 // Given a `diagonal`, this operation returns a tensor with the `diagonal` and
@@ -25358,6 +27511,27 @@ func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, val
 	return op.Output(0)
 }
 
+// Creates a tree ensemble model and returns a handle to it.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Applies sparse addition to `input` using individual values or slices
 //
 // from `updates` according to indices `indices`.  The updates are non-aliasing:
@@ -26238,6 +28412,120 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 	return output
 }
 
+// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
+type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
+
+// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Retrieves CudnnRNN params in canonical form.
+//
+// Retrieves a set of weights from the opaque params buffer that can be saved and
+// restored in a way compatible with future runs.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_params": num_params}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsToCanonical",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	return weights, biases
+}
+
 // UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
 type UniformCandidateSamplerAttr func(optionalAttr)
 
@@ -26480,6 +28768,128 @@ func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	return op.Output(0)
 }
 
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // EnterAttr is an optional argument to Enter.
 type EnterAttr func(optionalAttr)
 
@@ -26915,6 +29325,64 @@ func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Elementwise computes the bitwise AND of `x` and `y`.
 //
 // The result will have those bits set, that are set in both `x` and `y`. The
@@ -26951,6 +29419,44 @@ func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
+	}
+}
+
+// Stacks all tensors in the list.
+//
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListStack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Elementwise computes the bitwise right-shift of `x` and `y`.
 //
 // Performs a logical shift for unsigned integer types, and an arithmetic shift
@@ -27000,6 +29506,175 @@ func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Outpu
 	return op.Output(0)
 }
 
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
+//
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
+//
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Batch",
+		Input: []tf.Input{
+			tf.OutputList(in_tensors),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
+}
+
+// UnbatchAttr is an optional argument to Unbatch.
+type UnbatchAttr func(optionalAttr)
+
+// UnbatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchContainer(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchSharedName(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Reverses the operation of Batch for a single output Tensor.
+//
+// An instance of Unbatch either receives an empty batched_tensor, in which case it
+// asynchronously waits until the values become available from a concurrently
+// running instance of Unbatch with the same container and shared_name, or receives
+// a non-empty batched_tensor in which case it finalizes all other concurrently
+// running instances and outputs its own element from the batch.
+//
+// batched_tensor: The possibly transformed output of Batch. The size of the first
+//  dimension should remain unchanged by the transformations for the operation to
+//  work.
+// batch_index: The matching batch_index obtained from Batch.
+// id: The id scalar emitted by Batch.
+// unbatched_tensor: The Tensor corresponding to this execution.
+// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+//  batched input tensor associated with a given invocation of the op.
+// container: Container to control resource sharing.
+// shared_name: Instances of Unbatch with the same container and shared_name are
+//  assumed to possibly belong to the same batch. If left empty, the op name will
+//  be used as the shared name.
+func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unbatch",
+		Input: []tf.Input{
+			batched_tensor, batch_index, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
 type AvgPool3DGradAttr func(optionalAttr)
 
@@ -27212,6 +29887,60 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeWavAttr is an optional argument to DecodeWav.
 type DecodeWavAttr func(optionalAttr)
 
@@ -27317,6 +30046,60 @@ func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf
 	return op.Output(0)
 }
 
+//     Subtracts `v` into specified rows of `x`.
+//
+//     Computes y = x; y[i, :] -= v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceSub",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a flat index or array of flat indices into a tuple of
+//
+// coordinate arrays.
+//
+// @compatibility(numpy)
+// Equivalent to np.unravel_index
+// @end_compatibility
+//
+// Arguments:
+//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+// flattened version of an array of dimensions dims.
+//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+// indices.
+//
+// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+// same shape as the indices array.
+func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnravelIndex",
+		Input: []tf.Input{
+			indices, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
 // The lower regularized incomplete Gamma function is defined as:
@@ -27799,6 +30582,10 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 //
 //     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 //
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
 // Some examples below.
 //
 // Simple indexing into a matrix:
-- 
GitLab


From 7ffbedee2d78fd9dc8e6d072858b0fada0d98a3e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:17:06 +0000
Subject: [PATCH 1027/1262] Add uint16 support for py_func

In tf most of the numeric data types are supported though uint16 support
is not:
```
$ python
>>> import tensorflow as tf
>>> def sum_func(x, y):
...   return x + y
...
>>> x = tf.constant(1, dtype=tf.uint16)
>>> y = tf.constant(2, dtype=tf.uint16)
>>> z = tf.py_func(sum_func, [x, y], tf.uint16)
>>> tf.Session().run(z)
...
...
tensorflow.python.framework.errors_impl.UnimplementedError: Unsupported numpy type 4
	 [[Node: PyFunc = PyFunc[Tin=[DT_UINT16, DT_UINT16], Tout=[DT_UINT16], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](Const, Const_1)]]
...
```

The reason is that there is no conversion between numpy uint16 and tf.uint16.

This fix adds the support so that py_func could process tf.uint16 data types.

This fix also adds test cases for different data types with py_func to
increase the test coverage.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/lib/core/py_func.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 22317a348c..8c6bb7955a 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -126,6 +126,9 @@ Status NumericNpDTypeToTfDType(const int np, DataType* tf) {
     case NPY_INT8:
       *tf = DT_INT8;
       break;
+    case NPY_UINT16:
+      *tf = DT_UINT16;
+      break;
     case NPY_INT16:
       *tf = DT_INT16;
       break;
-- 
GitLab


From 493f297d4a95add8242dfd1321ff8eb1d551db16 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:21:36 +0000
Subject: [PATCH 1028/1262] Add test cases for real data types with py_func.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 5b508b7c0e..bea997098d 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -52,6 +52,16 @@ class PyFuncTest(test.TestCase):
   """Encapsulates tests for py_func and eager_py_func."""
 
   # ----- Tests for py_func -----
+  def testRealDataTypes(self):
+    def sum_func(x, y):
+      return x + y
+    for dtype in [np.float16, np.float32, np.float64, np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64]:
+      with self.test_session():
+        x = constant_op.constant(1, dtype=dtype)
+        y = constant_op.constant(2, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
+        self.assertEqual(z, dtype(3))
+
   def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
-- 
GitLab


From b1165a83ec6ed3beb5076b67631f5c3739b6a068 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:22:53 +0000
Subject: [PATCH 1029/1262] Fix line too long issue with pylint

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index bea997098d..9d8761fdb9 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -55,7 +55,8 @@ class PyFuncTest(test.TestCase):
   def testRealDataTypes(self):
     def sum_func(x, y):
       return x + y
-    for dtype in [np.float16, np.float32, np.float64, np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64]:
+    for dtype in [np.float16, np.float32, np.float64,
+                  np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64]:
       with self.test_session():
         x = constant_op.constant(1, dtype=dtype)
         y = constant_op.constant(2, dtype=dtype)
-- 
GitLab


From 090794d6b71ff20c4d365015f604aba9b8acf8d6 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:24:30 +0000
Subject: [PATCH 1030/1262] Add test cases for complex (complex64/complex128)
 type with py_func

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 9d8761fdb9..5280b80c6c 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -62,6 +62,15 @@ class PyFuncTest(test.TestCase):
         y = constant_op.constant(2, dtype=dtype)
         z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
         self.assertEqual(z, dtype(3))
+  def testComplexDataTypes(self):
+    def sum_func(x, y):
+      return x + y
+    for dtype in [np.complex64, np.complex128]:
+      with self.test_session():
+        x = constant_op.constant(1 + 1j, dtype=dtype)
+        y = constant_op.constant(2 + 2j, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
+        self.assertEqual(z, dtype(3 + 3j))
 
   def testSingleType(self):
     with self.test_session():
-- 
GitLab


From 6e1b1d244451bea06de7253ba80166d90e483ea6 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:25:32 +0000
Subject: [PATCH 1031/1262] Pylint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 5280b80c6c..e0eeee1b5b 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -62,6 +62,7 @@ class PyFuncTest(test.TestCase):
         y = constant_op.constant(2, dtype=dtype)
         z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
         self.assertEqual(z, dtype(3))
+
   def testComplexDataTypes(self):
     def sum_func(x, y):
       return x + y
-- 
GitLab


From 6919f6e311b9b8b53675824567adf5fd22de40ac Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:27:51 +0000
Subject: [PATCH 1032/1262] Update complex test case

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index e0eeee1b5b..fd71f51151 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -64,14 +64,14 @@ class PyFuncTest(test.TestCase):
         self.assertEqual(z, dtype(3))
 
   def testComplexDataTypes(self):
-    def sum_func(x, y):
-      return x + y
+    def sub_func(x, y):
+      return x - y
     for dtype in [np.complex64, np.complex128]:
       with self.test_session():
         x = constant_op.constant(1 + 1j, dtype=dtype)
-        y = constant_op.constant(2 + 2j, dtype=dtype)
-        z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
-        self.assertEqual(z, dtype(3 + 3j))
+        y = constant_op.constant(2 - 2j, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(sub_func, [x, y], dtype))
+        self.assertEqual(z, dtype(-1 + 3j))
 
   def testSingleType(self):
     with self.test_session():
-- 
GitLab


From 1936fb5e018952d77c5b6e90ec75575b1a6918d5 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:28:56 +0000
Subject: [PATCH 1033/1262] Add test case for py_func with bool types

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index fd71f51151..54ab5ab1f0 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -73,6 +73,16 @@ class PyFuncTest(test.TestCase):
         z = self.evaluate(script_ops.py_func(sub_func, [x, y], dtype))
         self.assertEqual(z, dtype(-1 + 3j))
 
+  def testBoolDataTypes(self):
+    def and_func(x, y):
+      return x and y
+    for dtype in [np.bool]:
+      with self.test_session():
+        x = constant_op.constant(True, dtype=dtype)
+        y = constant_op.constant(False, dtype=dtype)
+        z = self.evaluate(script_ops.py_func(and_func, [x, y], dtype))
+        self.assertEqual(z, dtype(False))
+
   def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
-- 
GitLab


From 4eec00cd4b8b8a3a46322dd044095829c11f1224 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:32:37 +0000
Subject: [PATCH 1034/1262] Remove unneeded for loop for bool data types in
 tests

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 54ab5ab1f0..7a178617dd 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -76,12 +76,12 @@ class PyFuncTest(test.TestCase):
   def testBoolDataTypes(self):
     def and_func(x, y):
       return x and y
-    for dtype in [np.bool]:
-      with self.test_session():
-        x = constant_op.constant(True, dtype=dtype)
-        y = constant_op.constant(False, dtype=dtype)
-        z = self.evaluate(script_ops.py_func(and_func, [x, y], dtype))
-        self.assertEqual(z, dtype(False))
+    dtype = dtypes.bool
+    with self.test_session():
+      x = constant_op.constant(True, dtype=dtype)
+      y = constant_op.constant(False, dtype=dtype)
+      z = self.evaluate(script_ops.py_func(and_func, [x, y], dtype))
+      self.assertEqual(z, False)
 
   def testSingleType(self):
     with self.test_session():
-- 
GitLab


From 8ade898582f79af900853e5b3336af08846ddd62 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 11 Apr 2018 22:20:09 +0000
Subject: [PATCH 1035/1262] Replace raw_input/input with six.moves.input

This fix is an enhancement to replace raw_input/input
in python 2 and 3 with six.moves.input.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/debug/wrappers/grpc_wrapper.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index fb9494f576..eafe8c22fe 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import signal
 import sys
 import traceback
+import six
 
 # Google-internal import(s).
 from tensorflow.python.debug.lib import common
@@ -140,11 +141,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
 
 def _signal_handler(unused_signal, unused_frame):
-  try:
-    input_func = raw_input
-  except NameError:
-    # Python 3 does not have raw_input.
-    input_func = input
+  input_func = six.moves.input
 
   while True:
     response = input_func("\nSIGINT received. Quit program? (Y/n): ").strip()
-- 
GitLab


From 394da026da99a69e2adc6a45b25fd3e153af3814 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:42:02 +0000
Subject: [PATCH 1036/1262] Pylint fix for the import

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/debug/wrappers/grpc_wrapper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index eafe8c22fe..94acdfd11b 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import signal
 import sys
 import traceback
+
 import six
 
 # Google-internal import(s).
-- 
GitLab


From 946584497b34f443c158f82374b86bc404e44458 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:43:24 +0000
Subject: [PATCH 1037/1262] Remove unneeded assignment

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/debug/wrappers/grpc_wrapper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index 94acdfd11b..00015606c9 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -142,10 +142,8 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
 
 def _signal_handler(unused_signal, unused_frame):
-  input_func = six.moves.input
-
   while True:
-    response = input_func("\nSIGINT received. Quit program? (Y/n): ").strip()
+    response = six.moves.input("\nSIGINT received. Quit program? (Y/n): ").strip()
     if response in ("", "Y", "y"):
       sys.exit(0)
     elif response in ("N", "n"):
-- 
GitLab


From 48589205460a876a9ac783bd9b7fc3af99f8defb Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 10:58:56 -0700
Subject: [PATCH 1038/1262] Fix issue where git_tag_override would fail if "-"
 in tag name.

---
 tensorflow/tools/git/gen_git_source.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index db2580755b..7f0f325119 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,14 +164,18 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override:
+    if git_tag_override and val:
       split_val = val.split("-")
-      if len(split_val) != 3:
+      if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      split_val[0] = git_tag_override
-      val = bytes("-".join(split_val))
+      # There might be "-" in the tag name. But we can be sure that the final
+      # two "-" are those inserted by the git describe command.
+      commits_ahead_of_tag = split_val[-2]
+      abbrev_commit = split_val[-1]
+      val = bytes(
+          "-".join([git_tag_override, commits_ahead_of_tag, abbrev_commit]))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
-- 
GitLab


From 9680377f7385cf5a3a73dc4d8b68d14a99afabe9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:44:30 +0000
Subject: [PATCH 1039/1262] Fix `Line too long (82/80) (line-too-long)` issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/debug/wrappers/grpc_wrapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index 00015606c9..1f9c8fa5a9 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -143,7 +143,8 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
 def _signal_handler(unused_signal, unused_frame):
   while True:
-    response = six.moves.input("\nSIGINT received. Quit program? (Y/n): ").strip()
+    response = six.moves.input(
+        "\nSIGINT received. Quit program? (Y/n): ").strip()
     if response in ("", "Y", "y"):
       sys.exit(0)
     elif response in ("N", "n"):
-- 
GitLab


From 5994156438a8d863dab04161589b34a3d0eb01d6 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 11:26:40 -0700
Subject: [PATCH 1040/1262] Fix gen_git_version script not being able to find
 git binary.

This error is happening on our Window's release builds. Making sure
we add git binary to the PATH for Bazel.
---
 tensorflow/tools/ci_build/windows/bazel/common_env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 7d4cc7ac30..0e6c0227b7 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -44,6 +44,8 @@ export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/${PYTHON_BASE_PATH}:$PATH"
+# Add git into PATH needed for gen_git_source.py
+export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-- 
GitLab


From 31f925c7783fb8fa58278b31585dcf7bdb4cfd8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 11:46:07 -0700
Subject: [PATCH 1041/1262] Change operands of subtraction expression to have
 well-defined behaviour.

At present, signed arithmetic overflows (i.e. has undefined behaviour) in general, e.g. when computing 0 - INT_MIN or INT_MAX - INT_MIN. The fact that we want the result in the unsigned type does not help us here.

The fix is to convert the operands to the corresponding unsigned type first and then perform the operation in unsigned arithmetic, which is well-defined and has the correct subtraction behaviour.

PiperOrigin-RevId: 193391813
---
 tensorflow/core/lib/random/random_distributions.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index ad16dbf01f..4cf3a999f6 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -164,7 +164,8 @@ class UniformDistribution<Generator, int32> {
   typedef int32 ResultElementType;
 
   // Must have lo < hi
-  UniformDistribution(int32 lo, int32 hi) : lo_(lo), range_(hi - lo) {}
+  UniformDistribution(int32 lo, int32 hi)
+      : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
@@ -198,7 +199,8 @@ class UniformDistribution<Generator, int64> {
   typedef int64 ResultElementType;
 
   // Must have lo < hi
-  UniformDistribution(int64 lo, int64 hi) : lo_(lo), range_(hi - lo) {}
+  UniformDistribution(int64 lo, int64 hi)
+      : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
-- 
GitLab


From 60444df318439654324ff797d66734c9920e48a2 Mon Sep 17 00:00:00 2001
From: Chris Kennelly <ckennelly@google.com>
Date: Wed, 18 Apr 2018 11:50:46 -0700
Subject: [PATCH 1042/1262] Expose an API for invoking sized delete.

Sized delete avoids a costly lookup to map the pointer to the allocated size
when this information is commonly available (Allocator::Deallocate).  As this
code also provides an alignment, we only use these paths when aligned new is
available.

PiperOrigin-RevId: 193392688
---
 tensorflow/core/framework/allocator.cc | 27 ++++++++++++++++++++++++++
 tensorflow/core/framework/allocator.h  | 11 ++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 1a7e5219cd..29b67ebdfa 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -48,6 +48,10 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
+void Allocator::DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) {
+  DeallocateRaw(ptr);
+}
+
 void RunResourceCtor(ResourceHandle* p, size_t n) {
   for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
 }
@@ -103,7 +107,12 @@ class CPUAllocator : public Allocator {
                    << "% of system memory.";
     }
 
+#ifdef __cpp_aligned_new
+    void* p =
+        ::operator new(num_bytes, static_cast<std::align_val_t>(alignment));
+#else
     void* p = port::AlignedMalloc(num_bytes, alignment);
+#endif
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
       mutex_lock l(mu_);
@@ -132,7 +141,25 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr);
+#else
     port::AlignedFree(ptr);
+#endif
+  }
+
+  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
+#ifdef __cpp_aligned_new
+    if (cpu_allocator_collect_stats) {
+      const std::size_t alloc_size =
+          port::MallocExtension_GetAllocatedSize(ptr);
+      mutex_lock l(mu_);
+      stats_.bytes_in_use -= alloc_size;
+    }
+    ::operator delete(ptr, num_bytes, static_cast<std::align_val_t>(alignment));
+#else
+    DeallocateRaw(ptr);
+#endif
   }
 
   void GetStats(AllocatorStats* stats) override {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca..0dda38fbb7 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -101,6 +101,11 @@ class Allocator {
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
 
+  // Deallocate a block of memory pointer to by "ptr" with size "num_bytes"
+  // REQUIRES: "ptr" was previously returned by a call to AllocateRaw with
+  // "num_bytes" and "alignment"
+  virtual void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes);
+
   // Convenience functions to do typed allocation.  C++ constructors
   // and destructors are invoked for complex types if necessary,
   // depending on the concrete Allocator implementation. May return
@@ -132,7 +137,7 @@ class Allocator {
   void Deallocate(T* ptr, size_t num_elements) {
     if (ptr) {
       RunDtor<T>(ptr, num_elements);
-      DeallocateRaw(ptr);
+      DeallocateRaw(ptr, kAllocatorAlignment, sizeof(T) * num_elements);
     }
   }
 
@@ -304,6 +309,10 @@ class AllocatorWrapper : public Allocator {
 
   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
 
+  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
+    wrapped_->DeallocateRaw(ptr, alignment, num_bytes);
+  }
+
   bool TracksAllocationSizes() override {
     return wrapped_->TracksAllocationSizes();
   }
-- 
GitLab


From 03d18ae232c3cff4c56d1efec7bf29f9b16c4f68 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Wed, 18 Apr 2018 12:03:32 -0700
Subject: [PATCH 1043/1262] Add support for initializable iterator in
 distribution strategies. Use that in estimator.

PiperOrigin-RevId: 193394603
---
 tensorflow/contrib/distribute/README.md       |  2 -
 .../distribute/python/minimize_loss_test.py   | 12 ++-
 .../distribute/python/mirrored_strategy.py    |  3 +-
 .../python/mirrored_strategy_multigpu_test.py |  3 +-
 .../distribute/python/one_device_strategy.py  |  7 +-
 .../distribute/python/optimizer_v2_test.py    |  3 +-
 .../distribute/python/prefetching_ops_v2.py   | 83 ++++++++++++++++---
 .../python/prefetching_ops_v2_test.py         | 22 +++++
 .../contrib/distribute/python/step_fn.py      |  3 +-
 .../contrib/distribute/python/values.py       | 22 ++---
 .../contrib/distribute/python/values_test.py  | 27 ++++++
 tensorflow/python/estimator/estimator.py      | 13 +--
 tensorflow/python/training/distribute.py      | 38 +++++----
 13 files changed, 174 insertions(+), 64 deletions(-)

diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 5d22d9aa2b..44a4481021 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -131,8 +131,6 @@ adjusting your learning rate or batch size according to the number of GPUs.
 We are working on addressing this limitation by splitting each batch across GPUs
 instead.
 * PartitionedVariables are not supported yet.
-* Input pipelines with Datasets that capture stateful objects and rely on
-`make_initializable_iterator` are not supported yet.
 
 ## What's next?
 
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 4219d54cbd..d7fbf7f379 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -67,7 +67,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       if is_tpu:
         dataset = dataset.batch(2)
 
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         # TODO(isaprykin): Make iterator get_next() return a list of sub-
@@ -127,7 +128,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
@@ -185,7 +187,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       # on each device.
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
         distribution._prefetch_on_device = False
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(
@@ -260,7 +263,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
       labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
       dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index eb0edb3a11..d5e22e8100 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -141,9 +141,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     return result
 
   def distribute_dataset(self, dataset):
-    per_device_dataset = values.PerDeviceDataset(
+    return values.PerDeviceDataset(
         dataset, self._devices, self._prefetch_on_device)
-    return per_device_dataset.make_one_shot_iterator()
 
   def _broadcast(self, tensor, destinations):
     # TODO(josh11b): In eager mode, use one thread per device, or async mode.
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 9e9f06da8e..59cd6703b9 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -248,7 +248,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     dist = mirrored_strategy.MirroredStrategy(
         ["/device:GPU:0", "/device:CPU:0"])
     features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
-    features = dist.distribute_dataset(features).get_next()
+    features = dist.distribute_dataset(
+        features).make_one_shot_iterator().get_next()
 
     with dist.scope():
       result = dist.call_for_each_tower(
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 39c49442b9..2002266dd5 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import six
 
 from tensorflow.contrib.distribute.python import values
-from tensorflow.contrib.eager.python import datasets
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -63,10 +61,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       return next_creator(*args, **kwargs)
 
   def distribute_dataset(self, dataset):
-    if context.executing_eagerly():
-      return datasets.Iterator(dataset)
-    else:
-      return dataset.make_one_shot_iterator()
+    return dataset
 
   def _broadcast(self, tensor, destinations):
     return tensor
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index a0912b625f..6e4d050073 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -42,7 +42,8 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
       model_fn, dataset, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = distribution.distribute_dataset(dataset)
+      iterator = distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
index dfcbb8568f..7b3670b45a 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -26,6 +26,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -34,26 +35,55 @@ from tensorflow.python.util import nest
 
 # pylint: disable=protected-access
 class _PrefetchToDeviceIterator(object):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device."""
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
 
-  def __init__(self, input_dataset, devices, buffer_size):
+  Args:
+    input_dataset: The input dataset.
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    devices: Devices on which to prefetch.
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server). Only used if one_shot
+        is False.
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               devices,
+               buffer_size,
+               shared_name=None):
     self._input_dataset = input_dataset
     self._get_next_call_count = 0
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
     self._devices = devices
-    input_iterator = input_dataset.make_one_shot_iterator()
-    input_iterator_handle = input_iterator.string_handle()
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
       """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, input_iterator.output_types, input_iterator.output_shapes,
-          input_iterator.output_classes)
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     target_device = gen_dataset_ops.iterator_get_device(
-        input_iterator._iterator_resource)
+        self._input_iterator._iterator_resource)
     self._buffering_resources = []
     for device in nest.flatten(self._devices):
       with ops.device(device):
@@ -61,9 +91,19 @@ class _PrefetchToDeviceIterator(object):
             f=_prefetch_fn,
             target_device=target_device,
             string_arg=input_iterator_handle,
-            buffer_size=buffer_size)
+            buffer_size=buffer_size,
+            shared_name=shared_name)
         self._buffering_resources.append(buffer_resource_handle)
 
+    if not self._one_shot:
+      reset_ops = []
+      for buffer_resource in self._buffering_resources:
+        reset_ops.append(
+            prefetching_ops.function_buffering_resource_reset(buffer_resource))
+      with ops.control_dependencies(reset_ops):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
+
   def get_next(self, name=None):
     """See @{tf.data.Iterator.get_next}."""
     self._get_next_call_count += 1
@@ -92,6 +132,12 @@ class _PrefetchToDeviceIterator(object):
 
     return nest.pack_sequence_as(self._devices, flat_result)
 
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
@@ -115,13 +161,24 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset):
     self._buffer_size = buffer_size if buffer_size is not None else 1
 
   def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(self._input_dataset, self._devices,
-                                     self._buffer_size)
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=True,
+        devices=self._devices,
+        buffer_size=self._buffer_size)
 
   def make_initializable_iterator(self, shared_name=None):
-    raise NotImplementedError("`prefetch_to_devices()` is not currently "
-                              "compatible with initializable iterators. Use "
-                              "`make_one_shot_iterator()` instead.")
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        devices=self._devices,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
 
   def _as_variant_tensor(self):
     # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
index 8ed16f4607..a68dbce6c7 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -64,5 +64,27 @@ class PrefetchingOpsV2Test(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchToTwoDevicesWithReinit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(5):
+        sess.run(next_element)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      sess.run(iterator.initializer)
+      for _ in range(5):
+        sess.run(next_element)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 82514c64be..68b8f4d626 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -54,7 +54,8 @@ class StandardInputStep(Step):
 
   def __init__(self, input_dataset, distribution):
     Step.__init__(self, distribution)
-    self._distributed_input = distribution.distribute_dataset(input_dataset)
+    self._distributed_input = distribution.distribute_dataset(
+        input_dataset).make_one_shot_iterator()
 
   def inputs(self):
     return self._distributed_input.get_next()
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 87bf059038..18fedd2775 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -28,7 +28,6 @@ import six
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.distribute.python import prefetching_ops_v2
-from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -510,6 +509,10 @@ class PerDeviceDataIterator(object):
     self._devices = devices
     self._prefetch_on_device = prefetch_on_device
 
+  @property
+  def initializer(self):
+    return self._iterator.initializer
+
   def get_next(self, name=None):
     """Scatter the input across devices."""
     if self._prefetch_on_device:
@@ -545,7 +548,8 @@ class PerDeviceDataset(object):
         "Prefetching is only supported in graph mode currently")
 
     if self._prefetch_on_device:
-      self._dataset = dataset
+      self._dataset = dataset.apply(
+          prefetching_ops_v2.prefetch_to_devices(self._devices))
     else:
       # TODO(priyag): If dropping remainder is not appropriate, find another
       # approach to distributing the dataset when not possible to divide evenly.
@@ -555,15 +559,13 @@ class PerDeviceDataset(object):
 
   def make_one_shot_iterator(self):
     """Get a one time use iterator for the distributed PerDeviceDataset."""
-    if self._prefetch_on_device:
-      on_device_dataset = self._dataset.apply(
-          prefetching_ops_v2.prefetch_to_devices(self._devices))
-      dataset_iterator = on_device_dataset.make_one_shot_iterator()
-    elif context.executing_eagerly():
-      dataset_iterator = datasets.Iterator(self._dataset)
-    else:
-      dataset_iterator = self._dataset.make_one_shot_iterator()
+    dataset_iterator = self._dataset.make_one_shot_iterator()
+    return PerDeviceDataIterator(
+        dataset_iterator, self._devices, self._prefetch_on_device)
 
+  def make_initializable_iterator(self):
+    """Get an initializable iterator for the distributed PerDeviceDataset."""
+    dataset_iterator = self._dataset.make_initializable_iterator()
     return PerDeviceDataIterator(
         dataset_iterator, self._devices, self._prefetch_on_device)
 
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 5c0d4b7d6c..e96ce54741 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import device_util
 from tensorflow.python.training import saver as saver_lib
@@ -408,6 +409,32 @@ class PerDeviceDatasetTest(test.TestCase):
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
     self._test_iterator(devices, dataset, expected_values)
 
+  def testInitializableIterator(self):
+    with context.graph_mode():
+      devices = ["/device:CPU:0"]
+      # Using random input since that is only allowed with initializable
+      # iterator.
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          random_ops.random_uniform((10,)))
+
+      per_device_dataset = values.PerDeviceDataset(
+          dataset, devices, prefetch_on_device=False)
+      iterator = per_device_dataset.make_initializable_iterator()
+
+      self.evaluate(iterator.initializer)
+      next_element = iterator.get_next()
+      for _ in range(10):
+        self.evaluate(next_element)
+
+      # Should fail after the input is finished.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
+
+      # After re-initializing the iterator, should be able to iterate again.
+      self.evaluate(iterator.initializer)
+      for _ in range(10):
+        self.evaluate(next_element)
+
 
 @test_util.with_c_api
 class MirroredVariableTest(test.TestCase):
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 4d3eff71ad..dde463aaf4 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -700,15 +700,10 @@ class Estimator(object):
     input_hooks = []
     if isinstance(result, dataset_ops.Dataset):
       if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
-        # TODO(josh11b): This is currently using a one-shot iterator, we
-        # will update this to an initializeable iterator once the
-        # necessory support for creating an initializable iterator is
-        # available.
-        result = self._distribution.distribute_dataset(result).get_next()
-      else:
-        iterator = result.make_initializable_iterator()
-        input_hooks.append(_DatasetInitializerHook(iterator))
-        result = iterator.get_next()
+        result = self._distribution.distribute_dataset(result)
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index c6b2dcdf98..d855c4f551 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -391,7 +391,8 @@ class DistributionStrategy(object):
 
     ```
     with my_distribution.scope():
-      iterator = my_distribution.distribute_dataset(dataset)
+      iterator = my_distribution.distribute_dataset(
+          dataset).make_one_shot_iterator()
       tower_train_ops = my_distribution.call_for_each_tower(
           tower_fn, iterator.get_next())
       train_op = tf.group(my_distribution.unwrap(tower_train_ops))
@@ -404,8 +405,14 @@ class DistributionStrategy(object):
     `tower_fn` can use the `get_tower_context()` API to get enhanced
     behavior in this case.
 
-    Note that in the future we will add support for initializable
-    Dataset iterators, at which point this example code will change.
+    You can also create an initializable iterator instead of one shot iterator.
+    In that case, you will need to ensure that you initialize the iterator
+    before calling get_next.
+    ```
+    iterator = my_distribution.distribute_dataset(
+        dataset).make_initializable_iterator())
+    session.run(iterator.initializer)
+    ```
 
   * If you want to write a distributed algorithm, you may use any of
     the `DistributionStrategy` APIs inside a
@@ -486,8 +493,8 @@ class DistributionStrategy(object):
     a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
-  * `d.distribute_dataset(dataset)`: in cross-tower context, produces an
-    iterator with locality T
+  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-tower
+    context, produces an iterator with locality T
   * `d.broadcast(t)`: in cross-tower context, produces a value with locality M
   * `d.broadcast(t, v)`: in cross-tower context, produces a value with
     locality V(`v`)
@@ -510,7 +517,7 @@ class DistributionStrategy(object):
 
   The standard pattern for updating variables is to:
 
-  1. Wrap your input dataset in `d.distribute_dataset()`.
+  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
   2. Define each tower `d.call_for_each_tower()` up to the point of
      getting a list of gradient, variable pairs.
   3. Call `d.reduce("sum", t, v)` or `d.batch_reduce()` to sum the
@@ -665,16 +672,19 @@ class DistributionStrategy(object):
     _require_distribution_strategy_scope(self)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
-  # TODO(josh11b): Currently this returns an iterator, but should return
-  # something implementing (a subset of) the Dataset API.
+  # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
+  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
+  # Extend to implement more functionality of datasets.
   def distribute_dataset(self, dataset):
-    """Return an iterator into `dataset` split across all towers.
+    """Return a `dataset` split across all towers.
 
-    Suitable for providing input to for `call_for_each_tower()`, as in:
+    Suitable for providing input to for `call_for_each_tower()` by creating an
+    iterator:
 
     ```
     with distribution_strategy.scope():
-      iterator = distribution_strategy.distribute_dataset(dataset)
+      distributed_dataset = distribution_strategy.distribute_dataset(dataset)
+      iterator = distributed_dataset.make_one_shot_iterator()
       tower_results = distribution_strategy.call_for_each_tower(
           tower_fn, iterator.get_next())
     ```
@@ -683,7 +693,7 @@ class DistributionStrategy(object):
       dataset: A `tf.data.Dataset`.
 
     Returns:
-      A Dataset iterator that will produce separate splits for each tower.
+      A `PerDeviceDataset` that will produce data for each tower.
     """
     raise NotImplementedError("must be implemented in descendants")
 
@@ -1126,9 +1136,7 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     return ops.colocate_with(colocate_with_variable)
 
   def distribute_dataset(self, dataset):
-    # TODO(josh11b): Support for this when executing eagerly is currently only
-    # in contrib.
-    return dataset.make_one_shot_iterator()
+    return dataset
 
   def _broadcast(self, tensor, destinations):
     if destinations is None:
-- 
GitLab


From f0aabfa0139cb83c857e6142286d025515fbf9a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 12:10:51 -0700
Subject: [PATCH 1044/1262] Make toco generate uint8 weights that are safe for
 fast int8 kernels.

PiperOrigin-RevId: 193395910
---
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 tensorflow/contrib/lite/toco/args.h           |   1 +
 ...int8_weights_safe_for_fast_int8_kernels.cc | 209 ++++++++++++++++++
 .../graph_transformations.h                   |  13 ++
 .../contrib/lite/toco/toco_cmdline_flags.cc   |   9 +
 tensorflow/contrib/lite/toco/toco_flags.proto |   7 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   5 +
 7 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index f696f4b845..3f73ef620e 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -219,6 +219,7 @@ cc_library(
         "graph_transformations/drop_fake_quant.cc",
         "graph_transformations/drop_im2col_arrays.cc",
         "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc",
         "graph_transformations/experimental_shuffle_fc_weights.cc",
         "graph_transformations/fuse_activation_functions.cc",
         "graph_transformations/fuse_binary_into_following_affine.cc",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index c9662d05ce..fe30b88344 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -240,6 +240,7 @@ struct ParsedTocoFlags {
   Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
   Arg<bool> drop_control_dependency = Arg<bool>(false);
   Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
+  Arg<bool> allow_nudging_weights_to_use_fast_gemm_kernel = Arg<bool>(false);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
new file mode 100644
index 0000000000..394fa349e2
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// === Summary ===
+//
+// TLDR: Some of our 8-bit arithmetic operations require uint8 weight values
+// to avoid the value 0, thus ranging only in [1, 255]. This enables faster
+// runtime arithmetic kernels on ARM NEON. This is not relevant on most
+// other hardware architectures, and will cease to be relevant on ARM NEON
+// in the future. These topics are elaborated below ("Context").
+//
+// Having just one isolated uint8 value equal to 0 is fine. The bad case is when
+// two uint8 values are both zero and are less than 16 bytes apart.
+//
+// By default, toco generates a fatal error when that happens. The user may opt
+// in to more lax behavior by passing
+//   --allow_nudging_weights_to_use_fast_gemm_kernel.
+// This causes toco to nudge such bad 0 values into the value 1, thus avoiding
+// the problem in exchange for compromising on accuracy.
+//
+// The present graph transformation implements both the default fatal-erroring
+// behavior, and, when allow_nudging_weights is set, also the lax nudging
+// behavior.
+//
+//
+// === Context ===
+//
+// Since March 2017, we have been using a trick to perform faster
+// 8bit matrix multiplications, to our knowledge first implemented in gemmlowp
+// here:
+//   https://github.com/google/gemmlowp/commit/25b2989415b99e797e1ab977837111b2e231f81f
+//
+// This trick is explained in Appendix B of our paper,
+//   https://arxiv.org/abs/1712.05877
+//
+// Here is the relevant paragraph:
+//
+//      For efficient NEON implementation of the matrix multiplication’s
+//      core accumulation, we use the following trick.
+//      In the multiply-add operation in (10), we first change the
+//      operands’ type from uint8 to int8 (which can be done by
+//      subtracting 128 from the quantized values and zero-points).
+//      Thus the core multiply-add becomes
+//
+//            int32 += int8 * int8. (B.1)
+//
+//      As mentioned in section 3, with a minor tweak of the quantized
+//      training process, we can ensure that the weights, once
+//      quantized as int8 values, never take the value −128. Hence,
+//      the product in (B.1) is never −128 ∗ −128, and is therefore
+//      always less than 2^14 in absolute value. Hence, (B.1)
+//      can accumulate two products on a local int16 accumulator
+//      before that needs to be accumulated into the true int32 accumulator.
+//      This allows the use of an 8-way SIMD multiplication
+//      (SMULL on int8 operands), followed by an 8-way
+//      SIMD multiply-add (SMLAL on int8 operands), followed
+//      by a pairwise-add-and-accumulate into the int32 accumulators
+//      (SADALP).
+//
+// As that paragraph notes, quantized training should be suitably modified to
+// ensure that quantized uint8 weights value only range in [1, 255]. So the
+// problem that we are dealing with is only about the existing 8-bit quantized
+// models that haven't been trained specifically to get 8-bit weights only in
+// [1, 255].
+//
+// This spreadsheet shows the speed benefit of this trick across many existing
+// ARM-architecture CPUs:
+//
+//    https://docs.google.com/spreadsheets/d/1-0LjdMvW0XtH1bYknC0bQINoFaxjTuL9eplZZcitykI/edit?usp=sharing
+//
+// Compare Row 18 (fast int8 trick) to Row 20 (regular uint8 kernel).
+//
+// The introduction of the 'dotprod' extension to ARM NEON, specifically the
+// SDOT instruction, renders this eventually moot. See the experimental
+// kernels contributed by ARM here,
+//
+//     https://github.com/google/gemmlowp/pull/116
+//
+// However, as of April 2018, there don't seem to be any commercially available
+// CPU supporting these instructions (yet); we are waiting for
+// Cortex-A{75,55}-r1 to become available; the "-r1" is key here. Even if such
+// CPUs become available soon, it will presumably take years for them to
+// overtake the large volume of existing CPUs not supporting these new
+// instructions, especially in current and future low-end devices. All in all,
+// we can foresee these 'fast int8 kernels' to remain important to have into
+// the 2020s.
+//
+bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
+                                                   std::size_t op_index) {
+  const auto& op = *model->operators[op_index];
+  int weights_index = 0;
+  switch (op.type) {
+    case OperatorType::kConv:
+      weights_index = 1;
+      break;
+    case OperatorType::kLstmCell:
+      weights_index = 2;
+      break;
+    case OperatorType::kFullyConnected: {
+      weights_index = 1;
+      const auto& fc_op = static_cast<const toco::FullyConnectedOperator&>(op);
+      CHECK(!fc_op.experimental_shuffled_weights)
+          << "This graph transformation expects to run before FC weights get "
+             "shuffled.";
+      break;
+    }
+    default:
+      // Other operator types are unaffected by this graph transformation,
+      // because their runtime implementations don't use the fast int8 trick.
+      // In particular that's the case of DepthwiseConv at the moment.
+      // We have to update this logic when that changes, e.g. if in the future
+      // some DepthwiseConv kernel wants to use the trick.
+      //
+      // The reason why that's not so likely, hence why it's fairly safe to
+      // stay conservative in the list of operators that we handle here, is that
+      // the fast int8 kernel trick is only applicable to ops that either are
+      // implemented as a GEMM, or use symmetric ranges for both weights and
+      // activations. The reason why GEMM is special (can use the trick even
+      // without symmetric ranges) is that it is so arithmetic-intense that
+      // it can use techniques reducing its implementation to the symmetric
+      // ranges case, with limited relative overhead (O(N^2) overhead vs
+      // O(N^3) GEMM cost). See https://arxiv.org/pdf/1712.05877, section
+      // 2.3 Efficient handling of zero-points.
+      //
+      // That's why at the moment we only handle operators that use a GEMM
+      // (Conv, fully-connected --- note that LSTM merely wraps a
+      // fully-connected operator).
+      return false;
+  }
+
+  const string& name = op.inputs[weights_index];
+  auto& array = model->GetArray(name);
+  if (!array.buffer) {
+    return false;
+  }
+  if (array.data_type != ArrayDataType::kUint8) {
+    return false;
+  }
+  auto& buffer_data = array.GetMutableBuffer<ArrayDataType::kUint8>().data;
+
+  int count_bad = 0;
+  int index_of_previous_bad_value = 0;
+  bool changed = false;
+
+  for (int i = 0; i < buffer_data.size(); i++) {
+    if (buffer_data[i] == 0) {
+      count_bad++;
+      if (count_bad > 1) {
+        const int distance = i - index_of_previous_bad_value;
+        // Semi-arbitrary threshold. The idea is that trouble only occurs
+        // when two bad values are very close to each other so that they
+        // are jointly used within registers inside some GEMM kernel.
+        // The details of that depend on the kernel. Our current fast ARM64
+        // kernel, for instance, only has an issue when the distance between
+        // consecutive bad values is exactly 8. We do not want to track such
+        // kernel details too closely here, so we pick a threshold that's
+        // a bit larger than that, to give us room to change kernels in the
+        // future without worrying.
+        static constexpr int kMinDistanceBetweenBadValues = 16;
+        if (distance < kMinDistanceBetweenBadValues) {
+          if (allow_nudging_weights()) {
+            buffer_data[i] = 1;
+            changed = true;
+            continue;
+          }
+          LOG(FATAL) << "Bad value for " << name << " at index " << i
+                     << ", previous bad value at index "
+                     << index_of_previous_bad_value << ", distance=" << distance
+                     << ", kMinDistanceBetweenBadValues="
+                     << kMinDistanceBetweenBadValues << ". Consider passing "
+                     << "--allow_nudging_weights_to_use_fast_gemm_kernel "
+                     << "if you don't care about accuracy.";
+        }
+      }
+      index_of_previous_bad_value = i;
+    }
+  }
+
+  if (changed) {
+    AddMessageF("Tweaked weights values for %s", LogName(op));
+  }
+
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 8075d0205d..72ffd51db4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -246,6 +246,19 @@ class ResolveConstantFakeQuant : public GraphTransformation {
   bool propagate_fake_quant_num_bits_ = false;
 };
 
+class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override {
+    return "EnsureUint8WeightsSafeForFastInt8Kernels";
+  }
+  bool allow_nudging_weights() const { return allow_nudging_weights_; }
+  void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; }
+
+ private:
+  bool allow_nudging_weights_ = false;
+};
+
 #undef DECLARE_GRAPH_TRANSFORMATION
 
 }  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 74f98c8452..1611c4d0c0 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -141,6 +141,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.propagate_fake_quant_num_bits.default_value(),
            "If true, use FakeQuant* operator num_bits attributes to adjust "
            "array data_types."),
+      Flag("allow_nudging_weights_to_use_fast_gemm_kernel",
+           parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel.bind(),
+           parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel
+               .default_value(),
+           "Some fast uint8 GEMM kernels require uint8 weights to avoid the "
+           "value 0. This flag allows nudging them to 1 to allow proceeding, "
+           "with moderate inaccuracy."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -230,6 +237,8 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone);
   READ_TOCO_FLAG(debug_disable_recurrent_cell_fusion, FlagRequirement::kNone);
   READ_TOCO_FLAG(propagate_fake_quant_num_bits, FlagRequirement::kNone);
+  READ_TOCO_FLAG(allow_nudging_weights_to_use_fast_gemm_kernel,
+                 FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 869c512d93..a04017a6bf 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 17.
+// Next ID to use: 18.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -156,4 +156,9 @@ message TocoFlags {
   // Input and output array data types may change because of this propagation
   // and users must be sure to query the final data_type values.
   optional bool propagate_fake_quant_num_bits = 14;
+
+  // Some fast uint8 GEMM kernels require uint8 weights to avoid the value 0.
+  // This flag allows nudging them to 1 to allow proceeding, with moderate
+  // inaccuracy.
+  optional bool allow_nudging_weights_to_use_fast_gemm_kernel = 17;
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 89cb2f85f8..7252ec2ea4 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -317,12 +317,17 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     }
 
     CheckIsReadyForQuantization(*model);
+    auto* ensure_safe_for_int8_kernels =
+        new EnsureUint8WeightsSafeForFastInt8Kernels;
+    ensure_safe_for_int8_kernels->set_allow_nudging_weights(
+        toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
     RunGraphTransformations(model, "quantization graph transformations",
                             {
                                 new RemoveTrivialQuantizedActivationFunc,
                                 new RemoveTrivialQuantizedMinMax,
                                 new Quantize,
                                 new RemoveFinalDequantizeOp,
+                                ensure_safe_for_int8_kernels,
                             });
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
-- 
GitLab


From 87d37a689cff06ae1c1539abb747d152170c91b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 12:13:24 -0700
Subject: [PATCH 1045/1262] Automated g4 rollback of changelist 193369280

PiperOrigin-RevId: 193396206
---
 .../optimizers/constant_folding_test.cc       | 88 +------------------
 1 file changed, 1 insertion(+), 87 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 4b41dae480..36625b68b7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -520,25 +520,6 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
       EXPECT_EQ("Mul", node.op()) << node.name();
     }
   }
-
-  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
-  auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto x_partially_unknown_t =
-      GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
-  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
-  auto expected_tensors =
-      EvaluateNodes(item.graph, fetch,
-                    {{"x_known", x_known_t},
-                     {"x_partially_unknown", x_partially_unknown_t},
-                     {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(fetch.size(), expected_tensors.size());
-  auto tensors = EvaluateNodes(output, fetch,
-                               {{"x_known", x_known_t},
-                                {"x_partially_unknown", x_partially_unknown_t},
-                                {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(fetch.size(), tensors.size());
-  for (int i = 0; i < tensors.size(); i++)
-    test::ExpectTensorNear<float>(expected_tensors[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
@@ -591,20 +572,6 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
-  const std::vector<string> fetch = {"addn1"};
-  auto x_partially_unknown_t =
-      GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto expected_tensors =
-      EvaluateNodes(item.graph, fetch,
-                    {{"x_partially_unknown", x_partially_unknown_t},
-                     {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(1, expected_tensors.size());
-  auto tensors = EvaluateNodes(output, fetch,
-                               {{"x_partially_unknown", x_partially_unknown_t},
-                                {"x_unknown", x_unknown_t}});
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorNear<float>(expected_tensors[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, CreateConstNodes) {
@@ -1089,20 +1056,6 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
   }
   EXPECT_EQ(9, found);
-
-  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
-  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
-  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
-  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
-                                           "i2c", "i3a", "i3b"};
-  auto tensors_expected = EvaluateNodes(
-      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-  auto tensors = EvaluateNodes(output, fetch_nodes,
-                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
@@ -1935,14 +1888,6 @@ TEST_F(ConstantFoldingTest, Packing) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  const std::vector<string> fetch_nodes = {"i1", "i2"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-  auto tensors = EvaluateNodes(output, fetch_nodes);
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
-
   // Make sure that the representation of the folded constant is space
   // efficient: in particular, the whole message should be smaller than 8k
   // (the size needed to naively encode 1000 floats folded twice).
@@ -1978,13 +1923,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
-  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
-  auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
-  auto tensors_expected =
-      EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
   status = optimizer.Optimize(nullptr, item, &output);
@@ -2025,11 +1963,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     }
   }
   EXPECT_EQ(6, found);
-
-  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
@@ -2049,11 +1982,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch_nodes = {"o1", "o2"};
-  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-  auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
-
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2108,10 +2036,6 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
     }
   }
   EXPECT_EQ(7, found);
-  auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}});
-  EXPECT_EQ(fetch_nodes.size(), tensors.size());
-  for (int i = 0; i < fetch_nodes.size(); i++)
-    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -2589,7 +2513,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   value_tensor.AsProtoTensorContent(value.mutable_tensor());
 
   GraphDef& graph = item.graph;
-  AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph);
+  AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
   AddNode("enter1", "Enter", {"x"},
           {{"T", type},
@@ -2615,10 +2539,6 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
-  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
-  EXPECT_EQ(item.fetch.size(), tensors_expected.size());
-
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -2646,12 +2566,6 @@ TEST_F(ConstantFoldingTest, Enter) {
       EXPECT_EQ("enter3", node.input(0));
     }
   }
-
-  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
-  EXPECT_EQ(item.fetch.size(), tensors.size());
-
-  for (int i = 0; i < item.fetch.size(); i++)
-    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, TensorArraySize) {
-- 
GitLab


From fe732eea0138167f105720ce83cc0e3034a19d07 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 18 Apr 2018 12:26:03 -0700
Subject: [PATCH 1046/1262] Minor test improvement

PiperOrigin-RevId: 193398068
---
 tensorflow/contrib/data/python/kernel_tests/bucketing_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 6002cc73c8..55a56b83a8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -61,7 +61,7 @@ class GroupByWindowTest(test.TestCase):
 
       self.assertEqual(len(components), sum(counts))
       num_full_batches = len([c for c in counts if c == 4])
-      self.assertGreaterEqual(num_full_batches, 23)
+      self.assertGreaterEqual(num_full_batches, 24)
       self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
 
   def testImmediateOutput(self):
-- 
GitLab


From 011740b18b8309bb3126f95b736931d850a83861 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 12:32:53 -0700
Subject: [PATCH 1047/1262] Create specialized functions in optimized graph for
 each function instantiation context.

PiperOrigin-RevId: 193399263
---
 .../common_runtime/graph_execution_state.cc   |  44 +--
 .../common_runtime/graph_execution_state.h    |   5 +-
 tensorflow/core/grappler/optimizers/BUILD     |   2 +
 .../grappler/optimizers/function_optimizer.cc | 186 +++++++++---
 .../grappler/optimizers/function_optimizer.h  |   9 +
 .../optimizers/function_optimizer_test.cc     | 269 ++++++++++--------
 tensorflow/core/grappler/utils/functions.cc   |  94 ++++--
 tensorflow/core/grappler/utils/functions.h    |  19 ++
 .../core/grappler/utils/functions_test.cc     |  39 ++-
 9 files changed, 468 insertions(+), 199 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 6a3e6906a3..642d91e328 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -398,7 +398,8 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
 }
 
 Status GraphExecutionState::OptimizeGraph(
-    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph) {
+    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
+    std::unique_ptr<FunctionLibraryDefinition>* optimized_flib) {
 #ifndef IS_MOBILE_PLATFORM
   if (session_options_->config.graph_options().place_pruned_graph()) {
     return errors::InvalidArgument("Can't optimize a pruned graph");
@@ -493,9 +494,17 @@ Status GraphExecutionState::OptimizeGraph(
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
         item, rewrite_options, cpu_device, &cluster, &new_graph));
+
+    // Merge optimized graph function library with an original library.
+    // Optimized graph might have new functions specialized for it's
+    // instantiation context (see Grappler function optimizer).
+    optimized_graph->reset(new Graph(OpRegistry::Global()));
+    optimized_flib->reset(new FunctionLibraryDefinition(OpRegistry::Global(),
+                                                        new_graph.library()));
+    TF_RETURN_IF_ERROR((*optimized_flib)->AddLibrary(*flib_def_));
+
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
-    optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
     // The graph conversion sets the requested device names but not the assigned
@@ -524,18 +533,25 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
         "Attempted to prune a graph that has not been fully initialized.");
   }
 
-  std::unique_ptr<Graph> ng;
-  Status s = OptimizeGraph(options, &ng);
+  // Grappler optimization might change the structure of a graph itself, and
+  // also it can add/prune functions to/from the library.
+  std::unique_ptr<Graph> optimized_graph;
+  std::unique_ptr<FunctionLibraryDefinition> optimized_flib;
+
+  Status s = OptimizeGraph(options, &optimized_graph, &optimized_flib);
   if (!s.ok()) {
-    // Simply copy the original graph if we couldn't optimize it.
-    ng.reset(new Graph(flib_def_.get()));
-    CopyGraph(*graph_, ng.get());
+    // Simply copy the original graph and the function library if we couldn't
+    // optimize it.
+    optimized_graph.reset(new Graph(flib_def_.get()));
+    CopyGraph(*graph_, optimized_graph.get());
+    optimized_flib.reset(new FunctionLibraryDefinition(*flib_def_));
   }
 
   subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
       !session_options_->config.graph_options().place_pruned_graph()) {
-    TF_RETURN_IF_ERROR(PruneGraph(options, ng.get(), &rewrite_metadata));
+    TF_RETURN_IF_ERROR(
+        PruneGraph(options, optimized_graph.get(), &rewrite_metadata));
   } else {
     // This GraphExecutionState represents a graph that was
     // pruned when this was constructed, so we copy the metadata from
@@ -549,15 +565,11 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   CHECK_EQ(options.callable_options.fetch_size(),
            rewrite_metadata.fetch_types.size());
 
-  // Make a fresh copy of the function library for the client graph.
-  std::unique_ptr<FunctionLibraryDefinition> flib(
-      new FunctionLibraryDefinition(*flib_def_));
-
   // TODO(andydavis): Clarify optimization pass requirements around CostModel.
   GraphOptimizationPassOptions optimization_options;
   optimization_options.session_options = session_options_;
-  optimization_options.graph = &ng;
-  optimization_options.flib_def = flib.get();
+  optimization_options.graph = &optimized_graph;
+  optimization_options.flib_def = optimized_flib.get();
   optimization_options.device_set = device_set_;
 
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -567,9 +579,9 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
   std::unique_ptr<ClientGraph> dense_copy(
-      new ClientGraph(std::move(flib), rewrite_metadata.feed_types,
+      new ClientGraph(std::move(optimized_flib), rewrite_metadata.feed_types,
                       rewrite_metadata.fetch_types));
-  CopyGraph(*ng, &dense_copy->graph);
+  CopyGraph(*optimized_graph, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 2154ef5bd3..d44a24c87b 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -182,8 +182,9 @@ class GraphExecutionState {
   Status PruneGraph(const BuildGraphOptions& options, Graph* graph,
                     subgraph::RewriteGraphMetadata* out_rewrite_metadata);
 
-  Status OptimizeGraph(const BuildGraphOptions& options,
-                       std::unique_ptr<Graph>* optimized_graph);
+  Status OptimizeGraph(
+      const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
+      std::unique_ptr<FunctionLibraryDefinition>* optimized_flib);
 
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3070eb1799..63492e1a7f 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -143,6 +143,8 @@ cc_library(
         ":graph_optimizer",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 6d67ead355..d008a9719f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -29,65 +29,141 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class FunctionInliningContext {
+// Mark functions that were created as a result of function specialization.
+constexpr char kGrapplerSpecializedFuncAttr[] = "_GrapplerSpecializedFunc";
+
+constexpr char kNoInlineAttr[] = "_noinline";
+
+bool AttrIsTrue(const FunctionDef& func, const string& attr) {
+  return func.attr().count(attr) != 0 && func.attr().at(attr).b();
+}
+
+bool MarkedSpecialized(const FunctionDef& func) {
+  return AttrIsTrue(func, kGrapplerSpecializedFuncAttr);
+}
+
+bool MarkedNoInline(const FunctionDef& func) {
+  return AttrIsTrue(func, kNoInlineAttr);
+}
+
+// Find unique name for the specialized function. Collision can happen if
+// specialized function is instantiated for the nodes with the same name (e.g.
+// inside function body of two different functions).
+string UniqueSpecializedFunctionName(const FunctionDef& func,
+                                     const NodeDef& func_node,
+                                     const FunctionLibraryDefinition& flib) {
+  using str_util::StringReplace;
+  using strings::StrCat;
+
+  string specialized_name = StrCat(func.signature().name(), "_specialized_for_",
+                                   StringReplace(func_node.name(), "/", "_",
+                                                 /*replace_all*/ true));
+  string unique_name = specialized_name;
+
+  int idx = 0;
+  while (flib.Find(unique_name)) {
+    unique_name = strings::StrCat(specialized_name, "_", ++idx);
+  }
+  return unique_name;
+}
+
+class FunctionOptimizerContext {
  public:
-  explicit FunctionInliningContext(const GrapplerItem& item,
-                                   RewriterConfig::Toggle opt_level)
+  explicit FunctionOptimizerContext(const GrapplerItem& item,
+                                    RewriterConfig::Toggle opt_level)
       : opt_level_(opt_level),
-        functions_(InliningCandidates(item)),
         function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
-                                                    item.graph.library())) {}
+                                                    item.graph.library())) {
+    InitializeInlinedFunctions(item);
+  }
+
+  const FunctionLibraryDefinition& function_library() const {
+    return function_library_;
+  }
 
-  const FunctionLibraryDefinition& FunctionLibrary() const {
+  FunctionLibraryDefinition& mutable_function_library() {
     return function_library_;
   }
 
-  bool HasInlinedFunctions() const { return !functions_.empty(); }
+  bool IsInlinedFunction(const string& name) const {
+    return inlined_functions_.count(name) > 0;
+  }
 
   // Find inlining candidate by name. Return nullptr if not found.
   const FunctionDef* FindInlinedFunction(const string& name) const {
-    auto it = functions_.find(name);
-    if (it != functions_.end()) {
-      return it->second;
-    } else {
-      return nullptr;
-    }
+    return gtl::FindWithDefault(inlined_functions_, name, nullptr);
   }
 
  private:
-  std::unordered_map<string, const FunctionDef*> InliningCandidates(
-      const GrapplerItem& item) const {
-    std::unordered_map<string, const FunctionDef*> functions;
+  void InitializeInlinedFunctions(const GrapplerItem& item) {
+    bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+
     for (const FunctionDef& func : item.graph.library().function()) {
-      // Don't inline functions marked as noinline
-      if (func.attr().count("_noinline") != 0 &&
-          func.attr().at("_noinline").b() &&
-          opt_level_ != RewriterConfig::AGGRESSIVE) {
-        continue;
-      }
       // Can't create IdentityN nodes with no input or output: skip these
       // functions for now.
       if (func.signature().input_arg_size() == 0 ||
           func.signature().output_arg_size() == 0) {
         continue;
       }
-      functions[func.signature().name()] = &func;
+      bool marked_noinline = MarkedNoInline(func);
+      bool marked_specialized = MarkedSpecialized(func);
+
+      if (!marked_specialized && (!marked_noinline || aggressive)) {
+        inlined_functions_[func.signature().name()] = &func;
+      }
     }
-    return functions;
   }
 
   RewriterConfig::Toggle opt_level_;
-  std::unordered_map<string, const FunctionDef*> functions_;
   FunctionLibraryDefinition function_library_;
+  // Functions that can be inlined into optimized graph.
+  std::unordered_map<string, const FunctionDef*> inlined_functions_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
+                          FunctionOptimizerContext* ctx,
+                          GraphDef* optimized_graph) {
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
+
+  const auto& flib = ctx->function_library();
+
+  // Make a GrapplerFunctionItem and immediately convert it back to FunctionDef.
+  GrapplerFunctionItem item;
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  // TODO(ezhulenev): Push down const inputs and known input shapes.
+  FunctionDef specialized;
+  TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized));
+
+  // Find a name for specialized function.
+  const string specialized_func_name =
+      UniqueSpecializedFunctionName(func, func_node, flib);
+
+  specialized.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized.mutable_attr();
+  (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
+
+  // Add specialized function to the library.
+  TF_RETURN_IF_ERROR(
+      ctx->mutable_function_library().AddFunctionDef(specialized));
+
+  // Add a function call node for the specialized function.
+  NodeDef* specialized_func_node = optimized_graph->add_node();
+  *specialized_func_node = func_node;
+  specialized_func_node->set_op(specialized_func_name);
+
+  return Status::OK();
+}
+
 // Copy input/output argument type to the type_list. Return error if argument
 // type is not explicitly defined, and not specified in function attributes.
 Status CopyArgType(const NodeDef& func_node,
@@ -148,14 +224,14 @@ Status HookInlinedFunctionOutputs(
 }
 
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
-                      const FunctionInliningContext& ctx,
+                      const FunctionOptimizerContext& ctx,
                       GraphDef* optimized_graph) {
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
   GrapplerFunctionItem item;
   Status item_status =
-      MakeGrapplerFunctionItem(func, func_attr, ctx.FunctionLibrary(), &item);
+      MakeGrapplerFunctionItem(func, func_attr, ctx.function_library(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -378,39 +454,61 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  FunctionInliningContext function_inlining_ctx(item, opt_level_);
-
   // Nothing to do here.
-  if (!function_inlining_ctx.HasInlinedFunctions()) {
+  if (item.graph.library().function_size() == 0) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
+  FunctionOptimizerContext ctx(item, opt_level_);
   SymbolicGradientEnv env(item.graph.versions().producer(),
                           item.graph.library());
 
+  bool inline_gradients = options_.enable_symbolic_gradient_inlining;
+  bool inline_func = options_.enable_function_inlining;
+  bool specialize_func = options_.enable_function_specialization;
+
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() == "SymbolicGradient") {
-      TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
-      continue;
+    const string func_name = node.op();
+
+    if (func_name == "SymbolicGradient" && inline_gradients) {
+      // Inline symbolic gradients only if the corresponding function is inlined
+      const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
+      string f_name = f_attr != nullptr ? f_attr->func().name() : "";
+      if (ctx.IsInlinedFunction(f_name)) {
+        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+        continue;
+      }
     }
 
-    const FunctionDef* func =
-        function_inlining_ctx.FindInlinedFunction(node.op());
+    const FunctionDef* func = ctx.function_library().Find(func_name);
     if (func != nullptr) {
-      TF_RETURN_IF_ERROR(
-          InlineFunction(node, *func, function_inlining_ctx, optimized_graph));
-    } else {
-      *optimized_graph->add_node() = node;
+      if (inline_func && ctx.IsInlinedFunction(func_name)) {
+        // Inline function body into the optimized graph}
+        TF_RETURN_IF_ERROR(InlineFunction(node, *func, ctx, optimized_graph));
+        continue;
+      }
+
+      if (specialize_func && IsParametrized(*func)) {
+        // TODO(ezhulenev): Specialize function call if input is a Const or has
+        // a known shape. Const input tensors can be pushed into the function
+        // body and removed from function inputs.
+
+        // Specialize function body for its instantiation attributes and inputs.
+        TF_RETURN_IF_ERROR(
+            SpecializeFunction(node, *func, &ctx, optimized_graph));
+        continue;
+      }
     }
-  }
 
-  // TODO(bsteiner): specialize the implementation of functions that can't be
-  // inlined based on the context in which they're instantiated.
+    // If we reached this point, node was not handled by any of the stages
+    // (inline, specialize), simply add a copy to the graph.
+    *optimized_graph->add_node() = node;
+  }
 
   // TODO(bsteiner): trim the library to remove unused function definitions
   *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_library() = ctx.function_library().ToProto();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index b124efe01d..c555fadf83 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -38,7 +38,16 @@ class FunctionOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  friend class FunctionOptimizerTest;
+
+  struct FunctionOptimizerOptions {
+    bool enable_function_inlining = true;
+    bool enable_function_specialization = true;
+    bool enable_symbolic_gradient_inlining = true;
+  };
+
   RewriterConfig::Toggle opt_level_;
+  FunctionOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 099fe7caf2..fb006d4868 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -24,92 +24,97 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
+namespace {
 constexpr char kDevice[] = "/device:CPU:0";
+}  // namespace
 
 class FunctionOptimizerTest : public GrapplerTest {
  protected:
-  Tensor MakeScalarTensor(float value) {
-    Tensor tensor(DT_FLOAT, {});
-    tensor.scalar<float>()() = value;
-    return tensor;
+  void DisableAll(FunctionOptimizer* optimizer) {
+    optimizer->options_.enable_function_inlining = false;
+    optimizer->options_.enable_function_specialization = false;
+    optimizer->options_.enable_symbolic_gradient_inlining = false;
   }
 
-  Tensor MakeScalarTensor(int value) {
-    Tensor tensor(DT_INT32, {});
-    tensor.scalar<int>()() = value;
-    return tensor;
+  void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) {
+    DisableAll(optimizer);
+    optimizer->options_.enable_function_inlining = true;
+  }
+
+  void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) {
+    DisableAll(optimizer);
+    optimizer->options_.enable_function_specialization = true;
   }
 };
 
-TEST_F(FunctionOptimizerTest, SimpleFunction) {
+TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   // Build a graph to compute y = XTimesTwo(x)
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           test::function::XTimesTwo(),
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y/inlined_inputs") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("x", node.input(0));
     } else if (node.name() == "y/x") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/inlined_inputs:0", node.input(0));
     } else if (node.name() == "y/two") {
       count++;
       EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^y/inlined_inputs", node.input(0));
     } else if (node.name() == "y/scale") {
       count++;
       EXPECT_EQ("Cast", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
     } else if (node.name() == "y/y") {
       count++;
       EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
       EXPECT_EQ("y/scale", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y", node.input(0));
     }
   }
   EXPECT_EQ(7, count);
 
-  Tensor pi = MakeScalarTensor(3.14f);
+  Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -118,7 +123,11 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   // Create and instantiate a version of the XTimesTwo function that only
   // accepts floats a inputs.
   const Tensor kTwo = test::AsScalar<float>(2.0f);
@@ -137,19 +146,16 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
           {{"y"}, "Mul", {"x", "two"}, {{"T", DT_FLOAT}}},
       });
 
-  constexpr char device[] = "/device:CPU:0";
   GrapplerItem item;
   item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "XTimesTwo", {"x"}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           x_times_two,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -159,13 +165,13 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
     if (node.name() == "y/inlined_inputs") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("x", node.input(0));
     } else if (node.name() == "y/x") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/inlined_inputs:0", node.input(0));
     } else if (node.name() == "y/two") {
@@ -173,31 +179,31 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^y/inlined_inputs", node.input(0));
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
     } else if (node.name() == "y/y") {
       count++;
       EXPECT_EQ("Mul", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
       EXPECT_EQ("y/two", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y", node.input(0));
     }
   }
   EXPECT_EQ(6, count);
 
-  Tensor pi = MakeScalarTensor(3.14f);
+  Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -206,7 +212,11 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithOutputMapping) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   FunctionDef func = FunctionDefHelper::Create(
       // Name
       "Exp_func",
@@ -223,65 +233,61 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
       {{"out", "Exp:y:0"}});
 
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "Exp_func", {"x"}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "Exp_func", {"x"}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           func,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y/inlined_inputs") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("x", node.input(0));
     } else if (node.name() == "y/in") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/inlined_inputs:0", node.input(0));
     } else if (node.name() == "y/Linear_func") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/in", node.input(0));
     } else if (node.name() == "y/Exp") {
       count++;
       EXPECT_EQ("Exp", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/Linear_func", node.input(0));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y/Exp", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(device, node.device());
+      EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("y", node.input(0));
     }
   }
   EXPECT_EQ(6, count);
 
-  Tensor pi = MakeScalarTensor(3.14f);
+  Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -290,7 +296,11 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithInputForwarding) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   FunctionDef func = FunctionDefHelper::Create(
       // Name
       "ForwardInputs",
@@ -306,42 +316,30 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
       {{"out0", "in0"}, {"arg2", "arg2"}, {"arg3", "arg3"}});
 
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("x2", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("x3", "Placeholder", {}, {{"dtype", DT_INT32}},
-                            device),
-       test::function::NDef("x4", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            device),
-       test::function::NDef("y", "ForwardInputs",
-                            {"x0", "x1", "x2", "x3", "x4"}, {}, device),
-       test::function::NDef("z0", "Identity", {"y:0"}, {{"T", DT_FLOAT}},
-                            device),
-       test::function::NDef("z1", "Identity", {"y:1"}, {{"T", DT_FLOAT}},
-                            device),
-       test::function::NDef("z2", "Identity", {"y:2"}, {{"T", DT_INT32}},
-                            device)},
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x2", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x3", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       NDef("x4", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "ForwardInputs", {"x0", "x1", "x2", "x3", "x4"}, {}, kDevice),
+       NDef("z0", "Identity", {"y:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z1", "Identity", {"y:1"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z2", "Identity", {"y:2"}, {{"T", DT_INT32}}, kDevice)},
       // FunctionLib
       {
           func,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   item.fetch = {"z0", "z1", "z2"};
-  item.feed.emplace_back("x0", MakeScalarTensor(3.14f));
-  item.feed.emplace_back("x1", MakeScalarTensor(2.7f));
-  item.feed.emplace_back("x2", MakeScalarTensor(1.0f));
-  item.feed.emplace_back("x4", MakeScalarTensor(-1.0f));
-  item.feed.emplace_back("x3", MakeScalarTensor(1234));
+  item.feed.emplace_back("x0", test::AsScalar<float>(3.14f));
+  item.feed.emplace_back("x1", test::AsScalar<float>(2.7f));
+  item.feed.emplace_back("x2", test::AsScalar<float>(1.0f));
+  item.feed.emplace_back("x4", test::AsScalar<float>(-1.0f));
+  item.feed.emplace_back("x3", test::AsScalar<int>(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -350,7 +348,12 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
   test::ExpectTensorEqual<int>(tensors_expected[2], tensors[2]);
 }
 
-TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  EnableOnlyFunctionInlining(&optimizer);
+
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
       // Name
@@ -366,25 +369,26 @@ TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
        {{"o"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}}});
 
   GrapplerItem item;
-  constexpr char device[] = "/device:CPU:0";
   item.graph = test::function::GDef(
-      {test::function::NDef("y", "GenerateTwo", {}, {}, device),
-       test::function::NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, device)},
+      {NDef("y", "GenerateTwo", {}, {}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {
           func,
       });
 
-  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   // For now we won't inline the function.
   EXPECT_EQ(item.graph.DebugString(), output.DebugString());
 }
 
-TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
+TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
   // Define square via function library:
   //   MySquare(x) = MyMul(x, x)
 
@@ -402,17 +406,13 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
 
   GrapplerItem item;
   item.graph = test::function::GDef(
-      {test::function::NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}},
-                            kDevice),
-       test::function::NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}},
-                            kDevice),
-       test::function::NDef("outputs", "Identity", {"square:0"},
-                            {{"T", DT_FLOAT}}, kDevice)},
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("outputs", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice)},
       // FunctionLib
       {mul_func, square_func});
 
   GraphDef output;
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   int count = 0;
@@ -469,7 +469,7 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
   EXPECT_EQ(9, count);
 
   item.fetch = {"outputs"};
-  item.feed.emplace_back("a", MakeScalarTensor(2.0f));
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
   GrapplerItem optimized(item, std::move(output));
@@ -478,7 +478,9 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, SymbolicGradients) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_TestFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   FunctionDef func = FunctionDefHelper::Define(
@@ -508,10 +510,8 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   std::vector<Tensor> expected =
       EvaluateNodes(item.graph, {"out1", "out2"}, {});
@@ -520,7 +520,9 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   test::ExpectTensorEqual<float>(expected[1], optimized[1]);
 }
 
-TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   FunctionDef func = FunctionDefHelper::Create(
@@ -550,10 +552,8 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_EQ(13, output.node_size());
   EXPECT_EQ("Const", output.node(0).name());
@@ -583,7 +583,9 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
-TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+
   FunctionDef func = FunctionDefHelper::Define(
       "TestFunc", {"x:float", "y:float"}, {"l:float"}, {},
       {
@@ -613,7 +615,6 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   // The optimizer should succeed but the graphs should be the same.
@@ -621,6 +622,52 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   CompareGraphs(item.graph, output);
 }
 
-}  // namespace
+TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  EnableOnlyFunctionSpecialization(&optimizer);
+
+  // Mark XTimesTwo as noinline
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  (*x_times_two.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {x_times_two};
+
+  // Build a graph to compute y = XTimesTwo(x)
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that specialized function was added to the library
+  EXPECT_EQ(2, output.library().function_size());
+  EXPECT_EQ("XTimesTwo_specialized_for_y",
+            output.library().function(1).signature().name());
+
+  // And 'y' node is calling specialized function
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y" && count++) {
+      EXPECT_EQ("XTimesTwo_specialized_for_y", node.op());
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And that graph evaluation yields the same result
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index e8d423a759..638fe1999a 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -33,23 +33,22 @@ namespace grappler {
 
 namespace {
 
-Status OutputNameRange(const FunctionLibraryDefinition& flib,
-                       const NodeDef& node,
-                       tensorflow::NameRangeMap* outputs_range_map) {
-  const OpRegistrationData* registration;
-  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
-  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(node, registration->op_def,
-                                                   nullptr, outputs_range_map));
+Status RegisterFunctionBodyOutputs(const OpRegistrationData& registration,
+                                   const NodeDef& node,
+                                   GrapplerFunctionConnectivity* connectivity) {
+  tensorflow::NameRangeMap outputs_range_map;
+  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(
+      node, registration.op_def, nullptr, &outputs_range_map));
+  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
   return Status::OK();
 }
 
 Status RegisterFunctionBodyOutputs(const FunctionLibraryDefinition& flib,
                                    const NodeDef& node,
                                    GrapplerFunctionConnectivity* connectivity) {
-  tensorflow::NameRangeMap outputs_range_map;
-  TF_RETURN_IF_ERROR(OutputNameRange(flib, node, &outputs_range_map));
-  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
-  return Status::OK();
+  const OpRegistrationData* registration;
+  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
+  return RegisterFunctionBodyOutputs(*registration, node, connectivity);
 }
 
 // Replace the placeholder attribute values with the values specified in
@@ -306,26 +305,35 @@ GrapplerFunctionItem::GrapplerFunctionItem(
     const string& func_name, const AttrValueMap& func_attr,
     const std::vector<InputArgExpansion>& input_arg_expansions,
     const std::vector<OutputArgExpansion>& output_arg_expansions,
+    const std::vector<string>& keep_nodes, bool is_stateful,
     GraphDef&& function_body)
     : func_attr_(func_attr),
       input_arg_expansions_(input_arg_expansions),
-      output_arg_expansions_(output_arg_expansions) {
+      output_arg_expansions_(output_arg_expansions),
+      is_stateful_(is_stateful) {
   id = func_name;
-  // Fill the feed nodes with input placeholders
+  keep_ops = keep_nodes;
+  // Swap the graph body.
+  graph.Swap(&function_body);
+  // Fill the feed nodes with input placeholders.
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
       feed.emplace_back(placeholder, Tensor());
       input_arg_placeholders_.insert(placeholder);
     }
   }
-  // Fill the fetch nodes with outputs
+  // Fill the fetch nodes with outputs.
   for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
     for (const string& output_tensor : output_arg.output_tensors) {
       fetch.push_back(output_tensor);
     }
   }
-  // Swap the graph body
-  graph.Swap(&function_body);
+  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
+  for (const NodeDef& node : graph.node()) {
+    if (IsSend(node)) {
+      keep_ops.push_back(node.name());
+    }
+  }
 }
 
 const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
@@ -365,6 +373,8 @@ const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
 
 GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
 
+bool GrapplerFunctionItem::is_stateful() const { return is_stateful_; }
+
 GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
   graph.Swap(&other);
   return *this;
@@ -380,6 +390,33 @@ std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
   return output_tensors;
 }
 
+bool HasParametrizedType(const FunctionDef& func) {
+  const auto is_type_parametrized = [](const OpDef::ArgDef& arg) {
+    return !arg.type_attr().empty() || !arg.number_attr().empty() ||
+           !arg.type_list_attr().empty();
+  };
+
+  const auto& input = func.signature().input_arg();
+  const auto& output = func.signature().output_arg();
+  return std::any_of(input.begin(), input.end(), is_type_parametrized) ||
+         std::any_of(output.begin(), output.end(), is_type_parametrized);
+}
+
+bool HasParametrizedBody(const FunctionDef& func) {
+  const auto is_parametrized = [&](const NodeDef& node) {
+    for (const auto& attr : node.attr()) {
+      if (!attr.second.placeholder().empty()) return true;
+    }
+    return false;
+  };
+  return std::any_of(func.node_def().begin(), func.node_def().end(),
+                     is_parametrized);
+}
+
+bool IsParametrized(const FunctionDef& func) {
+  return HasParametrizedType(func) || HasParametrizedBody(func);
+}
+
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const AttrValueMap& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
@@ -408,6 +445,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   std::vector<InputArgExpansion> inputs;
   std::vector<OutputArgExpansion> outputs;
+  std::vector<string> keep_nodes;
 
   // Function body shares the library with the graph that instantiated it.
   GraphDef function_body;
@@ -444,6 +482,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
     InputArgExpansion input_expansion{/*input_name=*/input.name(),
                                       /*data_type=*/input_data_type,
+                                      /*is_ref*/ input.is_ref(),
                                       /*placeholders=*/{input.name()}};
     connectivity.RegisterInputArgExpansion(input_expansion);
     inputs.push_back(input_expansion);
@@ -454,12 +493,21 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     NodeDef* new_node = function_body.add_node();
     *new_node = func_def_node;
 
+    const OpRegistrationData* registration;
+    TF_RETURN_IF_ERROR(flib.LookUp(func_def_node.op(), &registration));
+
     // Resolve all placeholder values using function instantiation attributes.
     TF_RETURN_IF_ERROR(ResolveFunctionBodyNodeAttrPlaceholders(
         func_instantiation_attr, new_node));
+
     // Register node output range in a function connectivity.
-    TF_RETURN_IF_ERROR(
-        RegisterFunctionBodyOutputs(flib, func_def_node, &connectivity));
+    TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
+                                                   &connectivity));
+
+    // Stateful and Send nodes must be preserved in a function body
+    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+      keep_nodes.push_back(func_def_node.name());
+    }
   }
 
   // Rewrite inputs to use GraphDef format
@@ -483,19 +531,22 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
     OutputArgExpansion output{/*output_name=*/out.name(),
                               /*data_type=*/output_data_type,
+                              /*is_ref=*/out.is_ref(),
                               /*output_tensors=*/output_tensors};
     outputs.push_back(output);
   }
 
+  bool is_stateful = signature.is_stateful();
+
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(),
       /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
-      inputs, outputs, std::move(function_body));
+      inputs, outputs, keep_nodes, is_stateful, std::move(function_body));
   return Status::OK();
 }
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
-// in the GrapplerFunctionConnectivity
+// in the GrapplerFunctionConnectivity.
 Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity) {
@@ -513,6 +564,7 @@ Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
                                   const FunctionLibraryDefinition& flib,
                                   FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
+  func->mutable_signature()->set_is_stateful(item.is_stateful());
 
   // Build a GrapplerFunctionConnectivity from inputs and new function body.
   GrapplerFunctionConnectivity connectivity;
@@ -524,6 +576,7 @@ Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
     OpDef::ArgDef arg_def;
     arg_def.set_name(input_arg.input_name);
     arg_def.set_type(input_arg.data_type);
+    arg_def.set_is_ref(input_arg.is_ref);
     *func->mutable_signature()->add_input_arg() = arg_def;
   }
 
@@ -532,6 +585,7 @@ Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
     OpDef::ArgDef arg_def;
     arg_def.set_name(output_arg.output_name);
     arg_def.set_type(output_arg.data_type);
+    arg_def.set_is_ref(output_arg.is_ref);
     *func->mutable_signature()->add_output_arg() = arg_def;
 
     CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 2ac3917a66..ab369bcad7 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -41,6 +41,7 @@ struct InputArgExpansion {
   // different data types
   string input_name;                 // name of the function input argument
   DataType data_type;                // input data type
+  bool is_ref;                       // if true, inputs are required to be refs
   std::vector<string> placeholders;  // names of placeholder nodes in the
                                      // function body
 };
@@ -55,6 +56,7 @@ struct OutputArgExpansion {
   // different data types
   string output_name;                  // name of the function output argument
   DataType data_type;                  // output data type
+  bool is_ref;                         // if true, outputs are refs
   std::vector<string> output_tensors;  // names of output tensor from the
                                        // function body nodes
 };
@@ -136,6 +138,7 @@ class GrapplerFunctionItem : public GrapplerItem {
       const string& func_name, const AttrValueMap& func_attr,
       const std::vector<InputArgExpansion>& input_arg_expansions,
       const std::vector<OutputArgExpansion>& output_arg_expansions,
+      const std::vector<string>& keep_nodes, bool is_stateful,
       GraphDef&& function_body);
 
   bool IsInputPlaceholder(const string& node_name) const;
@@ -152,6 +155,8 @@ class GrapplerFunctionItem : public GrapplerItem {
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
 
+  bool is_stateful() const;
+
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
@@ -162,11 +167,25 @@ class GrapplerFunctionItem : public GrapplerItem {
   std::vector<OutputArgExpansion> output_arg_expansions_;
 
   std::set<string> input_arg_placeholders_;
+
+  bool is_stateful_;
 };
 
 // Return all output tensors referenced by item output args.
 std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
 
+// Check if function input/output types are fully defined only at instantiation
+// time (parametrized by it's instantiation node).
+bool HasParametrizedType(const FunctionDef& func);
+
+// Check if a function body is parametrized by it's instantiation node. Function
+// body is parametrized, if it has at least one node with a 'placeholder'
+// attribute.
+bool HasParametrizedBody(const FunctionDef& func);
+
+// Check if function has parametrized type or body.
+bool IsParametrized(const FunctionDef& func);
+
 // Make a GrapplerFunctionItem from the function definition and attributes.
 // Return error if the given function def cannot be converted.
 Status MakeGrapplerFunctionItem(
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index a9a708bf67..54d235a8a4 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -30,12 +30,37 @@ namespace {
 
 class FunctionsTest : public ::testing::Test {};
 
+TEST_F(FunctionsTest, IsParametrized) {
+  // Function is defined for multiple input types.
+  FunctionDef parametrized_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  // Function is defined just for float inputs.
+  FunctionDef non_parametrized_func = FunctionDefHelper::Create(
+      "MyMul", {"x:float", "y:float"}, {"z:float"}, {},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  EXPECT_TRUE(HasParametrizedType(parametrized_func));
+  EXPECT_TRUE(HasParametrizedBody(parametrized_func));
+  EXPECT_TRUE(IsParametrized(parametrized_func));
+
+  EXPECT_FALSE(HasParametrizedType(non_parametrized_func));
+  EXPECT_FALSE(HasParametrizedBody(non_parametrized_func));
+  EXPECT_FALSE(IsParametrized(non_parametrized_func));
+}
+
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
   connectivity.RegisterInputArgExpansion(
-      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
 
   connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
   connectivity.RegisterFunctionBodyOutputs("Func",
@@ -98,9 +123,10 @@ TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
   connectivity.RegisterInputArgExpansion(
-      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
 
   connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
   connectivity.RegisterFunctionBodyOutputs("Func",
@@ -136,9 +162,10 @@ TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandNodeInputs) {
   GrapplerFunctionConnectivity connectivity;
 
-  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
   connectivity.RegisterInputArgExpansion(
-      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+      {"inputA", DT_FLOAT, /*is_ref=*/false, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, /*is_ref=*/false, {"inputB_0", "inputB_1"}});
 
   NodeDef node;
   node.add_input("inputA:0");
-- 
GitLab


From ac4717707dc3c9d1441ffe85d6563e868f9677e3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 19:38:30 +0000
Subject: [PATCH 1048/1262] Fix issue for float16 data type with reuse in
 CudnnLSTM

This fix tries to address the issue raised in 18699 where
for float16 data type, the reuse in CudnnLSTM throws a ValueError.

This fix fixes the issue by passing the data type. This fix
fixes 18699.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 00d9544602..1b8614899f 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -358,7 +358,7 @@ class _CudnnRNN(base_layer.Layer):
             "CUDA/CuDNN generations.")
       # Initialize opaque params with a tensor.
       self.kernel = vs.get_variable(
-          "opaque_kernel", initializer=opaque_params_t, validate_shape=False)
+          "opaque_kernel", dtype=self._plain_dtype, initializer=opaque_params_t, validate_shape=False)
     # Create saveable in the outer scope of the cudnn subgraph, such that
     # alternative subgraph with platform-independent rnn cells can load the
     # checkpoints directly.
-- 
GitLab


From d84768bcbf4530a77acf0853c6f8ffc72caffc19 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 19:49:26 +0000
Subject: [PATCH 1049/1262] Update comment in the test

`checked is done in kernel.` -> `checked in kernel.`

for review feedback.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/manip_ops_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index 7cc4bf61ba..f31426713c 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -107,7 +107,7 @@ class RollTest(test_util.TensorFlowTestCase):
       manip_ops.roll(7, 1, 0)
 
   def testRollInputMustVectorHigherRaises(self):
-    # The input should be 1-D or higher, checked is done in kernel.
+    # The input should be 1-D or higher, checked in kernel.
     tensor = array_ops.placeholder(dtype=dtypes.int32)
     shift = 1
     axis = 0
-- 
GitLab


From 8b1c3049028d1c25d7f4acc3af794918d64aafdf Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Wed, 18 Apr 2018 12:51:56 -0700
Subject: [PATCH 1050/1262] Moving all state (variables) required for
 _EmbeddingColumn and _SharedEmbeddingColumn into a base.Layer

PiperOrigin-RevId: 193401873
---
 .../python/feature_column/feature_column.py   | 337 +++++++++---------
 .../feature_column/feature_column_test.py     | 280 +++++++--------
 2 files changed, 293 insertions(+), 324 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index f9201a4794..0ad8131599 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -135,6 +135,7 @@ import numpy as np
 import six
 
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -462,6 +463,16 @@ def linear_model(features,
     return predictions
 
 
+def _add_to_collections(var, weight_collections):
+  # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
+  # so that we don't have to do this check.
+  if isinstance(var, variables.PartitionedVariable):
+    for constituent_var in list(var):
+      ops.add_to_collections(weight_collections, constituent_var)
+  else:
+    ops.add_to_collections(weight_collections, var)
+
+
 class _FCLinearWrapper(base.Layer):
   """Wraps a _FeatureColumn in a layer for use in a linear model.
 
@@ -482,12 +493,8 @@ class _FCLinearWrapper(base.Layer):
     self._units = units
     self._sparse_combiner = sparse_combiner
     self._weight_collections = weight_collections
-    self._state = {}
 
   def build(self, _):
-    self._state = self._feature_column._create_state(  # pylint: disable=protected-access
-        self._weight_collections, self.add_variable)
-
     if isinstance(self._feature_column, _CategoricalColumn):
       weight = self.add_variable(
           name='weights',
@@ -501,7 +508,7 @@ class _FCLinearWrapper(base.Layer):
           shape=[num_elements, self._units],
           initializer=init_ops.zeros_initializer(),
           trainable=self.trainable)
-    ops.add_to_collections(self._weight_collections, weight)
+    _add_to_collections(weight, self._weight_collections)
     self._weight_var = weight
     self.built = True
 
@@ -513,8 +520,7 @@ class _FCLinearWrapper(base.Layer):
         sparse_combiner=self._sparse_combiner,
         weight_collections=self._weight_collections,
         trainable=self.trainable,
-        weight_var=self._weight_var,
-        state=self._state)
+        weight_var=self._weight_var)
     return weighted_sum
 
 
@@ -538,7 +544,7 @@ class _BiasLayer(base.Layer):
         shape=[self._units],
         initializer=init_ops.zeros_initializer(),
         trainable=self.trainable)
-    ops.add_to_collections(self._weight_collections, self._bias_variable)
+    _add_to_collections(self._bias_variable, self._weight_collections)
     self.built = True
 
   def call(self, _):
@@ -806,11 +812,22 @@ def embedding_column(
     initializer = init_ops.truncated_normal_initializer(
         mean=0.0, stddev=1 / math.sqrt(dimension))
 
+  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
+
+  def _creator(weight_collections, scope):
+    embedding_column_layer = _EmbeddingColumnLayer(
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        name='embedding_column_layer')
+    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
+
   return _EmbeddingColumn(
       categorical_column=categorical_column,
       dimension=dimension,
       combiner=combiner,
-      initializer=initializer,
+      layer_creator=_creator,
       ckpt_to_load_from=ckpt_to_load_from,
       tensor_name_in_ckpt=tensor_name_in_ckpt,
       max_norm=max_norm,
@@ -933,6 +950,7 @@ def shared_embedding_columns(
   sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
 
   c0 = sorted_columns[0]
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
   if not isinstance(c0, _CategoricalColumn):
     raise ValueError(
         'All categorical_columns must be subclasses of _CategoricalColumn. '
@@ -948,23 +966,45 @@ def shared_embedding_columns(
           'the same type, or be weighted_categorical_column of the same type. '
           'Given column: {} of type: {} does not match given column: {} of '
           'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
 
   if not shared_embedding_collection_name:
     shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
     shared_embedding_collection_name += '_shared_embedding'
 
+  # Create the state (_SharedEmbeddingColumnLayer) here.
+  embedding_shape = num_buckets, dimension
+
+  shared_embedding_column_layer = _EmbeddingColumnLayer(
+      embedding_shape=embedding_shape,
+      initializer=initializer,
+      weight_collections=[],
+      trainable=trainable,
+      name=shared_embedding_collection_name)
+
   result = []
   for column in categorical_columns:
-    result.append(_SharedEmbeddingColumn(
-        categorical_column=column,
-        dimension=dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable))
+    result.append(
+        _SharedEmbeddingColumn(
+            categorical_column=column,
+            initializer=initializer,
+            dimension=dimension,
+            combiner=combiner,
+            var_scope_name=shared_embedding_collection_name,
+            ckpt_to_load_from=ckpt_to_load_from,
+            tensor_name_in_ckpt=tensor_name_in_ckpt,
+            max_norm=max_norm,
+            trainable=trainable))
+
+  for single_result in result:
+    single_result._set_layer(shared_embedding_column_layer)  # pylint: disable=protected-access
+    single_result._set_all_columns(result)  # pylint: disable=protected-access
+
   return result
 
 
@@ -1721,6 +1761,57 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
       hash_key=hash_key)
 
 
+# TODO(rohanj): Clearly define semantics of this layer.
+class _EmbeddingColumnLayer(base.Layer):
+  """A layer that stores all the state required for a embedding column."""
+
+  def __init__(self,
+               embedding_shape,
+               initializer,
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    """Constructor.
+
+    Args:
+      embedding_shape: Shape of the embedding variable used for lookup.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      weight_collections: A list of collection names to which the Variable will
+        be added. Note that, variables will also be added to collections
+        `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+      trainable: If `True` also add the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: Name of the layer
+      **kwargs: keyword named properties.
+    """
+    super(_EmbeddingColumnLayer, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._embedding_shape = embedding_shape
+    self._initializer = initializer
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._embedding_weight_var = self.add_variable(
+        name='embedding_weights',
+        shape=self._embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self._initializer,
+        trainable=self.trainable)
+    # self.add_variable already appends to GLOBAL_VARIABLES collection.
+    if self._weight_collections and not context.executing_eagerly():
+      for weight_collection in self._weight_collections:
+        if weight_collection != ops.GraphKeys.GLOBAL_VARIABLES:
+          _add_to_collections(self._embedding_weight_var, [weight_collection])
+    self.built = True
+
+  def call(self, _):
+    return self._embedding_weight_var
+
+
 class _FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -1794,18 +1885,13 @@ class _FeatureColumn(object):
     """
     pass
 
-  def _create_state(self, weight_collections=None, creator=None):
-    """Returns an object that captures the state of the column.
+  def _reset_config(self):
+    """Resets the configuration in the column.
 
-    Args:
-      weight_collections: Collections to add the variable to
-      creator: Variable creator method called, if provided.
-
-    Returns:
-      An object that encapsulates the state of the column. Can return None.
+    Some feature columns e.g. embedding or shared embedding columns might
+    have some state that is needed to be reset sometimes. Use this method
+    in that scenario.
     """
-    del weight_collections, creator  # Unused
-    return None
 
 
 class _DenseColumn(_FeatureColumn):
@@ -1826,11 +1912,7 @@ class _DenseColumn(_FeatureColumn):
     pass
 
   @abc.abstractmethod
-  def _get_dense_tensor(self,
-                        inputs,
-                        weight_collections=None,
-                        trainable=None,
-                        state=None):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     """Returns a `Tensor`.
 
     The output of this function will be used by model-builder-functions. For
@@ -1848,9 +1930,6 @@ class _DenseColumn(_FeatureColumn):
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
-      state: An object encapsulating the state of the column. Columns that
-        create state using the _create_state method would have that state
-        passed in to this method.
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -1864,8 +1943,7 @@ def _create_weighted_sum(column,
                          sparse_combiner,
                          weight_collections,
                          trainable,
-                         weight_var=None,
-                         state=None):
+                         weight_var=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
@@ -1883,8 +1961,7 @@ def _create_weighted_sum(column,
         units=units,
         weight_collections=weight_collections,
         trainable=trainable,
-        weight_var=weight_var,
-        state=state)
+        weight_var=weight_var)
 
 
 def _create_dense_column_weighted_sum(column,
@@ -1892,20 +1969,12 @@ def _create_dense_column_weighted_sum(column,
                                       units,
                                       weight_collections,
                                       trainable,
-                                      weight_var=None,
-                                      state=None):
+                                      weight_var=None):
   """Create a weighted sum of a dense column for linear_model."""
-  if state is not None:
-    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-        builder,
-        weight_collections=weight_collections,
-        trainable=trainable,
-        state=state)
-  else:
-    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-        builder,
-        weight_collections=weight_collections,
-        trainable=trainable)
+  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
@@ -2368,10 +2437,10 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
 
 class _EmbeddingColumn(
     _DenseColumn, _SequenceDenseColumn,
-    collections.namedtuple('_EmbeddingColumn', (
-        'categorical_column', 'dimension', 'combiner', 'initializer',
-        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
-    ))):
+    collections.namedtuple(
+        '_EmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
+         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2393,33 +2462,10 @@ class _EmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _create_state(self, weight_collections=None, creator=None):
-    variables_map = {}
-    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    if creator is not None:
-      embedding_weights = creator(
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer,
-          trainable=self.trainable)
-      ops.add_to_collections(weight_collections, embedding_weights)
-    else:
-      embedding_weights = variable_scope.get_variable(
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer,
-          trainable=self.trainable,
-          collections=weight_collections)
-    variables_map['embedding_weights'] = embedding_weights
-    return variables_map
-
   def _get_dense_tensor_internal(self,
                                  inputs,
                                  weight_collections=None,
-                                 trainable=None,
-                                 state=None):
+                                 trainable=None):
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -2427,9 +2473,9 @@ class _EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    if state is None:
-      state = self._create_state(weight_collections)
-    embedding_weights = state['embedding_weights']
+    embedding_weights = self.layer_creator(
+        weight_collections=weight_collections,
+        scope=variable_scope.get_variable_scope())
 
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
@@ -2448,11 +2494,7 @@ class _EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
-  def _get_dense_tensor(self,
-                        inputs,
-                        weight_collections=None,
-                        trainable=None,
-                        state=None):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
           'In embedding_column: {}. '
@@ -2467,8 +2509,7 @@ class _EmbeddingColumn(
     return self._get_dense_tensor_internal(
         inputs=inputs,
         weight_collections=weight_collections,
-        trainable=trainable,
-        state=state)
+        trainable=trainable)
 
   def _get_sequence_dense_tensor(
       self, inputs, weight_collections=None, trainable=None):
@@ -2492,13 +2533,20 @@ class _EmbeddingColumn(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
+def _get_graph_for_variable(var):
+  if isinstance(var, variables.PartitionedVariable):
+    return list(var)[0].graph
+  else:
+    return var.graph
+
+
 class _SharedEmbeddingColumn(
     _DenseColumn,
-    collections.namedtuple('_SharedEmbeddingColumn', (
-        'categorical_column', 'dimension', 'combiner', 'initializer',
-        'shared_embedding_collection_name', 'ckpt_to_load_from',
-        'tensor_name_in_ckpt', 'max_norm', 'trainable'
-    ))):
+    collections.namedtuple(
+        '_SharedEmbeddingColumn',
+        ('categorical_column', 'dimension', 'combiner', 'initializer',
+         'var_scope_name', 'ckpt_to_load_from', 'tensor_name_in_ckpt',
+         'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
   @property
@@ -2509,7 +2557,7 @@ class _SharedEmbeddingColumn(
 
   @property
   def _var_scope_name(self):
-    return self.shared_embedding_collection_name
+    return self.var_scope_name
 
   @property
   def _parse_example_spec(self):
@@ -2518,45 +2566,29 @@ class _SharedEmbeddingColumn(
   def _transform_feature(self, inputs):
     return inputs.get(self.categorical_column)
 
+  def _set_layer(self, layer):
+    self._layer = layer
+
+  def _set_all_columns(self, all_columns):
+    self._all_columns = all_columns
+
+  def _reset_config(self):
+    config = self._layer.get_config()
+    config['embedding_shape'] = (
+        self.categorical_column._num_buckets,  # pylint: disable=protected-access
+        self.dimension)
+    config['initializer'] = self.initializer
+    self._layer = self._layer.__class__.from_config(config)
+    for column in self._all_columns:
+      column._set_layer(self._layer)  # pylint: disable=protected-access
+
   @property
   def _variable_shape(self):
     if not hasattr(self, '_shape'):
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _create_state(self, weight_collections=None, creator=None):
-    variables_map = {}
-    shared_embedding_collection = ops.get_collection(
-        self.shared_embedding_collection_name)
-    if not shared_embedding_collection:
-      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-      if creator is not None:
-        embedding_weights = creator(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable)
-        ops.add_to_collections(weight_collections, embedding_weights)
-      else:
-        embedding_weights = variable_scope.get_variable(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable,
-            collections=weight_collections)
-      ops.add_to_collection(self.shared_embedding_collection_name,
-                            embedding_weights)
-      variables_map['embedding_weights'] = embedding_weights
-
-    return variables_map
-
-  def _get_dense_tensor(self,
-                        inputs,
-                        weight_collections=None,
-                        trainable=None,
-                        state=None):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     # This method is called from a variable_scope with name _var_scope_name,
     # which is shared among all shared embeddings. Open a name_scope here, so
     # that the ops for different columns have distinct names.
@@ -2567,38 +2599,17 @@ class _SharedEmbeddingColumn(
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
-      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-      shared_embedding_collection = ops.get_collection(
-          self.shared_embedding_collection_name)
-      if shared_embedding_collection:
-        if len(shared_embedding_collection) > 1:
-          raise ValueError(
-              'Collection {} can only contain one variable. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(shared_embedding_collection))
-        embedding_weights = shared_embedding_collection[0]
-        if embedding_weights.get_shape() != embedding_shape:
-          raise ValueError(
-              'Shared embedding collection {} contains variable {} of '
-              'unexpected shape {}. Expected shape is {}. '
-              'Suggested fix A: Choose a unique name for this collection. '
-              'Suggested fix B: Do not add any variables to this collection. '
-              'The feature_column library already adds a variable under the '
-              'hood.'.format(
-                  self.shared_embedding_collection_name, embedding_weights.name,
-                  embedding_weights.get_shape(), embedding_shape))
-      else:
-        embedding_weights = variable_scope.get_variable(
-            name='embedding_weights',
-            shape=embedding_shape,
-            dtype=dtypes.float32,
-            initializer=self.initializer,
-            trainable=self.trainable and trainable,
-            collections=weight_collections)
-        ops.add_to_collection(
-            self.shared_embedding_collection_name, embedding_weights)
+      embedding_weights = self._layer(
+          None, scope=variable_scope.get_variable_scope())
+      # If we're in graph mode and this is called with a different graph,
+      # then we should reset.
+      if not context.executing_eagerly() and (
+          ops.get_default_graph() !=
+          _get_graph_for_variable(embedding_weights)):
+        self._reset_config()
+        embedding_weights = self._layer(
+            None, scope=variable_scope.get_variable_scope())
+
       if self.ckpt_to_load_from is not None:
         to_restore = embedding_weights
         if isinstance(to_restore, variables.PartitionedVariable):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 62718db0e5..46404abadc 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -2885,6 +2885,114 @@ class FunctionalInputLayerTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      fc.input_layer(features, all_cols)
+      fc.input_layer(features, all_cols)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'input_layer/sparse_feature_embedding/embedding_weights:0',
+          'input_layer_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  def test_multiple_layers_with_same_shared_embedding_column(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      all_cols = [embedding_column_a, embedding_column_b]
+      fc.input_layer(features, all_cols)
+      fc.input_layer(features, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    all_cols = [embedding_column_a, embedding_column_b]
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc.input_layer(features, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+
+    with ops.Graph().as_default():
+      features1 = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+
+      fc.input_layer(features1, all_cols)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
   def test_with_numpy_input_fn(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -4504,7 +4612,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('mean', embedding_column.combiner)
-    self.assertIsNotNone(embedding_column.initializer)
     self.assertIsNone(embedding_column.ckpt_to_load_from)
     self.assertIsNone(embedding_column.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column.max_norm)
@@ -4529,7 +4636,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
-    self.assertEqual('my_initializer', embedding_column.initializer())
     self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column.max_norm)
@@ -4560,7 +4666,6 @@ class EmbeddingColumnTest(test.TestCase):
 
       self.assertEqual(embedding_dimension, embedding_column.dimension)
       self.assertEqual('my_combiner', embedding_column.combiner)
-      self.assertEqual('my_initializer', embedding_column.initializer())
       self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column.max_norm)
@@ -4675,72 +4780,6 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
-  def test_get_dense_tensor_with_state(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Create embedding_weights variable.
-    weight_collections = [
-        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
-    ]
-    state = embedding_column._create_state(weight_collections)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }), state=state)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
-
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -4795,8 +4834,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
@@ -4823,8 +4862,9 @@ class EmbeddingColumnTest(test.TestCase):
         }), weight_collections=('my_vars',))
 
     # Assert expected embedding variable and lookups.
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in my_vars]))
@@ -5243,14 +5283,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column_b.dimension)
     self.assertEqual('mean', embedding_column_a.combiner)
     self.assertEqual('mean', embedding_column_b.combiner)
-    self.assertIsNotNone(embedding_column_a.initializer)
-    self.assertIsNotNone(embedding_column_b.initializer)
     self.assertIsNone(embedding_column_a.ckpt_to_load_from)
     self.assertIsNone(embedding_column_b.ckpt_to_load_from)
     self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_a.shared_embedding_collection_name)
+                     embedding_column_a.var_scope_name)
     self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_b.shared_embedding_collection_name)
+                     embedding_column_b.var_scope_name)
     self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_a.max_norm)
@@ -5296,12 +5334,10 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column_b.dimension)
     self.assertEqual('my_combiner', embedding_column_a.combiner)
     self.assertEqual('my_combiner', embedding_column_b.combiner)
-    self.assertEqual('my_initializer', embedding_column_a.initializer())
-    self.assertEqual('my_initializer', embedding_column_b.initializer())
     self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_a.shared_embedding_collection_name)
+                     embedding_column_a.var_scope_name)
     self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_b.shared_embedding_collection_name)
+                     embedding_column_b.var_scope_name)
     self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
     self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
@@ -5351,9 +5387,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
       self.assertEqual(embedding_dimension, embedding_column_a.dimension)
       self.assertEqual('my_combiner', embedding_column_a.combiner)
-      self.assertEqual('my_initializer', embedding_column_a.initializer())
       self.assertEqual('shared_embedding_collection_name',
-                       embedding_column_a.shared_embedding_collection_name)
+                       embedding_column_a.var_scope_name)
       self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column_a.max_norm)
@@ -5537,80 +5572,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
       self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
-  def test_get_dense_tensor_with_state(self):
-    # Inputs.
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-    input_features = {'aaa': input_a, 'bbb': input_b}
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups_a = (
-        # example 0:
-        (7., 11.),  # ids [2], embedding = [7, 11]
-        # example 1:
-        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-    )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Create state.
-    weight_collections = [
-        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
-    ]
-    state = embedding_column_a._create_state(weight_collections)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a._get_dense_tensor(
-        _LazyBuilder(input_features), state=state)
-    embedding_lookup_b = embedding_column_b._get_dense_tensor(
-        _LazyBuilder(input_features), state=state)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
-
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5912,10 +5873,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           tuple([v.name for v in trainable_vars]))
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = ops.get_collection('aaa_bbb_shared_embedding')
-    self.assertItemsEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in shared_embedding_vars]))
+    shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
       self.assertAllEqual(expected_lookups, input_layer.eval())
-- 
GitLab


From 3836be5716b19708df75229ae9f8712f669205ae Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 19:41:46 +0000
Subject: [PATCH 1051/1262] Fix pylint `Line too long (102/80)`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 1b8614899f..d58198faf3 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -358,7 +358,8 @@ class _CudnnRNN(base_layer.Layer):
             "CUDA/CuDNN generations.")
       # Initialize opaque params with a tensor.
       self.kernel = vs.get_variable(
-          "opaque_kernel", dtype=self._plain_dtype, initializer=opaque_params_t, validate_shape=False)
+          "opaque_kernel", dtype=self._plain_dtype,
+          initializer=opaque_params_t, validate_shape=False)
     # Create saveable in the outer scope of the cudnn subgraph, such that
     # alternative subgraph with platform-independent rnn cells can load the
     # checkpoints directly.
-- 
GitLab


From f4c6a318eb9eb01440c313a4fc423ac267fdb74e Mon Sep 17 00:00:00 2001
From: Stanley Bileschi <bileschi@google.com>
Date: Wed, 18 Apr 2018 13:12:04 -0700
Subject: [PATCH 1052/1262] Improves error messaging for bad (empty) CSV files.

PiperOrigin-RevId: 193404804
---
 tensorflow/contrib/data/python/ops/readers.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 4ec8ae1c79..bbb808fbd7 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -156,12 +156,21 @@ def _infer_column_names(filenames, field_delim, use_quote_delim):
       "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE
   }
   with file_io.FileIO(filenames[0], "r") as f:
-    column_names = next(csv.reader(f, **csv_kwargs))
+    try:
+      column_names = next(csv.reader(f, **csv_kwargs))
+    except StopIteration:
+      raise ValueError(("Received StopIteration when reading the header line "
+                        "of %s.  Empty file?") % filenames[0])
 
   for name in filenames[1:]:
     with file_io.FileIO(name, "r") as f:
-      if next(csv.reader(f, **csv_kwargs)) != column_names:
-        raise ValueError("Files have different column names in the header row.")
+      try:
+        if next(csv.reader(f, **csv_kwargs)) != column_names:
+          raise ValueError(
+              "Files have different column names in the header row.")
+      except StopIteration:
+        raise ValueError(("Received StopIteration when reading the header line "
+                          "of %s.  Empty file?") % filenames[0])
   return column_names
 
 
-- 
GitLab


From f28342c2caab42987e6761abeca84ba3147cddba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 13:13:02 -0700
Subject: [PATCH 1053/1262] Prevent access to deallocated hash map upon exit().

PiperOrigin-RevId: 193404950
---
 tensorflow/core/lib/strings/numbers.cc | 33 ++++++++++++++++----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 8f34baa7de..c296daa95d 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -33,19 +33,26 @@ namespace tensorflow {
 
 namespace {
 
+template <typename T>
+const std::unordered_map<string, T>* GetSpecialNumsSingleton() {
+  static const std::unordered_map<string, T>* special_nums =
+      CHECK_NOTNULL((new const std::unordered_map<string, T>{
+          {"inf", std::numeric_limits<T>::infinity()},
+          {"+inf", std::numeric_limits<T>::infinity()},
+          {"-inf", -std::numeric_limits<T>::infinity()},
+          {"infinity", std::numeric_limits<T>::infinity()},
+          {"+infinity", std::numeric_limits<T>::infinity()},
+          {"-infinity", -std::numeric_limits<T>::infinity()},
+          {"nan", std::numeric_limits<T>::quiet_NaN()},
+          {"+nan", std::numeric_limits<T>::quiet_NaN()},
+          {"-nan", -std::numeric_limits<T>::quiet_NaN()},
+      }));
+  return special_nums;
+}
+
 template <typename T>
 T locale_independent_strtonum(const char* str, const char** endptr) {
-  static const std::unordered_map<string, T> special_nums = {
-      {"inf", std::numeric_limits<T>::infinity()},
-      {"+inf", std::numeric_limits<T>::infinity()},
-      {"-inf", -std::numeric_limits<T>::infinity()},
-      {"infinity", std::numeric_limits<T>::infinity()},
-      {"+infinity", std::numeric_limits<T>::infinity()},
-      {"-infinity", -std::numeric_limits<T>::infinity()},
-      {"nan", std::numeric_limits<T>::quiet_NaN()},
-      {"+nan", std::numeric_limits<T>::quiet_NaN()},
-      {"-nan", -std::numeric_limits<T>::quiet_NaN()},
-  };
+  auto special_nums = GetSpecialNumsSingleton<T>();
   std::stringstream s(str);
 
   // Check if str is one of the special numbers.
@@ -57,8 +64,8 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
         std::tolower(special_num_str[i], std::locale::classic());
   }
 
-  auto entry = special_nums.find(special_num_str);
-  if (entry != special_nums.end()) {
+  auto entry = special_nums->find(special_num_str);
+  if (entry != special_nums->end()) {
     *endptr = str + (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
                              : s.tellg());
     return entry->second;
-- 
GitLab


From 6e87115add98695862343539f383bf82e5cacf32 Mon Sep 17 00:00:00 2001
From: Sami Kama <samikama@users.noreply.github.com>
Date: Wed, 18 Apr 2018 13:30:10 -0700
Subject: [PATCH 1054/1262] Configurable Custom graph optimizers for grappler
 (#18479)

* Adding a new field for configurable custom optimizers

* Pass configuration structure to custom optimizers

* Reviewer requests

* Style fixes

* Fix tests
---
 .../optimizers/custom_graph_optimizer.h       |  4 +++-
 .../custom_graph_optimizer_registry_test.cc   |  5 ++++-
 .../grappler/optimizers/meta_optimizer.cc     | 22 ++++++++++++++++++-
 .../optimizers/meta_optimizer_test.cc         |  5 ++++-
 .../core/protobuf/rewriter_config.proto       | 11 ++++++++++
 5 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
index a80d46f416..4d7f8c98d0 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -26,7 +27,8 @@ namespace grappler {
 class CustomGraphOptimizer : public GraphOptimizer {
  public:
   virtual ~CustomGraphOptimizer() {}
-  virtual Status Init() = 0;
+  virtual Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
+                          config = nullptr) = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
index 629f5e83c1..bdb1ae8532 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
@@ -32,7 +32,10 @@ static const char* kTestOptimizerName = "Test";
 
 class TestGraphOptimizer : public CustomGraphOptimizer {
  public:
-  Status Init() override { return Status::OK(); }
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
   string name() const override { return kTestOptimizerName; }
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 558b8a77e8..f4bc865657 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -164,6 +164,26 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       TF_RETURN_IF_ERROR(opt->Init());
       optimizers.push_back(std::move(opt));
     }
+
+    // Append custom configurable optimizers.
+    std::vector<tensorflow::RewriterConfig_CustomGraphOptimizer>
+        custom_configurable_optimizers;
+    for (const auto& optimizer : cfg_.custom_optimizers()) {
+      if (available_optimizers.find(optimizer.name()) !=
+          available_optimizers.end()) {
+        optimizers.push_back(NewOptimizer(optimizer.name()));
+      } else {
+        custom_configurable_optimizers.push_back(optimizer);
+      }
+    }
+    // Now initialize and configure the custom optimizers.
+    for (const auto& optimizer : custom_configurable_optimizers) {
+      std::unique_ptr<CustomGraphOptimizer> opt =
+          CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer.name());
+      if (opt == nullptr) continue;
+      TF_RETURN_IF_ERROR(opt->Init(&optimizer));
+      optimizers.push_back(std::move(opt));
+    }
   }
 
   if (optimizers.empty()) {
@@ -253,7 +273,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.auto_parallel().enable() ||
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
-         !cfg.optimizers().empty();
+         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index d9a386b9be..9fcf07651b 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -36,7 +36,10 @@ class TestOptimizer : public CustomGraphOptimizer {
   TestOptimizer() {}
   string name() const override { return "test_optimizer"; }
 
-  Status Init() override { return Status::OK(); }
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 9b6202e7b4..029b27cd04 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -6,6 +6,8 @@ option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+import "tensorflow/core/framework/attr_value.proto";
+
 message AutoParallelOptions {
   bool enable = 1;
   int32 num_replicas = 2;
@@ -119,4 +121,13 @@ message RewriterConfig {
   // Custom registered optimizers will be run after the base optimizers, in
   // the order that they are specified.
   repeated string optimizers = 100;
+
+  // Message to describe custom graph optimizer and its parameters
+  message CustomGraphOptimizer {
+    string name = 1;
+    map<string, AttrValue> parameter_map = 2;
+  }
+
+  // list of CustomGraphOptimizers to apply.
+  repeated CustomGraphOptimizer custom_optimizers = 200;
 }
-- 
GitLab


From 6fa949afca5f1549f87554475d053c608f0da379 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 13:29:43 -0700
Subject: [PATCH 1055/1262] Catch OSError in gen_git_source.py subprocess call.

OSError occurs if git cannot be found. This is a initial fix for some
Windows build errors.

PiperOrigin-RevId: 193407250
---
 tensorflow/tools/git/gen_git_source.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b80..78d511969e 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -125,7 +125,7 @@ def configure(src_base_path, gen_path, debug=False):
       try:
         # In python 3.5, symlink function exists even on Windows. But requires
         # Windows Admin privileges, otherwise an OSError will be thrown.
-        if hasattr(os, 'symlink'):
+        if hasattr(os, "symlink"):
           os.symlink(src, os.path.join(gen_path, target))
         else:
           shutil.copy2(src, os.path.join(gen_path, target))
@@ -162,7 +162,7 @@ def get_git_version(git_base_path):
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
     return val if val else unknown_label
-  except subprocess.CalledProcessError:
+  except (subprocess.CalledProcessError, OSError):
     return unknown_label
 
 
-- 
GitLab


From 075fbb59d767ae2868c369799d553a953ffb4dad Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 13:49:18 -0700
Subject: [PATCH 1056/1262] Add string_strip to remove leading and trailing
 whitespaces (#18418)

* Add string_strip to remove leading and trailing whitespaces

This fix tries to address the issue raised in 18384 to add
an op tf.string_strip so that the leading and trailing whitespaces
could be removed.

This fix fixes 18384.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add StringStrip op to string_ops.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update Bazel BUILD file

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Sanitize with clang-format

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test cases for string_strip

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update API defs with tensorflow/core/api_def/update_api_def.sh

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix python test error

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update goldens API with

```
bazel-bin/tensorflow/tools/api/tests/api_compatibility_test
           --update_goldens True
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../base_api/api_def_StringStrip.pbtxt        | 16 ++++++
 tensorflow/core/kernels/BUILD                 |  7 +++
 tensorflow/core/kernels/string_strip_op.cc    | 53 ++++++++++++++++++
 tensorflow/core/ops/string_ops.cc             |  5 ++
 tensorflow/python/kernel_tests/BUILD          | 14 +++++
 .../kernel_tests/string_strip_op_test.py      | 56 +++++++++++++++++++
 tensorflow/tools/api/golden/tensorflow.pbtxt  |  4 ++
 7 files changed, 155 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
 create mode 100644 tensorflow/core/kernels/string_strip_op.cc
 create mode 100644 tensorflow/python/kernel_tests/string_strip_op_test.py

diff --git a/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
new file mode 100644
index 0000000000..12fbdfdf3f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "StringStrip"
+  in_arg {
+    name: "input"
+    description: <<END
+A string `Tensor` of any shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A string `Tensor` of the same shape as the input.
+END
+  }
+  summary: "Strip leading and trailing whitespaces from the Tensor."
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 47cb344091..835b8bbb47 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4234,6 +4234,7 @@ cc_library(
         ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
+        ":string_strip_op",
         ":string_to_hash_bucket_op",
         ":substr_op",
     ],
@@ -4278,6 +4279,12 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "string_strip_op",
+    prefix = "string_strip_op",
+    deps = STRING_DEPS,
+)
+
 tf_kernel_library(
     name = "substr_op",
     prefix = "substr_op",
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
new file mode 100644
index 0000000000..ae700f4294
--- /dev/null
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class StringStripOp : public OpKernel {
+ public:
+  explicit StringStripOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor));
+
+    const auto input = input_tensor->flat<string>();
+    auto output = output_tensor->flat<string>();
+
+    for (int64 i = 0; i < input.size(); ++i) {
+      StringPiece entry(input(i));
+      str_util::RemoveWhitespaceContext(&entry);
+      output(i) = entry.ToString();
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringStrip").Device(DEVICE_CPU), StringStripOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 05f216a83e..469f193cf4 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -123,6 +123,11 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringStrip")
+    .Input("input: string")
+    .Output("output: string")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("EncodeBase64")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 210b571449..a8ff9f73ea 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -917,6 +917,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "string_strip_op_test",
+    size = "small",
+    srcs = ["string_strip_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "substr_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
new file mode 100644
index 0000000000..30fd477ff4
--- /dev/null
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for string_strip_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class StringStripOpTest(test.TestCase):
+  """ Test cases for tf.string_strip."""
+
+  def test_string_strip(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
+
+  def test_string_strip_2d(self):
+    strings = [["pigs on the wing", "animals"],
+               [" hello ", "\n\tworld \r \n"]]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
+                                   [b"hello", b"world"]])
+
+  def test_string_strip_with_empty_strings(self):
+    strings = [" hello ", "", "world ", " \t \r \n "]
+
+    with self.test_session() as sess:
+      output = string_ops.string_strip(strings)
+      output = sess.run(output)
+      self.assertAllEqual(output, [b"hello", b"", b"world", b""])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index c66249999f..0b12bc060e 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1980,6 +1980,10 @@ tf_module {
     name: "string_split"
     argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
   }
+  member_method {
+    name: "string_strip"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "string_to_hash_bucket"
     argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From 9fe297ffa8133309fe548df3a0208d0ff9305a66 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 18 Apr 2018 13:46:37 -0700
Subject: [PATCH 1057/1262] Internal-only change.

PiperOrigin-RevId: 193409980
---
 tensorflow/compiler/tests/BUILD                  |  2 +-
 .../contrib/data/python/kernel_tests/BUILD       | 15 ++++++++++++---
 tensorflow/contrib/distributions/BUILD           |  1 +
 tensorflow/contrib/estimator/BUILD               |  4 ++--
 tensorflow/contrib/legacy_seq2seq/BUILD          |  5 ++++-
 tensorflow/contrib/linalg/BUILD                  | 10 ++++++++--
 tensorflow/contrib/lookup/BUILD                  |  2 +-
 tensorflow/contrib/optimizer_v2/BUILD            |  1 +
 tensorflow/core/BUILD                            | 16 +++++++++++++++-
 tensorflow/examples/tutorials/mnist/BUILD        |  1 +
 tensorflow/python/BUILD                          |  4 ++--
 tensorflow/python/estimator/BUILD                |  1 +
 tensorflow/python/keras/BUILD                    |  2 +-
 tensorflow/python/kernel_tests/BUILD             |  7 +++++--
 tensorflow/python/kernel_tests/linalg/BUILD      |  1 +
 tensorflow/tools/docs/BUILD                      |  3 ++-
 16 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b9e42ca677..46b86c53aa 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -340,7 +340,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "ftrl_test",
-    size = "small",
+    size = "medium",
     srcs = ["ftrl_test.py"],
     deps = [
         ":xla_test",
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index b475c9fa6b..c554607960 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -122,7 +122,10 @@ py_test(
     size = "small",
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/python:array_ops",
@@ -211,7 +214,10 @@ py_test(
     size = "medium",
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:error_ops",
@@ -306,7 +312,10 @@ py_test(
     srcs = ["resample_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
     deps = [
         "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 20e432b88d..2d99e8172d 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -877,6 +877,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["optonly"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 9f4cd44afb..9e88bc7de1 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -210,7 +210,7 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/head_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -250,7 +250,7 @@ py_library(
 
 py_test(
     name = "linear_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/linear_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 8c2c4fd29c..4ce91a140f 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -58,5 +58,8 @@ cuda_py_tests(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = ["noasan"],  # times out b/63678675
+    tags = [
+        "noasan",  # times out b/63678675
+        "optonly",  # times out (flaky)
+    ],
 )
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 2c5fa7af89..2e92ad6eb3 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -59,7 +59,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
 
 cuda_py_test(
@@ -78,5 +81,8 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 8,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index f616207d46..e3928a82a2 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -28,7 +28,7 @@ py_library(
 
 tf_py_test(
     name = "lookup_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["lookup_ops_test.py"],
     additional_deps = [
         ":lookup_py",
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 86e5f4a437..85cfce346c 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -203,4 +203,5 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
     ],
+    tags = ["optonly"],
 )
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 01bda8e09b..21f929894c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2759,7 +2759,6 @@ tf_cc_tests(
         "lib/monitoring/sampler_test.cc",
         "lib/random/distribution_sampler_test.cc",
         "lib/random/philox_random_test.cc",
-        "lib/random/random_distributions_test.cc",
         "lib/random/random_test.cc",
         "lib/random/simple_philox_test.cc",
         "lib/strings/base64_test.cc",
@@ -2789,6 +2788,21 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "lib_random_random_distributions_test",
+    srcs = ["lib/random/random_distributions_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index aa1b2ec2db..d7bc6a5a7d 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -51,6 +51,7 @@ py_binary(
         "fully_connected_feed.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["optonly"],
     deps = [
         ":input_data",
         ":mnist",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 569d3eb2ce..c2bedab4f9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2802,7 +2802,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "image_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/image_ops_test.py"],
     additional_deps = [
         ":array_ops",
@@ -4333,7 +4333,7 @@ py_test(
 
 tf_py_test(
     name = "input_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/input_test.py"],
     additional_deps = [
         ":array_ops",
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 7bf4447491..c6bb9b9be7 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -205,6 +205,7 @@ py_test(
         "no_pip",
         "noasan",  # test flakily times out in asan mode.
         "notsan",  # b/67510291
+        "optonly",  # flakily times out in fastbuild
     ],
     deps = [
         ":baseline",
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 024a8cd3d1..ca7686b1d1 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -835,7 +835,7 @@ py_test(
 
 py_test(
     name = "saving_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/engine/saving_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 11adb1ccfc..a02783e7e7 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -592,7 +592,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "matrix_solve_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["matrix_solve_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1603,7 +1603,10 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 9555e51099..4e3f24890b 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -123,6 +123,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
+    tags = ["optonly"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 0c1fd0cf9d..58b5ef8345 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -103,10 +103,11 @@ py_test(
     data = ["//tensorflow:docs_src"],
     srcs_version = "PY2AND3",
     tags = [
-        # No reason to run sanitizers for this test.
+        # No reason to run sanitizers or fastbuild for this test.
         "noasan",
         "nomsan",
         "notsan",
+        "optonly",
     ],
     deps = [
         ":generate_lib",
-- 
GitLab


From f17311fa8d2df24e56deaab743cdf1ec5e12c692 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 18 Apr 2018 13:47:17 -0700
Subject: [PATCH 1058/1262] Use the new gather HLO in the bridge when lowering
 TF gather ops; NFC

After gather expansion this should boil down to a while loop very similar to
what we emit from the bridge today.

PiperOrigin-RevId: 193410095
---
 .../compiler/tf2xla/kernels/gather_op.cc      | 189 ++++++++----------
 .../tf2xla/kernels/gather_op_helpers.h        |   4 +-
 2 files changed, 90 insertions(+), 103 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 7945c05af4..0b79cb0916 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -29,52 +29,54 @@ namespace tensorflow {
 Status XlaGather(const xla::ComputationDataHandle& input,
                  const TensorShape& input_shape,
                  const xla::ComputationDataHandle& indices,
-                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
-                 DataType dtype, DataType index_type,
+                 const TensorShape& indices_shape, int64 axis,
+                 bool indices_are_nd, DataType dtype, DataType index_type,
                  xla::ComputationBuilder* builder,
                  xla::ComputationDataHandle* gather_output) {
+  // There is no deep reason why we need this precondition, but this is the only
+  // combination that is used and tested today.
+  CHECK(!indices_are_nd || axis == 0);
+
+  // num_index_dims is the number of components in each index in the indices
+  // tensor.
+  //
+  // num_indices is the total number of (n dimensional or scalar) indices in the
+  // indices tensor.
+  //
   // If the indices are N-dimensional, then the minor dimension of indices
   // should be of size N and correspond to the N indices.
-  int64 num_index_dims = 1;
+  int64 num_index_dims;
+  int64 num_indices = 1;
   if (indices_are_nd) {
     CHECK_GE(indices_shape.dims(), 1);
     num_index_dims = indices_shape.dim_size(indices_shape.dims() - 1);
-    indices_shape.RemoveLastDims(1);
+    for (int64 i = 0, e = indices_shape.dims() - 1; i < e; i++) {
+      num_indices *= indices_shape.dim_size(i);
+    }
+  } else {
+    num_index_dims = 1;
+    for (int64 i = 0, e = indices_shape.dims(); i < e; i++) {
+      num_indices *= indices_shape.dim_size(i);
+    }
   }
 
-  // Although the indices Tensor is flattened into rank 1 during the lookup,
-  // and each scalar entry is used as an index into the first dimension of the
-  // input, the output is returned with shape:
-  // input.shape[:axis] + indices.shape + input.shape[axis+1:]
-
-  const int64 num_indices = indices_shape.num_elements();
-  TensorShape input_shape_pre_axis(input_shape);
-  input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
-  TensorShape input_shape_post_axis(input_shape);
-  input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
-  // Each slice of the input tensor has shape:
-  // [<input_shape_pre_axis>, 1, ..., 1, <input shape_post_axis>]
-  TensorShape slice_shape(input_shape);
-  for (int64 i = 0; i < num_index_dims; ++i) {
-    slice_shape.set_dim(axis + i, 1);
-  }
+  // Degenerate case: empty indices.
+  if (num_indices == 0) {
+    TensorShape input_shape_pre_axis{input_shape};
+    input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
+    TensorShape input_shape_post_axis{input_shape};
+    input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
 
-  TensorShape loop_out_shape;
-  loop_out_shape.AppendShape(input_shape_pre_axis);
-  loop_out_shape.AddDim(num_indices);
-  loop_out_shape.AppendShape(input_shape_post_axis);
-  TensorShape loop_out_slice_shape;
-  loop_out_slice_shape.AppendShape(input_shape_pre_axis);
-  loop_out_slice_shape.AddDim(1);
-  loop_out_slice_shape.AppendShape(input_shape_post_axis);
+    TensorShape indices_shape_no_index_vectors{indices_shape};
+    if (indices_are_nd) {
+      indices_shape_no_index_vectors.RemoveLastDims(1);
+    }
 
-  TensorShape out_shape;
-  out_shape.AppendShape(input_shape_pre_axis);
-  out_shape.AppendShape(indices_shape);
-  out_shape.AppendShape(input_shape_post_axis);
+    TensorShape out_shape;
+    out_shape.AppendShape(input_shape_pre_axis);
+    out_shape.AppendShape(indices_shape_no_index_vectors);
+    out_shape.AppendShape(input_shape_post_axis);
 
-  // Degenerate case: empty indices.
-  if (num_indices == 0) {
     *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
                                         out_shape.dim_sizes());
     return Status::OK();
@@ -88,76 +90,61 @@ Status XlaGather(const xla::ComputationDataHandle& input,
     }
   }
 
-  // Flatten the major dimensions of indices into a single dimension for ease of
-  // iteration. If there is an axis dimension, we must leave it alone.
-  std::vector<int64> flat_indices_shape = {num_indices};
-  if (indices_are_nd) {
-    flat_indices_shape.push_back(num_index_dims);
-  }
-
-  // Specify the shape of the loop-carried Tensor tuple.
-
-  // Construct the initial values of the loop-carried Tensors.
-  auto flat_indices = builder->Reshape(indices, flat_indices_shape);
-  auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
-                                     loop_out_shape.dim_sizes());
-  auto init = {input, flat_indices, init_out};
-
-  // Construct the while loop body's function. The implementation of gather is:
-  // for i in range(num_indices):
-  //   index = dynamic-slice(indices, i)
-  //   xi = dynamic-slice(input, index)
-  //   output = dynamic-update-slice(output, xi, i)
-  auto body_fn = [&](xla::ComputationDataHandle i,
-                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
-                     xla::ComputationBuilder* bodyb) {
-    auto input = loop_vars[0];
-    auto indices = loop_vars[1];
-    auto output = loop_vars[2];
-
-    auto zero_index = XlaHelpers::Zero(bodyb, index_type);
-
-    // Slice the i-th index from the indices array.
-    xla::ComputationDataHandle index;
-    auto indices_offset = bodyb->Reshape(i, {1});
-    if (indices_are_nd) {
-      // Slice out the entire nd index, if applicable.
-      indices_offset = bodyb->Pad(indices_offset, zero_index,
-                                  xla::MakeEdgePaddingConfig({{0, 1}}));
-      index = bodyb->DynamicSlice(indices, indices_offset, {1, num_index_dims});
-      index = bodyb->Collapse(index, {0, 1});
+  // Example of a 1-D gather with axis=1, pulling two [3,1] tensors out of a
+  // tensor of shape [3,3].
+  //
+  //  operand = s32[3,3] parameter(0)
+  //  indices = s32[2] parameter(1)
+  //  gather = s32[3,2] gather(operand, indices),
+  //       output_window_dims={0},
+  //       elided_window_dims={1},
+  //       gather_dims_to_operand_dims={1},
+  //       index_vector_dim=1,
+  //       window_bounds={3, 1}
+  //
+  //
+  // Example of an N-D gather pulling out slices of shape [1,1,2] out of a
+  // tensor of shape [3,3,2].
+  //
+  //  operand = s32[3,3,2] parameter(0)
+  //  indices = s32[2,2] parameter(1)
+  //  gather = s32[2,2] gather(operand, indices),
+  //       output_window_dims={1},
+  //       elided_window_dims={0,1},
+  //       gather_dims_to_operand_dims={0,1},
+  //       index_vector_dim=0,
+  //       window_bounds={1,1,2}
+
+  xla::GatherDimensionNumbers dim_numbers;
+  std::vector<int64> window_bounds;
+  window_bounds.reserve(input_shape.dims());
+  for (int64 i = 0; i < input_shape.dims(); i++) {
+    int64 window_bound;
+    if (axis <= i && i < (axis + num_index_dims)) {
+      dim_numbers.add_elided_window_dims(i);
+      window_bound = 1;
     } else {
-      index = bodyb->DynamicSlice(indices, indices_offset, {1});
+      window_bound = input_shape.dim_size(i);
+    }
+
+    window_bounds.push_back(window_bound);
+
+    if (i < axis) {
+      dim_numbers.add_output_window_dims(i);
+    } else if (i >= (axis + num_index_dims)) {
+      int64 indices_rank =
+          indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims();
+      dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims);
     }
+  }
+
+  dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1)
+                                                  : indices_shape.dims());
+  for (int64 i = axis; i < axis + num_index_dims; i++) {
+    dim_numbers.add_gather_dims_to_operand_dims(i);
+  }
 
-    // Slice the corresponding data from the input array.
-    auto start_indices = bodyb->Pad(
-        index, zero_index,
-        xla::MakeEdgePaddingConfig(
-            {{input_shape_pre_axis.dims(), input_shape_post_axis.dims()}}));
-    auto slice_i = bodyb->Reshape(
-        bodyb->DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
-        loop_out_slice_shape.dim_sizes());
-
-    // Construct the index into the output Tensor 0, ..., <index>, 0, ...
-    std::vector<xla::ComputationDataHandle> out_index_vals(
-        loop_out_shape.dims(), bodyb->Reshape(zero_index, {1}));
-    out_index_vals[input_shape_pre_axis.dims()] = bodyb->Reshape(i, {1});
-    auto out_index = bodyb->ConcatInDim(out_index_vals, 0);
-
-    // Update the output Tensor
-    auto updated_output = bodyb->DynamicUpdateSlice(output, slice_i, out_index);
-
-    return std::vector<xla::ComputationDataHandle>{input, indices,
-                                                   updated_output};
-  };
-
-  // Construct the While loop, extract and reshape the output.
-  xla::PrimitiveType ptype;
-  TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(index_type, &ptype));
-  TF_ASSIGN_OR_RETURN(auto outputs, XlaForEachIndex(num_indices, ptype, body_fn,
-                                                    init, "gather", builder));
-  *gather_output = builder->Reshape(outputs[2], out_shape.dim_sizes());
+  *gather_output = builder->Gather(input, indices, dim_numbers, window_bounds);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index bd8b92c22d..f9376f0eab 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -36,8 +36,8 @@ namespace tensorflow {
 Status XlaGather(const xla::ComputationDataHandle& input,
                  const TensorShape& input_shape,
                  const xla::ComputationDataHandle& indices,
-                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
-                 DataType dtype, DataType index_type,
+                 const TensorShape& indices_shape, int64 axis,
+                 bool indices_are_nd, DataType dtype, DataType index_type,
                  xla::ComputationBuilder* builder,
                  xla::ComputationDataHandle* gather_output);
 
-- 
GitLab


From 603aad77e69ea856b39566769361c022b6af933a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 13:54:43 -0700
Subject: [PATCH 1059/1262] [XLA] Redesign: add arguments, result, and
 execution_platform to HloProto, because the SessionModule has those.

PiperOrigin-RevId: 193411310
---
 tensorflow/compiler/xla/service/hlo.proto | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 8fd7f8945c..0c3eb7dcb4 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -296,3 +296,20 @@ message HloProto {
   HloOrderingProto hlo_ordering = 2;
   BufferAssignmentProto buffer_assignment = 3;
 }
+
+// Encapsulates HloProto together with the arguments, result, and
+// execution_platform. This message is used for purposes such as
+// analysis/replay/file-storage.
+message HloSession {
+  // The hlo graph.
+  HloProto hlo = 1;
+
+  // The arguments passed to the graph.
+  repeated LiteralProto arguments = 2;
+
+  // The result of the graph.
+  LiteralProto result = 3;
+
+  // The name of the platform used to run the graph.
+  string execution_platform = 4;
+}
-- 
GitLab


From f3d2fdf088ea6674f0c0b034af04b99fc1a830dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 13:54:52 -0700
Subject: [PATCH 1060/1262] Replace six.get_unbound_function with a simpler
 version that doesn't crash for methods of tf.keras.Model.

PiperOrigin-RevId: 193411332
---
 tensorflow/contrib/autograph/pyct/inspect_utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index 63361cc4f2..a0f56a6c1f 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -63,14 +63,23 @@ def getnamespace(f):
   return namespace
 
 
+def _get_unbound_function(m):
+  # TODO(mdan): Figure out why six.get_unbound_function fails in some cases.
+  # The failure case is for tf.keras.Model.
+  if hasattr(m, 'im_func'):
+    return m.im_func
+  return m
+
+
 def getdefiningclass(m, owner_class):
   """Resolves the class (e.g. one of the superclasses) that defined a method."""
-  m = six.get_unbound_function(m)
+  # Normalize bound functions to their respective unbound versions.
+  m = _get_unbound_function(m)
   last_defining = owner_class
   for superclass in tf_inspect.getmro(owner_class):
     if hasattr(superclass, m.__name__):
       superclass_m = getattr(superclass, m.__name__)
-      if six.get_unbound_function(superclass_m) == m:
+      if _get_unbound_function(superclass_m) == m:
         last_defining = superclass
   return last_defining
 
-- 
GitLab


From 18fd1275a0c0e39a5cecea950a1fef3d8472e911 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:07:10 -0700
Subject: [PATCH 1061/1262] If the summary is empty, return empty list for
 quantiles.

PiperOrigin-RevId: 193413363
---
 .../lib/quantiles/weighted_quantiles_stream_test.cc | 13 +++++++++++++
 .../lib/quantiles/weighted_quantiles_summary.h      |  9 ++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
index 4481c0d0e4..67ac9bf387 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
@@ -138,6 +138,12 @@ void GenerateOneValue(int32 worker_id, int64 max_elements, double *total_weight,
   stream->Finalize();
 }
 
+void GenerateOneZeroWeightedValue(int32 worker_id, int64 max_elements,
+                                  double *total_weight, Stream *stream) {
+  stream->PushEntry(10, 0);
+  stream->Finalize();
+}
+
 TEST(WeightedQuantilesStreamTest, OneValue) {
   const double eps = 0.01;
   const int64 max_elements = 1 << 16;
@@ -145,6 +151,13 @@ TEST(WeightedQuantilesStreamTest, OneValue) {
                           {10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2);
 }
 
+TEST(WeightedQuantilesStreamTest, OneZeroWeightValue) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateOneZeroWeightedValue, {},
+                          1e-2);
+}
+
 TEST(WeightedQuantilesStreamTest, FixedUniform) {
   const double eps = 0.01;
   const int64 max_elements = 1 << 16;
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index aec232f3cb..7576856dc3 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -235,6 +235,11 @@ class WeightedQuantilesSummary {
   // The resulting boundaries are guaranteed to both contain at least
   // num_boundaries unique elements and maintain approximation bounds.
   std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
+
     // Generate soft compressed summary.
     WeightedQuantilesSummary<ValueType, WeightType, CompareFn>
         compressed_summary;
@@ -246,7 +251,6 @@ class WeightedQuantilesSummary {
     compressed_summary.Compress(num_boundaries, compression_eps);
 
     // Return boundaries.
-    std::vector<ValueType> output;
     output.reserve(compressed_summary.entries_.size());
     for (const auto& entry : compressed_summary.entries_) {
       output.push_back(entry.value);
@@ -260,6 +264,9 @@ class WeightedQuantilesSummary {
   // full rank queries O(nlogn).
   std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
     std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
     num_quantiles = std::max(num_quantiles, 2LL);
     output.reserve(num_quantiles + 1);
 
-- 
GitLab


From d61b579f10d2a56b0f8616aa1fe18e7827e3afec Mon Sep 17 00:00:00 2001
From: Chris Kennelly <ckennelly@google.com>
Date: Wed, 18 Apr 2018 14:07:23 -0700
Subject: [PATCH 1062/1262] Automated g4 rollback of changelist 193392688

PiperOrigin-RevId: 193413401
---
 tensorflow/core/framework/allocator.cc | 27 --------------------------
 tensorflow/core/framework/allocator.h  | 11 +----------
 2 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 29b67ebdfa..1a7e5219cd 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -48,10 +48,6 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
-void Allocator::DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) {
-  DeallocateRaw(ptr);
-}
-
 void RunResourceCtor(ResourceHandle* p, size_t n) {
   for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
 }
@@ -107,12 +103,7 @@ class CPUAllocator : public Allocator {
                    << "% of system memory.";
     }
 
-#ifdef __cpp_aligned_new
-    void* p =
-        ::operator new(num_bytes, static_cast<std::align_val_t>(alignment));
-#else
     void* p = port::AlignedMalloc(num_bytes, alignment);
-#endif
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
       mutex_lock l(mu_);
@@ -141,25 +132,7 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-#ifdef __cpp_aligned_new
-    ::operator delete(ptr);
-#else
     port::AlignedFree(ptr);
-#endif
-  }
-
-  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
-#ifdef __cpp_aligned_new
-    if (cpu_allocator_collect_stats) {
-      const std::size_t alloc_size =
-          port::MallocExtension_GetAllocatedSize(ptr);
-      mutex_lock l(mu_);
-      stats_.bytes_in_use -= alloc_size;
-    }
-    ::operator delete(ptr, num_bytes, static_cast<std::align_val_t>(alignment));
-#else
-    DeallocateRaw(ptr);
-#endif
   }
 
   void GetStats(AllocatorStats* stats) override {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 0dda38fbb7..2c87156dca 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -101,11 +101,6 @@ class Allocator {
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
 
-  // Deallocate a block of memory pointer to by "ptr" with size "num_bytes"
-  // REQUIRES: "ptr" was previously returned by a call to AllocateRaw with
-  // "num_bytes" and "alignment"
-  virtual void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes);
-
   // Convenience functions to do typed allocation.  C++ constructors
   // and destructors are invoked for complex types if necessary,
   // depending on the concrete Allocator implementation. May return
@@ -137,7 +132,7 @@ class Allocator {
   void Deallocate(T* ptr, size_t num_elements) {
     if (ptr) {
       RunDtor<T>(ptr, num_elements);
-      DeallocateRaw(ptr, kAllocatorAlignment, sizeof(T) * num_elements);
+      DeallocateRaw(ptr);
     }
   }
 
@@ -309,10 +304,6 @@ class AllocatorWrapper : public Allocator {
 
   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
 
-  void DeallocateRaw(void* ptr, size_t alignment, size_t num_bytes) override {
-    wrapped_->DeallocateRaw(ptr, alignment, num_bytes);
-  }
-
   bool TracksAllocationSizes() override {
     return wrapped_->TracksAllocationSizes();
   }
-- 
GitLab


From 497dc60720669434a9e6cf7ff19be9ca6d526010 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:30:01 -0700
Subject: [PATCH 1063/1262] Allow turning off checkpointing for
 ShardedMutableDenseHashTable. Keep the checkpointing=True as the default.

PiperOrigin-RevId: 193417350
---
 .../python/ops/sharded_mutable_dense_hashtable.py               | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index ec726bbed4..5015fb0848 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -49,6 +49,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                default_value,
                empty_key,
                num_shards=1,
+               checkpoint=True,
                name='ShardedMutableHashTable'):
     with ops.name_scope(name, 'sharded_mutable_hash_table') as scope:
       super(ShardedMutableDenseHashTable, self).__init__(key_dtype,
@@ -61,6 +62,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                 value_dtype=value_dtype,
                 default_value=default_value,
                 empty_key=empty_key,
+                checkpoint=checkpoint,
                 name='%s-%d-of-%d' % (name, i + 1, num_shards)))
       self._table_shards = table_shards
       # TODO(andreasst): add a value_shape() method to LookupInterface
-- 
GitLab


From b75e1204d3aaab20d7a937edd6b2f05ff5785827 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:34:50 -0700
Subject: [PATCH 1064/1262] Increase shard count of :init_ops_test.

PiperOrigin-RevId: 193418147
---
 tensorflow/python/kernel_tests/BUILD | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index a02783e7e7..3aedd70f8c 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1603,10 +1603,8 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "noasan",
-        "optonly",
-    ],
+    shard_count = 4,
+    tags = ["noasan"],
 )
 
 cuda_py_test(
-- 
GitLab


From 325ba9ece698d04082b173ba300a10623d27de96 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 14:38:07 -0700
Subject: [PATCH 1065/1262] Adds an implementation of the precision at recall
 metric.

PiperOrigin-RevId: 193418737
---
 tensorflow/contrib/metrics/__init__.py        |   2 +
 .../contrib/metrics/python/ops/metric_ops.py  | 115 +++++++++++++++
 .../metrics/python/ops/metric_ops_test.py     | 132 ++++++++++++++++++
 3 files changed, 249 insertions(+)

diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index de02dc8f45..5effea3596 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -71,6 +71,7 @@ See the @{$python/contrib.metrics} guide.
 @@count
 @@precision_recall_at_equal_thresholds
 @@recall_at_precision
+@@precision_at_recall
 
 """
 from __future__ import absolute_import
@@ -87,6 +88,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics
 from tensorflow.contrib.metrics.python.ops.metric_ops import auc_with_confidence_intervals
 from tensorflow.contrib.metrics.python.ops.metric_ops import cohen_kappa
 from tensorflow.contrib.metrics.python.ops.metric_ops import count
+from tensorflow.contrib.metrics.python.ops.metric_ops import precision_at_recall
 from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision
 from tensorflow.contrib.metrics.python.ops.metric_ops import sparse_recall_at_top_k
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 9c8ae48094..5364e3075d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2588,6 +2588,121 @@ def recall_at_precision(labels,
     return recall, update_op
 
 
+def precision_at_recall(labels,
+                        predictions,
+                        target_recall,
+                        weights=None,
+                        num_thresholds=200,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the precision at a given recall.
+
+  This function creates variables to track the true positives, false positives,
+  true negatives, and false negatives at a set of thresholds. Among those
+  thresholds where recall is at least `target_recall`, precision is computed
+  at the threshold where recall is closest to `target_recall`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  precision at `target_recall`. `update_op` increments the counts of true
+  positives, false positives, true negatives, and false negatives with the
+  weight of each case found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about precision and recall, see
+  http://en.wikipedia.org/wiki/Precision_and_recall
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    target_recall: A scalar value in range `[0, 1]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    num_thresholds: The number of thresholds to use for matching the given
+      recall.
+    metrics_collections: An optional list of collections to which `precision`
+      should be added.
+    updates_collections: An optional list of collections to which `update_op`
+      should be added.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: A scalar `Tensor` representing the precision at the given
+      `target_recall` value.
+    update_op: An operation that increments the variables for tracking the
+      true positives, false positives, true negatives, and false negatives and
+      whose value matches `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `target_recall` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+    RuntimeError: If eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('tf.metrics.precision_at_recall is not '
+                       'supported when eager execution is enabled.')
+
+  if target_recall < 0 or target_recall > 1:
+    raise ValueError('`target_recall` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'precision_at_recall',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # Used to avoid division by zero.
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _streaming_confusion_matrix_at_thresholds(
+        predictions, labels, thresholds, weights)
+
+    def compute_precision_at_recall(tp, fp, fn, name):
+      """Computes the precision at a given recall.
+
+      Args:
+        tp: True positives.
+        fp: False positives.
+        fn: False negatives.
+        name: A name for the operation.
+
+      Returns:
+        The precision at the desired recall.
+      """
+      recalls = math_ops.div(tp, tp + fn + kepsilon)
+
+      # Because recall is monotone decreasing as a function of the threshold,
+      # the smallest recall exceeding target_recall occurs at the largest
+      # threshold where recall >= target_recall.
+      admissible_recalls = math_ops.cast(
+          math_ops.greater_equal(recalls, target_recall), dtypes.int64)
+      tf_index = math_ops.reduce_sum(admissible_recalls) - 1
+
+      # Now we have the threshold at which to compute precision:
+      return math_ops.div(tp[tf_index] + kepsilon,
+                          tp[tf_index] + fp[tf_index] + kepsilon,
+                          name)
+
+    precision_value = compute_precision_at_recall(
+        values['tp'], values['fp'], values['fn'], 'value')
+    update_op = compute_precision_at_recall(
+        update_ops['tp'], update_ops['fp'], update_ops['fn'], 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, precision_value)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return precision_value, update_op
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 33eb655fb6..76420db8bd 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -3380,6 +3380,138 @@ class RecallAtPrecisionTest(test.TestCase):
       self.assertAlmostEqual(target_recall, recall.eval())
 
 
+class PrecisionAtRecallTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7)
+    _assert_metric_variables(self,
+                             ('precision_at_recall/true_positives:0',
+                              'precision_at_recall/false_negatives:0',
+                              'precision_at_recall/false_positives:0',
+                              'precision_at_recall/true_negatives:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.precision_at_recall(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        target_recall=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = random_ops.random_uniform(
+        (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
+    labels = random_ops.random_uniform(
+        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=1)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_precision = precision.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_precision, precision.eval(), places=5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(inputs)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, precision.eval())
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = 1.0 - predictions
+    label_prior = math_ops.reduce_mean(labels)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.2)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(sess.run(label_prior), sess.run(update_op))
+      self.assertEqual(sess.run(label_prior), precision.eval())
+
+  def testSomeCorrectHighRecall(self):
+    predictions_values = [0.1, 0.2, 0.5, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.8)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.assertAlmostEqual(0.8, precision.eval())
+
+  def testSomeCorrectLowRecall(self):
+    predictions_values = [0.1, 0.2, 0.7, 0.3, 0.0, 0.1, 0.45, 0.5, 0.6, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    precision, update_op = metrics.precision_at_recall(
+        labels, predictions, target_recall=0.4)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(2.0/3, sess.run(update_op))
+      self.assertAlmostEqual(2.0/3, precision.eval())
+
+  def testWeighted_multipleLabelDtypes(self):
+    for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+      predictions_values = [
+          0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.22, 0.25, 0.31, 0.35]
+      labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+      weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+      predictions = constant_op.constant(
+          predictions_values, dtype=dtypes_lib.float32)
+      labels = math_ops.cast(labels_values, dtype=label_dtype)
+      weights = constant_op.constant(weights_values)
+      precision, update_op = metrics.precision_at_recall(
+          labels, predictions, target_recall=0.8, weights=weights)
+
+      with self.test_session() as sess:
+        sess.run(variables.local_variables_initializer())
+        self.assertAlmostEqual(34.0/43, sess.run(update_op))
+        self.assertAlmostEqual(34.0/43, precision.eval())
+
+
 class StreamingFNRThresholdsTest(test.TestCase):
 
   def setUp(self):
-- 
GitLab


From 324215184bc727c273d0482d870eb53216626022 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 18 Apr 2018 01:38:31 +0000
Subject: [PATCH 1066/1262] Update dtypes for the test cases

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/py_func_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 7a178617dd..b9f44d728a 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -55,23 +55,24 @@ class PyFuncTest(test.TestCase):
   def testRealDataTypes(self):
     def sum_func(x, y):
       return x + y
-    for dtype in [np.float16, np.float32, np.float64,
-                  np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64]:
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.uint8, dtypes.int8, dtypes.uint16, dtypes.int16,
+                  dtypes.int32, dtypes.int64]:
       with self.test_session():
         x = constant_op.constant(1, dtype=dtype)
         y = constant_op.constant(2, dtype=dtype)
         z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype))
-        self.assertEqual(z, dtype(3))
+        self.assertEqual(z, 3)
 
   def testComplexDataTypes(self):
     def sub_func(x, y):
       return x - y
-    for dtype in [np.complex64, np.complex128]:
+    for dtype in [dtypes.complex64, dtypes.complex128]:
       with self.test_session():
         x = constant_op.constant(1 + 1j, dtype=dtype)
         y = constant_op.constant(2 - 2j, dtype=dtype)
         z = self.evaluate(script_ops.py_func(sub_func, [x, y], dtype))
-        self.assertEqual(z, dtype(-1 + 3j))
+        self.assertEqual(z, -1 + 3j)
 
   def testBoolDataTypes(self):
     def and_func(x, y):
-- 
GitLab


From d964834a922e77198fd387aac6c6cc5970a31e7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:02:26 -0700
Subject: [PATCH 1067/1262] Merged commit includes the following changes:
 193422827  by yifeif:

    Fix buildifier error.

--
193421691  by skyewm:

    Make GraphModeFunctions work with _USE_C_SHAPES=True.

    Tensor._handle_data is going away. This change adds special hooks for
    propagating the resource handle shape information through
    EagerTensors.

--
193421473  by A. Unique TensorFlower:

    Register dynamic_stitch for DT_VARIANT type.

--
193421175  by nolivia:

    disabling flaky tsan test

--
193420117  by nolivia:

    disabling flaky test in tensorflow that has no apparent culprit

--

PiperOrigin-RevId: 193422827
---
 tensorflow/c/eager/BUILD                      |  2 +
 tensorflow/c/eager/c_api.cc                   | 57 +++++++++++++++++++
 tensorflow/c/eager/c_api.h                    | 14 +++++
 .../contrib/rpc/python/kernel_tests/BUILD     |  1 +
 tensorflow/core/kernels/dynamic_stitch_op.cc  |  1 +
 tensorflow/python/eager/function.py           | 18 +++++-
 tensorflow/python/eager/function_test.py      |  3 +
 tensorflow/python/framework/test_util.py      | 24 ++++++++
 tensorflow/python/kernel_tests/BUILD          |  5 +-
 .../python/ops/resource_variable_ops.py       | 24 +++++++-
 tensorflow/python/pywrap_tfe.i                |  2 +
 11 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index a2d96357ac..3e14c10727 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -41,6 +41,8 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            # TODO(b/74620627): move this here
+            "//tensorflow/python:cpp_shape_inference_proto_cc",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 393851d13c..369342b142 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/python/framework/cpp_shape_inference.pb.h"
 
 using tensorflow::int64;
 using tensorflow::string;
@@ -1015,6 +1016,62 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
   ctx->context.RunMetadataProto()->Clear();
 }
 
+void TFE_GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                       TF_Buffer* output_proto,
+                                       TF_Status* status) {
+  tensorflow::Node* node = &output.oper->node;
+  tensorflow::CppShapeInferenceResult::HandleData handle_data;
+  handle_data.set_is_set(true);
+  {
+    tensorflow::mutex_lock l(graph->mu);
+    tensorflow::shape_inference::InferenceContext* ic =
+        graph->refiner.GetContext(node);
+    CHECK(ic != nullptr);
+    CHECK_LT(output.index, ic->num_outputs());
+    const auto* shapes_and_types =
+        ic->output_handle_shapes_and_types(output.index);
+    if (shapes_and_types == nullptr) {
+      output_proto->data = nullptr;
+      output_proto->length = 0;
+      output_proto->data_deallocator = nullptr;
+      return;
+    }
+
+    for (const auto& p : *shapes_and_types) {
+      auto* out_shape_and_type = handle_data.add_shape_and_type();
+      ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
+      out_shape_and_type->set_dtype(p.dtype);
+    }
+  }
+  status->status = MessageToBuffer(handle_data, output_proto);
+}
+
+void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output,
+                                       const void* proto, size_t proto_len,
+                                       TF_Status* status) {
+  tensorflow::CppShapeInferenceResult::HandleData handle_data;
+  if (!handle_data.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Couldn't deserialize HandleData proto");
+    return;
+  }
+  DCHECK(handle_data.is_set());
+
+  tensorflow::mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(&output.oper->node);
+
+  std::vector<tensorflow::shape_inference::ShapeAndType> shapes_and_types;
+  for (const auto& shape_and_type_proto : handle_data.shape_and_type()) {
+    tensorflow::shape_inference::ShapeHandle shape;
+    status->status =
+        ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
+    if (status->status.ok()) return;
+    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
+  }
+  ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
+}
+
 namespace {
 TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
                 TF_Status* status) {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 3926c22ce1..15ac0f376c 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -329,6 +329,20 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx,
                                                         TF_Buffer* buf,
                                                         TF_Status* status);
 
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource tensor, or otherwise returns an empty buffer.
+TF_CAPI_EXPORT extern void TFE_GetResourceHandleShapeAndType(
+    TF_Graph* graph, TF_Output output, TF_Buffer* output_proto,
+    TF_Status* status);
+
+// Sets `output` based on `proto`, which should be a serialized
+// CppShapeInferenceResult::HandleData proto.
+TF_CAPI_EXPORT extern void TFE_SetResourceHandleShapeAndType(TF_Graph* graph,
+                                                             TF_Output output,
+                                                             const void* proto,
+                                                             size_t proto_len,
+                                                             TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
index 2311c15a68..f3e6731213 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -28,6 +28,7 @@ py_library(
 py_library(
     name = "rpc_op_test_base",
     srcs = ["rpc_op_test_base.py"],
+    tags = ["notsan"],
     deps = [
         ":test_example_proto_py",
         "//tensorflow/contrib/proto",
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index f018499f6c..b01db91720 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -326,6 +326,7 @@ struct ParallelDynamicStitchOpCPU : DynamicStitchOpImplCPU<T, true> {
                           ParallelDynamicStitchOpCPU<type>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
+TF_CALL_variant(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 5168ad3b18..0f1170bb42 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -69,9 +70,22 @@ def capture_value(tensor_map, value, dtype, name):
     captured_value = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
     if captured_value.dtype == dtypes_module.resource:
-      handle_data = value._handle_data  # pylint: disable=protected-access
-      captured_value._handle_data = handle_data  # pylint: disable=protected-access
+      if ops._USE_C_SHAPES:  # pylint: disable=protected-access
+        if isinstance(value, ops.EagerTensor):
+          handle_data = value._handle_data  # pylint: disable=protected-access
+        else:
+          handle_data = resource_variable_ops.get_resource_handle_data(value)
+      else:
+        handle_data = value._handle_data  # pylint: disable=protected-access
       if handle_data is not None and handle_data.is_set:
+        # pylint: disable=protected-access
+        if ops._USE_C_SHAPES:
+          pywrap_tensorflow.TFE_SetResourceHandleShapeAndType(
+              captured_value.graph._c_graph, captured_value._as_tf_output(),
+              handle_data.SerializeToString())
+        else:
+          captured_value._handle_data = handle_data
+        # pylint: enable=protected-access
         # Ensure that shapes and dtypes are propagated.
         shapes, types = zip(*[(pair.shape, pair.dtype)
                               for pair in handle_data.shape_and_type])
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 65dde75e60..1828c987f4 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -41,6 +42,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.with_c_shapes
 class FunctionTest(test.TestCase):
 
   def testBasic(self):
@@ -615,6 +617,7 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual([[[[4.0]]]], y.numpy())
 
 
+@test_util.with_c_shapes
 class AutomaticControlDependenciesTest(test.TestCase):
 
   def testBasic(self):
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 70e70abc06..f954b9d6c7 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -464,6 +464,30 @@ def with_c_api(cls):
   return cls
 
 
+def with_c_shapes(cls):
+  """Adds methods that call original methods but with C API shapes enabled.
+
+  Note this enables C shapes in new methods after running the test class's
+  setup method.
+
+  Args:
+    cls: class to decorate
+
+  Returns:
+    cls with new test methods added
+  """
+  # If C shapes are already enabled, don't do anything. Some tests break if the
+  # same test is run twice, so this allows us to turn on the C shapes by default
+  # without breaking these tests.
+  if ops._USE_C_SHAPES:
+    return cls
+
+  for name, value in cls.__dict__.copy().items():
+    if callable(value) and name.startswith("test"):
+      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  return cls
+
+
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3aedd70f8c..9440f2a4f9 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1604,7 +1604,10 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 4,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "notap",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 49dd7f9948..4d26b2f46e 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -24,6 +24,8 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -41,6 +43,19 @@ from tensorflow.python.training import checkpointable
 from tensorflow.python.util import compat
 
 
+def get_resource_handle_data(graph_op):
+  assert ops._USE_C_SHAPES  # pylint: disable=protected-access
+  assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
+
+  with c_api_util.tf_buffer() as buf:
+    pywrap_tensorflow.TFE_GetResourceHandleShapeAndType(
+        graph_op.graph._c_graph, graph_op._as_tf_output(), buf)  # pylint: disable=protected-access
+    data = pywrap_tensorflow.TF_GetBuffer(buf)
+
+  return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
+      compat.as_bytes(data))
+
+
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
   """Creates a variable handle with information to do shape inference."""
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
@@ -73,9 +88,12 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
     # pylint: disable=protected-access
-    if h._handle_data is None:
-      ops.set_shape_and_handle_data_for_outputs(h.op)
-    handle._handle_data = h._handle_data
+    if ops._USE_C_SHAPES:
+      handle._handle_data = get_resource_handle_data(h)
+    else:
+      if h._handle_data is None:
+        ops.set_shape_and_handle_data_for_outputs(h.op)
+      handle._handle_data = h._handle_data
     # pylint: enable=protected-access
 
   # Clean up our reference cycles to avoid making the garbage collector run.
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 5ee55301df..0982a67dee 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -59,6 +59,8 @@ limitations under the License.
 %rename("%s") TFE_ContextOptionsSetAsync;
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
+%rename("%s") TFE_GetResourceHandleShapeAndType;
+%rename("%s") TFE_SetResourceHandleShapeAndType;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
-- 
GitLab


From a655d1670c264652efc42c1b12565232e22b8b84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:02:44 -0700
Subject: [PATCH 1068/1262] Fix a bug in GcsFileSystem that inconsistent read
 error may not be revealed if the requested read size is larger than the block
 size.

PiperOrigin-RevId: 193422905
---
 .../core/platform/cloud/gcs_file_system.cc    |  2 +-
 .../platform/cloud/gcs_file_system_test.cc    | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 6ed1d5dad2..f0003fa784 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -840,7 +840,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
 
   throttle_.RecordResponse(bytes_read);
 
-  if (bytes_read < block_size()) {
+  if (bytes_read < n) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
     if (stat_cache_->Lookup(filename, &stat)) {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index e9eca04fef..ca4b7722b6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -360,6 +360,47 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
             fs.NewRandomAccessFile("gs://bucket/", &file).code());
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"6\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-5\n"
+           "Timeouts: 5 1 20\n",
+           "012")});
+
+  // Set stat_cache_max_age to 1000s so that StatCache could work.
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   1e3 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  // Stat the file first so that the file stats are cached.
+  FileStatistics stat;
+  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", &stat));
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+
+  char scratch[6];
+  StringPiece result;
+
+  EXPECT_EQ(errors::Code::INTERNAL,
+            file->Read(0, sizeof(scratch), &result, scratch).code());
+}
+
 TEST(GcsFileSystemTest, NewWritableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-- 
GitLab


From 5c1e253344c0a9d90b27eeef6dd5fcf76b74bba5 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 18 Apr 2018 15:04:21 -0700
Subject: [PATCH 1069/1262] Fix loss computation bug in Model training/eval
 methods with eager execution enabled. Fixes #18642.

PiperOrigin-RevId: 193423288
---
 .../_impl/keras/engine/training_eager.py      |  2 +-
 .../_impl/keras/engine/training_eager_test.py | 25 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 4cdb5f108a..695669d9ee 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -150,7 +150,7 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
       with backend.name_scope(model.output_names[i] + '_loss'):
         output_loss = weighted_masked_fn(
-            outs[i], targets[i], weights, mask=mask)
+            targets[i], outs[i], weights, mask=mask)
       loss_metrics.append(backend.mean(output_loss))
 
       loss_weight = model.loss_weights_list[i]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index 6cdb6b0753..ed0f91ee1e 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
@@ -625,6 +626,30 @@ class LossWeightingTest(test.TestCase):
       model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
 
 
+class CorrectnessTest(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_loss_correctness(self):
+    # Test that training loss is the same in eager and graph
+    # (by comparing it to a reference value in a deterministic case)
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(3,
+                                 activation='relu',
+                                 input_dim=4,
+                                 kernel_initializer='ones'))
+    model.add(keras.layers.Dense(2,
+                                 activation='softmax',
+                                 kernel_initializer='ones'))
+    model.compile(loss='sparse_categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    x = np.ones((100, 4))
+    np.random.seed(123)
+    y = np.random.randint(0, 1, size=(100, 1))
+    history = model.fit(x, y, epochs=1, batch_size=10)
+    self.assertEqual(
+        np.around(history.history['loss'][-1], decimals=4), 0.6173)
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
-- 
GitLab


From e662c3fcfcd03fd091b032a5a33971428f4cdb89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:25:42 -0700
Subject: [PATCH 1070/1262] A very simple AST pattern matcher. Only supports
 wildcards, and it's minimally tested. When using, you may want to add your
 use case to the tests.

PiperOrigin-RevId: 193426859
---
 tensorflow/contrib/autograph/pyct/ast_util.py | 79 ++++++++++++++++++-
 .../contrib/autograph/pyct/ast_util_test.py   | 28 ++++++-
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 4a70bab440..c4f82d1170 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -23,10 +23,11 @@ import ast
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
 
 
 class CleanCopier(gast.NodeVisitor):
-  """Copy AST nodes.
+  """Copies AST nodes.
 
   The copied nodes will ignore almost all fields that are prefixed by '__'.
   Exceptions make some annotations.
@@ -106,3 +107,79 @@ def keywords_to_dict(keywords):
     keys.append(gast.Str(kw.arg))
     values.append(kw.value)
   return gast.Dict(keys=keys, values=values)
+
+
+class PatternMatcher(gast.NodeVisitor):
+  """Matches a node against a pattern represented by a node.
+
+  The pattern may contain wildcards represented by the symbol '_'.
+  """
+
+  def __init__(self, pattern):
+    self.pattern = pattern
+    self.pattern_stack = []
+    self.matches = True
+
+  def compare_and_visit(self, node, pattern):
+    self.pattern_stack.append(self.pattern)
+    self.pattern = pattern
+    self.generic_visit(node)
+    self.pattern = self.pattern_stack.pop()
+
+  def no_match(self):
+    self.matches = False
+    return False
+
+  def is_wildcard(self, p):
+    if isinstance(p, (list, tuple)) and len(p) == 1:
+      p, = p
+    if isinstance(p, gast.Name) and p.id == '_':
+      return True
+    if p == '_':
+      return True
+    return False
+
+  def generic_visit(self, node):
+    if not self.matches:
+      return
+
+    pattern = self.pattern
+    for f in node._fields:
+      if f.startswith('__'):
+        continue
+
+      if not hasattr(node, f):
+        if hasattr(pattern, f) and getattr(pattern, f):
+          return self.no_match()
+        else:
+          continue
+      if not hasattr(pattern, f):
+        return self.no_match()
+
+      v = getattr(node, f)
+      p = getattr(pattern, f)
+
+      if self.is_wildcard(p):
+        continue
+      if isinstance(v, (list, tuple)):
+        if not isinstance(p, (list, tuple)) or len(v) != len(p):
+          return self.no_match()
+        for v_item, p_item in zip(v, p):
+          self.compare_and_visit(v_item, p_item)
+      elif isinstance(v, (gast.AST, ast.AST)):
+        if not isinstance(v, type(p)) and not isinstance(p, type(v)):
+          return self.no_match()
+        self.compare_and_visit(v, p)
+      else:
+        # Assume everything else is a value type.
+        if v != p:
+          return self.no_match()
+
+
+def matches(node, pattern):
+  if isinstance(pattern, str):
+    pattern = parser.parse_expression(pattern)
+  matcher = PatternMatcher(pattern)
+  matcher.visit(node)
+  return matcher.matches
+
diff --git a/tensorflow/contrib/autograph/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py
index 8faf92c705..3afa04a506 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util_test.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py
@@ -85,7 +85,33 @@ class AstUtilTest(test.TestCase):
     output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),)
     result, _ = compiler.ast_to_object(output)
     self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'})
-    print(d)
+
+  def assertMatch(self, target_str, pattern_str):
+    node = parser.parse_expression(target_str)
+    pattern = parser.parse_expression(pattern_str)
+    self.assertTrue(ast_util.matches(node, pattern))
+
+  def assertNoMatch(self, target_str, pattern_str):
+    node = parser.parse_expression(target_str)
+    pattern = parser.parse_expression(pattern_str)
+    self.assertFalse(ast_util.matches(node, pattern))
+
+  def test_matches_symbols(self):
+    self.assertMatch('foo', '_')
+    self.assertNoMatch('foo()', '_')
+    self.assertMatch('foo + bar', 'foo + _')
+    self.assertNoMatch('bar + bar', 'foo + _')
+    self.assertNoMatch('foo - bar', 'foo + _')
+
+  def test_matches_function_args(self):
+    self.assertMatch('super(Foo, self).__init__(arg1, arg2)',
+                     'super(_).__init__(_)')
+    self.assertMatch('super().__init__()', 'super(_).__init__(_)')
+    self.assertNoMatch('super(Foo, self).bar(arg1, arg2)',
+                       'super(_).__init__(_)')
+    self.assertMatch('super(Foo, self).__init__()', 'super(Foo, _).__init__(_)')
+    self.assertNoMatch('super(Foo, self).__init__()',
+                       'super(Bar, _).__init__(_)')
 
 
 if __name__ == '__main__':
-- 
GitLab


From 80f60ea37ed77b3dbe1d983f101a5efba2fd4f2e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 15:27:05 -0700
Subject: [PATCH 1071/1262] Never use the LegacySession when a Master
 explicitly calls CreateWorkerSession.

Previously, if the session handle was unrecognized by the worker, it
would default to using the LegacySession. This prevents us from
noticing that a server has been restarted.

To address the problem in a backwards-compatible way, we add a bit to
each session-handle-carrying worker request, indicating whether the
master believes that CreateWorkerSession has been called. If this bit
is set and the handle is unrecognized, the worker will raise an
AbortedError, which can be caught by high-level frameworks such as
`tf.estimator`.

Note that CreateWorkerSession is not yet used by default, and a
follow-up change will add that.

PiperOrigin-RevId: 193427057
---
 .../cluster_function_library_runtime.cc       |  2 +
 .../cluster_function_library_runtime.h        |  7 +-
 .../cluster_function_library_runtime_test.cc  |  2 +-
 .../distributed_runtime/master_session.cc     |  3 +
 .../distributed_runtime/message_wrappers.cc   | 23 +++++++
 .../distributed_runtime/message_wrappers.h    | 10 +++
 .../core/distributed_runtime/session_mgr.cc   | 24 ++++---
 .../core/distributed_runtime/session_mgr.h    |  8 ++-
 .../distributed_runtime/session_mgr_test.cc   | 34 +++++++---
 tensorflow/core/distributed_runtime/worker.cc | 67 ++++++++++++++-----
 .../distributed_runtime/worker_session.cc     |  3 +-
 tensorflow/core/protobuf/worker.proto         | 11 ++-
 12 files changed, 153 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 000a03da5d..6edc2ec5ed 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -145,6 +145,7 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
 
   RegisterGraphRequest req;
   req.set_session_handle(worker_session_->session_name);
+  req.set_create_worker_session_called(create_worker_session_called_);
   *req.mutable_graph_def() = gdef;
   req.mutable_graph_options()
       ->mutable_optimizer_options()
@@ -182,6 +183,7 @@ void ClusterFunctionLibraryRuntime::Run(
 
   RunGraphRequest* req = new RunGraphRequest;
   req->set_session_handle(worker_session_->session_name);
+  req->set_create_worker_session_called(create_worker_session_called_);
   req->set_graph_handle(function_data->graph_handle);
   // Borrowed from master_session.cc
   const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index d3ca350e36..1ea0a3ad51 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -27,8 +27,10 @@ struct WorkerSession;
 // functions across processes by making RPCs.
 class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
  public:
-  ClusterFunctionLibraryRuntime(WorkerSession* worker_session)
-      : worker_session_(worker_session) {}
+  ClusterFunctionLibraryRuntime(WorkerSession* worker_session,
+                                bool create_worker_session_called)
+      : worker_session_(worker_session),
+        create_worker_session_called_(create_worker_session_called) {}
 
   ~ClusterFunctionLibraryRuntime() override;
 
@@ -51,6 +53,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   mutable mutex mu_;
   WorkerSession* const worker_session_ = nullptr;  // not owned.
+  const bool create_worker_session_called_;
 
   struct FunctionData {
     const string graph_handle;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 1810996ab8..6f96d7cb06 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -44,7 +44,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
         std::unique_ptr<GraphMgr>()));
 
     cluster_flr_.reset(
-        new ClusterFunctionLibraryRuntime(worker_session_.get()));
+        new ClusterFunctionLibraryRuntime(worker_session_.get(), true));
   }
 
   Status ConstructFunctionGraphHelper(
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index e0a5bb4c53..08020f0266 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -431,6 +431,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     const Part& part = partitions_[i];
     Call* c = &calls[i];
     c->req.set_session_handle(session_handle_);
+    c->req.set_create_worker_session_called(!should_deregister_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
@@ -587,6 +588,7 @@ Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
       c->req->set_is_last_partial_run(is_last_partial_run);
     }
     c->req->set_session_handle(session_handle_);
+    c->req->set_create_worker_session_called(!should_deregister_);
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
@@ -1003,6 +1005,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
     if (!part.graph_handle.empty()) {
       Call* c = new Call;
       c->req.set_session_handle(session_handle_);
+      c->req.set_create_worker_session_called(!should_deregister_);
       c->req.set_graph_handle(part.graph_handle);
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 18668b44d3..40bf564cab 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -282,10 +282,18 @@ const string& InMemoryRunGraphRequest::session_handle() const {
   return session_handle_;
 }
 
+bool InMemoryRunGraphRequest::create_worker_session_called() const {
+  return create_worker_session_called_;
+}
+
 void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
   session_handle_ = handle;
 }
 
+void InMemoryRunGraphRequest::set_create_worker_session_called(bool called) {
+  create_worker_session_called_ = called;
+}
+
 const string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
@@ -378,6 +386,8 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
     proto_version_->set_session_handle(session_handle());
+    proto_version_->set_create_worker_session_called(
+        create_worker_session_called());
     proto_version_->set_graph_handle(graph_handle());
     proto_version_->set_step_id(step_id());
     *proto_version_->mutable_exec_opts() = exec_opts();
@@ -403,6 +413,15 @@ void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
   request_.set_session_handle(handle);
 }
 
+bool MutableProtoRunGraphRequest::create_worker_session_called() const {
+  return request_.create_worker_session_called();
+}
+
+void MutableProtoRunGraphRequest::set_create_worker_session_called(
+    bool called) {
+  request_.set_create_worker_session_called(called);
+}
+
 const string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
@@ -514,6 +533,10 @@ const string& ProtoRunGraphRequest::session_handle() const {
   return request_->session_handle();
 }
 
+bool ProtoRunGraphRequest::create_worker_session_called() const {
+  return request_->create_worker_session_called();
+}
+
 const string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 1f7cdb98a4..92c5668e3a 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -246,6 +246,9 @@ class RunGraphRequestWrapper {
   // namespace is used.
   virtual const string& session_handle() const = 0;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  virtual bool create_worker_session_called() const = 0;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   virtual const string& graph_handle() const = 0;
@@ -293,6 +296,7 @@ class RunGraphRequestWrapper {
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
   virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_create_worker_session_called(bool called) = 0;
   virtual void set_graph_handle(const string& handle) = 0;
   virtual void set_step_id(int64 step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
@@ -317,6 +321,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
   const string& graph_handle() const override;
+  bool create_worker_session_called() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
@@ -331,6 +336,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -347,6 +353,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
  private:
   string session_handle_;
+  bool create_worker_session_called_;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
@@ -370,6 +377,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
+  bool create_worker_session_called() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -385,6 +393,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
+  void set_create_worker_session_called(bool called) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -409,6 +418,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
 
   // RunGraphRequestWrapper methods.
   const string& session_handle() const override;
+  bool create_worker_session_called() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 51b9547f53..e51d63cf2b 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -98,20 +98,26 @@ Status SessionMgr::DeleteSession(const string& session) {
   return Status::OK();
 }
 
-std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSessionUnlocked(
-    const string& session) {
-  auto it = sessions_.find(session);
-  if (it == sessions_.end()) {
-    return legacy_session_;
+Status SessionMgr::WorkerSessionForSessionLocked(
+    const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
+  if (session_handle.empty()) {
+    *out_session = legacy_session_;
   } else {
-    return it->second;
+    auto it = sessions_.find(session_handle);
+    if (it == sessions_.end()) {
+      return errors::Aborted("Session handle is not found: ", session_handle,
+                             ". Possibly this worker just restarted.");
+    } else {
+      *out_session = it->second;
+    }
   }
+  return Status::OK();
 }
 
-std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSession(
-    const string& session) {
+Status SessionMgr::WorkerSessionForSession(
+    const string& session_handle, std::shared_ptr<WorkerSession>* out_session) {
   mutex_lock l(mu_);
-  return WorkerSessionForSessionUnlocked(session);
+  return WorkerSessionForSessionLocked(session_handle, out_session);
 }
 
 std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 4c9702d522..0a10fe240f 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -50,7 +50,8 @@ class SessionMgr {
                        bool isolate_session_state);
 
   // Locates the worker session for a given session handle
-  std::shared_ptr<WorkerSession> WorkerSessionForSession(const string& session);
+  Status WorkerSessionForSession(const string& session_handle,
+                                 std::shared_ptr<WorkerSession>* out_session);
   std::shared_ptr<WorkerSession> LegacySession();
 
   Status DeleteSession(const string& session);
@@ -86,8 +87,9 @@ class SessionMgr {
 
   const WorkerCacheFactory worker_cache_factory_;
 
-  std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
-      const string& session) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status WorkerSessionForSessionLocked(
+      const string& session_handle, std::shared_ptr<WorkerSession>* out_session)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 4d028f7f4a..858e636e08 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -46,8 +46,8 @@ class SessionMgrTest : public ::testing::Test {
       : device_(FakeDevice::MakeCPU(
             "/job:mnist/replica:0/task:0/device:fakecpu:0")),
         mgr_(&env_, "/job:mnist/replica:0/task:0",
-             std::unique_ptr<WorkerCacheInterface>(), factory_),
-        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {
+             std::unique_ptr<WorkerCacheInterface>(), factory_) {
+    TF_CHECK_OK(mgr_.WorkerSessionForSession("", &legacy_session_));
     env_.local_devices = {device_.get()};
   }
 
@@ -69,7 +69,8 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
 
   string session_handle = "test_session_handle";
   TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
-  auto session = mgr_.WorkerSessionForSession(session_handle);
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
@@ -81,22 +82,26 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   server_def.set_task_index(3);
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
-  auto session_1 = mgr_.WorkerSessionForSession("handle_1");
+  std::shared_ptr<WorkerSession> session_1;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_1", &session_1));
   std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_1.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
-  auto session_2 = mgr_.WorkerSessionForSession("handle_2");
+  std::shared_ptr<WorkerSession> session_2;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_2", &session_2));
   std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_2.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
-  auto session_3 = mgr_.WorkerSessionForSession("handle_3");
+  std::shared_ptr<WorkerSession> session_3;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_3", &session_3));
   std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_3.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
-  auto session_4 = mgr_.WorkerSessionForSession("handle_4");
+  std::shared_ptr<WorkerSession> session_4;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_4", &session_4));
   std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_4.size());
 
@@ -109,12 +114,23 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
 TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
   string session_handle = "";
-  auto session = mgr_.WorkerSessionForSession(session_handle);
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
   EXPECT_EQ(mgr_.LegacySession(), session);
 
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, UnknownSessionHandle) {
+  ServerDef server_def;
+  string session_handle = "unknown_session_handle";
+  std::shared_ptr<WorkerSession> session;
+  Status s = mgr_.WorkerSessionForSession(session_handle, &session);
+  EXPECT_TRUE(errors::IsAborted(s));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(), "Session handle is not found"));
+}
+
 TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
   ServerDef server_def;
   server_def.set_job_name("worker");
@@ -124,7 +140,7 @@ TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
 }
 
 TEST_F(SessionMgrTest, DeleteLegacySession) {
-  TF_EXPECT_OK(mgr_.DeleteSession("legacy_session"));
+  TF_EXPECT_OK(mgr_.DeleteSession(""));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 598652fb98..6b2536c3c0 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -59,21 +59,37 @@ void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
 void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
                                 RegisterGraphResponse* response,
                                 StatusCallback done) {
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
-  Status s = session->graph_mgr->Register(
-      request->session_handle(), request->graph_def(), request->graph_options(),
-      request->debug_options(), session->cluster_flr.get(),
-      response->mutable_graph_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (s.ok()) {
+    s = session->graph_mgr->Register(
+        request->session_handle(), request->graph_def(),
+        request->graph_options(), request->debug_options(),
+        session->cluster_flr.get(), response->mutable_graph_handle());
+  }
   done(s);
 }
 
 void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
-  Status s = session->graph_mgr->Deregister(request->graph_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (s.ok()) {
+    s = session->graph_mgr->Deregister(request->graph_handle());
+  }
 
   done(s);
 }
@@ -135,11 +151,21 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
                         StatusCallback done) {
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  std::shared_ptr<WorkerSession> session;
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
-  Status s = PrepareRunGraph(request, &in, out);
+  s = PrepareRunGraph(request, &in, out);
   if (!s.ok()) {
     delete out;
     done(s);
@@ -209,12 +235,23 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const int64 step_id = request->step_id();
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
-  auto session =
-      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  std::shared_ptr<WorkerSession> session;
+
+  Status s;
+  if (request->create_worker_session_called()) {
+    s = env_->session_mgr->WorkerSessionForSession(request->session_handle(),
+                                                   &session);
+  } else {
+    session = env_->session_mgr->LegacySession();
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
 
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
-  Status s = PrepareRunGraph(request, &in, out);
+  s = PrepareRunGraph(request, &in, out);
   auto finish = [done, out, opts](const Status& s) {
     opts->ClearCancelCallback();
     delete out;
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index cb7059b36e..18886babd5 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -97,6 +97,7 @@ WorkerSession::WorkerSession(const string& session_name,
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
       device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(new ClusterFunctionLibraryRuntime(this)) {}
+      cluster_flr(
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 3e7289bd91..1819a35248 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -103,6 +103,9 @@ message RegisterGraphRequest {
   // Subgraphs are scoped within one session.
   string session_handle = 1;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 6;
+
   // "graph_def" has the subgraph of nodes for this worker, with each node
   // having its device_name filled in.
   GraphDef graph_def = 2;
@@ -144,6 +147,9 @@ message DeregisterGraphRequest {
   // empty, a single global namespace is used.
   string session_handle = 2;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 3;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -200,6 +206,9 @@ message RunGraphRequest {
   // search for the graph_handle.
   string session_handle = 8;
 
+  // Set to true if `CreateWorkerSession` was called for `session_handle`.
+  bool create_worker_session_called = 10;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -234,7 +243,7 @@ message RunGraphRequest {
   // truncate long metadata messages.
   bool store_errors_in_response_body = 9;
 
-  // Next: 10
+  // Next: 11
 }
 
 message RunGraphResponse {
-- 
GitLab


From 8c66f2223078dca765e7817f26f66e61fe819715 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Wed, 18 Apr 2018 15:30:30 -0700
Subject: [PATCH 1072/1262] Automated g4 rollback of changelist 192180356

PiperOrigin-RevId: 193427566
---
 .../xla/service/algebraic_simplifier.cc       |  1 -
 .../compiler/xla/service/dfs_hlo_visitor.h    |  1 -
 .../service/dfs_hlo_visitor_with_default.h    |  3 ---
 .../xla/service/hlo_constant_folding.cc       |  3 +--
 .../compiler/xla/service/hlo_cost_analysis.cc |  5 ----
 .../compiler/xla/service/hlo_cost_analysis.h  |  1 -
 .../compiler/xla/service/hlo_graph_dumper.cc  |  1 -
 .../compiler/xla/service/hlo_instruction.cc   | 19 ++-----------
 .../compiler/xla/service/hlo_instruction.h    |  4 ---
 tensorflow/compiler/xla/service/hlo_opcode.h  |  1 -
 .../compiler/xla/service/hlo_verifier.cc      | 27 ++++---------------
 .../compiler/xla/service/hlo_verifier.h       |  1 -
 .../xla/service/instruction_fusion.cc         |  7 ++---
 .../compiler/xla/service/pattern_matcher.h    |  1 -
 .../compiler/xla/tools/parser/hlo_parser.cc   |  9 -------
 .../xla/tools/parser/hlo_parser_test.cc       | 12 ---------
 16 files changed, 10 insertions(+), 86 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 8d26938c6e..8e785de68c 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1412,7 +1412,6 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
-// TODO(b/74536353): do this simplification for BroadcastDimOne as well.
 StatusOr<bool> AlgebraicSimplifierVisitor::
     TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
         HloInstruction* reshape_or_broadcast) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 3f7089d6ca..56723e7650 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -199,7 +199,6 @@ class DfsHloVisitorBase {
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
-  virtual Status HandleBroadcastDimOne(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index e6680ee9b8..240faebe62 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -158,9 +158,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
-  Status HandleBroadcastDimOne(HloInstructionPtr broadcastDimOne) override {
-    return DefaultAction(broadcastDimOne);
-  }
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 7aa38c6b79..35ecd4428d 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -69,8 +69,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
-      if (instruction->opcode() == HloOpcode::kBroadcast ||
-          instruction->opcode() == HloOpcode::kBroadcastDimOne) {
+      if (instruction->opcode() == HloOpcode::kBroadcast) {
         continue;
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index ea4dd62fdb..44e4f75f75 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -336,11 +336,6 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleBroadcastDimOne(
-    const HloInstruction* broadcastDimOne) {
-  return Status::OK();
-}
-
 Status HloCostAnalysis::HandlePad(const HloInstruction*) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index a9f6845747..d17678d20f 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -95,7 +95,6 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleSelectAndScatter(const HloInstruction* instruction) override;
   Status HandleBitcast(const HloInstruction* bitcast) override;
   Status HandleBroadcast(const HloInstruction* broadcast) override;
-  Status HandleBroadcastDimOne(const HloInstruction* broadcastDimOne) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index c35783c456..25702dc65e 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -956,7 +956,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
-    case HloOpcode::kBroadcastDimOne:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
       // these are essentially free.
       if (instr->IsFused() &&
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 56cb241087..a445380817 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -700,15 +700,6 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateBroadcastDimOne(const Shape& shape,
-                                      HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBroadcastDimOne, shape));
-  instruction->AppendOperand(operand);
-  return instruction;
-}
-
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1311,10 +1302,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateBroadcast(shape, new_operands[0], dimensions_);
       break;
-    case HloOpcode::kBroadcastDimOne:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateBroadcastDimOne(shape, new_operands[0]);
-      break;
     case HloOpcode::kCall:
       clone = CreateCall(shape, new_operands, to_apply());
       break;
@@ -1863,8 +1850,6 @@ bool HloInstruction::IdenticalSlowPath(
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
-    case HloOpcode::kBroadcastDimOne:
-    case HloOpcode::kDynamicUpdateSlice:
       return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
       return eq_shapes(shape(), other.shape()) &&
@@ -1883,6 +1868,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDynamicSlice:
       return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
+    case HloOpcode::kDynamicUpdateSlice:
+      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -2692,8 +2679,6 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBitcast(this);
     case HloOpcode::kBroadcast:
       return visitor->HandleBroadcast(this);
-    case HloOpcode::kBroadcastDimOne:
-      return visitor->HandleBroadcastDimOne(this);
     case HloOpcode::kPad:
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 49aa075029..5a7394f7a6 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -401,10 +401,6 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
-  // Creates a broadcast-size-one-dimensions instruction.
-  static std::unique_ptr<HloInstruction> CreateBroadcastDimOne(
-      const Shape& shape, HloInstruction* operand);
-
   // Creates a sequence of instructions that performs an explicit broadcast of
   // the operand to the target shape.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index dddc72480f..af24604c39 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -54,7 +54,6 @@ namespace xla {
   V(kBitcast, "bitcast")                                     \
   V(kBitcastConvert, "bitcast-convert")                      \
   V(kBroadcast, "broadcast")                                 \
-  V(kBroadcastDimOne, "broadcast-dim-one")                   \
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 63ec5964eb..8c875698eb 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -174,34 +174,17 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
   TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
                broadcast->dimensions().size());
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
-    int64 output_dimension = broadcast->dimensions()[i];
+  for (int64 operand_dimension = 0;
+       operand_dimension < ShapeUtil::Rank(operand_shape);
+       ++operand_dimension) {
+    int64 output_dimension = broadcast->dimensions()[operand_dimension];
     TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(i))
+                 operand_shape.dimensions(operand_dimension))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return tensorflow::Status::OK();
 }
 
-Status ShapeVerifier::HandleBroadcastDimOne(HloInstruction* broadcastDimOne) {
-  const Shape& operand_shape = broadcastDimOne->operand(0)->shape();
-  int64 operand_rank = ShapeUtil::Rank(operand_shape);
-  const Shape& output_shape = broadcastDimOne->shape();
-  // Check for mixed precision.
-  TF_RETURN_IF_ERROR(CheckShape(broadcastDimOne, output_shape));
-  TF_RET_CHECK(operand_rank == ShapeUtil::Rank(output_shape));
-  for (int64 i = 0; i < operand_rank; ++i) {
-    int64 operand_dimension = operand_shape.dimensions(i);
-    int64 output_dimension = output_shape.dimensions(i);
-    TF_RET_CHECK(operand_dimension == 1 ||
-                 operand_dimension == output_dimension)
-        << "Dimension " << i << " of broadcastDimOne "
-        << broadcastDimOne->ToString() << " is " << operand_dimension
-        << ", expected 1 or " << output_dimension;
-  }
-  return tensorflow::Status::OK();
-}
-
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index a4dff977ba..1dd7ec3c51 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -54,7 +54,6 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
-  Status HandleBroadcastDimOne(HloInstruction* broadcastDimOne) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 3f4dbf897d..d69ad80bdb 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -37,7 +37,6 @@ namespace xla {
     case HloOpcode::kBitcast:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
-    case HloOpcode::kBroadcastDimOne:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -143,8 +142,7 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
       });
   return std::count_if(hlo->operands().begin(), hlo->operands().end(),
                        [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast ||
-                             operand->opcode() == HloOpcode::kBroadcastDimOne) {
+                         if (operand->opcode() == HloOpcode::kBroadcast) {
                            return false;
                          }
                          if (operand->opcode() == HloOpcode::kConstant &&
@@ -249,8 +247,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     auto reachability = computation->ComputeReachability();
 
     auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast ||
-          producer->opcode() == HloOpcode::kBroadcastDimOne) {
+      if (producer->opcode() == HloOpcode::kBroadcast) {
         return true;
       }
       if (producer->opcode() == HloOpcode::kConstant &&
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index f5a4f2c9df..586f6ef7a9 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -879,7 +879,6 @@ XLA_UNOP_PATTERN(Abs)
 XLA_UNOP_PATTERN(RoundNearestAfz)
 XLA_UNOP_PATTERN(Bitcast)
 XLA_UNOP_PATTERN(Broadcast)
-XLA_UNOP_PATTERN(BroadcastDimOne)
 XLA_UNOP_PATTERN(Ceil)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index b2f122982a..e60a5a4919 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -724,15 +724,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, operands[0], *broadcast_dimensions));
       break;
     }
-    case HloOpcode::kBroadcastDimOne: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateBroadcastDimOne(shape, operands[0]));
-      break;
-    }
     case HloOpcode::kConcatenate: {
       optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 57684b5834..adc8b1d620 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -57,18 +57,6 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
-)"
-},
-// broadcast size-one dimensions
-{
-"BroadcastDimOne",
-R"(HloModule broadcast_dim_one_module
-
-ENTRY %broadcast-dim-one () -> f32[2,2] {
-  %constant = f32[1,2]{1,0} constant(f32[1,2] { { 1.1, 2.2 } })
-  ROOT %broadcast-dim-one = f32[2,2]{1,0} broadcast-dim-one(f32[1,2]{1,0} %constant)
-}
-
 )"
 },
 // pred constant
-- 
GitLab


From 529c56d88f27337d6be263b6f61a2a7a1994bb2d Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Wed, 18 Apr 2018 15:33:39 -0700
Subject: [PATCH 1073/1262] Add --test_output=errors as default

---
 tensorflow/tools/ci_build/ci_parameterized_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 9d23b508aa..797e0a6db5 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -237,7 +237,7 @@ function get_cuda_capability_version() {
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
 
 # Determine if the machine is a Mac
-OPT_FLAG=""
+OPT_FLAG="--test_output=errors"
 if [[ "$(uname -s)" == "Darwin" ]]; then
   DO_DOCKER=0
 
-- 
GitLab


From 427a458ae638b8488280019498e6ea5e238eb925 Mon Sep 17 00:00:00 2001
From: Dustin Tran <trandustin@google.com>
Date: Wed, 18 Apr 2018 15:38:38 -0700
Subject: [PATCH 1074/1262] Have TensorFlow Distributions share name scopes
 across method calls. END_PUBLIC

*** Reason for rollback ***

Roll forward, allowing distributions to have same names across objects.

*** Original change description ***

BEGIN_PUBLIC
Automated g4 rollback of changelist 190728742

PiperOrigin-RevId: 193428925
---
 .../python/kernel_tests/distribution_test.py  | 39 ++++++++++++++-----
 .../kernel_tests/mvn_full_covariance_test.py  |  2 +-
 .../python/ops/autoregressive.py              |  2 +-
 .../distributions/python/ops/binomial.py      |  2 +-
 .../distributions/python/ops/cauchy.py        |  2 +-
 .../contrib/distributions/python/ops/chi2.py  |  4 +-
 .../distributions/python/ops/deterministic.py |  2 +-
 .../distributions/python/ops/geometric.py     |  2 +-
 .../distributions/python/ops/gumbel.py        |  2 +-
 .../distributions/python/ops/half_normal.py   |  2 +-
 .../distributions/python/ops/independent.py   |  2 +-
 .../distributions/python/ops/inverse_gamma.py |  4 +-
 .../distributions/python/ops/kumaraswamy.py   |  9 +++--
 .../distributions/python/ops/logistic.py      |  2 +-
 .../distributions/python/ops/mixture.py       |  2 +-
 .../python/ops/mixture_same_family.py         |  2 +-
 .../distributions/python/ops/mvn_diag.py      |  4 +-
 .../python/ops/mvn_diag_plus_low_rank.py      |  2 +-
 .../python/ops/mvn_full_covariance.py         |  2 +-
 .../python/ops/mvn_linear_operator.py         |  2 +-
 .../distributions/python/ops/mvn_tril.py      |  2 +-
 .../python/ops/negative_binomial.py           |  2 +-
 .../python/ops/onehot_categorical.py          |  2 +-
 .../distributions/python/ops/poisson.py       |  2 +-
 .../python/ops/poisson_lognormal.py           |  2 +-
 .../python/ops/quantized_distribution.py      |  2 +-
 .../python/ops/relaxed_bernoulli.py           |  2 +-
 .../python/ops/relaxed_onehot_categorical.py  |  2 +-
 .../distributions/python/ops/sinh_arcsinh.py  |  3 +-
 .../python/ops/vector_diffeomixture.py        |  2 +-
 .../python/ops/vector_exponential_diag.py     |  2 +-
 .../ops/vector_exponential_linear_operator.py |  2 +-
 .../python/ops/vector_sinh_arcsinh_diag.py    |  2 +-
 .../python/ops/vector_student_t.py            |  2 +-
 .../distributions/python/ops/wishart.py       | 10 ++---
 .../python/ops/distributions/bernoulli.py     |  2 +-
 tensorflow/python/ops/distributions/beta.py   |  6 +--
 .../python/ops/distributions/categorical.py   |  2 +-
 .../python/ops/distributions/dirichlet.py     |  2 +-
 .../distributions/dirichlet_multinomial.py    |  2 +-
 .../python/ops/distributions/distribution.py  |  6 ++-
 .../python/ops/distributions/exponential.py   |  4 +-
 tensorflow/python/ops/distributions/gamma.py  |  4 +-
 .../python/ops/distributions/laplace.py       |  4 +-
 .../python/ops/distributions/multinomial.py   |  2 +-
 tensorflow/python/ops/distributions/normal.py |  4 +-
 .../python/ops/distributions/student_t.py     |  4 +-
 .../distributions/transformed_distribution.py |  2 +-
 .../python/ops/distributions/uniform.py       |  2 +-
 49 files changed, 100 insertions(+), 75 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index 68e0d9cb82..f42feae25d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -190,11 +190,30 @@ class DistributionTest(test.TestCase):
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertTrue(y.get_shape().ndims is None)
 
+  def testNameScopeWorksCorrectly(self):
+    x = tfd.Normal(loc=0., scale=1., name="x")
+    x_duplicate = tfd.Normal(loc=0., scale=1., name="x")
+    with ops.name_scope("y") as name:
+      y = tfd.Bernoulli(logits=0., name=name)
+    x_sample = x.sample(name="custom_sample")
+    x_sample_duplicate = x.sample(name="custom_sample")
+    x_log_prob = x.log_prob(0., name="custom_log_prob")
+    x_duplicate_sample = x_duplicate.sample(name="custom_sample")
+
+    self.assertEqual(x.name, "x/")
+    self.assertEqual(x_duplicate.name, "x_1/")
+    self.assertEqual(y.name, "y/")
+    self.assertTrue(x_sample.name.startswith("x/custom_sample"))
+    self.assertTrue(x_sample_duplicate.name.startswith("x/custom_sample_1"))
+    self.assertTrue(x_log_prob.name.startswith("x/custom_log_prob"))
+    self.assertTrue(x_duplicate_sample.name.startswith(
+        "x_1/custom_sample"))
+
   def testStrWorksCorrectlyScalar(self):
     normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
     self.assertEqual(
         ("tf.distributions.Normal("
-         "\"Normal\", "
+         "\"Normal/\", "
          "batch_shape=(), "
          "event_shape=(), "
          "dtype=float16)"),  # Got the dtype right.
@@ -203,7 +222,7 @@ class DistributionTest(test.TestCase):
     chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
     self.assertEqual(
         ("tf.distributions.Chi2("
-         "\"silly\", "  # What a silly name that is!
+         "\"silly/\", "  # What a silly name that is!
          "batch_shape=(2,), "
          "event_shape=(), "
          "dtype=float32)"),
@@ -211,7 +230,7 @@ class DistributionTest(test.TestCase):
 
     exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
     self.assertEqual(
-        ("tf.distributions.Exponential(\"Exponential\", "
+        ("tf.distributions.Exponential(\"Exponential/\", "
          # No batch shape.
          "event_shape=(), "
          "dtype=float32)"),
@@ -222,7 +241,7 @@ class DistributionTest(test.TestCase):
         loc=np.zeros([2, 2]), name="MVN")
     self.assertEqual(
         ("tf.distributions.MultivariateNormalDiag("
-         "\"MVN\", "
+         "\"MVN/\", "
          "batch_shape=(2,), "
          "event_shape=(2,), "
          "dtype=float64)"),
@@ -233,7 +252,7 @@ class DistributionTest(test.TestCase):
         name="MVN2")
     self.assertEqual(
         ("tf.distributions.MultivariateNormalDiag("
-         "\"MVN2\", "
+         "\"MVN2/\", "
          "batch_shape=(?,), "  # Partially known.
          "event_shape=(3,), "
          "dtype=float32)"),
@@ -243,7 +262,7 @@ class DistributionTest(test.TestCase):
     normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
     self.assertEqual(
         ("<tf.distributions.Normal"
-         " 'Normal'"
+         " 'Normal/'"
          " batch_shape=()"
          " event_shape=()"
          " dtype=float16>"),  # Got the dtype right.
@@ -252,7 +271,7 @@ class DistributionTest(test.TestCase):
     chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly")
     self.assertEqual(
         ("<tf.distributions.Chi2"
-         " 'silly'"  # What a silly name that is!
+         " 'silly/'"  # What a silly name that is!
          " batch_shape=(2,)"
          " event_shape=()"
          " dtype=float32>"),
@@ -261,7 +280,7 @@ class DistributionTest(test.TestCase):
     exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32))
     self.assertEqual(
         ("<tf.distributions.Exponential"
-         " 'Exponential'"
+         " 'Exponential/'"
          " batch_shape=<unknown>"
          " event_shape=()"
          " dtype=float32>"),
@@ -272,7 +291,7 @@ class DistributionTest(test.TestCase):
         loc=np.zeros([2, 2]), name="MVN")
     self.assertEqual(
         ("<tf.distributions.MultivariateNormalDiag"
-         " 'MVN'"
+         " 'MVN/'"
          " batch_shape=(2,)"
          " event_shape=(2,)"
          " dtype=float64>"),
@@ -283,7 +302,7 @@ class DistributionTest(test.TestCase):
         name="MVN2")
     self.assertEqual(
         ("<tf.distributions.MultivariateNormalDiag"
-         " 'MVN2'"
+         " 'MVN2/'"
          " batch_shape=(?,)"  # Partially known.
          " event_shape=(3,)"
          " dtype=float32>"),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
index 1a02fbefb8..7435bcbc68 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -52,7 +52,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       mu = [1., 2.]
       sigma = [[1., 0.], [0., 1.]]
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, name="Billy")
-      self.assertEqual(mvn.name, "Billy")
+      self.assertEqual(mvn.name, "Billy/")
 
   def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
index 69f3d57ff0..88ed012784 100644
--- a/tensorflow/contrib/distributions/python/ops/autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -145,7 +145,7 @@ class Autoregressive(distribution_lib.Distribution):
       ValueError: if `num_steps < 1`.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._distribution_fn = distribution_fn
       self._sample0 = sample0
       self._distribution0 = (distribution_fn() if sample0 is None
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 6a1bb39ab2..12d1603178 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -164,7 +164,7 @@ class Binomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
           validate_args)
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index 6f5d724a2a..daacfe657f 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -121,7 +121,7 @@ class Cauchy(distribution.Distribution):
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)]
                                     if validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index e610f469e5..c77c5fd208 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -88,7 +88,7 @@ class Chi2(gamma.Gamma):
     # not true in the parent class "gamma."  therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[df]):
+    with ops.name_scope(name, values=[df]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(df),
       ] if validate_args else []):
@@ -120,7 +120,7 @@ class Chi2WithAbsDf(Chi2):
                allow_nan_stats=True,
                name="Chi2WithAbsDf"):
     parameters = locals()
-    with ops.name_scope(name, values=[df]):
+    with ops.name_scope(name, values=[df]) as name:
       super(Chi2WithAbsDf, self).__init__(
           df=math_ops.floor(
               math_ops.abs(df, name="abs_df"),
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 8049522e9f..a42350430e 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -87,7 +87,7 @@ class _BaseDeterministic(distribution.Distribution):
       ValueError:  If `loc` is a scalar.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, atol, rtol]):
+    with ops.name_scope(name, values=[loc, atol, rtol]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       if is_vector and validate_args:
         msg = "Argument loc must be at least rank 1."
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index 8f190e48a7..53dd42f4c8 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -86,7 +86,7 @@ class Geometric(distribution.Distribution):
     """
 
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 8d05ad6b80..2c261073ee 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -125,7 +125,7 @@ class _Gumbel(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index fc0751a6e0..d0df2befd6 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -106,7 +106,7 @@ class HalfNormal(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._scale = array_ops.identity(scale, name="scale")
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index b1bacb91b0..fbde55ef31 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -119,7 +119,7 @@ class Independent(distribution_lib.Distribution):
     parameters = locals()
     name = name or "Independent" + distribution.name
     self._distribution = distribution
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       if reinterpreted_batch_ndims is None:
         reinterpreted_batch_ndims = self._get_default_reinterpreted_batch_ndims(
             distribution)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 51ac61dcf6..502bd4f493 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -126,7 +126,7 @@ class InverseGamma(distribution.Distribution):
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -281,7 +281,7 @@ class InverseGammaWithSoftplusConcentrationRate(InverseGamma):
                allow_nan_stats=True,
                name="InverseGammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       super(InverseGammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index 192dede6ff..66682b2ff5 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -151,10 +151,11 @@ class Kumaraswamy(transformed_distribution.TransformedDistribution):
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    concentration1 = ops.convert_to_tensor(
-        concentration1, name="concentration1")
-    concentration0 = ops.convert_to_tensor(
-        concentration0, name="concentration0")
+    with ops.name_scope(name, values=[concentration1, concentration0]) as name:
+      concentration1 = ops.convert_to_tensor(
+          concentration1, name="concentration1")
+      concentration0 = ops.convert_to_tensor(
+          concentration0, name="concentration0")
     super(Kumaraswamy, self).__init__(
         distribution=uniform.Uniform(
             low=array_ops.zeros([], dtype=concentration1.dtype),
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 68e6bca5a5..c83b5bc2e3 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -120,7 +120,7 @@ class Logistic(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index cef6a143fc..2ef294af2e 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -145,7 +145,7 @@ class Mixture(distribution.Distribution):
           "none of the components provide a static number of ndims")
 
     # Ensure that all batch and event ndims are consistent.
-    with ops.name_scope(name, values=[cat.logits]):
+    with ops.name_scope(name, values=[cat.logits]) as name:
       num_components = cat.event_size
       static_num_components = tensor_util.constant_value(num_components)
       if static_num_components is None:
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index b93bdc5ab4..0b1301e551 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -131,7 +131,7 @@ class MixtureSameFamily(distribution.Distribution):
         `components_distribution` rightmost batch shape.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._mixture_distribution = mixture_distribution
       self._components_distribution = components_distribution
       self._runtime_assertions = []
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index e862552880..e3236c2db9 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -194,7 +194,7 @@ class MultivariateNormalDiag(
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
         # No need to validate_args while making diag_scale.  The returned
@@ -225,7 +225,7 @@ class MultivariateNormalDiagWithSoftplusScale(MultivariateNormalDiag):
                allow_nan_stats=True,
                name="MultivariateNormalDiagWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale_diag]):
+    with ops.name_scope(name, values=[scale_diag]) as name:
       super(MultivariateNormalDiagWithSoftplusScale, self).__init__(
           loc=loc,
           scale_diag=nn.softplus(scale_diag),
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 413e88f03a..2f6a6f198c 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -218,7 +218,7 @@ class MultivariateNormalDiagPlusLowRank(
     parameters = locals()
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier, scale_perturb_factor,
           scale_perturb_diag]):
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index 4bea99fbb7..86fcd4db54 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -159,7 +159,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
     parameters = locals()
 
     # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[loc, covariance_matrix]):
         if covariance_matrix is None:
           scale_tril = None
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index a739979289..44c92312c7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -176,7 +176,7 @@ class MultivariateNormalLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 6c7dc4ca7a..d6f8b731cb 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -184,7 +184,7 @@ class MultivariateNormalTriL(
       return None if x is None else ops.convert_to_tensor(x, name=name)
     if loc is None and scale_tril is None:
       raise ValueError("Must specify one or both of `loc`, `scale_tril`.")
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[loc, scale_tril]):
         loc = _convert_to_tensor(loc, name="loc")
         scale_tril = _convert_to_tensor(scale_tril, name="scale_tril")
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 3a58df80da..eeaf9c0a5e 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -91,7 +91,7 @@ class NegativeBinomial(distribution.Distribution):
     """
 
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
       with ops.control_dependencies(
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index e3e40b2e9c..305b138fdc 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -116,7 +116,7 @@ class OneHotCategorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
           multidimensional=True)
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index 02e97c0a2f..a84aad6fc9 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -94,7 +94,7 @@ class Poisson(distribution.Distribution):
       TypeError: if `log_rate` is not a float-type.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       if (rate is None) == (log_rate is None):
         raise ValueError("Must specify exactly one of `rate` and `log_rate`.")
       elif log_rate is None:
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 3314181898..19c99dcee9 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -256,7 +256,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
         `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       if loc is not None:
         loc = ops.convert_to_tensor(loc, name="loc")
       if scale is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 8aebb79b91..1ef7651d03 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -217,7 +217,7 @@ class QuantizedDistribution(distributions.Distribution):
     values = (
         list(distribution.parameters.values()) +
         [low, high])
-    with ops.name_scope(name, values=values):
+    with ops.name_scope(name, values=values) as name:
       self._dist = distribution
 
       if low is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index e454a53c62..84c8d29072 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -166,7 +166,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
       ValueError: If both `probs` and `logits` are passed, or if neither.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]):
+    with ops.name_scope(name, values=[logits, probs, temperature]) as name:
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
         self._temperature = array_ops.identity(temperature, name="temperature")
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index 02cf3c7992..325f41e37c 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -163,7 +163,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]):
+    with ops.name_scope(name, values=[logits, probs, temperature]) as name:
 
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index cde6d85500..03828fa612 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -134,7 +134,8 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
     """
     parameters = locals()
 
-    with ops.name_scope(name, values=[loc, scale, skewness, tailweight]):
+    with ops.name_scope(name,
+                        values=[loc, scale, skewness, tailweight]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       dtype = loc.dtype
       scale = ops.convert_to_tensor(scale, name="scale", dtype=dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index da271a852d..af6ff8162b 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -396,7 +396,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       ValueError: if `not distribution.is_scalar_event`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[mix_loc, temperature]):
+    with ops.name_scope(name, values=[mix_loc, temperature]) as name:
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
                          "LinearOperators, one for each component with "
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index 526fe2d39a..e265b5d0f7 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -176,7 +176,7 @@ class VectorExponentialDiag(
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
         # No need to validate_args while making diag_scale.  The returned
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index 9d5fd9ac41..89136d6760 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -181,7 +181,7 @@ class VectorExponentialLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    with ops.name_scope(name, values=[loc] + scale.graph_parents) as name:
       # Since expand_dims doesn't preserve constant-ness, we obtain the
       # non-dynamic value if possible.
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 05919be124..1438ede265 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -169,7 +169,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
         name,
         values=[
             loc, scale_diag, scale_identity_multiplier, skewness, tailweight
-        ]):
+        ]) as name:
       loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
       tailweight = 1. if tailweight is None else tailweight
       has_default_skewness = skewness is None
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 887981d64e..7e78ded9df 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -178,7 +178,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
     parameters = locals()
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
                      scale_tril, scale_perturb_factor, scale_perturb_diag]
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=graph_parents):
         # The shape of the _VectorStudentT distribution is governed by the
         # relationship between df.batch_shape and affine.batch_shape. In
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 5a8c94dabf..91453fed5d 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -109,7 +109,7 @@ class _WishartLinearOperator(distribution.Distribution):
     """
     parameters = locals()
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[df, scale_operator]):
         if not scale_operator.dtype.is_floating:
           raise TypeError(
@@ -163,7 +163,7 @@ class _WishartLinearOperator(distribution.Distribution):
         parameters=parameters,
         graph_parents=([self._df, self._dimension] +
                        self._scale_operator.graph_parents),
-        name=ns)
+        name=name)
 
   @property
   def df(self):
@@ -531,7 +531,7 @@ class WishartCholesky(_WishartLinearOperator):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
@@ -647,7 +647,7 @@ class WishartFull(_WishartLinearOperator):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
@@ -666,5 +666,5 @@ class WishartFull(_WishartLinearOperator):
         cholesky_input_output_matrices=cholesky_input_output_matrices,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        name=ns)
+        name=name)
     self._parameters = parameters
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 68aaf3815e..2c9f0e9a32 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -72,7 +72,7 @@ class Bernoulli(distribution.Distribution):
       ValueError: If p and logits are passed, or if neither are passed.
     """
     parameters = locals()
-    with ops.name_scope(name):
+    with ops.name_scope(name) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 469bcadb8e..8beab99bf8 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -151,7 +151,7 @@ class Beta(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration1, concentration0]):
+    with ops.name_scope(name, values=[concentration1, concentration0]) as name:
       self._concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
           validate_args)
@@ -323,7 +323,7 @@ class BetaWithSoftplusConcentration(Beta):
                name="BetaWithSoftplusConcentration"):
     parameters = locals()
     with ops.name_scope(name, values=[concentration1,
-                                      concentration0]) as ns:
+                                      concentration0]) as name:
       super(BetaWithSoftplusConcentration, self).__init__(
           concentration1=nn.softplus(concentration1,
                                      name="softplus_concentration1"),
@@ -331,7 +331,7 @@ class BetaWithSoftplusConcentration(Beta):
                                      name="softplus_concentration0"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 9161e3fa9f..66fa9e110c 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -183,7 +183,7 @@ class Categorical(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]):
+    with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 25afeec936..eafcd5c78f 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -155,7 +155,7 @@ class Dirichlet(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration]):
+    with ops.name_scope(name, values=[concentration]) as name:
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration, name="concentration"),
           validate_args)
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 03a98c56ba..fe0ed7e07d 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -192,7 +192,7 @@ class DirichletMultinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, concentration]):
+    with ops.name_scope(name, values=[total_count, concentration]) as name:
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
       #   we use the last dimension for the distribution, whereas
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 7c43bf54fc..3815abf72d 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -434,13 +434,17 @@ class Distribution(_BaseDistribution):
     for i, t in enumerate(graph_parents):
       if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
+    if not name or name[-1] != "/":  # `name` is not a name scope
+      non_unique_name = name or type(self).__name__
+      with ops.name_scope(non_unique_name) as name:
+        pass
     self._dtype = dtype
     self._reparameterization_type = reparameterization_type
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
     self._parameters = parameters or {}
     self._graph_parents = graph_parents
-    self._name = name or type(self).__name__
+    self._name = name
 
   @classmethod
   def param_shapes(cls, sample_shape, name="DistributionParamShapes"):
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 6345a76d48..cf0e729e1a 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -95,7 +95,7 @@ class Exponential(gamma.Gamma):
     # true in the parent class "Gamma."  Therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       self._rate = ops.convert_to_tensor(rate, name="rate")
     super(Exponential, self).__init__(
         concentration=array_ops.ones([], dtype=self._rate.dtype),
@@ -144,7 +144,7 @@ class ExponentialWithSoftplusRate(Exponential):
                allow_nan_stats=True,
                name="ExponentialWithSoftplusRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[rate]):
+    with ops.name_scope(name, values=[rate]) as name:
       super(ExponentialWithSoftplusRate, self).__init__(
           rate=nn.softplus(rate, name="softplus_rate"),
           validate_args=validate_args,
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index adb1f4f9a8..d39f7c56d3 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -127,7 +127,7 @@ class Gamma(distribution.Distribution):
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -262,7 +262,7 @@ class GammaWithSoftplusConcentrationRate(Gamma):
                allow_nan_stats=True,
                name="GammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]):
+    with ops.name_scope(name, values=[concentration, rate]) as name:
       super(GammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index e98ac855c5..3ccfc618d1 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -101,7 +101,7 @@ class Laplace(distribution.Distribution):
       TypeError: if `loc` and `scale` are of different dtype.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -218,7 +218,7 @@ class LaplaceWithSoftplusScale(Laplace):
                allow_nan_stats=True,
                name="LaplaceWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       super(LaplaceWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 4ae67a009b..ab77f5c1f8 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -183,7 +183,7 @@ class Multinomial(distribution.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]):
+    with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = ops.convert_to_tensor(total_count, name="total_count")
       if validate_args:
         self._total_count = (
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 32e8a49c81..20d4420e91 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -132,7 +132,7 @@ class Normal(distribution.Distribution):
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]):
+    with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
@@ -244,7 +244,7 @@ class NormalWithSoftplusScale(Normal):
                allow_nan_stats=True,
                name="NormalWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale]):
+    with ops.name_scope(name, values=[scale]) as name:
       super(NormalWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index 9d9e65b4e8..961b07a7bd 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -158,7 +158,7 @@ class StudentT(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[df, loc, scale]):
+    with ops.name_scope(name, values=[df, loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
         self._df = array_ops.identity(df, name="df")
@@ -350,7 +350,7 @@ class StudentTWithAbsDfSoftplusScale(StudentT):
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[df, scale]):
+    with ops.name_scope(name, values=[df, scale]) as name:
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1ad63a8cf6..6aa6ec40d9 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -257,7 +257,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     parameters = locals()
     name = name or (("" if bijector is None else bijector.name) +
                     distribution.name)
-    with ops.name_scope(name, values=[event_shape, batch_shape]):
+    with ops.name_scope(name, values=[event_shape, batch_shape]) as name:
       # For convenience we define some handy constants.
       self._zero = constant_op.constant(0, dtype=dtypes.int32, name="zero")
       self._empty = constant_op.constant([], dtype=dtypes.int32, name="empty")
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 0891bffdd5..087797c653 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -103,7 +103,7 @@ class Uniform(distribution.Distribution):
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[low, high]):
+    with ops.name_scope(name, values=[low, high]) as name:
       with ops.control_dependencies([
           check_ops.assert_less(
               low, high, message="uniform not defined when low >= high.")
-- 
GitLab


From 40e16d6301ee0c1334ce514350668a16d7debd9a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 15:47:12 -0700
Subject: [PATCH 1075/1262] Remove duplicate code.

PiperOrigin-RevId: 193430279
---
 tensorflow/contrib/autograph/impl/naming.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/impl/naming.py b/tensorflow/contrib/autograph/impl/naming.py
index 1facaa0ca0..b1d3f76be7 100644
--- a/tensorflow/contrib/autograph/impl/naming.py
+++ b/tensorflow/contrib/autograph/impl/naming.py
@@ -62,8 +62,6 @@ class Namer(object):
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
 
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
     if live_entity is not None:
       self.renamed_calls[live_entity] = new_name
-- 
GitLab


From 695da2d928b5927c0a4f73e352a597a19886f2cb Mon Sep 17 00:00:00 2001
From: HyoukJoong Lee <hyouklee@google.com>
Date: Wed, 18 Apr 2018 15:57:53 -0700
Subject: [PATCH 1076/1262] Disable failing test RGBToHSVTest.testBatch

PiperOrigin-RevId: 193431888
---
 tensorflow/compiler/tests/image_ops_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 12791ef8ac..5b19e993ec 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -37,6 +37,10 @@ from tensorflow.python.platform import test
 class RGBToHSVTest(XLATestCase):
 
   def testBatch(self):
+    # TODO(b/78230407): Reenable the test on GPU.
+    if self.device == "XLA_GPU":
+      return
+
     # Build an arbitrary RGB image
     np.random.seed(7)
     batch_size = 5
-- 
GitLab


From e9d47fbff0d644a75c6f3dcdcb852685ef515b64 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Wed, 18 Apr 2018 16:01:55 -0700
Subject: [PATCH 1077/1262] Adds dataset transformation function
 `set_stats_aggregator(..)`, which sets the given `stats_aggregator` for
 aggregating the input dataset stats.

PiperOrigin-RevId: 193432590
---
 .../kernel_tests/stats_dataset_ops_test.py    |  67 ++++-----
 .../contrib/data/python/ops/stats_ops.py      |  61 +++++---
 tensorflow/core/BUILD                         |   1 +
 .../api_def_IteratorSetStatsAggregator.pbtxt  |   4 -
 .../api_def_SetStatsAggregatorDataset.pbtxt   |   3 +
 .../api_def_IteratorSetStatsAggregator.pbtxt  |   4 -
 .../api_def_SetStatsAggregatorDataset.pbtxt   |   4 +
 .../data => framework}/stats_aggregator.h     |   6 +-
 tensorflow/core/kernels/data/BUILD            |  32 ++---
 tensorflow/core/kernels/data/iterator_ops.cc  |  32 +----
 .../data/stats_aggregator_dataset_op.cc       | 135 ++++++++++++++++++
 .../core/kernels/data/stats_aggregator_ops.cc |   2 +-
 .../core/kernels/data/stats_dataset_ops.cc    |   2 +-
 .../core/ops/compat/ops_history.v1.pbtxt      |  12 --
 tensorflow/core/ops/dataset_ops.cc            |  13 +-
 tensorflow/core/ops/ops.pbtxt                 |  12 --
 16 files changed, 240 insertions(+), 150 deletions(-)
 delete mode 100644 tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
 rename tensorflow/core/{kernels/data => framework}/stats_aggregator.h (94%)
 create mode 100644 tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 07bdf92044..7acbc676ce 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -50,17 +50,17 @@ class StatsDatasetTest(test.TestCase):
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
   def testBytesProduced(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
-            stats_ops.bytes_produced_stats("bytes_produced"))
+            stats_ops.bytes_produced_stats("bytes_produced")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
@@ -76,16 +76,16 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
   def testLatencyStats(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -95,16 +95,15 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
 
   def testReinitialize(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(stats_aggregator_subscriber)
       for j in range(5):
         sess.run(iterator.initializer)
         for i in range(100):
@@ -130,17 +129,17 @@ class StatsDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def testMultipleTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency_2"))
+            stats_ops.latency_stats("record_latency_2")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -154,17 +153,17 @@ class StatsDatasetTest(test.TestCase):
           sess.run(summary_t), "record_latency_2", 100.0)
 
   def testRepeatedTags(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency"))
+            stats_ops.latency_stats("record_latency")).apply(
+                stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
     next_element = iterator.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      sess.run(iterator.initializer)
       for i in range(100):
         self.assertEqual(i, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -174,19 +173,17 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
   def testMultipleIteratorsSameAggregator(self):
+    stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
     iterator_0 = dataset.make_initializable_iterator()
     iterator_1 = dataset.make_initializable_iterator()
-    stats_aggregator = stats_ops.StatsAggregator()
-    stats_aggregator_subscribers = [stats_aggregator.subscribe(iterator_0),
-                                    stats_aggregator.subscribe(iterator_1)]
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = stats_aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer,
-                stats_aggregator_subscribers])
+      sess.run([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
         self.assertEqual(i * 2, sess.run(next_element))
         self._assertSummaryHasCount(
@@ -195,20 +192,6 @@ class StatsDatasetTest(test.TestCase):
         sess.run(next_element)
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
-  def testMultipleStatsAggregatorsSameIteratorFail(self):
-    dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
-    stats_aggregator_0 = stats_ops.StatsAggregator()
-    stats_aggregator_1 = stats_ops.StatsAggregator()
-
-    with self.test_session() as sess:
-      sess.run(stats_aggregator_0.subscribe(iterator))
-      # TODO(mrry): Consider making this allowable (and also allowing
-      # aggregators to unsubscribe).
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(stats_aggregator_1.subscribe(iterator))
-
 
 class StatsDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -253,5 +236,9 @@ class StatsDatasetSerializationTest(
         None, num_outputs)
 
 
+# TODO(shivaniagrawal): Can not checkpoint input_pipeline with the
+# transformation `stats_ops.set_stats_aggregator`, since we don't support
+# serializing StatsAggregator yet.
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index b5cf0fcfe9..d391720396 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -85,25 +84,53 @@ class StatsAggregator(object):
     """
     return gen_dataset_ops.stats_aggregator_summary(self._resource)
 
-  def subscribe(self, iterator):
-    """Returns a @{tf.Operation} to associate this aggregator with `iterator`.
 
-    Note: Each @{tf.data.Iterator} can be associated with at most one
-    `StatsAggregator`. After running the operation that this function
-    returns, all statistics recorded in the iteration of `iterator`
-    will be stored in `stats_aggregator`.
+class _SetStatsAggregatorDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
 
-    Args:
-      iterator: A @{tf.data.Iterator} object.
+  def __init__(self, input_dataset, stats_aggregator):
+    super(_SetStatsAggregatorDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._stats_aggregator = stats_aggregator
 
-    Returns:
-      A @{tf.Operation} that, when run, associates this aggregator with
-      `iterator`.
-    """
-    if not isinstance(iterator, iterator_ops.Iterator):
-      raise TypeError("`iterator` must be a `tf.data.Iterator` object.")
-    return gen_dataset_ops.iterator_set_stats_aggregator(
-        iterator._iterator_resource, self._resource)  # pylint: disable=protected-access
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.set_stats_aggregator_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+# TODO(shivaniagrawal): Expose these methods in `tf.contrib.data`.
+def set_stats_aggregator(stats_aggregator):
+  """Set the given stats_aggregator for aggregating the input dataset stats.
+
+  Args:
+    stats_aggregator: A `StatsAggregator` object.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _SetStatsAggregatorDataset(dataset, stats_aggregator)
+
+  return _apply_fn
 
 
 def bytes_produced_stats(tag):
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 21f929894c..54e7ab31d7 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -547,6 +547,7 @@ tf_cuda_library(
         "framework/selective_registration.h",
         "framework/session_state.h",
         "framework/shape_inference.h",
+        "framework/stats_aggregator.h",
         "framework/tensor.h",
         "framework/tensor_shape.h",
         "framework/tensor_slice.h",
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
deleted file mode 100644
index c6f2212cd4..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IteratorSetStatsAggregator"
-  summary: "Associates the given iterator with the given statistics aggregator."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000..77123e143b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt
deleted file mode 100644
index db51ae3873..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_IteratorSetStatsAggregator.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IteratorSetStatsAggregator"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000..3a8c1036ca
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/data/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
similarity index 94%
rename from tensorflow/core/kernels/data/stats_aggregator.h
rename to tensorflow/core/framework/stats_aggregator.h
index 076a56b0bf..a449f324e6 100644
--- a/tensorflow/core/kernels/data/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
 
 #include <memory>
 #include <string>
@@ -81,4 +81,4 @@ class StatsAggregatorResource : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index e856ede44b..221724e25d 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -13,20 +13,10 @@ load(
     "tf_cc_test",
 )
 
-cc_library(
-    name = "stats_aggregator",
-    hdrs = ["stats_aggregator.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-)
-
 tf_kernel_library(
     name = "stats_aggregator_ops",
     srcs = ["stats_aggregator_ops.cc"],
     deps = [
-        ":stats_aggregator",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -38,14 +28,7 @@ cc_library(
     name = "dataset",
     srcs = [],
     hdrs = ["dataset.h"],
-    deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 cc_library(
@@ -360,7 +343,6 @@ tf_kernel_library(
     srcs = ["stats_dataset_ops.cc"],
     deps = [
         ":dataset",
-        ":stats_aggregator",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -368,6 +350,16 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "stats_aggregator_dataset_op",
+    srcs = ["stats_aggregator_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "random_dataset_op",
     srcs = ["random_dataset_op.cc"],
@@ -510,7 +502,6 @@ tf_kernel_library(
     srcs = ["iterator_ops.cc"],
     deps = [
         ":dataset",
-        ":stats_aggregator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -564,6 +555,7 @@ tf_kernel_library(
         ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
         ":sql_dataset_ops",
+        ":stats_aggregator_dataset_op",
         ":stats_aggregator_ops",
         ":stats_dataset_ops",
         ":take_dataset_op",
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 780f927a4f..4e4997d7b3 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -203,10 +203,6 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-  void set_stats_aggregator(std::shared_ptr<StatsAggregator> stats_aggregator) {
-    mutex_lock l(mu_);
-    stats_aggregator_ = std::move(stats_aggregator);
-  }
 
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     tf_shared_lock l(mu_);
@@ -1075,30 +1071,6 @@ class DeserializeIteratorOp : public OpKernel {
   }
 };
 
-class IteratorSetStatsAggregatorOp : public OpKernel {
- public:
-  explicit IteratorSetStatsAggregatorOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    IteratorResource* iterator_resource;
-    OP_REQUIRES_OK(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-    core::ScopedUnref unref_iterator(iterator_resource);
-
-    StatsAggregatorResource* stats_aggregator_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
-                                       &stats_aggregator_resource));
-    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
-    // TODO(mrry): Consider allowing multiple StatsAggregator ops to
-    // subscribe to updates, and/or unsubscribing.
-    OP_REQUIRES(ctx, !iterator_resource->stats_aggregator(),
-                errors::FailedPrecondition(
-                    "Iterator already associated with a StatsAggregator"));
-    iterator_resource->set_stats_aggregator(
-        stats_aggregator_resource->stats_aggregator());
-  }
-};
 
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
@@ -1119,8 +1091,6 @@ REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeserializeIterator").Device(DEVICE_CPU),
                         DeserializeIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorSetStatsAggregator").Device(DEVICE_CPU),
-                        IteratorSetStatsAggregatorOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
new file mode 100644
index 0000000000..eb96b8a872
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SetStatsAggregatorDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    StatsAggregatorResource* stats_aggregator_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &stats_aggregator_resource));
+    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+
+    *output = new Dataset(ctx, input, stats_aggregator_resource);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     StatsAggregatorResource* stats_aggregator_resource)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          stats_aggregator_resource_(stats_aggregator_resource) {
+      input_->Ref();
+      stats_aggregator_resource_->Ref();
+    }
+
+    ~Dataset() override {
+      input_->Unref();
+      stats_aggregator_resource_->Unref();
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return "SetStatsAggregatorDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(
+          "Cannot currently serialize the `stats_aggregator` for a "
+          "SetStatsAggregatorDataset.");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        StatsAggregatorResource* stats_aggregator_resource =
+            dataset()->stats_aggregator_resource_;
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        params.stats_aggregator_getter = [stats_aggregator_resource]() {
+          return stats_aggregator_resource->stats_aggregator();
+        };
+        params.lib = ctx->lib();
+        params.function_library = ctx->function_library();
+        params.allocator_getter = ctx->allocator_getter();
+        IteratorContext set_stats_aggregator_ctx(params);
+        return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors,
+                                    end_of_sequence);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    StatsAggregatorResource* stats_aggregator_resource_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
+                        SetStatsAggregatorDatasetOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
index 17103627e0..dd37311580 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 
 #include <memory>
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 4dc1343e21..633cd85451 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 5bd37efac8..031932d79f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -25657,18 +25657,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "IteratorToStringHandle"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index b25abbcc67..57f871af32 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -151,6 +151,14 @@ REGISTER_OP("LatencyStatsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("SetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -506,11 +514,6 @@ REGISTER_OP("StatsAggregatorHandle")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''");
 
-REGISTER_OP("IteratorSetStatsAggregator")
-    .Input("iterator_handle: resource")
-    .Input("stats_aggregator_handle: resource")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("StatsAggregatorSummary")
     .Input("iterator: resource")
     .Output("summary: string")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index a36608ded3..4ae1c3d7e0 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12364,18 +12364,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
 op {
   name: "IteratorToStringHandle"
   input_arg {
-- 
GitLab


From fddfa9f8dcd1a922ade5362c0538ca39e99472a7 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Wed, 18 Apr 2018 16:35:44 -0700
Subject: [PATCH 1078/1262] Change distribution.distribute_dataset to accept an
 input_fn instead of a dataset.

PiperOrigin-RevId: 193437651
---
 .../distribute/python/minimize_loss_test.py   | 31 +++++++++--------
 .../distribute/python/mirrored_strategy.py    |  5 +--
 .../python/mirrored_strategy_multigpu_test.py |  4 +--
 .../distribute/python/one_device_strategy.py  |  4 +--
 .../distribute/python/optimizer_v2_test.py    |  4 +--
 .../distribute/python/single_loss_example.py  | 33 ++++++++++++-------
 .../contrib/distribute/python/step_fn.py      | 14 ++++----
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/estimator/estimator.py      | 21 +++++-------
 tensorflow/python/training/distribute.py      | 21 +++++++++---
 10 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index d7fbf7f379..6c73250ded 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -54,21 +54,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
   def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
                        is_tpu):
     with distribution.scope():
-      model_fn, dataset, layer = minimize_loss_example(
-          optimizer_fn,
-          use_bias=True,
-          use_callable_loss=use_callable_loss)
+      model_fn, dataset_fn, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
+      def tpu_dataset_fn():
+        return dataset_fn().batch(2)
       # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
       # `DistributionStrategy.create_monitor` so that each DistributionStrategy
       # could influence its training loop. That method would return an instance
       # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
       # tpu.shutdown_system().
-      if is_tpu:
-        dataset = dataset.batch(2)
-
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          tpu_dataset_fn if is_tpu else dataset_fn).make_one_shot_iterator()
 
       def run_step():
         # TODO(isaprykin): Make iterator get_next() return a list of sub-
@@ -122,14 +119,14 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     # `distribution.scope`.
     with variable_scope.variable_creator_scope(
         appending_creator), distribution.scope():
-      model_fn, dataset, layer = minimize_loss_example(
+      model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn,
           use_bias=True,
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
@@ -176,7 +173,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     """Verifies that moving mean updates are reduced across towers."""
     with distribution.scope():
       num_towers = len(distribution.worker_devices)
-      model_fn, dataset, batchnorm = batchnorm_example(
+      model_fn, dataset_fn, batchnorm = batchnorm_example(
           optimizer_fn,
           batch_per_epoch=num_towers,
           momentum=momentum,
@@ -188,7 +185,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
         distribution._prefetch_on_device = False
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(
@@ -260,11 +257,13 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         else:
           return optimizer.minimize(loss_fn())
 
-      features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
-      labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
-      dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
+      def dataset_fn():
+        features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
+        labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
+        return dataset_ops.Dataset.zip((features, labels)).repeat()
+
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return distribution.group(
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index d5e22e8100..6efd578a77 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -140,9 +140,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       g.add_to_collections(collections, result)
     return result
 
-  def distribute_dataset(self, dataset):
+  def distribute_dataset(self, dataset_fn):
     return values.PerDeviceDataset(
-        dataset, self._devices, self._prefetch_on_device)
+        self._call_dataset_fn(dataset_fn), self._devices,
+        self._prefetch_on_device)
 
   def _broadcast(self, tensor, destinations):
     # TODO(josh11b): In eager mode, use one thread per device, or async mode.
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 59cd6703b9..6c5c055070 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -247,9 +247,9 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     dist = mirrored_strategy.MirroredStrategy(
         ["/device:GPU:0", "/device:CPU:0"])
-    features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
     features = dist.distribute_dataset(
-        features).make_one_shot_iterator().get_next()
+        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
+    ).make_one_shot_iterator().get_next()
 
     with dist.scope():
       result = dist.call_for_each_tower(
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 2002266dd5..646d2a5c3b 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -60,8 +60,8 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
-  def distribute_dataset(self, dataset):
-    return dataset
+  def distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
 
   def _broadcast(self, tensor, destinations):
     return tensor
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index 6e4d050073..abd3a65ac4 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -39,11 +39,11 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
   def testTrainNetwork(self, distribution, optimizer_fn,
                        use_callable_loss=True):
     with distribution.scope():
-      model_fn, dataset, layer = minimize_loss_example(
+      model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
       iterator = distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index cef5fd2f89..9e8f919c8a 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -29,7 +29,10 @@ from tensorflow.python.ops import math_ops
 
 def single_loss_example(optimizer_fn, distribution, use_bias=False):
   """Build a very simple network to use in tests and examples."""
-  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
+  def dataset_fn():
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
   optimizer = optimizer_fn()
   layer = core.Dense(1, use_bias=use_bias)
 
@@ -37,8 +40,8 @@ def single_loss_example(optimizer_fn, distribution, use_bias=False):
     y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
     return y * y
 
-  single_loss_step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer,
-                                                    distribution)
+  single_loss_step = step_fn.StandardSingleLossStep(dataset_fn, loss_fn,
+                                                    optimizer, distribution)
 
   # Layer is returned for inspecting the kernels in tests.
   return single_loss_step, layer
@@ -49,7 +52,10 @@ def minimize_loss_example(optimizer_fn,
                           use_callable_loss=True,
                           create_optimizer_inside_model_fn=False):
   """Example of non-distribution-aware legacy code."""
-  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
+  def dataset_fn():
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
   if not create_optimizer_inside_model_fn:
@@ -71,7 +77,7 @@ def minimize_loss_example(optimizer_fn,
     else:
       return optimizer.minimize(loss_fn())
 
-  return model_fn, dataset, layer
+  return model_fn, dataset_fn, layer
 
 
 def batchnorm_example(optimizer_fn,
@@ -79,12 +85,15 @@ def batchnorm_example(optimizer_fn,
                       momentum=0.9,
                       renorm=False):
   """Example of non-distribution-aware legacy code with batch normalization."""
-  # input shape is [16, 8], input values are increasing in both dimensions.
-  dataset = dataset_ops.Dataset.from_tensor_slices(
-      [[[float(x * 8 + y + z * 100)
-         for y in range(8)]
-        for x in range(16)]
-       for z in range(batch_per_epoch)]).repeat()
+
+  def dataset_fn():
+    # input shape is [16, 8], input values are increasing in both dimensions.
+    return dataset_ops.Dataset.from_tensor_slices(
+        [[[float(x * 8 + y + z * 100)
+           for y in range(8)]
+          for x in range(16)]
+         for z in range(batch_per_epoch)]).repeat()
+
   optimizer = optimizer_fn()
   batchnorm = normalization.BatchNormalization(
       renorm=renorm, momentum=momentum, fused=False)
@@ -99,4 +108,4 @@ def batchnorm_example(optimizer_fn,
     # Callable loss.
     return optimizer.minimize(loss_fn)
 
-  return model_fn, dataset, batchnorm
+  return model_fn, dataset_fn, batchnorm
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 68b8f4d626..d1910622b3 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -49,13 +49,14 @@ class StandardInputStep(Step):
   """Step with a standard implementation of input handling.
 
   Args:
-    input_dataset: a tf.data Dataset that provides input.
+    dataset_fn: a function that returns a tf.data Dataset that produces the
+      input for the model.
   """
 
-  def __init__(self, input_dataset, distribution):
+  def __init__(self, dataset_fn, distribution):
     Step.__init__(self, distribution)
     self._distributed_input = distribution.distribute_dataset(
-        input_dataset).make_one_shot_iterator()
+        dataset_fn).make_one_shot_iterator()
 
   def inputs(self):
     return self._distributed_input.get_next()
@@ -77,14 +78,15 @@ class StandardSingleLossStep(StandardInputStep):
   ```
 
   Args:
-    input_dataset: a tf.data Dataset that provides input.
+    dataset_fn: a function that returns a tf.data Dataset that produces the
+      input for the model.
     loss_fn: a function that returns loss.
     optimizer: an optimizer that implements an update rule.
     distribution: a `DistributionStrategy` object.
   """
 
-  def __init__(self, input_dataset, loss_fn, optimizer, distribution):
-    StandardInputStep.__init__(self, input_dataset, distribution)
+  def __init__(self, dataset_fn, loss_fn, optimizer, distribution):
+    StandardInputStep.__init__(self, dataset_fn, distribution)
     self._loss_fn = loss_fn
     self._optimizer = optimizer
     self._is_run_concurrently = False
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c2bedab4f9..698e2a28bf 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3048,6 +3048,7 @@ py_library(
         ":state_ops",
         ":util",
         ":variable_scope",
+        "//tensorflow/python/data",
         "//tensorflow/python/ops/losses",
     ],
 )
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index dde463aaf4..a42b6cfee8 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -688,22 +688,19 @@ class Estimator(object):
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
-    result = self._call_input_fn(input_fn, mode)
-    # TODO(anjalisridhar): What about the default DistributionStrategy? Perhaps
-    # using any input is alright in that case. There is also a
-    # has_dataset_or_queue_runner function that we may want to extend and use.
-    if (self._distribution is not None and
-        not isinstance(result, dataset_ops.Dataset) and
-        mode == model_fn_lib.ModeKeys.TRAIN):
-      raise ValueError('input_fn() must return a tf.data.Dataset when using a '
-                       'DistributionStrategy.')
     input_hooks = []
-    if isinstance(result, dataset_ops.Dataset):
-      if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
-        result = self._distribution.distribute_dataset(result)
+    if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
+      result = self._distribution.distribute_dataset(
+          lambda: self._call_input_fn(input_fn, mode))
       iterator = result.make_initializable_iterator()
       input_hooks.append(_DatasetInitializerHook(iterator))
       result = iterator.get_next()
+    else:
+      result = self._call_input_fn(input_fn, mode)
+      if isinstance(result, dataset_ops.Dataset):
+        iterator = result.make_initializable_iterator()
+        input_hooks.append(_DatasetInitializerHook(iterator))
+        result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index d855c4f551..21ec5292ad 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import threading
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -672,25 +673,35 @@ class DistributionStrategy(object):
     _require_distribution_strategy_scope(self)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
+  def _call_dataset_fn(self, dataset_fn):
+    result = dataset_fn()
+    if not isinstance(result, dataset_ops.Dataset):
+      raise ValueError(
+          "dataset_fn() must return a tf.data.Dataset when using a "
+          "DistributionStrategy.")
+    return result
+
   # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
   # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
   # Extend to implement more functionality of datasets.
-  def distribute_dataset(self, dataset):
+  def distribute_dataset(self, dataset_fn):
     """Return a `dataset` split across all towers.
 
     Suitable for providing input to for `call_for_each_tower()` by creating an
     iterator:
 
     ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
     with distribution_strategy.scope():
-      distributed_dataset = distribution_strategy.distribute_dataset(dataset)
+      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
       iterator = distributed_dataset.make_one_shot_iterator()
       tower_results = distribution_strategy.call_for_each_tower(
           tower_fn, iterator.get_next())
     ```
 
     Args:
-      dataset: A `tf.data.Dataset`.
+      dataset_fn: A function that returns a `tf.data.Dataset`.
 
     Returns:
       A `PerDeviceDataset` that will produce data for each tower.
@@ -1135,8 +1146,8 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     _require_distribution_strategy_scope(self)
     return ops.colocate_with(colocate_with_variable)
 
-  def distribute_dataset(self, dataset):
-    return dataset
+  def distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
 
   def _broadcast(self, tensor, destinations):
     if destinations is None:
-- 
GitLab


From 5ec3b021fd7e509a1597880ff093802de1f63d42 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 18 Apr 2018 16:48:17 -0700
Subject: [PATCH 1079/1262] Add tf.train.Checkpoint for reading and writing
 object-based checkpoints.

Previously exposed as tf.contrib.eager.Checkpoint / tfe.Checkpoint.

Spiffies up the documentation a bit, but otherwise just adds the export decorator.

Compatible in both directions with tf.train.Saver (object-based checkpoints can be fed to tf.train.Saver, and name-based checkpoints can be fed to tf.train.Checkpoint).

PiperOrigin-RevId: 193439442
---
 .../python/training/checkpointable_utils.py   | 189 ++++++++++++++++--
 tensorflow/python/training/saver.py           |   4 +-
 tensorflow/python/training/training.py        |   1 +
 .../golden/tensorflow.train.-checkpoint.pbtxt |  23 +++
 .../tools/api/golden/tensorflow.train.pbtxt   |   4 +
 5 files changed, 201 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt

diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 2c4677a278..4769e15120 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -38,6 +38,7 @@ from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 _ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
@@ -822,30 +823,92 @@ class CheckpointableSaver(object):
     return load_status
 
 
+@tf_export("train.Checkpoint")
 class Checkpoint(checkpointable_lib.Checkpointable):
-  """A utility class which groups `Checkpointable` objects.
+  """Groups checkpointable objects, saving and restoring them.
 
-  Accepts arbitrary keyword arguments to its constructor and saves those values
-  with a checkpoint. Maintains a `save_counter` for numbering checkpoints.
+  `Checkpoint`'s constructor accepts keyword arguments whose values are types
+  that contain checkpointable state, such as `tf.train.Optimizer`
+  implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
+  `tf.keras.Model` implementations. It saves these values with a checkpoint, and
+  maintains a `save_counter` for numbering checkpoints.
 
-  Example usage:
+  Example usage when graph building:
 
   ```python
   import tensorflow as tf
-  import tensorflow.contrib.eager as tfe
   import os
 
   checkpoint_directory = "/tmp/training_checkpoints"
   checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
-  root = tfe.Checkpoint(optimizer=optimizer, model=model)
-  root.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  train_op = optimizer.minimize( ... )
+  status.assert_consumed()  # Optional sanity checks.
+  with tf.Session() as session:
+    # Use the Session to restore variables, or initialize them if
+    # tf.train.latest_checkpoint returned None.
+    status.initialize_or_restore(session)
+    for _ in range(num_training_steps):
+      session.run(train_op)
+    checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  Example usage with eager execution enabled:
+
+  ```python
+  import tensorflow as tf
+  import os
+
+  tf.enable_eager_execution()
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
   for _ in range(num_training_steps):
-    optimizer.minimize( ... )
-  root.save(file_prefix=checkpoint_prefix)
+    optimizer.minimize( ... )  # Variables will be restored on creation.
+  status.assert_consumed()  # Optional sanity checks.
+  checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  `Checkpoint.save` and `Checkpoint.restore` write and read object-based
+  checkpoints, in contrast to `tf.train.Saver` which writes and reads
+  `variable.name` based checkpoints. Object-based checkpointing saves a graph of
+  dependencies between Python objects (`Layer`s, `Optimizer`s, `Variable`s,
+  etc.) with named edges, and this graph is used to match variables when
+  restoring a checkpoint. It can be more robust to changes in the Python
+  program, and helps to support restore-on-create for variables when executing
+  eagerly. Prefer `tf.train.Checkpoint` over `tf.train.Saver` for new code.
+
+  `Checkpoint` objects have dependencies on the objects passed as keyword
+  arguments to their constructors, and each dependency is given a name that is
+  identical to the name of the keyword argument for which it was created.
+  TensorFlow classes like `Layer`s and `Optimizer`s will automatically add
+  dependencies on their variables (e.g. "kernel" and "bias" for
+  `tf.keras.layers.Dense`). Inheriting from `tf.keras.Model` makes managing
+  dependencies easy in user-defined classes, since `Model` hooks into attribute
+  assignment. For example:
+
+  ```python
+  class Regress(tf.keras.Model):
+
+    def __init__(self):
+      super(Regress, self).__init__()
+      self.input_transform = tf.keras.layers.Dense(10)
+      # ...
+
+    def call(self, inputs):
+      x = self.input_transform(inputs)
+      # ...
   ```
 
-  For more manual control over saving, use `tfe.CheckpointableSaver` directly.
+  This `Model` has a dependency named "input_transform" on its `Dense` layer,
+  which in turn depends on its variables. As a result, saving an instance of
+  `Regress` using `tf.train.Checkpoint` will also save all the variables created
+  by the `Dense` layer.
 
   Attributes:
     save_counter: Incremented when `save()` is called. Used to number
@@ -857,17 +920,19 @@ class Checkpoint(checkpointable_lib.Checkpointable):
 
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Attribute values must derive from
-        `CheckpointableBase`.
+        saved with the checkpoint. Values must be checkpointable objects.
     Raises:
-      ValueError: If objects in `kwargs` are not Checkpointable.
+      ValueError: If objects in `kwargs` are not checkpointable.
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
       if not isinstance(v, checkpointable_lib.CheckpointableBase):
         raise ValueError(
-            ("`Checkpoint` was expecting an object derived from "
-             "`CheckpointableBase`, got %s.") % (v,))
+            ("`Checkpoint` was expecting a checkpointable object (an object "
+             "derived from `CheckpointableBase`), got %s. If you believe this "
+             "object should be checkpointable (i.e. it is part of the "
+             "TensorFlow Python API and manages state), please open an issue.")
+            % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
     self._saver = CheckpointableSaver(weakref.ref(self))
@@ -893,7 +958,23 @@ class Checkpoint(checkpointable_lib.Checkpointable):
     return self._save_counter
 
   def save(self, file_prefix, session=None):
-    """Save a checkpoint. Wraps `tfe.CheckpointableSaver.save`."""
+    """Save a training checkpoint.
+
+    The saved checkpoint includes variables created by this object and any
+    checkpointable objects it depends on at the time `Checkpoint.save()` is
+    called.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `Checkpoint.save_counter`.
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint.
+    """
     in_graph_mode = not context.executing_eagerly()
     if in_graph_mode:
       if session is None:
@@ -913,7 +994,81 @@ class Checkpoint(checkpointable_lib.Checkpointable):
         session=session)
 
   def restore(self, save_path):
-    """Restore a checkpoint. Wraps `tfe.CheckpointableSaver.restore`."""
+    """Restore a training checkpoint.
+
+    Restores this `Checkpoint` and any objects it depends on.
+
+    When executing eagerly, either assigns values immediately if variables to
+    restore have been created already, or defers restoration until the variables
+    are created. Dependencies added after this call will be matched if they have
+    a corresponding object in the checkpoint (the restore request will queue in
+    any checkpointable object waiting for the expected dependency to be added).
+
+    When graph building, restoration ops are added to the graph but not run
+    immediately.
+
+    To ensure that loading is complete and no more assignments will take place,
+    use the `assert_consumed()` method of the status object returned by
+    `restore`:
+
+    ```python
+    checkpoint = tf.train.Checkpoint( ... )
+    checkpoint.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised if any Python objects in the dependency graph
+    were not found in the checkpoint, or if any checkpointed values do not have
+    a matching Python object.
+
+    When graph building, `assert_consumed()` indicates that all of the restore
+    ops that will be created for this checkpoint have been created. They can be
+    run via the `run_restore_ops()` method of the status object:
+
+    ```python
+    checkpoint.restore(path).assert_consumed().run_restore_ops()
+    ```
+
+    If the checkpoint has not been consumed completely, then the list of restore
+    ops will grow as more objects are added to the dependency graph.
+
+    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    method. There is no deferred loading, and names are used to match
+    variables. No restore ops are created/run until `run_restore_ops()` or
+    `initialize_or_restore()` are called on the returned status object, even
+    when executing eagerly. Re-encode name-based checkpoints using
+    `tf.train.Checkpoint.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of a checkpoint restoration and run initialization/restore ops.
+
+      The returned status object has the following methods:
+      - `assert_consumed()`:
+          Raises an exception if any variables/objects are unmatched: either
+          checkpointed values which don't have a matching Python object or
+          Python objects in the dependency graph with no values in the
+          checkpoint. This method returns the status object, and so may be
+          chained with `initialize_or_restore` or `run_restore_ops`.
+      - `initialize_or_restore(session=None)`:
+          When graph building, runs variable initializers if `save_path` is
+          `None`, but otherwise runs restore operations. If no `session` is
+          explicitly specified, the default session is used. No effect for
+          object-based checkpoints when executing eagerly (variables are
+          initialized or restored eagerly).
+      - `run_restore_ops(session=None)`:
+          When graph building, runs restore operations. If no `session` is
+          explicitly specified, the default session is used. No effect for
+          object-based checkpoints when executing eagerly (restore operations
+          are run eagerly). May only be called when `save_path` is not `None`.
+    """
     status = self._saver.restore(save_path=save_path)
     # Create the save counter now so it gets initialized with other variables
     # when graph building. Creating it earlier would lead to double
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 79d278cf90..a74d629a8f 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1824,12 +1824,10 @@ class Saver(object):
       # This is an object-based checkpoint. We'll print a warning and then do
       # the restore.
       logging.warning(
-          # TODO(allenl): Modify instructions for using the object-based saver
-          # once that's in core.
           "Restoring an object-based checkpoint using a name-based saver. This "
           "may be somewhat fragile, and will re-build the Saver. Instead, "
           "consider loading object-based checkpoints using "
-          "tf.contrib.eager.Checkpoint().")
+          "tf.train.Checkpoint().")
       self._restore_from_object_based_checkpoint(
           sess=sess, save_path=save_path,
           object_graph_string=object_graph_string)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index b759b156d7..d7e5078be7 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -156,6 +156,7 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
+from tensorflow.python.training.checkpointable_utils import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
new file mode 100644
index 0000000000..17f393d27c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.train.Checkpoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpointable_utils.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "save_counter"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index bec72e1e60..9fb18e77af 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "BytesList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "Checkpoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
-- 
GitLab


From f089ef66f6e357e4a814ad4757e46bf88cf11bb6 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 17:04:46 -0700
Subject: [PATCH 1080/1262] Add a ten-second timeout to the DeleteWorkerSession
 call.

Previously, `MasterSession::Close()` did not block on the cleanup RPCs
to the individual workers, leading to deployments where the remote
workers might be shut down (e.g. by an external mechanism) before the
session was closed. In order to switch over to using
DeleteWorkerSession for all sessions, and preserve backwards
compatibility, we need to permit this behavior. Therefore, this CL
adds a 10-second timeout on the requests to workers, and logs an error
if the request does not succeed in that time period.

PiperOrigin-RevId: 193441618
---
 .../core/distributed_runtime/master_session.cc |  9 +++++++--
 .../rpc/grpc_remote_worker.cc                  |  6 ++++--
 tensorflow/core/distributed_runtime/worker.cc  |  3 ++-
 tensorflow/core/distributed_runtime/worker.h   |  3 ++-
 .../distributed_runtime/worker_interface.h     | 18 ++++++++++++++++--
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 08020f0266..7868200fb4 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1273,6 +1273,8 @@ Status MasterSession::DeleteWorkerSessions() {
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
 
+    CallOptions call_opts;
+
     // Request and responses used for a given worker.
     DeleteWorkerSessionRequest request;
     DeleteWorkerSessionResponse response;
@@ -1296,6 +1298,9 @@ Status MasterSession::DeleteWorkerSessions() {
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
+    // Since the worker may have gone away, set a timeout to avoid blocking the
+    // session-close operation.
+    workers[i].call_opts.SetTimeout(10000);
   }
 
   for (size_t i = 0; i < worker_names.size(); ++i) {
@@ -1303,8 +1308,8 @@ Status MasterSession::DeleteWorkerSessions() {
       workers[i].status = s;
       done.DecrementCount();
     };
-    workers[i].worker->DeleteWorkerSessionAsync(&workers[i].request,
-                                                &workers[i].response, cb);
+    workers[i].worker->DeleteWorkerSessionAsync(
+        &workers[i].call_opts, &workers[i].request, &workers[i].response, cb);
   }
 
   done.Wait();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index b3b05408b1..895bbd97b7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -72,10 +72,12 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, createworkersession_, std::move(done));
   }
 
-  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+  void DeleteWorkerSessionAsync(CallOptions* call_opts,
+                                const DeleteWorkerSessionRequest* request,
                                 DeleteWorkerSessionResponse* response,
                                 StatusCallback done) override {
-    IssueRequest(request, response, deleteworkersession_, std::move(done));
+    IssueRequest(request, response, deleteworkersession_, std::move(done),
+                 call_opts);
   }
 
   void RegisterGraphAsync(const RegisterGraphRequest* request,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 6b2536c3c0..e9073ef9f6 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -49,7 +49,8 @@ void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
   done(s);
 }
 
-void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+void Worker::DeleteWorkerSessionAsync(CallOptions* opts,
+                                      const DeleteWorkerSessionRequest* request,
                                       DeleteWorkerSessionResponse* response,
                                       StatusCallback done) {
   Status s = env_->session_mgr->DeleteSession(request->session_handle());
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 62fa5f3cf5..19aeeb752c 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -52,7 +52,8 @@ class Worker : public WorkerInterface {
                                 CreateWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
-  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+  void DeleteWorkerSessionAsync(CallOptions* opts,
+                                const DeleteWorkerSessionRequest* request,
                                 DeleteWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 4c58bf41a4..a1597ee798 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -45,7 +45,7 @@ class WorkerInterface {
       CreateWorkerSessionResponse* response, StatusCallback done) = 0;
 
   virtual void DeleteWorkerSessionAsync(
-      const DeleteWorkerSessionRequest* request,
+      CallOptions* opts, const DeleteWorkerSessionRequest* request,
       DeleteWorkerSessionResponse* response, StatusCallback done) = 0;
 
   virtual void RegisterGraphAsync(const RegisterGraphRequest* request,
@@ -124,7 +124,8 @@ class WorkerInterface {
 
   Status DeleteWorkerSession(const DeleteWorkerSessionRequest* request,
                              DeleteWorkerSessionResponse* response) {
-    return CallAndWait(&ME::DeleteWorkerSessionAsync, request, response);
+    return CallAndWaitWithOptions(&ME::DeleteWorkerSessionAsync, request,
+                                  response);
   }
 
   Status RegisterGraph(const RegisterGraphRequest* request,
@@ -183,6 +184,19 @@ class WorkerInterface {
     n.WaitForNotification();
     return ret;
   }
+
+  template <typename Method, typename Req, typename Resp>
+  Status CallAndWaitWithOptions(Method func, const Req* req, Resp* resp) {
+    CallOptions call_opts;
+    Status ret;
+    Notification n;
+    (this->*func)(&call_opts, req, resp, [&ret, &n](const Status& s) {
+      ret = s;
+      n.Notify();
+    });
+    n.WaitForNotification();
+    return ret;
+  }
 };
 
 }  // namespace tensorflow
-- 
GitLab


From b23415e3f3c34c3911e4e05758a41a81e5882453 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:05:04 -0700
Subject: [PATCH 1081/1262] Replace space in "Fraction of Zero Values" with _
 because using space is illegal and will be auto replaced.

PiperOrigin-RevId: 193441676
---
 tensorflow/contrib/slim/python/slim/summaries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/slim/python/slim/summaries.py b/tensorflow/contrib/slim/python/slim/summaries.py
index 358359d6eb..a7dc3f6723 100644
--- a/tensorflow/contrib/slim/python/slim/summaries.py
+++ b/tensorflow/contrib/slim/python/slim/summaries.py
@@ -144,7 +144,7 @@ def add_zero_fraction_summary(tensor, name=None, prefix=None,
     A scalar `Tensor` of type `string` whose contents are the serialized
     `Summary` protocol buffer.
   """
-  name = _get_summary_name(tensor, name, prefix, 'Fraction of Zero Values')
+  name = _get_summary_name(tensor, name, prefix, 'Fraction_of_Zero_Values')
   tensor = nn.zero_fraction(tensor)
   return add_scalar_summary(tensor, name, print_summary=print_summary)
 
-- 
GitLab


From 8cfbbafc17c8baaad47f2a12508c3bee9c8fcda4 Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@users.noreply.github.com>
Date: Thu, 12 Apr 2018 09:41:48 +0800
Subject: [PATCH 1082/1262] fix tf.GIT_VERSION always 'unknown' on windows
 cmake build (#16730)

---
 .../contrib/cmake/tf_core_framework.cmake     |  2 +-
 tensorflow/tools/git/gen_git_source.py        | 37 +++++++++++++------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 73cadc58ff..973c191c47 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    ARGS --raw_generate ${VERSION_INFO_CC} --git_tag_override=${GIT_TAG_OVERRIDE}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 7f0f325119..2151a75e84 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,18 +164,14 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override and val:
+    if git_tag_override:
       split_val = val.split("-")
-      if len(split_val) < 3:
+      if len(split_val) != 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      # There might be "-" in the tag name. But we can be sure that the final
-      # two "-" are those inserted by the git describe command.
-      commits_ahead_of_tag = split_val[-2]
-      abbrev_commit = split_val[-1]
-      val = bytes(
-          "-".join([git_tag_override, commits_ahead_of_tag, abbrev_commit]))
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -193,7 +189,15 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -257,7 +261,7 @@ def generate(arglist, git_tag_override=None):
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file, git_tag_override=None):
+def raw_generate(output_file, source_dir, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -265,12 +269,13 @@ def raw_generate(output_file, git_tag_override=None):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
     git_tag_override: Override the value for the git tag. This is useful for
       releases where we want to build the release before the git tag is
       created.
   """
 
-  git_version = get_git_version(".", git_tag_override)
+  git_version = get_git_version(source_dir, git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -308,6 +313,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -317,7 +327,10 @@ if args.configure is not None:
 elif args.generate is not None:
   generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate, args.git_tag_override)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
-- 
GitLab


From d961d8ffae1500aca0c6191e4b1e37a2a44bf527 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:09:06 -0700
Subject: [PATCH 1083/1262] Fix reference name.

PiperOrigin-RevId: 193442269
---
 tensorflow/contrib/autograph/pyct/static_analysis/type_info.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index 763997968c..c00946f9c4 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -199,8 +199,7 @@ class TypeInfoResolver(transformer.Base):
         target_symbol = anno.getanno(target, anno.Basic.QN)
         self.scope.setval(target_symbol, source)
       else:
-        raise ValueError(
-            'assignment target has unknown type: %s' % target_item)
+        raise ValueError('assignment target has unknown type: %s' % target)
 
   def visit_With(self, node):
     for wi in node.items:
-- 
GitLab


From 8d48dabb309dfc4ad1e06286b6e77c7258802e56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:18:52 -0700
Subject: [PATCH 1084/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193443417
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 28 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 28 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 031932d79f..d741e2ad46 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -55051,6 +55051,34 @@ op {
     }
   }
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Shape"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 4ae1c3d7e0..beda05fdf2 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -25720,6 +25720,34 @@ op {
     }
   }
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Shape"
   input_arg {
-- 
GitLab


From 558b3d35f080163b4f8cf8b4997d9e2cc0c4fd6e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 18 Apr 2018 17:42:42 -0700
Subject: [PATCH 1085/1262] Fix merge.

---
 tensorflow/tools/git/gen_git_source.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 2151a75e84..6ec162e4a9 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,14 +164,17 @@ def get_git_version(git_base_path, git_tag_override):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
-    if git_tag_override:
+    if git_tag_override and val:
       split_val = val.split("-")
-      if len(split_val) != 3:
+      if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
              "but got '%s'") % val)
-      split_val[0] = git_tag_override
-      val = bytes("-".join(split_val))
+      # There might be "-" in the tag name. But we can be sure that the final
+      # two "-" are those inserted by the git describe command.
+      abbrev_commit = split_val[-1]
+      val = bytes(
+          "-".join([git_tag_override, "0", abbrev_commit]))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -189,15 +192,7 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {
-#ifdef _MSC_VER
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-  return "MSVC " TOSTRING(_MSC_FULL_VER);
-#else
-  return __VERSION__;
-#endif
-}
+const char* tf_compiler_version() {return __VERSION__;}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -333,4 +328,4 @@ elif args.raw_generate is not None:
   raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
-                     "must be used")
+                     "must be used")
\ No newline at end of file
-- 
GitLab


From dc0f44a98284e1bd8f9d44ef7a8122b27f9f0f15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 17:46:46 -0700
Subject: [PATCH 1086/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 193446519

---
 tensorflow/go/op/wrappers.go | 130 +++++++++++++++--------------------
 1 file changed, 57 insertions(+), 73 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1d4b1399ed..a5b293ce75 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7564,22 +7564,6 @@ func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, li
 	return scope.AddOperation(opspec)
 }
 
-// Associates the given iterator with the given statistics aggregator.
-//
-// Returns the created operation.
-func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorSetStatsAggregator",
-		Input: []tf.Input{
-			iterator_handle, stats_aggregator_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
 type DataFormatVecPermuteAttr func(optionalAttr)
 
@@ -24288,6 +24272,63 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Convert JSON-encoded Example records to binary protocol buffer strings.
 //
 // This op translates a tensor containing Example records, encoded using
@@ -28128,63 +28169,6 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 	return values
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
 type MapIncompleteSizeAttr func(optionalAttr)
 
-- 
GitLab


From d4976f754009d084514f4308d3bfc7dc3a106e29 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 18 Apr 2018 17:48:49 -0700
Subject: [PATCH 1087/1262] Enable for all gpus.

PiperOrigin-RevId: 193446717
---
 tensorflow/core/grappler/optimizers/layout_optimizer.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 8fb30d116d..db83580c1c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2132,14 +2132,7 @@ int GetNumGPUs(const Cluster& cluster) {
   int num_gpus = 0;
   for (const auto& device : devices) {
     if (device.second.type() == "GPU") {
-      if (device.second.environment().find("architecture") !=
-          device.second.environment().end()) {
-        const string arch = device.second.environment().at("architecture");
-        // TODO(yaozhang): Enable for Volta GPUs (compute capability version 7).
-        if (arch < "7") {
-          num_gpus++;
-        }
-      }
+      num_gpus++;
     }
   }
   return num_gpus;
-- 
GitLab


From f1fb08bbb70047af0c86cc440ccc0581e64fd85f Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 18 Apr 2018 18:04:44 -0700
Subject: [PATCH 1088/1262] Various lint fixes to TensorFlow detected after
 GitHub merge.

PiperOrigin-RevId: 193448139
---
 .../contrib/data/python/ops/resampling.py     |  1 -
 .../contrib/layers/python/layers/layers.py    | 10 ++++++-
 .../kernel_tests/attention_wrapper_test.py    | 12 ++++-----
 .../core/kernels/mkl_input_conversion_op.cc   | 12 ++++++---
 tensorflow/java/src/gen/cc/source_writer.h    |  2 +-
 tensorflow/python/ops/control_flow_ops.py     | 10 +++----
 tensorflow/python/ops/data_flow_ops.py        | 27 ++++++++++---------
 tensorflow/python/training/session_manager.py |  2 --
 tensorflow/tools/pip_package/setup.py         | 10 ++++---
 9 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index b465397437..a182dddd38 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -110,7 +110,6 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
         .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
     return filtered_ds.map(lambda class_value, _, data: (class_value, data))
 
-
   return _apply_fn
 
 
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 10d7f6d076..25c3b1e7ea 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1404,6 +1404,7 @@ def convolution3d_transpose(
 @add_arg_scope
 def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   """Converts a dense tensor into a sparse tensor.
+
   An example use would be to convert dense labels to sparse ones
   so that they can be fed to the ctc_loss.
 
@@ -2191,11 +2192,16 @@ def images_to_sequence(inputs,
                        outputs_collections=None,
                        scope=None):
   """Convert a batch of images into a batch of sequences.
+
   Args:
     inputs: a (num_images, height, width, depth) tensor
     data_format: A string. `NHWC` (default) and `NCHW` are supported.
     outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for name_scope.
+
+  Raises:
+     ValueError: If `data_format` is not either NCHW or NHWC.
+
   Returns:
     (width, num_images*height, depth) sequence tensor
   """
@@ -2701,6 +2707,7 @@ def sequence_to_images(inputs,
                        outputs_collections=None,
                        scope=None):
   """Convert a batch of sequences into a batch of images.
+
   Args:
     inputs: (num_steps, num_batches, depth) sequence tensor
     height: the height of the images
@@ -2708,6 +2715,7 @@ def sequence_to_images(inputs,
       Currently supports `'channels_first'` and `'channels_last'`.
     outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for name_scope.
+
   Returns:
     A tensor representing the output of the operation.
   """
@@ -2717,7 +2725,7 @@ def sequence_to_images(inputs,
     if num_batches is None:
       num_batches = -1
     else:
-      num_batches = num_batches // height
+      num_batches //= height
     reshaped = array_ops.reshape(inputs,
                                  [width, num_batches, height, depth])
     if output_data_format == 'channels_first':
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index d508cf3f9d..0232103c41 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -355,11 +355,11 @@ class AttentionWrapperTest(test.TestCase):
 
   def testLuongScaledDType(self):
     # Test case for GitHub issue 18099
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dt in [np.float16, np.float32, np.float64]:
       num_units = 128
-      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_outputs = array_ops.placeholder(dt, shape=[64, None, 256])
       encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
-      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_inputs = array_ops.placeholder(dt, shape=[64, None, 128])
       decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
       batch_size = 64
       attention_mechanism = wrapper.LuongAttention(
@@ -367,7 +367,7 @@ class AttentionWrapperTest(test.TestCase):
           memory=encoder_outputs,
           memory_sequence_length=encoder_sequence_length,
           scale=True,
-          dtype=dtype,
+          dtype=dt,
       )
       cell = rnn_cell.LSTMCell(num_units)
       cell = wrapper.AttentionWrapper(cell, attention_mechanism)
@@ -378,12 +378,12 @@ class AttentionWrapperTest(test.TestCase):
           cell=cell,
           helper=helper,
           initial_state=cell.zero_state(
-              dtype=dtype, batch_size=batch_size))
+              dtype=dt, batch_size=batch_size))
 
       final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
       self.assertTrue(
           isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
-      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertEqual(final_outputs.rnn_output.dtype, dt)
       self.assertTrue(
           isinstance(final_state, wrapper.AttentionWrapperState))
       self.assertTrue(
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 68d3e1c9ab..dcf6bb9f74 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -291,7 +291,8 @@ class MklInputConversionOp : public OpKernel {
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
       // It is safer to compare the original TensorFlow shapes than to compare
-      // Mkl shapes since element wise ops are forwarded to Eigen implementation.
+      // Mkl shapes since element wise ops are forwarded to Eigen
+      // implementation.
       TensorShape tf_shape0 = input_shape_0.GetTfShape();
       TensorShape tf_shape1 = input_shape_1.GetTfShape();
       if (tf_shape0 == tf_shape1) {
@@ -362,9 +363,11 @@ class MklInputConversionOp : public OpKernel {
               << "converted MKL inputs to TF format";
 
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, kInputIndex_0);
+                                           op_data_type, has_avx512f_,
+                                           kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, kInputIndex_1);
+                                           op_data_type, has_avx512f_,
+                                           kInputIndex_1);
       SetDummyMklShapeOutput(context, kInputIndex_0);
       SetDummyMklShapeOutput(context, kInputIndex_1);
       return;
@@ -464,7 +467,8 @@ class MklInputConversionOp : public OpKernel {
     }
 
     VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(kInputIndex_0)->shape().DebugString() << " and "
+            << context->mutable_output(kInputIndex_0)->shape().DebugString()
+            << " and "
             << context->mutable_output(kInputIndex_1)->shape().DebugString();
 
     VLOG(1) << "MklInputConversion completed successfully.";
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index 637072c0df..f011acd30a 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -61,7 +61,7 @@ class SourceWriter {
   // The data might potentially contain newline characters, therefore it will
   // be scanned to ensure that each line is indented and prefixed properly,
   // making it a bit slower than Append().
-  SourceWriter& Write(const StringPiece& text);
+  SourceWriter& Write(const StringPiece& str);
 
   // Writes a source code snippet read from a file.
   //
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index c43bbd4a1e..a1bfe450c8 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -609,13 +609,13 @@ def _EnforceShapeInvariant(merge_var, next_var):
   """Check if the shapes of the loops variables are invariants.
 
   Args:
-    merge_vars: The list of tensors representing the initial values of the
+    merge_var: The list of tensors representing the initial values of the
       loop variables.
-    next_vars: The list of tensors representing the values of the loop
+    next_var: The list of tensors representing the values of the loop
       variables after one loop iteration.
 
   Raises:
-    ValueError: If any tensor in `merge_vars` has a more specific shape than
+    ValueError: If any tensor in `merge_var` has a more specific shape than
       its correspnding tensor in `next_var`.
   """
   if isinstance(merge_var, ops.Tensor):
@@ -833,7 +833,7 @@ class GradLoopState(object):
     if outer_grad_state:
       outer_forward_ctxt = outer_grad_state.forward_context
     else:
-      if not hasattr(forward_ctxt, 'outer_context'):
+      if not hasattr(forward_ctxt, "outer_context"):
         raise ValueError("Failed to call gradients on a while loop without"
                          "properly serializing graph via MetaGraphDef")
       outer_forward_ctxt = forward_ctxt.outer_context
@@ -2973,7 +2973,7 @@ class WhileContext(ControlFlowContext):
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
         flat_sequence=exit_vars_with_tensor_arrays)
-    return (packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars)
+    return packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars
 
   def _FixControlInputsAndContext(self, enters):
     graph = ops.get_default_graph()
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index cb725199a8..62c5adc385 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -571,7 +571,7 @@ class QueueBase(object):
           name=name)
 
   def is_closed(self, name=None):
-    """ Returns true if queue is closed.
+    """Returns true if queue is closed.
 
     This operation returns true if the queue is closed and false if the queue
     is open.
@@ -1563,7 +1563,7 @@ class BaseStagingArea(object):
     of the staging area.
 
     Args:
-      vals: A tensor, a list or tuple of tensors, or a dictionary..
+      vals: A tensor, a list or tuple of tensors, or a dictionary.
 
     Returns:
       A (tensors, indices) tuple where `tensors` is a list of `Tensor` objects
@@ -1582,7 +1582,7 @@ class BaseStagingArea(object):
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals, indices, n = zip(*[(vals[k], i, k)
+      vals, indices, _ = zip(*[(vals[k], i, k)
                                for i, k in enumerate(self._names)
                                if k in vals])
     else:
@@ -1612,7 +1612,7 @@ class BaseStagingArea(object):
     for val, i in zip(vals, indices):
       dtype, shape = self._dtypes[i], self._shapes[i]
       # Check dtype
-      if not val.dtype == dtype:
+      if val.dtype != dtype:
         raise ValueError("Datatypes do not match. '%s' != '%s'" %
                          (str(val.dtype), str(dtype)))
 
@@ -1626,7 +1626,7 @@ class BaseStagingArea(object):
 
   def _create_device_transfers(self, tensors):
     """Encode inter-device transfers if the current device
-    is not the same as the Staging Area's device
+    is not the same as the Staging Area's device.
     """
 
     if not isinstance(tensors, (tuple, list)):
@@ -1739,11 +1739,6 @@ class StagingArea(BaseStagingArea):
     Args:
       dtypes:  A list of types.  The length of dtypes must equal the number
         of tensors in each element.
-      capacity: (Optional.) Maximum number of elements.
-        An integer. If zero, the Staging Area is unbounded
-      memory_limit: (Optional.) Maximum number of bytes of all tensors
-        in the Staging Area.
-        An integer. If zero, the Staging Area is unbounded
       shapes: (Optional.) Constraints on the shapes of tensors in an element.
         A list of shape tuples or None. This list is the same length
         as dtypes.  If the shape of any tensors in the element are constrained,
@@ -1754,6 +1749,11 @@ class StagingArea(BaseStagingArea):
       shared_name: (Optional.) A name to be used for the shared object. By
         passing the same name to two different python objects they will share
         the underlying staging area. Must be a string.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area.
+        An integer. If zero, the Staging Area is unbounded
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -1782,7 +1782,7 @@ class StagingArea(BaseStagingArea):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
-      
+
       if not isinstance(values, (list, tuple, dict)):
         values = [values]
 
@@ -1911,7 +1911,8 @@ class StagingArea(BaseStagingArea):
 
 
 class MapStagingArea(BaseStagingArea):
-  """A `MapStagingArea` is a TensorFlow data structure that stores tensors across multiple steps, and exposes operations that can put and get tensors.
+  """A `MapStagingArea` is a TensorFlow data structure that stores tensors
+  across multiple steps, and exposes operations that can put and get tensors.
 
   Each `MapStagingArea` element is a (key, value) pair.
   Only int64 keys are supported, other types should be
@@ -2375,7 +2376,7 @@ class RecordInput(object):
       return records
     else:
       with ops.name_scope(self._name):
-        batch_list = [[] for i in six.moves.range(self._batches)]
+        batch_list = [[] for _ in six.moves.range(self._batches)]
         records = array_ops.split(records, self._batch_size, 0)
         records = [array_ops.reshape(record, []) for record in records]
         for index, protobuf in zip(six.moves.range(len(records)), records):
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index a00ceb9021..3cb3877cc2 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -263,8 +263,6 @@ class SessionManager(object):
 
     Raises:
       RuntimeError: If the model cannot be initialized or recovered.
-
-    Raises:
       ValueError: If both checkpoint_dir and checkpoint_filename_with_path are
         set.
     """
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6511a50b3b..211f93296b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -22,7 +22,9 @@ import os
 import re
 import sys
 
-from setuptools import find_packages, setup, Command
+from setuptools import Command
+from setuptools import find_packages
+from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
@@ -97,7 +99,9 @@ TEST_PACKAGES = [
     'scipy >= 0.15.1',
 ]
 
+
 class BinaryDistribution(Distribution):
+
   def has_ext_modules(self):
     return True
 
@@ -179,9 +183,9 @@ class InstallHeaders(Command):
 
 def find_files(pattern, root):
   """Return all the files matching pattern below root dir."""
-  for path, _, files in os.walk(root):
+  for dirpath, _, files in os.walk(root):
     for filename in fnmatch.filter(files, pattern):
-      yield os.path.join(path, filename)
+      yield os.path.join(dirpath, filename)
 
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
-- 
GitLab


From a699d69c621fde118d4c89ba94658a9d7f91faac Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 18 Apr 2018 18:49:02 -0700
Subject: [PATCH 1089/1262] [TF TensorLists] Add TensorListConcatLists

TensorListConcat concatenates two TensorLists' entries (supports non-scalar
Tensors containing TensorLists).

PiperOrigin-RevId: 193451787
---
 .../api_def_TensorListConcatLists.pbtxt       |  3 +
 tensorflow/core/kernels/list_kernels.cc       | 93 +++++++++++++++++++
 tensorflow/core/ops/list_ops.cc               | 41 ++++++++
 .../python/kernel_tests/list_ops_test.py      | 60 ++++++++++++
 tensorflow/python/ops/list_ops.py             |  4 +
 5 files changed, 201 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt
new file mode 100644
index 0000000000..3fa6265e10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcatLists.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListConcatLists"
+}
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index d1e481d7cc..84fa63fc00 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -475,6 +475,99 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA
 
+class TensorListConcatLists : public OpKernel {
+ public:
+  explicit TensorListConcatLists(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorShape& tl_a_shape = c->input(0).shape();
+    const TensorShape& tl_b_shape = c->input(1).shape();
+    OP_REQUIRES(
+        c, tl_a_shape == tl_b_shape,
+        errors::InvalidArgument("Incompatible input TensorList tensor shapes: ",
+                                tl_a_shape.DebugString(), " vs. ",
+                                tl_b_shape.DebugString()));
+    AllocatorAttributes attr;
+    std::unique_ptr<Tensor> tl_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tl_a_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    // tl_a may be aliased by tl_alias.
+    const Tensor& tl_a = c->input(0);
+    const Tensor& tl_b = c->input(1);
+
+    Tensor* output;
+    if (tl_alias) {
+      c->set_output(0, *tl_alias);
+      output = tl_alias.get();
+    } else {
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(c, c->allocate_output(0, tl_a_shape, &output, attr));
+    }
+
+    auto output_t = output->flat<Variant>();
+    auto tl_a_t = tl_a.flat<Variant>();
+    auto tl_b_t = tl_b.flat<Variant>();
+
+    for (int64 b = 0; b < tl_a.NumElements(); ++b) {
+      const TensorList* l_a = tl_a_t(b).get<TensorList>();
+      const TensorList* l_b = tl_b_t(b).get<TensorList>();
+      OP_REQUIRES(
+          c, l_a != nullptr,
+          errors::InvalidArgument("input_a is not a TensorList at index ", b,
+                                  ".  Saw: '", tl_a_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l_b != nullptr,
+          errors::InvalidArgument("input_b is not a TensorList at index ", b,
+                                  ".  Saw: '", tl_b_t(b).DebugString(), "'"));
+      OP_REQUIRES(c, l_a->element_dtype == element_dtype_,
+                  errors::InvalidArgument(
+                      "input_a[", b, "].dtype != element_dtype.  Saw: ",
+                      DataTypeString(l_a->element_dtype), " vs. ",
+                      DataTypeString(element_dtype_)));
+      OP_REQUIRES(c, l_b->element_dtype == element_dtype_,
+                  errors::InvalidArgument(
+                      "input_b[", b, "].dtype != element_dtype.  Saw: ",
+                      DataTypeString(l_b->element_dtype), " vs. ",
+                      DataTypeString(element_dtype_)));
+      OP_REQUIRES(c, l_a->element_shape.IsIdenticalTo(l_b->element_shape),
+                  errors::InvalidArgument(
+                      "input_a and input_b TensorList element shapes are not "
+                      "identical at index ",
+                      b, ".  Saw ", l_a->element_shape.DebugString(), " vs. ",
+                      l_b->element_shape.DebugString()));
+      if (tl_alias) {
+        TensorList* out = output_t(b).get<TensorList>();
+        DCHECK(out != nullptr) << "Expected output to alias input_a, but it "
+                                  "doesn't contain a TensorList at index "
+                               << b;
+        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
+                  std::back_inserter(out->tensors));
+      } else {
+        TensorList out = *l_a;
+        std::copy(l_b->tensors.begin(), l_b->tensors.end(),
+                  std::back_inserter(out.tensors));
+        output_t(b) = std::move(out);
+      }
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_CPU),
+                        TensorListConcatLists);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
+                        TensorListConcatLists);
+
+#endif  // GOOGLE_CUDA
+
 #define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
   REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 7af70110b7..b9f94ba1c5 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -295,5 +295,46 @@ REGISTER_OP("TensorListSetItem")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcatLists")
+    .Input("input_a: variant")
+    .Input("input_b: variant")
+    .Attr("element_dtype: type")
+    .Output("output: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      auto input_a = c->input(0);
+      auto input_b = c->input(1);
+      TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
+      c->set_output(0, input_a);
+
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+
+      auto* handle_data_a = c->input_handle_shapes_and_types(0);
+      auto* handle_data_b = c->input_handle_shapes_and_types(1);
+      if (handle_data_a == nullptr && handle_data_b == nullptr) {
+        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        return Status::OK();
+      }
+      shape_inference::ShapeAndType list_shape_type_a =
+          (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
+      const shape_inference::ShapeAndType& list_shape_type_b =
+          (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
+      if (list_shape_type_a.dtype != t) {
+        return errors::InvalidArgument("input_a.type != element_dtype: ",
+                                       DataTypeString(list_shape_type_a.dtype),
+                                       " vs. ", DataTypeString(t));
+      }
+      if (list_shape_type_b.dtype != t) {
+        return errors::InvalidArgument("input_b.type != element_dtype: ",
+                                       DataTypeString(list_shape_type_b.dtype),
+                                       " vs. ", DataTypeString(t));
+      }
+      TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
+                                  list_shape_type_b.shape,
+                                  &list_shape_type_a.shape));
+      c->set_output_handle_shapes_and_types(0, {list_shape_type_a});
+      return Status::OK();
+    });
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 2084599760..098f9724a2 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -318,6 +318,66 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConcat(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l_batch_0 = array_ops.stack([l0, l1])
+    l_batch_1 = array_ops.stack([l1, l0])
+
+    l_concat_01 = list_ops.tensor_list_concat_lists(
+        l_batch_0, l_batch_1, element_dtype=dtypes.float32)
+    l_concat_10 = list_ops.tensor_list_concat_lists(
+        l_batch_1, l_batch_0, element_dtype=dtypes.float32)
+    l_concat_00 = list_ops.tensor_list_concat_lists(
+        l_batch_0, l_batch_0, element_dtype=dtypes.float32)
+    l_concat_11 = list_ops.tensor_list_concat_lists(
+        l_batch_1, l_batch_1, element_dtype=dtypes.float32)
+
+    expected_00 = [[1.0, 2.0, 1.0, 2.0], [-1.0, -1.0]]
+    expected_01 = [[1.0, 2.0, -1.0], [-1.0, 1.0, 2.0]]
+    expected_10 = [[-1.0, 1.0, 2.0], [1.0, 2.0, -1.0]]
+    expected_11 = [[-1.0, -1.0], [1.0, 2.0, 1.0, 2.0]]
+
+    for i, (concat, expected) in enumerate(zip(
+        [l_concat_00, l_concat_01, l_concat_10, l_concat_11],
+        [expected_00, expected_01, expected_10, expected_11])):
+      splitted = array_ops.unstack(concat)
+      splitted_stacked_ret = self.evaluate(
+          (list_ops.tensor_list_stack(splitted[0], dtypes.float32),
+           list_ops.tensor_list_stack(splitted[1], dtypes.float32)))
+      print("Test concat %d: %s, %s, %s, %s"
+            % (i, expected[0], splitted_stacked_ret[0],
+               expected[1], splitted_stacked_ret[1]))
+      self.assertAllClose(expected[0], splitted_stacked_ret[0])
+      self.assertAllClose(expected[1], splitted_stacked_ret[1])
+
+    # Concatenating mismatched shapes fails.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(
+              l_batch_0,
+              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              element_dtype=dtypes.float32))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "element shapes are not identical at index 0"):
+      l_batch_of_vec_tls = array_ops.stack(
+          [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_vec_tls,
+                                            element_dtype=dtypes.float32))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"input_b\[0\].dtype != element_dtype."):
+      l_batch_of_int_tls = array_ops.stack(
+          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
+          * 2)
+      self.evaluate(
+          list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
+                                            element_dtype=dtypes.float32))
+
   @test_util.run_in_graph_and_eager_modes()
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index bdf0774bbf..d9ede87530 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -29,6 +29,10 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
+ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListPushBackBatch")
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
-- 
GitLab


From cb35f8b702e6bf917b1d915346e959e76d1b1c1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 18:49:37 -0700
Subject: [PATCH 1090/1262] Teach transpose folding about sharding.

PiperOrigin-RevId: 193451839
---
 tensorflow/compiler/xla/service/transpose_folding.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 83185ac49e..3efd38ce0d 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -159,6 +159,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   auto new_conv = HloInstruction::CreateConvolve(
       convolution.shape(), new_lhs, new_rhs, convolution.window(), new_dnums);
+  convolution.SetupDerivedInstruction(new_conv.get());
   TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
       &convolution, std::move(new_conv)));
 
-- 
GitLab


From 6c85471ee06bf10f5034e2a8fb1fd6ab84dd7fbc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 19:18:33 -0700
Subject: [PATCH 1091/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193454093
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 19 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index d741e2ad46..42a67bc4c8 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -68628,6 +68628,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index beda05fdf2..980e560601 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -31979,6 +31979,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
-- 
GitLab


From 542edb6dd64bd18d63ef1fd64c55a645c406f170 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 19:49:12 -0700
Subject: [PATCH 1092/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 193456151

---
 tensorflow/go/op/wrappers.go | 212 +++++++++++++++++------------------
 1 file changed, 106 insertions(+), 106 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a5b293ce75..f270eadc32 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -13107,6 +13107,112 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
+// Returns a list of tensors with the same shapes and contents as the input
+//
+// tensors.
+//
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
+}
+
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Adds `bias` to `value`.
 //
 // This is a deprecated version of BiasAdd and will be soon removed.
@@ -23167,112 +23273,6 @@ func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
-//
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
-}
-
 // Computes the gradient of the sigmoid of `x` wrt its input.
 //
 // Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-- 
GitLab


From b7479a808477b61be0269048bf0cfad26070f832 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 20:02:52 -0700
Subject: [PATCH 1093/1262] Teach the reshape mover pass about sharding.

PiperOrigin-RevId: 193457083
---
 tensorflow/compiler/xla/service/reshape_mover.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 49ec38eb62..0f26a025bf 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -155,15 +155,20 @@ HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
     case HloOpcode::kConstant: {
       if (first_reshape_operand->opcode() == HloOpcode::kReshape) {
         VLOG(5) << "Adding reshape to kConstant operand";
-        return computation->AddInstruction(
+        HloInstruction* reshape = computation->AddInstruction(
             HloInstruction::CreateReshape(new_shape, operand));
+        operand->SetupDerivedInstruction(reshape);
+        return reshape;
       } else {
         CHECK(first_reshape_operand->opcode() == HloOpcode::kTranspose);
         VLOG(5) << "Adding transpose to kConstant operand";
         std::vector<int64> inverse_permutation =
             InversePermutation(first_reshape_operand->dimensions());
-        return computation->AddInstruction(HloInstruction::CreateTranspose(
-            new_shape, operand, inverse_permutation));
+        HloInstruction* transpose =
+            computation->AddInstruction(HloInstruction::CreateTranspose(
+                new_shape, operand, inverse_permutation));
+        operand->SetupDerivedInstruction(transpose);
+        return transpose;
       }
     }
     case HloOpcode::kRng: {
-- 
GitLab


From 81cabadc78811a216381fbf30715b1313684e32f Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 18 Apr 2018 20:04:56 -0700
Subject: [PATCH 1094/1262] Use the host implementation of vec permute op if
 the input on the host. Note that the op still needs to be placed on the GPU
 so that it stays within the same partiion with the neighboring ops, and as a
 result, no unnecessary send and recv are created.

PiperOrigin-RevId: 193457328
---
 .../grappler/optimizers/layout_optimizer.cc     | 17 ++++++++++-------
 .../optimizers/layout_optimizer_test.cc         |  2 +-
 tensorflow/core/kernels/data_format_ops.cc      |  9 ++++++++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index db83580c1c..87ab460862 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -909,7 +909,7 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(3, w);
   }
 
-  string MaybeGetHostDevice(const string& input_name) const {
+  bool IsInputOnHost(const string& input_name) const {
     string device = node_->device();
     DeviceNameUtils::ParsedName parsed_name;
     if (DeviceNameUtils::ParseFullName(device, &parsed_name)) {
@@ -918,13 +918,11 @@ class NodeProcessor : public GraphProcessor {
         int port;
         ParseNodeName(input_name, &port);
         if (IsHostMemory(*input, port)) {
-          parsed_name.type = "CPU";
-          parsed_name.id = 0;
-          device = DeviceNameUtils::ParsedNameToString(parsed_name);
+          return true;
         }
       }
     }
-    return device;
+    return false;
   }
 
   NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
@@ -934,9 +932,14 @@ class NodeProcessor : public GraphProcessor {
     added_node->set_name(name);
     added_node->set_op(op);
     node_map_->AddNode(added_node->name(), added_node);
+    added_node->set_device(node_->device());
     // The inputs of a DataFormat op could be in host memory for ops such as
-    // Reshape.
-    added_node->set_device(MaybeGetHostDevice(input_name));
+    // Reshape. In such cases, run the kernel on the host too.
+    if (IsInputOnHost(input_name)) {
+      AttrValue attr_kernel;
+      attr_kernel.set_s("host");
+      added_node->mutable_attr()->insert({"_kernel", attr_kernel});
+    }
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     added_node->mutable_attr()->insert({"T", attr_data_type});
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index e405c4c58c..fc87f69b8c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -1174,7 +1174,7 @@ TEST_F(LayoutOptimizerTest, DevicePlacement) {
   NodeMap node_map(&output);
   auto vec_permute =
       node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
-  EXPECT_TRUE(str_util::EndsWith(vec_permute->device(), "CPU:0"));
+  EXPECT_EQ(vec_permute->attr().at("_kernel").s(), "host");
 }
 }  // namespace
 }  // namespace grappler
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 4485152e96..23319e6d0c 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -195,7 +195,14 @@ TF_CALL_int64(REGISTER_GPU_KERNEL);
 #define REGISTER_GPU_KERNEL(T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("DataFormatVecPermute").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      DataFormatVecPermuteOp<GPUDevice, T>);
+      DataFormatVecPermuteOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("DataFormatVecPermute")                        \
+                              .Device(DEVICE_GPU)                             \
+                              .HostMemory("x")                                \
+                              .HostMemory("y")                                \
+                              .Label("host")                                  \
+                              .TypeConstraint<T>("T"),                        \
+                          DataFormatVecPermuteOp<CPUDevice, T>);
 TF_CALL_int32(REGISTER_GPU_KERNEL);
 TF_CALL_int64(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
-- 
GitLab


From fd10bfb61ef6b1885c8fa2459522fa98305df703 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 18 Apr 2018 20:15:27 -0700
Subject: [PATCH 1095/1262] Expose
 tf.contrib.training.{prepend_from_queue_and_padded_batch_dataset}

Also its helper method "enqueue_in_queue_dataset".

PiperOrigin-RevId: 193458095
---
 tensorflow/contrib/training/BUILD       | 1 +
 tensorflow/contrib/training/__init__.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 4d2bfd3e43..5de55b5f7f 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data",
         "//tensorflow/python/estimator:inputs_queues",
         "//third_party/py/numpy",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index da2de3e421..edd71fb250 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -57,6 +57,8 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
+from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
+from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -75,6 +77,7 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'train']
+    'multiply_gradients', 'enqueue_in_queue_dataset',
+    'prepend_from_queue_and_padded_batch_dataset', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From 1d003ee5f82d4d044323a3f162e6cfcf6d645346 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 18 Apr 2018 21:15:41 -0700
Subject: [PATCH 1096/1262] Initial addition of CLZ HLO

* Adds the HLO op and lowering on CPU/GPU/evaluator;
* This does not update the operation semantics;

PiperOrigin-RevId: 193461989
---
 .../xla/client/computation_builder.cc         |  5 ++++
 .../compiler/xla/client/computation_builder.h |  3 ++
 .../xla/client/xla_client/xla_builder.cc      |  4 +++
 .../xla/client/xla_client/xla_builder.h       |  3 ++
 .../compiler/xla/service/dfs_hlo_visitor.h    |  3 ++
 .../xla/service/elemental_ir_emitter.cc       |  7 +++++
 .../compiler/xla/service/hlo_evaluator.cc     | 28 +++++++++++++++++++
 .../compiler/xla/service/hlo_graph_dumper.cc  |  1 +
 .../compiler/xla/service/hlo_instruction.cc   |  6 ++++
 tensorflow/compiler/xla/service/hlo_opcode.h  |  1 +
 .../xla/service/instruction_fusion.cc         |  1 +
 .../compiler/xla/service/shape_inference.cc   |  3 ++
 .../compiler/xla/service/user_computation.cc  |  2 ++
 .../xla/tests/array_elementwise_ops_test.cc   |  9 ++++++
 .../compiler/xla/tools/parser/hlo_parser.cc   |  1 +
 tensorflow/compiler/xla/xla_data.proto        |  3 ++
 16 files changed, 80 insertions(+)

diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 4d3b0ee0d6..83c7cb1744 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -1046,6 +1046,11 @@ ComputationDataHandle ComputationBuilder::Neg(
   return UnaryOp(UNOP_NEGATE, operand);
 }
 
+ComputationDataHandle ComputationBuilder::Clz(
+    const ComputationDataHandle& operand) {
+  return UnaryOp(UNOP_CLZ, operand);
+}
+
 ComputationDataHandle ComputationBuilder::Clamp(
     const ComputationDataHandle& min, const ComputationDataHandle& operand,
     const ComputationDataHandle& max) {
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 019c6f3afb..9431c2c459 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -657,6 +657,9 @@ class ComputationBuilder {
   // Enqueues a negate instruction onto the computation.
   ComputationDataHandle Neg(const ComputationDataHandle& operand);
 
+  // Enqueues a count-leading-zeros instruction onto the computation.
+  ComputationDataHandle Clz(const ComputationDataHandle& operand);
+
   // Enqueues a transpose instruction onto the computation.
   ComputationDataHandle Transpose(
       const ComputationDataHandle& operand,
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 7ccdc2ded2..1899983e44 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1193,6 +1193,10 @@ XlaOp XlaBuilder::Sign(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kSign, operand);
 }
 
+XlaOp XlaBuilder::Clz(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kClz, operand);
+}
+
 XlaOp XlaBuilder::Cos(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kCos, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 1f7c731064..5977ee4f4b 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -571,6 +571,9 @@ class XlaBuilder {
   // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
+  // Enqueues a count leading zeros instruction onto the computation.
+  XlaOp Clz(const XlaOp& operand);
+
   // Enqueues a cosine instruction onto the computation.
   XlaOp Cos(const XlaOp& operand);
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 56723e7650..0528b07602 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -147,6 +147,9 @@ class DfsHloVisitorBase {
   virtual Status HandleLog(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleClz(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleCos(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index b6a0903b0e..56e35e2604 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -293,6 +293,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
         return operand_value;
       }
     }
+    case HloOpcode::kClz: {
+      auto is_zero_undef = ir_builder_->getFalse();
+      return llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::ctlz, {operand_value, is_zero_undef},
+          {operand_value->getType()}, ir_builder_);
+    }
     case HloOpcode::kSign: {
       bool is_signed =
           primitive_util::IsSignedIntegralType(op->shape().element_type());
@@ -1334,6 +1340,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 52bc2c0448..c5e3014834 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1853,6 +1853,34 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  // Enable CLZ only for int32 and uint32.
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          (std::is_floating_point<NativeT>::value ||
+           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
+          !(std::is_same<NativeT, uint32>::value ||
+            std::is_same<NativeT, int32>::value)>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    return InvalidArgument("Unsupported type for Clz");
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint32>::value ||
+                std::is_same<NativeT, int32>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 31 - tensorflow::Log2Floor(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleClz(HloInstruction* clz) override {
+    return HandleClz<ElementwiseT>(clz);
+  }
+
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleSin(HloInstruction* sin) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 25702dc65e..516e14b464 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -909,6 +909,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a445380817..6303bcc59f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -254,6 +254,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCeil:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
+    case HloOpcode::kClz:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
@@ -1248,6 +1249,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1728,6 +1730,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kAdd:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
@@ -2659,6 +2662,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleFloor(this);
     case HloOpcode::kCeil:
       return visitor->HandleCeil(this);
+    case HloOpcode::kClz:
+      return visitor->HandleClz(this);
     case HloOpcode::kLog:
       return visitor->HandleLog(this);
     case HloOpcode::kTanh:
@@ -3000,6 +3005,7 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index af24604c39..ca763076a1 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -57,6 +57,7 @@ namespace xla {
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
+  V(kClz, "count-leading-zeros")                             \
   V(kComplex, "complex")                                     \
   V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)       \
   V(kConditional, "conditional")                             \
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index d69ad80bdb..b9ccfeddb5 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -39,6 +39,7 @@ namespace xla {
     case HloOpcode::kBroadcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
+    case HloOpcode::kClz:
     case HloOpcode::kComplex:
     case HloOpcode::kConcatenate:
     case HloOpcode::kConstant:
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 77e12d3602..48b2922e77 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -52,6 +52,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_ABS;
     case HloOpcode::kCeil:
       return UNOP_CEIL;
+    case HloOpcode::kClz:
+      return UNOP_CLZ;
     case HloOpcode::kCos:
       return UNOP_COS;
     case HloOpcode::kExp:
@@ -360,6 +362,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
             arg, primitive_util::ComplexComponentType(arg.element_type()));
       }
       return arg;
+    case UNOP_CLZ:
     case UNOP_NEGATE:
     case UNOP_ROUND_NEAREST_AFZ:
     case UNOP_SIGN:
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 532f7fd5bf..0f16a592b6 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -49,6 +49,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kAbs;
     case UNOP_CEIL:
       return HloOpcode::kCeil;
+    case UNOP_CLZ:
+      return HloOpcode::kClz;
     case UNOP_COS:
       return HloOpcode::kCos;
     case UNOP_EXP:
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 03c91745b9..4b4dc6dd9d 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2217,6 +2217,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) {
+  XlaBuilder builder(TestName());
+  auto a = builder.ConstantR1<uint32>(
+      {0, 1, 0x10, 0x10000, 0x700000, 0x12345678, 0xF2345678});
+  builder.Clz(a);
+
+  ComputeAndCompareR1<uint32>(&builder, {32, 31, 27, 15, 9, 3, 0}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index e60a5a4919..95d3fd28b3 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -470,6 +470,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index f18d53c608..d23f9e5918 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -801,6 +801,9 @@ enum UnaryOperation {
 
   // Elementwise, extract real component of complex x.
   UNOP_IMAG = 16;
+
+  // Elementwise, computes clz(x).
+  UNOP_CLZ = 17;
 }
 
 message UnaryOpRequest {
-- 
GitLab


From ee1676d4dbded64e192aecfa693ab605e24c9929 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 18 Apr 2018 22:07:12 -0700
Subject: [PATCH 1097/1262] [XLA] Fix BF16 propagation bug for while condition.

PiperOrigin-RevId: 193465140
---
 .../xla/service/bfloat16_propagation.cc       |  1 -
 .../xla/service/bfloat16_propagation_test.cc  | 58 ++++++++++++++++++-
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index c26d2feef5..43ebe92c5e 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -392,7 +392,6 @@ void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) {
       adjust_computation(hlo->fused_instructions_computation(), hlo->shape());
       break;
     case HloOpcode::kWhile:
-      adjust_computation(hlo->while_condition(), hlo->shape());
       adjust_computation(hlo->while_body(), hlo->shape());
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 88f8301416..183db1652e 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -426,8 +426,62 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
   EXPECT_TRUE(OutputsBF16(xpose));
 }
 
-// Tests that BF16 is propagated properly through while computations.
-TEST_F(BFloat16PropagationTest, PropagateThroughWhile) {
+// Tests that BF16 is propagated properly through a while computation with
+// non-tuple input/output.
+TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  auto builder_cond = HloComputation::Builder("cond");
+  auto cond_param = builder_cond.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "cond_param"));
+  auto cond_dot = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, cond_param, cond_param));
+  auto cond_root = builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_dot, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond = module->AddEmbeddedComputation(builder_cond.Build());
+
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "body_param"));
+  auto body_dot = builder_body.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, body_param, body_param));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while_hlo = builder.AddInstruction(
+      HloInstruction::CreateWhile(shape, cond, body, add));
+
+  auto dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, while_hlo, while_hlo));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(cond_root->shape(), ShapeUtil::MakeShape(PRED, {})));
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(body_dot));
+  EXPECT_TRUE(OutputsBF16(body_param));
+  EXPECT_TRUE(OutputsBF16(cond_param));
+  EXPECT_FALSE(OutputsBF16(dot));
+}
+
+// Tests that BF16 is propagated properly through while computations with
+// tuple-shaped input/output.
+TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
-- 
GitLab


From 2a6c5998a239f41926ca295ac20bb595862fd5ff Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 18 Apr 2018 22:59:01 -0700
Subject: [PATCH 1098/1262] [tf.data] Add native implementation for
 `tf.contrib.data.unbatch()`.

The implementation has two main improvements:
1. Avoid relatively expensive (~15us) function invocation for each incoming batch.
2. Use std::move() where possible to avoid copying strings/variants into the unbatched
   elements.

PiperOrigin-RevId: 193467856
---
 .../kernel_tests/batch_dataset_op_test.py     | 228 +++++++++++++++++-
 .../contrib/data/python/ops/batching.py       | 135 ++++++++---
 tensorflow/contrib/tpu/python/tpu/datasets.py |   2 +-
 .../base_api/api_def_UnbatchDataset.pbtxt     |   4 +
 .../python_api/api_def_UnbatchDataset.pbtxt   |   4 +
 tensorflow/core/framework/tensor.h            |   5 +
 tensorflow/core/kernels/batch_util.cc         |  73 +++++-
 tensorflow/core/kernels/batch_util.h          |   6 +
 tensorflow/core/kernels/data/BUILD            |  14 ++
 .../core/kernels/data/unbatch_dataset_op.cc   | 204 ++++++++++++++++
 tensorflow/core/ops/dataset_ops.cc            |   7 +
 11 files changed, 635 insertions(+), 47 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
 create mode 100644 tensorflow/core/kernels/data/unbatch_dataset_op.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 413d873797..e1ec60d7c9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -18,15 +18,18 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import time
 
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -34,6 +37,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class BatchDatasetTest(test.TestCase):
@@ -151,6 +155,69 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
 
+  def testUnbatchDatasetWithStrings(self):
+    data = tuple([math_ops.range(10) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
+    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.apply(batching.unbatch())
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchDatasetWithSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors(st)
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        st_row = sess.run(next_element)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchDatasetWithDenseAndSparseTensor(self):
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[i, i] for i in range(10)],
+        values=list(range(10)),
+        dense_shape=[10, 10])
+    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
+    data = data.apply(batching.unbatch())
+    data = data.batch(5)
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        dense_elem, st_row = sess.run(next_element)
+        self.assertEqual(i, dense_elem)
+        self.assertEqual([i], st_row.indices)
+        self.assertEqual([i], st_row.values)
+        self.assertEqual([10], st_row.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -191,6 +258,53 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
 
+  def testUnbatchEmpty(self):
+    data = dataset_ops.Dataset.from_tensors(
+        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
+         constant_op.constant([], shape=[0, 4, 0])))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testUnbatchStaticShapeMismatch(self):
+    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
+                                             np.arange(9)))
+    with self.assertRaises(ValueError):
+      data.apply(batching.unbatch())
+
+  def testUnbatchDynamicShapeMismatch(self):
+    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
+    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
+    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
+    data = data.apply(batching.unbatch())
+    iterator = data.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Mismatch in the 0th dimension.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: np.arange(8).astype(np.int32)
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(next_element))
+
+      # No 0th dimension (i.e. scalar value) for one component.
+      sess.run(
+          iterator.initializer,
+          feed_dict={
+              ph1: np.arange(7).astype(np.int32),
+              ph2: 7
+          })
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(next_element))
+
   def testBatchAndDropRemainder(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -545,6 +659,28 @@ class BatchDatasetSerializationTest(
     self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
 
 
+class UnbatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
+    components = (
+        np.arange(tensor_slice_len),
+        np.array([[1, 2, 3]]) * np.arange(tensor_slice_len)[:, np.newaxis],
+        np.array(multiplier) * np.arange(tensor_slice_len))
+
+    return dataset_ops.Dataset.from_tensor_slices(components).batch(
+        batch_size).apply(batching.unbatch())
+
+  def testCore(self):
+    tensor_slice_len = 8
+    batch_size = 2
+    num_outputs = tensor_slice_len
+    self.run_core_tests(
+        lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
+        lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
+        num_outputs)
+
+
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
@@ -586,10 +722,12 @@ class RestructuredDatasetTest(test.TestCase):
   def test_assert_element_shape(self):
 
     def create_unknown_shape_dataset(x):
-      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
-                                           np.zeros((3, 4), dtype=np.int32)),
-                                [x],
-                                [dtypes.float32, dtypes.int32])
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
 
     dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
@@ -626,10 +764,12 @@ class RestructuredDatasetTest(test.TestCase):
   def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
 
     def create_unknown_shape_dataset(x):
-      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
-                                           np.zeros((3, 4), dtype=np.int32)),
-                                [x],
-                                [dtypes.float32, dtypes.int32])
+      return script_ops.py_func(
+          lambda _: (  # pylint: disable=g-long-lambda
+              np.ones(2, dtype=np.float32),
+              np.zeros((3, 4), dtype=np.int32)),
+          [x],
+          [dtypes.float32, dtypes.int32])
 
     dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
@@ -649,5 +789,77 @@ class RestructuredDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class UnbatchDatasetBenchmark(test.Benchmark):
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset.make_initializable_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
+              batch_size)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 28db949da9..2152bcde84 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -80,28 +80,98 @@ def dense_to_sparse_batch(batch_size, row_shape):
   return _apply_fn
 
 
+class UnbatchDataset(dataset_ops.Dataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset):
+    """See `unbatch()` for more details."""
+    super(UnbatchDataset, self).__init__()
+    flat_shapes = nest.flatten(input_dataset.output_shapes)
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError:
+        raise ValueError("Cannot unbatch an input whose components have "
+                         "different batch sizes.")
+    self._input_dataset = input_dataset
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.unbatch_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return nest.map_structure(lambda s: s[1:],
+                              self._input_dataset.output_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
 def unbatch():
-  """A Transformation which splits the elements of a dataset.
+  """Splits elements of a dataset into multiple elements on the batch dimension.
 
   For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
-  where `B` may vary from element to element, then for each element in
-  the dataset, the unbatched dataset will contain `B` consecutive elements
+  where `B` may vary for each input element, then for each element in the
+  dataset, the unbatched dataset will contain `B` consecutive elements
   of shape `[a0, a1, ...]`.
 
+  ```python
+  # NOTE: The following example uses `{ ... }` to represent the contents
+  # of a dataset.
+  a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+  a.apply(tf.contrib.data.unbatch()) == {
+      'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
+  ```
+
   Returns:
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
   """
 
   def _apply_fn(dataset):
-
-    def unbatch_map(arg, *rest):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    if not sparse.any_sparse(dataset.output_classes):
+      return UnbatchDataset(dataset)
+
+    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
+    # are normalized to the rank-1 dense representation, so that the
+    # sparse-oblivious unbatching logic will slice them
+    # appropriately. This leads to a somewhat inefficient re-encoding step
+    # for all SparseTensor components.
+    # TODO(mrry): Consider optimizing this in future
+    # if it turns out to be a bottleneck.
+    def normalize(arg, *rest):
       if rest:
-        return dataset_ops.Dataset.from_tensor_slices((arg,) + rest)
+        return sparse.serialize_many_sparse_tensors((arg,) + rest)
       else:
-        return dataset_ops.Dataset.from_tensor_slices(arg)
+        return sparse.serialize_many_sparse_tensors(arg)
+
+    normalized_dataset = dataset.map(normalize)
 
-    return dataset.flat_map(map_func=unbatch_map)
+    # NOTE(mrry): Our `map()` has lost information about the sparseness
+    # of any SparseTensor components, so re-apply the structure of the
+    # original dataset.
+    restructured_dataset = _RestructuredDataset(
+        normalized_dataset,
+        dataset.output_types,
+        dataset.output_shapes,
+        dataset.output_classes,
+        allow_unsafe_cast=True)
+    return UnbatchDataset(restructured_dataset)
 
   return _apply_fn
 
@@ -265,7 +335,8 @@ class _RestructuredDataset(dataset_ops.Dataset):
                dataset,
                output_types,
                output_shapes=None,
-               output_classes=None):
+               output_classes=None,
+               allow_unsafe_cast=False):
     """Creates a new dataset with the given output types and shapes.
 
     The given `dataset` must have a structure that is convertible:
@@ -283,6 +354,10 @@ class _RestructuredDataset(dataset_ops.Dataset):
         If omitted, the shapes will be inherited from `dataset`.
       output_classes: (Optional.) A nested structure of class types.
         If omitted, the class types will be inherited from `dataset`.
+      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
+        reported output types and shapes of the restructured dataset, e.g. to
+        switch a sparse tensor represented as `tf.variant` to its user-visible
+        type and shape.
 
     Raises:
       ValueError: If either `output_types` or `output_shapes` is not compatible
@@ -291,14 +366,15 @@ class _RestructuredDataset(dataset_ops.Dataset):
     super(_RestructuredDataset, self).__init__()
     self._dataset = dataset
 
-    # Validate that the types are compatible.
-    output_types = nest.map_structure(dtypes.as_dtype, output_types)
-    flat_original_types = nest.flatten(dataset.output_types)
-    flat_new_types = nest.flatten(output_types)
-    if flat_original_types != flat_new_types:
-      raise ValueError(
-          "Dataset with output types %r cannot be restructured to have output "
-          "types %r" % (dataset.output_types, output_types))
+    if not allow_unsafe_cast:
+      # Validate that the types are compatible.
+      output_types = nest.map_structure(dtypes.as_dtype, output_types)
+      flat_original_types = nest.flatten(dataset.output_types)
+      flat_new_types = nest.flatten(output_types)
+      if flat_original_types != flat_new_types:
+        raise ValueError(
+            "Dataset with output types %r cannot be restructured to have "
+            "output types %r" % (dataset.output_types, output_types))
 
     self._output_types = output_types
 
@@ -308,18 +384,19 @@ class _RestructuredDataset(dataset_ops.Dataset):
                                                   nest.flatten(
                                                       dataset.output_shapes))
     else:
-      # Validate that the shapes are compatible.
-      nest.assert_same_structure(output_types, output_shapes)
-      flat_original_shapes = nest.flatten(dataset.output_shapes)
-      flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
-
-      for original_shape, new_shape in zip(flat_original_shapes,
-                                           flat_new_shapes):
-        if not original_shape.is_compatible_with(new_shape):
-          raise ValueError(
-              "Dataset with output shapes %r cannot be restructured to have "
-              "incompatible output shapes %r" % (dataset.output_shapes,
-                                                 output_shapes))
+      if not allow_unsafe_cast:
+        # Validate that the shapes are compatible.
+        nest.assert_same_structure(output_types, output_shapes)
+        flat_original_shapes = nest.flatten(dataset.output_shapes)
+        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
+
+        for original_shape, new_shape in zip(flat_original_shapes,
+                                             flat_new_shapes):
+          if not original_shape.is_compatible_with(new_shape):
+            raise ValueError(
+                "Dataset with output shapes %r cannot be restructured to have "
+                "incompatible output shapes %r" % (dataset.output_shapes,
+                                                   output_shapes))
       self._output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
     if output_classes is None:
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 465c668fd8..2e472a2805 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -170,7 +170,7 @@ def StreamingFilesDataset(files,
         args=[source_handle],
         Tout=[dtypes.string],
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000..324fadac0a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  summary: "A dataset that splits the elements of its input into multiple elements."
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000..1e5415749f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 4d10f7efb5..58fbced606 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -44,6 +44,7 @@ class TensorProto;
 class VariantTensorData;
 namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
 }  // namespace batch_util
 
 /// @ingroup core
@@ -493,6 +494,10 @@ class Tensor {
   friend Status batch_util::CopyElementToSlice(
       Tensor element, Tensor* parent,
       int64 index);                // For access to RefCountIsOne().
+  friend Status batch_util::MaybeMoveSliceToElement(
+      Tensor* parent, Tensor* element,
+      int64 index);  // For access to RefCountIsOne().
+
   friend class NumpyTensorBuffer;  // For access to the private constructor
                                    // taking the buffer.
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 1a45212ad2..52be1ab8d0 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -78,14 +78,44 @@ Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
   return Status::OK();
 }
 
-// TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
-// the data when possible.
-
+// TODO(b/78245576): Consider removing this overload.
 template <typename T>
-static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
-                                   int64 index) {
+void HandleSliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   element->flat<T>() = parent.flat_outer_dims<T>().chip(index, 0);
-  return Status::OK();
+}
+
+template <typename T>
+void HandleSliceToElement(Tensor* parent, Tensor* element, int64 index,
+                          bool can_move) {
+  element->flat<T>() = parent->flat_outer_dims<T>().chip(index, 0);
+}
+
+template <>
+void HandleSliceToElement<string>(Tensor* parent, Tensor* element, int64 index,
+                                  bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<string>();
+  auto element_flat = element->flat<string>();
+  if (can_move) {
+    for (int64 i = 0; i < element->NumElements(); ++i) {
+      element_flat(i) = std::move(parent_as_matrix(index, i));
+    }
+  } else {
+    element_flat = parent_as_matrix.chip(index, 0);
+  }
+}
+
+template <>
+void HandleSliceToElement<Variant>(Tensor* parent, Tensor* element, int64 index,
+                                   bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element->flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element->NumElements(); ++i) {
+      element_flat(i) = std::move(parent_as_matrix(index, i));
+    }
+  } else {
+    element_flat = parent_as_matrix.chip(index, 0);
+  }
 }
 
 }  // namespace
@@ -115,9 +145,10 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   TF_RETURN_IF_ERROR(ValidateInput(parent, *element, index));
 
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleSliceToElement<T>(parent, element, index); \
+#define HANDLE_TYPE(T)                               \
+  case DataTypeToEnum<T>::value: {                   \
+    HandleSliceToElement<T>(parent, element, index); \
+    return Status::OK();                             \
   }
 
   switch (parent.dtype()) {
@@ -130,6 +161,30 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(*parent, *element, index));
+  bool can_move = parent->RefCountIsOne();
+
+#define HANDLE_TYPE(T)                                         \
+  case DataTypeToEnum<T>::value: {                             \
+    HandleSliceToElement<T>(parent, element, index, can_move); \
+    return Status::OK();                                       \
+  }
+
+  switch (parent->dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "MaybeMoveSliceToElement Unhandled data type: ", element->dtype());
+  }
+}
+
 // The following five functions are copied from padding_fifo_queue.cc.
 // TODO(mrry): Reconcile these functions with the similar methods in the
 // queue implementation.
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index a47bf1935d..69098fbd1d 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -32,6 +32,12 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+
 // Zero-initializes the tensor `element` using the scalar stored in `padding`.
 // Both `element` and `padding` must have matching `dtype`.
 Status SetElementZero(Tensor* element, const Tensor& padding);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 221724e25d..1e96eb6421 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -446,6 +446,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -562,6 +575,7 @@ tf_kernel_library(
         ":tensor_dataset_op",
         ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
+        ":unbatch_dataset_op",
         ":zip_dataset_op",
     ],
 )
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
new file mode 100644
index 0000000000..241b615aca
--- /dev/null
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class UnbatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit UnbatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
+      input_->Ref();
+      for (const PartialTensorShape& shape : input->output_shapes()) {
+        gtl::InlinedVector<int64, 4> partial_dim_sizes;
+        for (int i = 1; i < shape.dims(); ++i) {
+          partial_dim_sizes.push_back(shape.dim_size(i));
+        }
+        shapes_.emplace_back(std::move(partial_dim_sizes));
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "UnbatchDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            current_index_(0),
+            current_batch_size_(0),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            shapes_(params.dataset->output_shapes().size()) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+        *end_of_sequence = false;
+        while (!*end_of_sequence) {
+          if (current_index_ < current_batch_size_) {
+            out_tensors->clear();
+            out_tensors->reserve(tensors_.size());
+            for (int i = 0; i < tensors_.size(); ++i) {
+              out_tensors->emplace_back(ctx->allocator({}), tensors_[i].dtype(),
+                                        shapes_[i]);
+              TF_RETURN_IF_ERROR(batch_util::MaybeMoveSliceToElement(
+                  &tensors_[i], &out_tensors->back(), current_index_));
+            }
+            ++current_index_;
+            *end_of_sequence = false;
+            return Status::OK();
+          }
+          current_index_ = 0;
+          current_batch_size_ = 0;
+          tensors_.clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &tensors_, end_of_sequence));
+          if (!*end_of_sequence) {
+            for (size_t i = 0; i < tensors_.size(); ++i) {
+              if (tensors_[i].dims() == 0) {
+                return errors::InvalidArgument(
+                    "Input element must have a non-scalar value in each "
+                    "component.");
+              }
+              if (tensors_[i].dim_size(0) != tensors_[0].dim_size(0)) {
+                return errors::InvalidArgument(
+                    "Input element must have the same batch size in each "
+                    "component. Component 0 had size ",
+                    tensors_[0].dim_size(0), " but component ", i,
+                    " had size, ", tensors_[i].dim_size(0), ".");
+              }
+              shapes_[i] = tensors_[i].shape();
+              shapes_[i].RemoveDim(0);
+            }
+            current_batch_size_ = tensors_[0].dim_size(0);
+          }
+        }
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("current_index"), current_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("n"), current_batch_size_));
+        if (current_index_ < current_batch_size_) {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), tensors_[i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_index"), &current_index_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("n"), &current_batch_size_));
+        tensors_.clear();
+        tensors_.resize(dataset()->output_dtypes().size());
+        if (current_index_ < current_batch_size_) {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("tensors[", i, "]")), &tensors_[i]));
+            shapes_[i] = tensors_[i].shape();
+            shapes_[i].RemoveDim(0);
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 current_index_ GUARDED_BY(mu_);
+      int64 current_batch_size_ GUARDED_BY(mu_);
+      std::vector<Tensor> tensors_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::vector<TensorShape> shapes_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+                        UnbatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 57f871af32..8be569b315 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -83,6 +83,13 @@ REGISTER_OP("GeneratorDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("UnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
-- 
GitLab


From 2294834612cde9781e37021af7ba8480aadbb112 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 23:18:50 -0700
Subject: [PATCH 1099/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193469437
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 23 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 23 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 42a67bc4c8..9bc11cf0fe 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -70008,6 +70008,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnbatchGrad"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 980e560601..9b665190ce 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -32826,6 +32826,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnbatchGrad"
   input_arg {
-- 
GitLab


From 38dda0e7776b68e1da70ab0601d2511df67b4e05 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Thu, 19 Apr 2018 15:41:04 +0900
Subject: [PATCH 1100/1262] fix typo

---
 tensorflow/contrib/lite/kernels/add.cc                      | 2 +-
 tensorflow/contrib/lite/kernels/sub.cc                      | 2 +-
 tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 63ea89df56..e0aa070e2d 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -176,7 +176,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   output);
   } else {
     context->ReportError(context,
-                         "Inputs and outputs not all float|unit8 types.");
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 5acb356181..7c60a4fdbf 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -175,7 +175,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                output);
   } else {
     context->ReportError(context,
-                         "Inputs and outputs not all float|unit8 types.");
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
index 9b00f5b19d..56a3658fa0 100644
--- a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
@@ -61,7 +61,7 @@ build the `tag` of the summary values:
    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 
 The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 Each element must be in the range `[0, 255]` (It represents the value of a
 pixel in the output image).  Non-finite values in the input tensor are
 replaced by this tensor in the output image.  The default value is the color
-- 
GitLab


From 2024f37f78e04ed1d035f53d2c3804bfb12e690f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 18 Apr 2018 23:46:34 -0700
Subject: [PATCH 1101/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 193471104

---
 tensorflow/go/op/wrappers.go | 570 +++++++++++++++++------------------
 1 file changed, 285 insertions(+), 285 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f270eadc32..35ad1eff0f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -22806,49 +22806,6 @@ func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Out
 	return op.Output(0), op.Output(1)
 }
 
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
 type ResourceApplyAdamAttr func(optionalAttr)
 
@@ -24195,21 +24152,299 @@ func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output,
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
 //
-// If `x` and `y` are reals, this will return the floating-point division.
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			x, y,
+			handle, elem,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -26269,241 +26504,6 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
-//
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Rpc",
-		Input: []tf.Input{
-			address, method, request,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
-//
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackPushV2",
-		Input: []tf.Input{
-			handle, elem,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
 type FusedBatchNormGradV2Attr func(optionalAttr)
 
-- 
GitLab


From a4b0b02ef66586ac98d558099a37662a892f14f1 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 00:28:33 -0700
Subject: [PATCH 1102/1262] docs: Add a note on building the C and/or Java API
 binaries from source.

See #15290

PiperOrigin-RevId: 193473886
---
 .../docs_src/install/install_sources.md       | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 7d7c2aa75a..26287aa3a1 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -241,12 +241,12 @@ One of the questions that `configure` will ask is as follows:
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]
 </pre>
 
-This question refers to a later phase in which you'll use bazel to
-[build the pip package](#build-the-pip-package).  We recommend
-accepting the default (`-march=native`), which will
-optimize the generated code for your local machine's CPU type.  However,
-if you are building TensorFlow on one CPU type but will run TensorFlow on
-a different CPU type, then consider specifying a more specific optimization
+This question refers to a later phase in which you'll use bazel to [build the
+pip package](#build-the-pip-package) or the [C/Java libraries](#BuildCorJava).
+We recommend accepting the default (`-march=native`), which will optimize the
+generated code for your local machine's CPU type.  However, if you are building
+TensorFlow on one CPU type but will run TensorFlow on a different CPU type, then
+consider specifying a more specific optimization
 flag as described in [the gcc
 documentation](https://gcc.gnu.org/onlinedocs/gcc-4.5.3/gcc/i386-and-x86_002d64-Options.html).
 
@@ -311,6 +311,10 @@ Note the following:
 
 ## Build the pip package
 
+Note: If you're only interested in building the libraries for the TensorFlow C
+or Java APIs, see [Build the C or Java libraries](#BuildCorJava), you do not
+need to build the pip package in that case.
+
 To build a pip package for TensorFlow with CPU-only support,
 you would typically invoke the following command:
 
@@ -503,3 +507,20 @@ Stack Overflow and specify the `tensorflow` tag.
 <tr><td>tensorflow-1.0.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.0.0</td><td>GPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
 </table>
+
+<a name="BuildCorJava"></a>
+## Build the C or Java libraries
+
+The instructions above are tailored to building the TensorFlow Python packages.
+
+If you're interested in building the libraries for the TensorFlow C API, do the
+following:
+
+1.  Follow the steps up to [Configure the installation](#ConfigureInstallation)
+2.  Build the C libraries following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md).
+
+If you're interested inv building the libraries for the TensorFlow Java API,
+do the following:
+
+1.  Follow the steps up to [Configure the installation](#ConfigureInstallation)
+2.  Build the Java library following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md).
-- 
GitLab


From d218339e6a05a984ef7b9a49d66db219d862936e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 19 Apr 2018 01:26:07 -0700
Subject: [PATCH 1103/1262] Remove proto import in header files for
 core/kernels/boosted_trees. Move implementations that requires declaration of
 TreeEnsemble to .cc files.

The goal is to make kernels mostly independent of proto headers, which will let us lock down our .so import

PiperOrigin-RevId: 193478404
---
 .../core/kernels/boosted_trees/resources.cc   | 138 ++++++++++++++++
 .../core/kernels/boosted_trees/resources.h    | 154 +++++-------------
 2 files changed, 178 insertions(+), 114 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 2ea12c522c..c410748c27 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -21,6 +21,35 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Constructor.
+BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
+    : tree_ensemble_(
+          protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
+              &arena_)) {}
+
+string BoostedTreesEnsembleResource::DebugString() {
+  return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
+                         "]");
+}
+
+bool BoostedTreesEnsembleResource::InitFromSerialized(const string& serialized,
+                                                      const int64 stamp_token) {
+  CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
+  if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
+    set_stamp(stamp_token);
+    return true;
+  }
+  return false;
+}
+
+string BoostedTreesEnsembleResource::SerializeAsString() const {
+  return tree_ensemble_->SerializeAsString();
+}
+
+int32 BoostedTreesEnsembleResource::num_trees() const {
+  return tree_ensemble_->trees_size();
+}
+
 int32 BoostedTreesEnsembleResource::next_node(
     const int32 tree_id, const int32 node_id, const int32 index_in_batch,
     const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const {
@@ -49,6 +78,115 @@ float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
   }
 }
 
+int32 BoostedTreesEnsembleResource::GetNumLayersGrown(
+    const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
+}
+
+void BoostedTreesEnsembleResource::SetNumLayersGrown(
+    const int32 tree_id, int32 new_num_layers) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
+      new_num_layers);
+}
+
+void BoostedTreesEnsembleResource::UpdateLastLayerNodesRange(
+    const int32 node_range_start, int32 node_range_end) const {
+  tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+      node_range_start);
+  tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+      node_range_end);
+}
+
+void BoostedTreesEnsembleResource::GetLastLayerNodesRange(
+    int32* node_range_start, int32* node_range_end) const {
+  *node_range_start =
+      tree_ensemble_->growing_metadata().last_layer_node_start();
+  *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+}
+
+int64 BoostedTreesEnsembleResource::GetNumNodes(const int32 tree_id) {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->trees(tree_id).nodes_size();
+}
+
+int32 BoostedTreesEnsembleResource::GetNumLayersAttempted() {
+  return tree_ensemble_->growing_metadata().num_layers_attempted();
+}
+
+bool BoostedTreesEnsembleResource::is_leaf(const int32 tree_id,
+                                           const int32 node_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  return node.node_case() == boosted_trees::Node::kLeaf;
+}
+
+int32 BoostedTreesEnsembleResource::feature_id(const int32 tree_id,
+                                               const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().feature_id();
+}
+
+int32 BoostedTreesEnsembleResource::bucket_threshold(
+    const int32 tree_id, const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().threshold();
+}
+
+int32 BoostedTreesEnsembleResource::left_id(const int32 tree_id,
+                                            const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().left_id();
+}
+
+int32 BoostedTreesEnsembleResource::right_id(const int32 tree_id,
+                                             const int32 node_id) const {
+  const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  return node.bucketized_split().right_id();
+}
+
+std::vector<float> BoostedTreesEnsembleResource::GetTreeWeights() const {
+  return {tree_ensemble_->tree_weights().begin(),
+          tree_ensemble_->tree_weights().end()};
+}
+
+float BoostedTreesEnsembleResource::GetTreeWeight(const int32 tree_id) const {
+  return tree_ensemble_->tree_weights(tree_id);
+}
+
+float BoostedTreesEnsembleResource::IsTreeFinalized(const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).is_finalized();
+}
+
+float BoostedTreesEnsembleResource::IsTreePostPruned(
+    const int32 tree_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size() >
+         0;
+}
+
+void BoostedTreesEnsembleResource::SetIsFinalized(const int32 tree_id,
+                                                  const bool is_finalized) {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
+      is_finalized);
+}
+
+// Sets the weight of i'th tree.
+void BoostedTreesEnsembleResource::SetTreeWeight(const int32 tree_id,
+                                                 const float weight) {
+  DCHECK_GE(tree_id, 0);
+  DCHECK_LT(tree_id, num_trees());
+  tree_ensemble_->set_tree_weights(tree_id, weight);
+}
+
 void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const {
   tree_ensemble_->mutable_growing_metadata()->set_num_layers_attempted(
       tree_ensemble_->growing_metadata().num_layers_attempted() + 1);
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index 561ca3a18a..df78d3f275 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -17,12 +17,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
+// Forward declaration for proto class TreeEnsemble
+namespace boosted_trees {
+class TreeEnsemble;
+}  // namespace boosted_trees
+
 // A StampedResource is a resource that has a stamp token associated with it.
 // Before reading from or applying updates to the resource, the stamp should
 // be checked to verify that the update is not stale.
@@ -42,31 +46,15 @@ class StampedResource : public ResourceBase {
 // Keep a tree ensemble in memory for efficient evaluation and mutation.
 class BoostedTreesEnsembleResource : public StampedResource {
  public:
-  // Constructor.
-  BoostedTreesEnsembleResource()
-      : tree_ensemble_(
-            protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
-                &arena_)) {}
-
-  string DebugString() override {
-    return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
-                           "]");
-  }
-
-  bool InitFromSerialized(const string& serialized, const int64 stamp_token) {
-    CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
-    if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
-      set_stamp(stamp_token);
-      return true;
-    }
-    return false;
-  }
-
-  string SerializeAsString() const {
-    return tree_ensemble_->SerializeAsString();
-  }
-
-  int32 num_trees() const { return tree_ensemble_->trees_size(); }
+  BoostedTreesEnsembleResource();
+
+  string DebugString() override;
+
+  bool InitFromSerialized(const string& serialized, const int64 stamp_token);
+
+  string SerializeAsString() const;
+
+  int32 num_trees() const;
 
   // Find the next node to which the example (specified by index_in_batch)
   // traverses down from the current node indicated by tree_id and node_id.
@@ -82,73 +70,31 @@ class BoostedTreesEnsembleResource : public StampedResource {
 
   float node_value(const int32 tree_id, const int32 node_id) const;
 
-  int32 GetNumLayersGrown(const int32 tree_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
-  }
+  int32 GetNumLayersGrown(const int32 tree_id) const;
 
-  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
-        new_num_layers);
-  }
+  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const;
 
   void UpdateLastLayerNodesRange(const int32 node_range_start,
-                                 int32 node_range_end) const {
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
-        node_range_start);
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
-        node_range_end);
-  }
+                                 int32 node_range_end) const;
 
   void GetLastLayerNodesRange(int32* node_range_start,
-                              int32* node_range_end) const {
-    *node_range_start =
-        tree_ensemble_->growing_metadata().last_layer_node_start();
-    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
-  }
+                              int32* node_range_end) const;
 
-  int64 GetNumNodes(const int32 tree_id) {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->trees(tree_id).nodes_size();
-  }
+  int64 GetNumNodes(const int32 tree_id);
 
   void UpdateGrowingMetadata() const;
 
-  int32 GetNumLayersAttempted() {
-    return tree_ensemble_->growing_metadata().num_layers_attempted();
-  }
-
-  bool is_leaf(const int32 tree_id, const int32 node_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
-    const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    return node.node_case() == boosted_trees::Node::kLeaf;
-  }
-
-  int32 feature_id(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().feature_id();
-  }
-
-  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().threshold();
-  }
-
-  int32 left_id(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().left_id();
-  }
-
-  int32 right_id(const int32 tree_id, const int32 node_id) const {
-    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
-    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
-    return node.bucketized_split().right_id();
-  }
+  int32 GetNumLayersAttempted();
+
+  bool is_leaf(const int32 tree_id, const int32 node_id) const;
+
+  int32 feature_id(const int32 tree_id, const int32 node_id) const;
+
+  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const;
+
+  int32 left_id(const int32 tree_id, const int32 node_id) const;
+
+  int32 right_id(const int32 tree_id, const int32 node_id) const;
 
   // Add a tree to the ensemble and returns a new tree_id.
   int32 AddNewTree(const float weight);
@@ -163,38 +109,18 @@ class BoostedTreesEnsembleResource : public StampedResource {
   // Retrieves tree weights and returns as a vector.
   // It involves a copy, so should be called only sparingly (like once per
   // iteration, not per example).
-  std::vector<float> GetTreeWeights() const {
-    return {tree_ensemble_->tree_weights().begin(),
-            tree_ensemble_->tree_weights().end()};
-  }
-
-  float GetTreeWeight(const int32 tree_id) const {
-    return tree_ensemble_->tree_weights(tree_id);
-  }
-
-  float IsTreeFinalized(const int32 tree_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->tree_metadata(tree_id).is_finalized();
-  }
-
-  float IsTreePostPruned(const int32 tree_id) const {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->tree_metadata(tree_id)
-               .post_pruned_nodes_meta_size() > 0;
-  }
-
-  void SetIsFinalized(const int32 tree_id, const bool is_finalized) {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
-        is_finalized);
-  }
+  std::vector<float> GetTreeWeights() const;
+
+  float GetTreeWeight(const int32 tree_id) const;
+
+  float IsTreeFinalized(const int32 tree_id) const;
+
+  float IsTreePostPruned(const int32 tree_id) const;
+
+  void SetIsFinalized(const int32 tree_id, const bool is_finalized);
 
   // Sets the weight of i'th tree.
-  void SetTreeWeight(const int32 tree_id, const float weight) {
-    DCHECK_GE(tree_id, 0);
-    DCHECK_LT(tree_id, num_trees());
-    tree_ensemble_->set_tree_weights(tree_id, weight);
-  }
+  void SetTreeWeight(const int32 tree_id, const float weight);
 
   // Resets the resource and frees the protos in arena.
   // Caller needs to hold the mutex lock while calling this.
-- 
GitLab


From b2536f05bb156612c96f204041ea31980b711fc8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 01:56:31 -0700
Subject: [PATCH 1104/1262] Update feature_util's GetFeatures to show
 compile-time error for unsupported types instead of a link-time error.

PiperOrigin-RevId: 193480683
---
 tensorflow/core/example/feature_util.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index d977935b8a..2265498b5e 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -182,13 +182,25 @@ struct FeatureTrait<
 // Returns true if sequence_example has a feature_list with the specified key.
 bool HasFeatureList(const string& key, const SequenceExample& sequence_example);
 
+template <typename T>
+struct TypeHasFeatures : std::false_type {};
+
+template <>
+struct TypeHasFeatures<Example> : std::true_type {};
+
+template <>
+struct TypeHasFeatures<Features> : std::true_type {};
+
 // A family of template functions to return mutable Features proto from a
 // container proto. Supported ProtoTypes: Example, Features.
 template <typename ProtoType>
-Features* GetFeatures(ProtoType* proto);
+typename std::enable_if<TypeHasFeatures<ProtoType>::value, Features*>::type
+GetFeatures(ProtoType* proto);
 
 template <typename ProtoType>
-const Features& GetFeatures(const ProtoType& proto);
+typename std::enable_if<TypeHasFeatures<ProtoType>::value,
+                        const Features&>::type
+GetFeatures(const ProtoType& proto);
 
 // Base declaration of a family of template functions to return a read only
 // repeated field of feature values.
@@ -300,7 +312,7 @@ bool HasFeature(const string& key, const Features& features);
 template <typename... FeatureType>
 bool HasFeature(const string& key, const Example& example) {
   return HasFeature<FeatureType...>(key, GetFeatures(example));
-};
+}
 
 // DEPRECATED: use HasFeature instead.
 // TODO(gorban): update all clients in a followup CL.
-- 
GitLab


From 5fb3c64421f53aa7ef58ffcee6de47cd4a40fe2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 02:58:31 -0700
Subject: [PATCH 1105/1262] Set the random seed in on-demand mode.

PiperOrigin-RevId: 193488103
---
 tensorflow/compiler/jit/xla_compile_on_demand_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 682d6ea8cc..6c2782e28e 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -67,6 +67,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   run_options.set_stream(stream);
   run_options.set_allocator(client->backend().memory_allocator());
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
 
   auto run_result = executable->Run(launch_context.arguments(), run_options);
   TF_RETURN_IF_ERROR(run_result.status());
-- 
GitLab


From bf86d3a46b4e2ef4dabcba211c1ce36cb81ac315 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 04:27:38 -0700
Subject: [PATCH 1106/1262] Handle corner case in Python 3: members annotated
 with @classmethod.

PiperOrigin-RevId: 193495506
---
 tensorflow/contrib/autograph/pyct/inspect_utils.py   | 12 +++++++-----
 .../contrib/autograph/pyct/inspect_utils_test.py     |  7 +++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index a0f56a6c1f..eef74599a7 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -75,13 +75,15 @@ def getdefiningclass(m, owner_class):
   """Resolves the class (e.g. one of the superclasses) that defined a method."""
   # Normalize bound functions to their respective unbound versions.
   m = _get_unbound_function(m)
-  last_defining = owner_class
-  for superclass in tf_inspect.getmro(owner_class):
+  for superclass in owner_class.__bases__:
     if hasattr(superclass, m.__name__):
       superclass_m = getattr(superclass, m.__name__)
-      if _get_unbound_function(superclass_m) == m:
-        last_defining = superclass
-  return last_defining
+      if _get_unbound_function(superclass_m) is m:
+        return superclass
+      elif hasattr(m, '__self__') and m.__self__ == owner_class:
+        # Python 3 class methods only work this way it seems :S
+        return superclass
+  return owner_class
 
 
 def getmethodclass(m):
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index cf841dae81..1a212f676a 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -243,6 +243,10 @@ class InspectUtilsTest(test.TestCase):
       def bar(self):
         pass
 
+      @classmethod
+      def class_method(cls):
+        pass
+
     class Subclass(Superclass):
 
       def foo(self):
@@ -257,6 +261,9 @@ class InspectUtilsTest(test.TestCase):
         inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass)
     self.assertTrue(
         inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.class_method, Subclass) is
+        Superclass)
 
   def test_isbuiltin(self):
     self.assertTrue(inspect_utils.isbuiltin(range))
-- 
GitLab


From 06d802ab61987bde76a30098ff7930c27d561375 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 05:11:30 -0700
Subject: [PATCH 1107/1262] Support for converting entire class hierarchies:  *
 limit the methods being converted to those that have not been inherited from
 the superclass  * include the (possibly compiled) superclass in the
 definition of the compiled class  * either mark the superclass for conversion
 or generate an absolute aliased import line, depending on whether it's
 whitelisted

PiperOrigin-RevId: 193499204
---
 .../autograph/converters/call_trees.py        | 10 ++--
 tensorflow/contrib/autograph/impl/api.py      |  2 +-
 .../contrib/autograph/impl/conversion.py      | 58 +++++++++++++++---
 .../contrib/autograph/impl/conversion_test.py | 60 +++++++++++++++++++
 4 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index e390d1a262..2e5590b46c 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -245,8 +245,6 @@ class CallTreeTransformer(transformer.Base):
     new_call.keywords = node.keywords
     return new_call
 
-  # pylint:disable=invalid-name
-
   def visit_Expr(self, node):
     if isinstance(node.value, gast.Call):
       if anno.hasanno(node.value.func, 'live_val'):
@@ -294,15 +292,17 @@ class CallTreeTransformer(transformer.Base):
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
-      if self.context.recursive:
+      if ast_util.matches(node, 'super(_)'):
+        # super() calls are preserved. The class conversion mechanism will
+        # ensure that they return the correct value.
+        pass
+      elif self.context.recursive:
         node = self._insert_dynamic_conversion(node)
       else:
         # Unresolved functions are allowed in non-recursive mode.
         pass
     return node
 
-  # pylint:enable=invalid-name
-
 
 def transform(node, context, uncompiled_modules, nocompile_decorators):
   """Transform function call to the compiled counterparts.
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index f97a33326e..d874ef15c9 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -241,7 +241,7 @@ def to_graph(e,
   module = gast.Module([])
   for import_line in config.COMPILED_IMPORT_STATEMENTS:
     module.body.extend(parser.parse_str(import_line).body)
-  for dep in conversion_map.dependency_cache.values():
+  for dep in reversed(conversion_map.dependency_cache.values()):
     module.body.append(dep)
   compiled_node, compiled_src = compiler.ast_to_object(module)
 
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 5653e991f6..e7230a5f45 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import imp
 
 import gast
@@ -39,6 +40,7 @@ from tensorflow.contrib.autograph.converters import side_effect_guards
 from tensorflow.contrib.autograph.converters import single_return
 from tensorflow.contrib.autograph.impl import config
 from tensorflow.contrib.autograph.impl import naming
+from tensorflow.contrib.autograph.pyct import ast_util
 from tensorflow.contrib.autograph.pyct import context
 from tensorflow.contrib.autograph.pyct import inspect_utils
 from tensorflow.contrib.autograph.pyct import parser
@@ -81,7 +83,9 @@ class ConversionMap(object):
     self.recursive = recursive
     self.nocompile_decorators = nocompile_decorators
     self.partial_types = partial_types if partial_types else ()
-    self.dependency_cache = {}
+    # Required to output dependencies in discovery order, which should match
+    # the reverse dependency order.
+    self.dependency_cache = collections.OrderedDict()
     self.additional_imports = set()
     self.name_map = {}
     self.api_module = api_module
@@ -201,6 +205,9 @@ def class_to_graph(c, conversion_map):
 
   class_namespace = {}
   for _, m in members:
+    # Only convert the members that are directly defined by the class.
+    if inspect_utils.getdefiningclass(m, c) is not c:
+      continue
     node, _, namespace = function_to_graph(
         m,
         conversion_map=conversion_map,
@@ -214,12 +221,49 @@ def class_to_graph(c, conversion_map):
     converted_members[m] = node
   namer = conversion_map.new_namer(class_namespace)
   class_name = namer.compiled_class_name(c.__name__, c)
-  node = gast.ClassDef(
-      class_name,
-      bases=[],
-      keywords=[],
-      body=list(converted_members.values()),
-      decorator_list=[])
+
+  # TODO(mdan): This needs to be explained more thoroughly.
+  # Process any base classes: if the sueprclass if of a whitelisted type, an
+  # absolute import line is generated. Otherwise, it is marked for conversion
+  # (as a side effect of the call to namer.compiled_class_name() followed by
+  # conversion_map.update_name_map(namer)).
+  output_nodes = []
+  renames = {}
+  bases = []
+  for base in c.__bases__:
+    if isinstance(object, base):
+      bases.append('object')
+      continue
+    if is_whitelisted_for_graph(base):
+      alias = namer.new_symbol(base.__name__, ())
+      output_nodes.append(
+          gast.ImportFrom(
+              module=base.__module__,
+              names=[gast.alias(name=base.__name__, asname=alias)],
+              level=0))
+    else:
+      # This will trigger a conversion into a class with this name.
+      alias = namer.compiled_class_name(base.__name__, base)
+    bases.append(alias)
+    renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
+  conversion_map.update_name_map(namer)
+
+  # Generate the definition of the converted class.
+  output_nodes.append(
+      gast.ClassDef(
+          class_name,
+          bases=bases,
+          keywords=[],
+          body=list(converted_members.values()),
+          decorator_list=[]))
+  node = gast.Module(output_nodes)
+
+  # Make a final pass to replace references to the class or its base classes.
+  # Most commonly, this occurs when making super().__init__() calls.
+  # TODO(mdan): Making direct references to superclass' superclass will fail.
+  node = qual_names.resolve(node)
+  renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name)
+  node = ast_util.rename_symbols(node, renames)
 
   return node, class_name, class_namespace
 
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index da3220892f..5edd8e74a8 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.impl import api
 from tensorflow.contrib.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
+from tensorflow.python.keras._impl.keras.engine import training
 from tensorflow.python.platform import test
 
 
@@ -78,6 +79,65 @@ class ConversionTest(test.TestCase):
         conversion_map.dependency_cache[f].body[0].body[0].value.func.id)
     self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
 
+  def test_entity_to_graph_class_hierarchy(self):
+
+    class TestBase(object):
+
+      def __init__(self, x='base'):
+        self.x = x
+
+      def foo(self):
+        return self.x
+
+      def bar(self):
+        return self.x
+
+    class TestSubclass(TestBase):
+
+      def __init__(self, y):
+        super(TestSubclass, self).__init__('sub')
+        self.y = y
+
+      def foo(self):
+        return self.y
+
+      def baz(self):
+        return self.y
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(TestSubclass, conversion_map, None, None)
+
+    self.assertTrue(TestBase in conversion_map.dependency_cache)
+    self.assertTrue(TestSubclass in conversion_map.dependency_cache)
+    self.assertEqual('TfTestBase',
+                     conversion_map.dependency_cache[TestBase].body[-1].name)
+    self.assertEqual(
+        'TfTestSubclass',
+        conversion_map.dependency_cache[TestSubclass].body[-1].name)
+
+  def test_entity_to_graph_class_hierarchy_whitelisted(self):
+
+    class TestSubclass(training.Model):
+
+      def __init__(self, y):
+        super(TestSubclass, self).__init__()
+        self.built = False
+
+      def call(self, x):
+        return 3 * x
+
+    conversion_map = self._simple_conversion_map()
+    conversion.entity_to_graph(TestSubclass, conversion_map, None, None)
+
+    self.assertTrue(TestSubclass in conversion_map.dependency_cache)
+    self.assertFalse(training.Model in conversion_map.dependency_cache)
+    self.assertEqual(
+        'Model',
+        conversion_map.dependency_cache[TestSubclass].body[0].names[0].name)
+    self.assertEqual(
+        'TfTestSubclass',
+        conversion_map.dependency_cache[TestSubclass].body[-1].name)
+
   def test_entity_to_graph_lambda(self):
     f = lambda a: a
 
-- 
GitLab


From 40f77655affb162d32b7d4861fa68c35fc3d8f7a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 06:58:34 -0700
Subject: [PATCH 1108/1262] Update the Colorbot demo to use a Keras model in
 addition to the Estimator.

PiperOrigin-RevId: 193508874
---
 ...imator.ipynb => rnn_keras_estimator.ipynb} | 677 +++++-------------
 1 file changed, 167 insertions(+), 510 deletions(-)
 rename tensorflow/contrib/autograph/examples/notebooks/{rnn_colorbot_estimator.ipynb => rnn_keras_estimator.ipynb} (50%)

diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
similarity index 50%
rename from tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
rename to tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
index 7f5e4d4ac1..324b23c24b 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
@@ -62,7 +62,7 @@
         }
       },
       "source": [
-        "# Case study: building an RNN\n"
+        "# Case study: training a custom RNN, using Keras and Estimators\n"
       ]
     },
     {
@@ -118,6 +118,16 @@
         "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
         "  return rgb, chars, length\n",
         "\n",
+        "\n",
+        "def set_static_batch_shape(batch_size):\n",
+        "  def apply(rgb, chars, length):\n",
+        "    rgb.set_shape((batch_size, None))\n",
+        "    chars.set_shape((batch_size, None, 256))\n",
+        "    length.set_shape((batch_size,))\n",
+        "    return rgb, chars, length\n",
+        "  return apply\n",
+        "\n",
+        "\n",
         "def load_dataset(data_dir, url, batch_size, training=True):\n",
         "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
         "  path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n",
@@ -129,7 +139,10 @@
         "  if training:\n",
         "    dataset = dataset.shuffle(buffer_size=3000)\n",
         "  dataset = dataset.padded_batch(\n",
-        "      batch_size, padded_shapes=([None], [None, None], []))\n",
+        "      batch_size, padded_shapes=((None,), (None, 256), ()))\n",
+        "  # To simplify the model code, we statically set as many of the shapes that we\n",
+        "  # know.\n",
+        "  dataset = dataset.map(set_static_batch_shape(batch_size))\n",
         "  return dataset"
       ]
     },
@@ -145,7 +158,8 @@
       "source": [
         "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n",
         "\n",
-        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode."
+        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode.\n",
+        "We use Keras to define the model, and we will train it using Estimators."
       ]
     },
     {
@@ -166,70 +180,72 @@
       },
       "outputs": [],
       "source": [
-        "class RnnColorbot(object):\n",
-        "  \"\"\"Holds the parameters of the colorbot model.\"\"\"\n",
+        "@autograph.convert()\n",
+        "class RnnColorbot(tf.keras.Model):\n",
+        "  \"\"\"RNN Colorbot model.\"\"\"\n",
         "\n",
         "  def __init__(self):\n",
+        "    super(RnnColorbot, self).__init__()\n",
         "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
         "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
         "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
         "\n",
+        "\n",
+        "  def _rnn_layer(self, chars, cell, batch_size, training):\n",
+        "    \"\"\"A single RNN layer.\n",
+        "\n",
+        "    Args:\n",
+        "      chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "      cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "      batch_size: Int, the batch size to use\n",
+        "      training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "    Returns:\n",
+        "      A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "    \"\"\"\n",
+        "    hidden_outputs = []\n",
+        "    autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "    state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "    for ch in chars:\n",
+        "      cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "      hidden_outputs.append(cell_output)\n",
+        "    hidden_outputs = hidden_outputs.stack()\n",
+        "    if training:\n",
+        "      hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "    return hidden_outputs\n",
+        "\n",
+        "  def build(self, _):\n",
+        "    \"\"\"Creates the model variables. See keras.Model.build().\"\"\"\n",
         "    self.lower_cell.build(tf.TensorShape((None, 256)))\n",
         "    self.upper_cell.build(tf.TensorShape((None, 256)))\n",
-        "    self.relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "    self.relu_layer.build(tf.TensorShape((None, 128)))    \n",
+        "    self.built = True\n",
         "\n",
         "\n",
-        "def rnn_layer(chars, cell, batch_size, training):\n",
-        "  \"\"\"A simple RNN layer.\n",
-        "  \n",
-        "  Args:\n",
-        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
-        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
-        "    batch_size: Int, the batch size to use\n",
-        "    training: Boolean, whether the layer is used for training\n",
+        "  def call(self, inputs, training=False):\n",
+        "    \"\"\"The RNN model code. Uses Eager and \n",
         "\n",
-        "  Returns:\n",
-        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
-        "  \"\"\"\n",
-        "  hidden_outputs = []\n",
-        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
-        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
-        "  for ch in chars:\n",
-        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
-        "    hidden_outputs.append(cell_output)\n",
-        "  hidden_outputs = hidden_outputs.stack()\n",
-        "  if training:\n",
-        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
-        "  return hidden_outputs\n",
+        "    The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "    followed by a fully connected layer with ReLU activation.\n",
         "\n",
+        "    Args:\n",
+        "      inputs: A tuple (chars, length)\n",
+        "      training: Boolean, whether the layer is used for training\n",
         "\n",
-        "@autograph.convert(recursive=True)\n",
-        "def model(inputs, colorbot, batch_size, training):\n",
-        "  \"\"\"RNNColorbot model.\n",
-        "  \n",
-        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
-        "  followed by a fully connected layer with ReLU activation.\n",
-        "  \n",
-        "  Args:\n",
-        "    inputs: A tuple (chars, length)\n",
-        "    colorbot: An object of type RnnColorbot\n",
-        "    batch_size: Int, the batch size to use\n",
-        "    training: Boolean, whether the layer is used for training\n",
-        "    \n",
-        "  Returns:\n",
-        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
-        "  \"\"\"\n",
-        "  (chars, length) = inputs\n",
-        "  seq = tf.transpose(chars, [1, 0, 2])\n",
-        "  seq.set_shape((None, batch_size, 256))\n",
+        "    Returns:\n",
+        "      A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "    \"\"\"\n",
+        "    chars, length = inputs\n",
+        "    batch_size = chars.shape[0]\n",
+        "    seq = tf.transpose(chars, (1, 0, 2))\n",
         "\n",
-        "  seq = rnn_layer(seq, colorbot.lower_cell, batch_size, training)\n",
-        "  seq = rnn_layer(seq, colorbot.upper_cell, batch_size, training)\n",
+        "    seq = self._rnn_layer(seq, self.lower_cell, batch_size, training)\n",
+        "    seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n",
         "\n",
-        "  # Grab just the end-of-sequence from each output.\n",
-        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
-        "  sequence_ends = tf.gather_nd(seq, indices)\n",
-        "  return colorbot.relu_layer(sequence_ends)\n",
+        "    # Grab just the end-of-sequence from each output.\n",
+        "    indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "    sequence_ends = tf.gather_nd(seq, indices)\n",
+        "    return self.relu_layer(sequence_ends)\n",
         "\n",
         "@autograph.convert()\n",
         "def loss_fn(labels, predictions):\n",
@@ -246,9 +262,9 @@
         }
       },
       "source": [
-        "We will now create the model function for the estimator.\n",
+        "We will now create the model function for the custom Estimator.\n",
         "\n",
-        "In the model function, we simply call the converted functions that we defined above - that's it!"
+        "In the model function, we simply use the model class we defined above - that's it!"
       ]
     },
     {
@@ -275,14 +291,12 @@
         "  sequence_length = features['sequence_length']\n",
         "  inputs = (chars, sequence_length)\n",
         "\n",
-        "  # Create the model components.\n",
-        "  # Simply calling the AutoGraph-ed functions and objects just works!\n",
+        "  # Create the model. Simply using the AutoGraph-ed class just works!\n",
         "  colorbot = RnnColorbot()\n",
-        "  \n",
-        "  batch_size = params['batch_size']\n",
+        "  colorbot.build(None)\n",
         "\n",
         "  if mode == tf.estimator.ModeKeys.TRAIN:\n",
-        "    predictions = model(inputs, colorbot, batch_size, training=True)\n",
+        "    predictions = colorbot(inputs, training=True)\n",
         "    loss = loss_fn(labels, predictions)\n",
         "\n",
         "    learning_rate = params['learning_rate']\n",
@@ -292,14 +306,13 @@
         "    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n",
         "\n",
         "  elif mode == tf.estimator.ModeKeys.EVAL:\n",
-        "    predictions = model(inputs, colorbot, batch_size, training=False)\n",
+        "    predictions = colorbot(inputs)\n",
         "    loss = loss_fn(labels, predictions)\n",
         "\n",
         "    return tf.estimator.EstimatorSpec(mode, loss=loss)\n",
-        "  \n",
+        "\n",
         "  elif mode == tf.estimator.ModeKeys.PREDICT:\n",
-        "    # For prediction, we expect single tensors.\n",
-        "    predictions = model(inputs, colorbot, 1, training=False)\n",
+        "    predictions = colorbot(inputs)\n",
         "\n",
         "    predictions = tf.minimum(predictions, 1.0)\n",
         "    return tf.estimator.EstimatorSpec(mode, predictions=predictions)"
@@ -368,7 +381,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 7,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -379,9 +392,9 @@
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 10064,
+          "elapsed": 10604,
           "status": "ok",
-          "timestamp": 1523580419240,
+          "timestamp": 1524095272039,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -390,7 +403,7 @@
           "user_tz": 240
         },
         "id": "2pg1AfbxBJQq",
-        "outputId": "41894b16-3d3a-4e30-f6e4-5a9c837a2210",
+        "outputId": "9c924b4f-06e1-4538-976c-a3e1ddac5660",
         "slideshow": {
           "slide_type": "-"
         }
@@ -400,7 +413,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Eval loss at step 100: 0.0665446\n"
+            "Eval loss at step 100: 0.0674834\n"
           ]
         }
       ],
@@ -444,7 +457,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 8,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -455,9 +468,9 @@
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 31286,
+          "elapsed": 7990,
           "status": "ok",
-          "timestamp": 1523580450579,
+          "timestamp": 1524095280105,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -466,7 +479,7 @@
           "user_tz": 240
         },
         "id": "dxHex2tUN_10",
-        "outputId": "b3dc558d-b800-4e9b-e60e-3441124e80d8",
+        "outputId": "2b889e5a-b9ed-4645-bf03-d98f26c72101",
         "slideshow": {
           "slide_type": "slide"
         }
@@ -478,7 +491,7 @@
               "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f4112527e90\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3f36aa6cd0\u003e"
             ]
           },
           "metadata": {
@@ -494,7 +507,7 @@
               "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f4112527f10\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3eca67f7d0\u003e"
             ]
           },
           "metadata": {
@@ -510,7 +523,7 @@
               "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f4112527f50\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3eca67f8d0\u003e"
             ]
           },
           "metadata": {
@@ -523,11 +536,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f474-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"borderColor\": [\"#a7a7a7\"], \"contentBorder\": [\"0px\"], \"tabNames\": [\"RNN Colorbot\"], \"elementId\": \"id1\"});\n",
-              "//# sourceURL=js_a0db480422"
+              "window[\"e8ddfa22-4362-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"elementId\": \"id1\", \"borderColor\": [\"#a7a7a7\"], \"contentHeight\": [\"initial\"], \"tabNames\": [\"RNN Colorbot\"], \"location\": \"top\", \"initialSelection\": 0});\n",
+              "//# sourceURL=js_71b9087b6d"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd1d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f950\u003e"
             ]
           },
           "metadata": {
@@ -540,11 +553,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f475-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_d2a46ea291"
+              "window[\"e8ddfa23-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_e390445f33"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd0d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e"
             ]
           },
           "metadata": {
@@ -557,11 +570,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_0a8262c6e9"
+              "window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_241dd76d85"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd390\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e"
             ]
           },
           "metadata": {
@@ -575,11 +588,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_e32f85ccd2"
+              "window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_60c64e3d50"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e"
             ]
           },
           "metadata": {
@@ -593,11 +606,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f478-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_eaee748b21"
+              "window[\"e8ddfa26-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_14ea437cbd"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd550\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e"
             ]
           },
           "metadata": {
@@ -611,11 +624,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"2c60f479-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_2befe06587"
+              "window[\"e8ddfa27-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_09294c2226"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4112527f10\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fcd0\u003e"
             ]
           },
           "metadata": {
@@ -629,11 +642,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1a-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_8ec4aeeb25"
+              "window[\"ec965514-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_e5e8266997"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd690\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e"
             ]
           },
           "metadata": {
@@ -647,11 +660,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_9f9f4574f1"
+              "window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_07a097f0ee"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd350\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc90\u003e"
             ]
           },
           "metadata": {
@@ -665,11 +678,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_bcccd8f300"
+              "window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_790d669ca8"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd6d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f8d0\u003e"
             ]
           },
           "metadata": {
@@ -683,11 +696,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_2c056cee72"
+              "window[\"ec965517-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_d30df771f0"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e"
             ]
           },
           "metadata": {
@@ -701,11 +714,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"354d7b1e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_c853c3f58b"
+              "window[\"ec965518-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_8a43a2da4b"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd610\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e"
             ]
           },
           "metadata": {
@@ -718,369 +731,9 @@
         },
         {
           "data": {
-            "application/javascript": [
-              "window[\"354d7b1f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_e5730ab00d"
-            ],
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMBJREFUeJzt3F+I1XX+x/G32zjiFERUpgaFd2JBzOg5joX4h0SiMgmM\n/uhVGIlgFBlERGB3hUEkhkRdtDfRP1ACL6KpLBqcguxCjEAkmGamQcSohFHzsxe7O6zssvsydtff\n+ns8rs758j3f8z7fiyef7/k3o7XWCiDwh4s9APC/QzCAmGAAMcEAYoIBxAQDiAkGF8XTTz9d3W63\n7rvvvhoZGakVK1Zc7JEICMYlbvXq1TU8PHyxxzjPV199VcPDw/XZZ5/V22+/XVVVM2bMuMhTkRAM\n/qt+++23+uGHH+r666+vWbNmXexxuECCcQl76qmnanx8vLZs2VIDAwP1+uuv1zfffFP3339/dTqd\nWr9+fY2MjEzvv2nTpnr55ZfrgQceqIGBgXr44Yfr5MmTVVV1+vTp2r59ey1durQ6nU5t2LChTpw4\nUVVVk5OTtWXLllq6dGmtXbu23nnnnelj7tq1q7Zt21bbt2+vJUuW1HvvvVfPPvtsHTp0qAYGBmrX\nrl1/N/fRo0dr06ZN1el06u67766hoaGqqhodHa1OpzO93zPPPFO33nrr9P3t27fXm2+++e89iZyv\ncUlbtWpVGx4ebq21NjEx0brdbjtw4EBrrbUvvviidbvdduLEidZaaxs3bmxr1qxp33//fZuammob\nN25sO3fubK219tZbb7VHH320TU1NtXPnzrXDhw+3X375pbXW2kMPPdR27NjRTp8+3Y4cOdIGBwen\nn/OVV15pN910U/voo49aa61NTU21999/vz344IPTMx48eLCtWLGitdbamTNn2po1a9qePXvamTNn\n2vDwcOvv72/Hjh2bfj2HDx9urbW2du3advvtt7ejR4+21lpbuXJlO3LkyH/qVNJas8L4f6D95edC\n+/btq5UrV9by5curqmrZsmV1880316effjq977333ls33HBD9fb21h133FFHjhypqqqenp46efJk\nHTt2rGbMmFGLFi2qyy+/vCYmJurrr7+uJ598smbOnFkLFy6sDRs21N69e6eP2d/fX6tXr66qqt7e\n3n8666FDh+rUqVP1yCOPVE9PTw0ODtaqVavqgw8+qKqqJUuW1MjISB0/fryqqtauXVtffvlljY6O\n1q+//loLFy78N501/pGeiz0A/z1jY2O1f//++vjjj6vqzyE5e/ZsLVu2bHqfa665Zvr27Nmz69Sp\nU1VVdc8999TExEQ98cQT9fPPP9e6devq8ccfr8nJybryyitr9uzZ04+bP39+HT58ePr+3Llz4xkn\nJydr3rx5522bP39+TU5OVlVVp9OpoaGhuu6666rb7Va32629e/dWb29vLV68+ALOBr+HYFzi/vbT\nh3nz5tX69etrx44dF3ycnp6e2rp1a23durXGxsZq8+bNtWDBgrrtttvqp59+qlOnTlVfX19VVY2P\nj9ecOXP+4Qz/ypw5c2p8fPy8bWNjY7VgwYKqqup2u/Xiiy/WvHnzqtPp1MDAQD333HPV29tb3W73\ngl8XF8YlySXu2muvrdHR0aqqWrduXQ0NDdXnn39e586dq6mpqRoZGakff/zxXx7n4MGD9d1339W5\nc+eqr6+venp66rLLLqu5c+dWf39/vfTSS3X69On69ttv6913361169b9rnlvueWW6uvrq9dee63O\nnj1bBw8erE8++aTuvPPOqqq68cYba9asWbVv377qdDp1xRVX1NVXX10ffvjheW+I8p8hGJe4zZs3\n1+7du6vb7db+/ftr9+7dtWfPnlq2bFmtWrWq3njjjen3OP7ZSuD48eO1bdu2Wrx4cd111121dOnS\n6Sjs3LmzRkdHa/ny5bVt27Z67LHHzrvMuRAzZ86sV199tQ4cOFCDg4P1/PPP1wsvvDC9wqj68yrj\nqquumr7U+WsoFi1a9Luek9yM1vyBDpCxwgBiggHEBAOICQYQ+z/7PYzjf/QRGVxM12z68u+2WWEA\nMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE\nBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhAT\nDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMIDajtdYu9hDA/wYrDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4j9CY2LTAbbRbWuAAAAAElFTkSuQmCC\n",
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2050\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_a897ef7e24"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2250\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_565fa3d154"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4113124d90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b22-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_222e0dc6af"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4113124c10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"354d7b23-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_831db7458f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4113124310\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab4-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_adb576c6eb"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_9418f2d32f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_3fad25f306"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4112527ed0\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab7-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_45b9340e7b"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990c90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab8-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_bec9896d44"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f990a10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fab9-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_460b91ad4a"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3a10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_7dedd0b037"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3890\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_4b1c977dc7"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3bd0\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fabc-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_d64fedfcf9"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3410\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3803fabd-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_3e8c929c3f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3c50\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_9f9cf2b76f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_b402e6b587"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3d90\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_9b7d66db72"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3b10\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b986f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_11ec213a3f"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3950\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/javascript": [
-              "window[\"3b9b9870-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_9c055e4bc0"
-            ],
-            "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41b21d3850\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": [
-              "id1_content_0",
-              "outputarea_id1"
-            ]
-          },
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMRJREFUeJzt3F+IlfW+x/Gvp3FECyIqU4PCO7EgZnQtnUJ0JJGoTDoY\n/dGrMBJhosggIgK7KwwiMdxRF11F/0AJvIisLBqcguxCjEAkmNQGcRvVwIzm71zsc4Yje7P3x9h7\nz97u1+tqrYdnPeu7nos3v2f9m9FaawUQ+K/pHgD49yEYQEwwgJhgADHBAGKCAcQEg2nx9NNPV7fb\nrfvuu69GRkZq5cqV0z0SAcG4xK1evbqGh4ene4wLfPXVVzU8PFyfffZZvf3221VVNWPGjGmeioRg\n8E/122+/1Q8//FDXX399zZo1a7rH4SIJxiXsqaeeqhMnTtSWLVuqv7+/Xn/99frmm2/q/vvvr06n\nU+vXr6+RkZGp/Tdt2lQvv/xyPfDAA9Xf318PP/xwnTlzpqqqJicna9u2bbVs2bLqdDq1YcOGOn36\ndFVVjY2N1ZYtW2rZsmW1du3aeuedd6aOuXPnzhoaGqpt27bV0qVL67333qtnn322Dh06VP39/bVz\n584/m/vo0aO1adOm6nQ6dffdd9f+/furqmp0dLQ6nc7Ufs8880zdeuutU/e3bdtWb7755t/3JHKh\nxiVtcHCwDQ8Pt9ZaO3nyZOt2u+3AgQOttda++OKL1u122+nTp1trrW3cuLGtWbOmff/9921iYqJt\n3Lix7dixo7XW2ltvvdUeffTRNjEx0c6fP98OHz7cfvnll9Zaaw899FDbvn17m5ycbEeOHGnLly+f\nes5XXnml3XTTTe2jjz5qrbU2MTHR3n///fbggw9OzXjw4MG2cuXK1lprZ8+ebWvWrGm7d+9uZ8+e\nbcPDw62vr68dO3Zs6vUcPny4tdba2rVr2+23396OHj3aWmtt1apV7ciRI/+oU0lrzQrjP0D7358L\n7d27t1atWlUrVqyoqqqBgYG6+eab69NPP53a9957760bbrihent764477qgjR45UVVVPT0+dOXOm\njh07VjNmzKjFixfX5ZdfXidPnqyvv/66nnzyyZo5c2YtWrSoNmzYUHv27Jk6Zl9fX61evbqqqnp7\ne//qrIcOHarx8fF65JFHqqenp5YvX16Dg4P1wQcfVFXV0qVLa2RkpE6dOlVVVWvXrq0vv/yyRkdH\n69dff61Fixb9nc4af0nPdA/AP8/x48dr37599fHHH1fVn0Jy7ty5GhgYmNrnmmuumbo9e/bsGh8f\nr6qqe+65p06ePFlPPPFE/fzzz7Vu3bp6/PHHa2xsrK688sqaPXv21OMWLFhQhw8fnro/b968eMax\nsbGaP3/+BdsWLFhQY2NjVVXV6XRq//79dd1111W3261ut1t79uyp3t7eWrJkyUWcDX4PwbjE/f9P\nH+bPn1/r16+v7du3X/Rxenp6auvWrbV169Y6fvx4bd68uRYuXFi33XZb/fTTTzU+Pl5z5sypqqoT\nJ07U3Llz/+IMf8vcuXPrxIkTF2w7fvx4LVy4sKqqut1uvfjiizV//vzqdDrV399fzz33XPX29la3\n273o18XFcUlyibv22mtrdHS0qqrWrVtX+/fvr88//7zOnz9fExMTNTIyUj/++OPfPM7Bgwfru+++\nq/Pnz9ecOXOqp6enLrvsspo3b1719fXVSy+9VJOTk/Xtt9/Wu+++W+vWrftd895yyy01Z86ceu21\n1+rcuXN18ODB+uSTT+rOO++sqqobb7yxZs2aVXv37q1Op1NXXHFFXX311fXhhx9e8IYo/xiCcYnb\nvHlz7dq1q7rdbu3bt6927dpVu3fvroGBgRocHKw33nhj6j2Ov7YSOHXqVA0NDdWSJUvqrrvuqmXL\nlk1FYceOHTU6OlorVqyooaGheuyxxy64zLkYM2fOrFdffbUOHDhQy5cvr+eff75eeOGFqRVG1Z9W\nGVddddXUpc7/hWLx4sW/6znJzWjNH+gAGSsMICYYQEwwgJhgALF/2e9h/PEP/z3dI8B/tKseee/P\ntllhADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwg\nJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICY\nYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKC\nAcQEA4gJBhATDCA2o7XWpnsI4N+DFQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE/gfh60wGjfc7LQAAAABJRU5ErkJg\ngg==\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0x7f4113124310\u003e"
+              "\u003cmatplotlib.figure.Figure at 0x7f3ecc00bf10\u003e"
             ]
           },
           "metadata": {
@@ -1095,11 +748,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9871-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_ba6a061307"
+              "window[\"ec965519-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_893ad561f4"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd890\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55c90\u003e"
             ]
           },
           "metadata": {
@@ -1113,11 +766,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
-              "//# sourceURL=js_83e3496927"
+              "window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_2d99e0ac17"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67fe50\u003e"
             ]
           },
           "metadata": {
@@ -1131,11 +784,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
-              "//# sourceURL=js_f437bab20d"
+              "window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_5c19462e32"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a22d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55dd0\u003e"
             ]
           },
           "metadata": {
@@ -1149,11 +802,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9874-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_93aa63450e"
+              "window[\"ec96551c-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_b9c8b7567b"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2b90\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55a50\u003e"
             ]
           },
           "metadata": {
@@ -1167,11 +820,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9875-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
-              "//# sourceURL=js_aca189bea5"
+              "window[\"ec96551d-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_fd05186348"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd4d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55810\u003e"
             ]
           },
           "metadata": {
@@ -1185,10 +838,10 @@
         {
           "data": {
             "text/html": [
-              "\u003cdiv class=id_100313201 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
+              "\u003cdiv class=id_888646481 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.HTML at 0x7f410f990a90\u003e"
+              "\u003cIPython.core.display.HTML at 0x7f3f32414810\u003e"
             ]
           },
           "metadata": {
@@ -1203,11 +856,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
-              "//# sourceURL=js_5df1fe383e"
+              "window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n",
+              "//# sourceURL=js_efef96e882"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e"
             ]
           },
           "metadata": {
@@ -1222,11 +875,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3b9b9877-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
-              "//# sourceURL=js_c62c7174ad"
+              "window[\"ec96551f-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_6eca889864"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2390\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e"
             ]
           },
           "metadata": {
@@ -1241,11 +894,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 input\");\n",
-              "//# sourceURL=js_2e2201ddc4"
+              "window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 input\");\n",
+              "//# sourceURL=js_f02070cc60"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2810\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b553d0\u003e"
             ]
           },
           "metadata": {
@@ -1260,11 +913,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76585-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
-              "//# sourceURL=js_288e5283d6"
+              "window[\"ed8ea973-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
+              "//# sourceURL=js_ed9faba660"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a26d0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a95450\u003e"
             ]
           },
           "metadata": {
@@ -1279,11 +932,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
-              "//# sourceURL=js_2f31d19cde"
+              "window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n",
+              "//# sourceURL=js_f3458d7074"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a95250\u003e"
             ]
           },
           "metadata": {
@@ -1298,11 +951,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76587-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
-              "//# sourceURL=js_2fbbcda050"
+              "window[\"ed8ea975-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_3ffd97bd6f"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f4112527e90\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31a953d0\u003e"
             ]
           },
           "metadata": {
@@ -1317,11 +970,11 @@
         {
           "data": {
             "application/javascript": [
-              "window[\"3ed76588-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
-              "//# sourceURL=js_f94d975cf3"
+              "window[\"ed8ea976-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_7f73e8bcca"
             ],
             "text/plain": [
-              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+              "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e"
             ]
           },
           "metadata": {
@@ -1337,7 +990,7 @@
         "def predict_input_fn(color_name):\n",
         "  \"\"\"An input function for prediction.\"\"\"\n",
         "  _, chars, sequence_length = parse(color_name)\n",
-        "  \n",
+        "\n",
         "  # We create a batch of a single element.\n",
         "  features = {\n",
         "      'chars': tf.expand_dims(chars, 0),\n",
@@ -1385,7 +1038,11 @@
     "colab": {
       "collapsed_sections": [],
       "default_view": {},
-      "name": "RNN Colorbot using Estimators",
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
+      "name": "RNN Colorbot using Keras and Estimators",
       "provenance": [
         {
           "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl",
-- 
GitLab


From b4c37a452d2ed1d1c29ceb70127c4ef6434c44ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 07:13:03 -0700
Subject: [PATCH 1109/1262] Teach the conditinal simplifier about sharding.

PiperOrigin-RevId: 193510638
---
 tensorflow/compiler/xla/service/conditional_simplifier.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index f35de08085..e560abc87f 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -69,7 +69,7 @@ static StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
         conditional->shape(), {conditional->mutable_operand(2)},
         conditional->false_computation()));
   }
-
+  conditional->SetupDerivedInstruction(call_op);
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(conditional, call_op));
   TF_RETURN_IF_ERROR(CallInliner::Inline(call_op).status());
 
-- 
GitLab


From 1a2eb108a3e513a4f4609b9d421277bc222e5eb0 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 19 Apr 2018 15:03:05 +0000
Subject: [PATCH 1110/1262] Update docs for tf.unstack with respect to numpy.

In 18692 an issue was raised over whether tf.unstack
is compatible with numpy.unstack (specified in current docs)
or numpy.split.

It looks like there is no numpy.unstack. And for numpy.split,
it is not compatible with tf.unstack.

The tf.split is very close to numpy.split. However, the second
arg `num_or_size_splits` in `tf.split` requires the number of
the splits, while the second arg `indices_or_sections` in
`numpy.split` requires the index of the splits. For that reason
the tf.split is not compatible with numpy.split as well.

According to the above this fix simply removes `The numpy equivalent` part
in the docs of tf.unstack.

This fix fixes 18692.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ceeabe090d..23202ae28e 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1057,9 +1057,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
     `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
   Etc.
 
-  This is the opposite of stack.  The numpy equivalent is
-
-      tf.unstack(x, n) = np.unstack(x)
+  This is the opposite of stack.
 
   Args:
     value: A rank `R > 0` `Tensor` to be unstacked.
-- 
GitLab


From 50f6683ca50e6d4e7008d6d1b437b407d6a62e92 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 19 Apr 2018 09:13:21 -0700
Subject: [PATCH 1111/1262] Add shape check for batch related Dataset ops
 (#18683)

* Add shape check for PrefetchDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add BatchDataset shape check

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add shape check for SlideDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add shape check for DenseToSparseBatchDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Sanitize with clang-format -i --style=Google

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 31 ++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 34f2c612ec..c63e485f6c 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -199,7 +199,12 @@ REGISTER_OP("PrefetchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ScanDataset")
     .Input("input_dataset: variant")
@@ -283,7 +288,12 @@ REGISTER_OP("BatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 // TODO(mrry): move SlideDataset to contrib in the future.
 REGISTER_OP("SlideDataset")
@@ -293,7 +303,13 @@ REGISTER_OP("SlideDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size and stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PaddedBatchDataset")
     .Input("input_dataset: variant")
@@ -323,7 +339,14 @@ REGISTER_OP("DenseToSparseBatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
-- 
GitLab


From b71b6b8ca9ade8b39d77f0373210fe58dfccf4f4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 19 Apr 2018 09:13:35 -0700
Subject: [PATCH 1112/1262] Shape validation with random/shuffle related
 Dataset ops (#18682)

* Add shape check for CacheDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add shape check for ShuffleAndRepeatDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add check for ShuffleDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add shape check for RandomDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add RangeDataset shape check

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Sanitize with clang-format -i --style=Google

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 43 ++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index c63e485f6c..dae0c0eae4 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -357,7 +357,14 @@ REGISTER_OP("RangeDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // start, stop, and step should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("RandomDataset")
     .Input("seed: int64")
@@ -367,7 +374,13 @@ REGISTER_OP("RandomDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
@@ -378,7 +391,14 @@ REGISTER_OP("ShuffleDataset")
     .Attr("reshuffle_each_iteration: bool = true")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ShuffleAndRepeatDataset")
     .Input("input_dataset: variant")
@@ -389,7 +409,15 @@ REGISTER_OP("ShuffleAndRepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, seed2, and count should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("CacheDataset")
     .Input("input_dataset: variant")
@@ -397,7 +425,12 @@ REGISTER_OP("CacheDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // filename should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TextLineDataset")
     .Input("filenames: string")
-- 
GitLab


From 76619c8dea0e480fd48e3b4dcfe0249eb24216b8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 19 Apr 2018 09:13:53 -0700
Subject: [PATCH 1113/1262] Validation in shape functions of Dataset ops
 (#18680)

* Add shape check for PrependFromQueueAndPaddedBatchDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add comment for shape check

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add shape check for FixedLengthRecordDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add check for filenames as well

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Clang-format -i --style=google for file format

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add shape check for SqlDataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/dataset_ops.cc | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index dae0c0eae4..869bef8040 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -459,7 +459,14 @@ REGISTER_OP("SqlDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
@@ -470,7 +477,18 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // header_bytes, record_bytes, footer_bytes, buffer_size should be
+      // scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TFRecordDataset")
     .Input("filenames: string")
@@ -609,7 +627,12 @@ REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
     // length of `output_types` is `N`, the `output_shapes` are
     // (as far as possible to tell statically) compatible with `padded_shapes`,
     // and that `padding_values` are all scalars.
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("EnqueueInQueueDataset")
     .Input("queue: variant")
-- 
GitLab


From 7e735e5be811bacfa4e16aeae2e8aa53ef209ea6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 09:13:47 -0700
Subject: [PATCH 1114/1262] Pin pip to version 9.0.3

* This is because pip 10 is still unstable in some distros
* reference: https://github.com/pypa/pip/issues/5240

PiperOrigin-RevId: 193525542
---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index fc137aeeed..9644277fab 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -19,11 +19,11 @@ set -e
 # We don't apt-get install so that we can install a newer version of pip.
 # Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
 if $(cat /etc/*-release | grep -q 14.04); then
-  easy_install -U pip
-  easy_install3 -U pip
+  easy_install -U pip==9.0.3
+  easy_install3 -U pip==9.0.3
 else
-  pip2 install --upgrade pip
-  pip3 install --upgrade pip
+  pip2 install --upgrade pip==9.0.3
+  pip3 install --upgrade pip==9.0.3
 fi
 
 # Install pip packages from whl files to avoid the time-consuming process of
-- 
GitLab


From 51a26bb2f3e66fc79a5870f6eed88f60de995d4a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 09:23:35 -0700
Subject: [PATCH 1115/1262] [TF:XLA] Change HloTestBase::ExecuteNoHloPasses to
 return a literal directly.

PiperOrigin-RevId: 193526900
---
 tensorflow/compiler/xla/tests/hlo_test_base.cc | 8 +++++---
 tensorflow/compiler/xla/tests/hlo_test_base.h  | 2 +-
 tensorflow/compiler/xla/tests/tuple_test.cc    | 3 +--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index c5afe0c3e0..9984aba089 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -113,11 +113,13 @@ StatusOr<std::unique_ptr<Literal>> HloTestBase::Execute(
   return test_runner_.Execute(std::move(module), arguments);
 }
 
-StatusOr<std::unique_ptr<Literal>> HloTestBase::ExecuteNoHloPasses(
+std::unique_ptr<Literal> HloTestBase::ExecuteNoHloPasses(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<Literal*> arguments) {
-  return test_runner_.Execute(std::move(module), arguments,
-                              /*run_hlo_passes=*/false);
+  return test_runner_
+      .Execute(std::move(module), arguments,
+               /*run_hlo_passes=*/false)
+      .ValueOrDie();
 }
 
 std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 28d7ab09cb..79fcea9403 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -99,7 +99,7 @@ class HloTestBase : public ::testing::Test {
 
   // Same as above, except the module will be executed without running any HLO
   // passes on it.
-  StatusOr<std::unique_ptr<Literal>> ExecuteNoHloPasses(
+  std::unique_ptr<Literal> ExecuteNoHloPasses(
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<Literal*> arguments);
 
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 098be6d7aa..61d0fa02ab 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -535,8 +535,7 @@ TEST_F(TupleHloTest,
       HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
           .ValueOrDie();
   auto param = Literal::MakeTupleOwned(Literal::CreateR1<float>({1, 2, 3}));
-  TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          ExecuteNoHloPasses(std::move(module), {param.get()}));
+  auto result = ExecuteNoHloPasses(std::move(module), {param.get()});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *result,
       *Literal::MakeTupleOwned(Literal::CreateR2<float>({{1, 2, 3}}))));
-- 
GitLab


From 0b3950d67bcb07c11f87bd3c2da554017bff0674 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 00:35:54 +0800
Subject: [PATCH 1116/1262] Fix code block rendering in several api definitions

---
 tensorflow/core/api_def/base_api/api_def_Pad.pbtxt        | 1 +
 tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
index e45e2375eb..ee4aad7899 100644
--- a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
@@ -24,5 +24,6 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
                       [0, 0, 2, 2, 0, 0]
                       [0, 0, 0, 0, 0, 0]]
 ```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
index b9e75caf02..37ac10dddb 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@@ -44,6 +44,7 @@ In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
 if T == qint8, out[i] -= (range(T) + 1) / 2.0
 ```
+
 here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 
 *MIN_COMBINED Mode Example*
@@ -87,6 +88,7 @@ choosing to elide the lowest possible value for symmetry (e.g., output range is
 
 We first find the range of values in our tensor. The
 range we use is always centered on 0, so we find m such that
+
 ```c++
   m = max(abs(input_min), abs(input_max))
 ```
@@ -95,6 +97,7 @@ Our input tensor range is then `[-m, m]`.
 
 Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
 If T is signed, this is
+
 ```
   num_bits = sizeof(T) * 8
   [min_fixed, max_fixed] =
@@ -102,16 +105,19 @@ If T is signed, this is
 ```
 
 Otherwise, if T is unsigned, the fixed-point range is
+
 ```
   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 ```
 
 From this we compute our scaling factor, s:
+
 ```c++
   s = (max_fixed - min_fixed) / (2 * m)
 ```
 
 Now we can quantize the elements of our tensor:
+
 ```c++
 result = round(input * s)
 ```
-- 
GitLab


From 1f1d7b88717847f590987ee40efbe970bb591275 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 09:34:24 -0700
Subject: [PATCH 1117/1262] Disable dlopen error of libneuralnetworks for
 non-Android platforms.

PiperOrigin-RevId: 193528346
---
 tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 85aca36874..ace4827d8c 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -34,10 +34,13 @@ limitations under the License.
 inline void* loadLibrary(const char* name) {
   // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn
   // api RT
-  void* handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+  void* handle = nullptr;
+#ifdef __ANDROID__
+  handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
   if (handle == nullptr) {
     NNAPI_LOG("nnapi error: unable to open library %s", name);
   }
+#endif
   return handle;
 }
 
-- 
GitLab


From c173157bdc132460c6f424a9803221e74fc73f59 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 19 Apr 2018 09:37:20 -0700
Subject: [PATCH 1118/1262] [tf.data] Add checkpointing support for
 MapAndBatchDataset.

PiperOrigin-RevId: 193528712
---
 .../kernel_tests/batch_dataset_op_test.py     |  31 ++
 .../kernels/data/map_and_batch_dataset_op.cc  | 277 +++++++++++++++++-
 2 files changed, 302 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index e1ec60d7c9..a4a0ce79b6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -681,6 +681,37 @@ class UnbatchDatasetSerializationTest(
         num_outputs)
 
 
+class MapAndBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testSerializationCore(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_batches = 2
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      return dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_batches=num_parallel_batches,
+                  drop_remainder=drop_remainder))
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
+
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index aaf4dc7341..b8105552a0 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -74,26 +74,29 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
-    *output = new Dataset(input, batch_size, num_parallel_batches,
-                          drop_remainder, output_types_, output_shapes_,
+    *output = new Dataset(ctx, input, batch_size, num_parallel_batches,
+                          drop_remainder, output_types_, output_shapes_, func_,
                           std::move(captured_func), &ctx->eigen_cpu_device());
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input, int64 batch_size,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
             int64 num_parallel_batches, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
           batch_size_(batch_size),
           num_parallel_batches_(num_parallel_batches),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
+          map_fn_(func),
           captured_func_(std::move(captured_func)),
           device_(device) {
       input_->Ref();
@@ -117,6 +120,48 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "MapAndBatchDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* batch_size_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+      Node* num_parallel_batches_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_batches_, &num_parallel_batches_node));
+      Node* drop_remainder_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(map_fn_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(2, batch_size_node),
+           std::make_pair(3, num_parallel_batches_node),
+           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -217,9 +262,83 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return status;
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (current_batch_index_ == -1) {
+          // Iterator has not been used. Nothing to save.
+          return Status::OK();
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_batch_index"),
+                                               current_batch_index_));
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("invocation_results_size"), invocation_results_.size()));
+        for (size_t i = 0; i < invocation_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteInvocationResultLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
+                                               batch_results_.size()));
+        for (size_t i = 0; i < batch_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteBatchResultLocked(writer, i));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("current_batch_index"))) {
+          // Iterator was never used so nothing to restore.
+          return Status::OK();
+        }
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_batch_index"), &temp));
+          current_batch_index_ = static_cast<int32>(temp);
+          if (current_batch_index_ != temp) {
+            return errors::Internal("Invalid value for current_batch_index ",
+                                    temp);
+          }
+        }
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        size_t invocation_results_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("invocation_results_size"), &temp));
+          invocation_results_size = static_cast<size_t>(temp);
+          if (invocation_results_size != temp) {
+            return errors::Internal(
+                "Invalid value for invocation_results_size ", temp);
+          }
+        }
+        CHECK_EQ(invocation_results_.size(), invocation_results_size);
+        for (size_t i = 0; i < invocation_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadInvocationResultLocked(reader, i));
+        }
+        size_t batch_results_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("batch_results_size"), &temp));
+          batch_results_size = static_cast<size_t>(temp);
+          if (batch_results_size != temp) {
+            return errors::Internal("Invalid value for batch_results_size ",
+                                    temp);
+          }
+        }
+        CHECK_EQ(batch_results_.size(), batch_results_size);
+        for (size_t i = 0; i < batch_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadBatchResultLocked(reader, i));
+        }
+        return Status::OK();
+      }
+
      private:
       struct BatchResult {
-        mutex mu;
+        mutex mu ACQUIRED_AFTER(mu_);
         bool output_allocated GUARDED_BY(mu);
         std::vector<Tensor> output;
         std::unique_ptr<BlockingCounter> counter;
@@ -393,6 +512,151 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return status;
       }
 
+      Status WriteInvocationResultLocked(IteratorStateWriter* writer,
+                                         size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        const InvocationResult& result = invocation_results_[index];
+        string prefix = strings::StrCat("invocation_results_", index);
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, full_name(strings::StrCat(prefix, "_status")),
+            result.status));
+        if (result.end_of_input) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_return_values_size")),
+            result.return_values.size()));
+        for (size_t i = 0; i < result.return_values.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_return_values_", i)),
+              result.return_values[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadInvocationResultLocked(IteratorStateReader* reader,
+                                        size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        InvocationResult* result = &invocation_results_[index];
+        string prefix = strings::StrCat("invocation_results_", index);
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, full_name(strings::StrCat(prefix, "_status")),
+            &result->status));
+        result->end_of_input = reader->Contains(
+            full_name(strings::StrCat(prefix, "_end_of_input")));
+        size_t return_values_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_return_values_size")),
+              &temp));
+          return_values_size = static_cast<size_t>(temp);
+          if (temp != return_values_size) {
+            return errors::Internal("Invalid value for return_values_size ",
+                                    return_values_size);
+          }
+        }
+        result->return_values.reserve(return_values_size);
+        for (size_t i = 0; i < return_values_size; i++) {
+          result->return_values.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_return_values_", i)),
+              &result->return_values.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteBatchResultLocked(IteratorStateWriter* writer, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Wait for the map_fn dispatches made in `InvokeFunctionLocked` to
+        // finish. This may delay saving a checkpoint by a bit but keeps the
+        // code clean and also saves us from checkpointing the state of the
+        // `BlockingCounter`.
+        batch_results_[index].counter->Wait();
+        const BatchResult& result = batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        {
+          mutex_lock l(batch_results_[index].mu);
+          if (result.output_allocated) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat(prefix, "_output_allocated")), ""));
+          }
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_output_size")),
+            result.output.size()));
+        for (size_t i = 0; i < result.output.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_output_", i)),
+              result.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadBatchResultLocked(IteratorStateReader* reader, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        BatchResult* result = &batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        {
+          mutex_lock l(batch_results_[index].mu);
+          result->output_allocated = reader->Contains(
+              full_name(strings::StrCat(prefix, "_output_allocated")));
+          // Simulate that the batch was fully generated.
+          batch_results_[index].counter.reset(new BlockingCounter(0));
+        }
+        size_t output_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_output_size")), &temp));
+          output_size = static_cast<size_t>(temp);
+          if (temp != output_size) {
+            return errors::Internal("Invalid value for output_size ",
+                                    output_size);
+          }
+        }
+        result->output.reserve(output_size);
+        for (size_t i = 0; i < output_size; i++) {
+          result->output.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_output_", i)),
+              &result->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
       mutex mu_;
       int32 current_batch_index_ GUARDED_BY(mu_) = -1;
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
@@ -407,6 +671,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const NameAttrList map_fn_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
   };
-- 
GitLab


From 436f1434060d7f370baae9661baacc6cf27415ec Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 19 Apr 2018 09:54:40 -0700
Subject: [PATCH 1119/1262] Create a skeleton tf.contrib.checkpoint.

My plan for this is to incubate tools for working with object-based checkpoints:
  - Tools for managing dependency graphs, e.g. checkpointable lists/dictionaries
  - Inspecting/visualizing checkpoints
  - Listing variables and gathering initializers from a Checkpointable object
    and its dependencies
  - Verifying all variables are accessible as dependencies, which should make
    converting existing graph building Saver uses easier/safer.

This CL includes none of those things, it just moves the split_dependency tool
here instead of contrib/eager.

PiperOrigin-RevId: 193531292
---
 tensorflow/contrib/__init__.py                |  1 +
 tensorflow/contrib/checkpoint/README.md       |  2 +
 tensorflow/contrib/checkpoint/__init__.py     | 29 +++++++++++
 tensorflow/contrib/checkpoint/python/BUILD    | 29 +++++++++++
 .../python/split_dependency.py}               |  8 ++--
 .../python/split_dependency_test.py}          |  4 +-
 tensorflow/contrib/cmake/python_modules.txt   |  2 +
 tensorflow/contrib/cudnn_rnn/BUILD            |  2 +-
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |  4 +-
 tensorflow/contrib/eager/python/BUILD         | 48 ++-----------------
 tensorflow/contrib/optimizer_v2/BUILD         |  1 -
 tensorflow/tools/pip_package/BUILD            |  1 -
 12 files changed, 75 insertions(+), 56 deletions(-)
 create mode 100644 tensorflow/contrib/checkpoint/README.md
 create mode 100644 tensorflow/contrib/checkpoint/__init__.py
 create mode 100644 tensorflow/contrib/checkpoint/python/BUILD
 rename tensorflow/contrib/{eager/python/checkpointable_utils.py => checkpoint/python/split_dependency.py} (95%)
 rename tensorflow/contrib/{eager/python/checkpointable_utils_test.py => checkpoint/python/split_dependency_test.py} (96%)

diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 36cc5144d0..0d163daa6e 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -24,6 +24,7 @@ import os
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
+from tensorflow.contrib import checkpoint
 from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
diff --git a/tensorflow/contrib/checkpoint/README.md b/tensorflow/contrib/checkpoint/README.md
new file mode 100644
index 0000000000..d35c5bae3b
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/README.md
@@ -0,0 +1,2 @@
+Tools for working with object-based checkpoints produced by
+`tf.train.Checkpoint`.
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
new file mode 100644
index 0000000000..70d7d2d8d7
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools for working with object-based checkpoints.
+
+
+For creating and managing dependencies:
+@@split_dependency
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
new file mode 100644
index 0000000000..d57b01aab2
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -0,0 +1,29 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "split_dependency",
+    srcs = ["split_dependency.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "split_dependency_test",
+    srcs = ["split_dependency_test.py"],
+    deps = [
+        ":split_dependency",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
similarity index 95%
rename from tensorflow/contrib/eager/python/checkpointable_utils.py
rename to tensorflow/contrib/checkpoint/python/split_dependency.py
index 30c4103c5a..3aec8c96e9 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -1,4 +1,4 @@
-"""Utilities for working with Checkpointable objects."""
+"""Utility for creating multiple dependencies with synchronized save/restore."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +20,7 @@ from __future__ import print_function
 import functools
 
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import checkpointable as core_checkpointable
+from tensorflow.python.training import checkpointable as checkpointable
 from tensorflow.python.training import saver as saver_lib
 
 
@@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
     return self._restore_callback(tensor)
 
 
-class _SplitDependency(core_checkpointable.CheckpointableBase):
+class _SplitDependency(checkpointable.CheckpointableBase):
   """Looks like a regular variable while synchronizing save/restores."""
 
   def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
@@ -83,7 +83,7 @@ class _SplitDependency(core_checkpointable.CheckpointableBase):
   def _gather_saveables_for_checkpoint(self):
     """Looks to Checkpointable like a regular variable."""
     return {
-        core_checkpointable.VARIABLE_VALUE_KEY:
+        checkpointable.VARIABLE_VALUE_KEY:
         functools.partial(_CallbackSaveable,
                           dtype=self._dtype,
                           save_callback=self._save,
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
similarity index 96%
rename from tensorflow/contrib/eager/python/checkpointable_utils_test.py
rename to tensorflow/contrib/checkpoint/python/split_dependency_test.py
index da04199aaa..cb964c80e9 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.eager.python import checkpointable_utils as contrib_checkpointable_utils
+from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -47,7 +47,7 @@ class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
-    split_dependencies = contrib_checkpointable_utils.split_dependency(
+    split_dependencies = split_dependency.split_dependency(
         component_names=("first_half", "second_half"),
         component_dtypes=(self.combined.dtype,) * 2,
         fill_save_buffer_fn=_split_variable_closure(
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 91839194c7..fbcdf7e753 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -130,6 +130,8 @@ tensorflow/contrib/boosted_trees/ops
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/boosted_trees/python
 tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/checkpoint
+tensorflow/contrib/checkpoint/python
 tensorflow/contrib/cloud
 tensorflow/contrib/cloud/kernels
 tensorflow/contrib/cloud/ops
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index d68015ae15..aeefa3cee6 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -25,7 +25,7 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/eager/python:checkpointable_utils",
+        "//tensorflow/contrib/checkpoint/python:split_dependency",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index b615824460..a1ede4471e 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.eager.python import checkpointable_utils
+from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
@@ -318,7 +318,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         dependencies too (typically the cuDNN `Layer`).
       dtype: The dtype for the canonical parameter Tensors.
     """
-    split_dependencies = checkpointable_utils.split_dependency(
+    split_dependencies = split_dependency.split_dependency(
         component_names=self._param_names,
         component_dtypes=(dtype,) * len(self._param_names),
         fill_save_buffer_fn=self._checkpointable_save,
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index e2744a430d..99abbae03f 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -11,7 +11,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":checkpointable_utils",
         ":datasets",
         ":metrics",
         ":network",
@@ -19,15 +18,14 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:numerics",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:template",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
     ],
@@ -70,7 +68,6 @@ cuda_py_test(
     srcs = ["datasets_test.py"],
     additional_deps = [
         ":datasets",
-        ":checkpointable_utils",
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/contrib/data/python/ops:threadpool",
         "//tensorflow/contrib/data/python/ops:unique",
@@ -79,6 +76,7 @@ cuda_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python/data",
         "//tensorflow/python/eager:test",
     ],
@@ -121,8 +119,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -225,43 +223,3 @@ py_test(
         "//tensorflow/python/eager:test",
     ],
 )
-
-py_library(
-    name = "checkpointable_utils",
-    srcs = ["checkpointable_utils.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:training",
-    ],
-)
-
-cuda_py_test(
-    name = "checkpointable_utils_test",
-    srcs = ["checkpointable_utils_test.py"],
-    additional_deps = [
-        ":checkpointable_utils",
-        ":network",
-        "@six_archive//:six",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
-    ],
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/74395663
-    ],
-)
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 85cfce346c..5225ecc14f 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -115,7 +115,6 @@ cuda_py_test(
     additional_deps = [
         ":training",
         "@six_archive//:six",
-        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2ef105755f..0ac5a5bb6d 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -66,7 +66,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
     "//tensorflow/contrib/data/python/ops:contrib_op_loader",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
-    "//tensorflow/contrib/eager/python:checkpointable_utils",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
     "//tensorflow/contrib/graph_editor:graph_editor_pip",
-- 
GitLab


From 2273b62a769aa477f8d2ef02ca7dee253b8ea7b0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 19 Apr 2018 10:05:08 -0700
Subject: [PATCH 1120/1262] Added support for concatenation and slicing of
 symbolic shapes

PiperOrigin-RevId: 193532769
---
 ...direct_session_with_tracking_alloc_test.cc |   4 +-
 tensorflow/core/framework/shape_inference.cc  |   2 +
 tensorflow/core/framework/shape_inference.h   |  12 +
 .../core/grappler/costs/graph_properties.cc   | 236 ++++++++++++++++--
 4 files changed, 235 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 31fb128f93..b4dd521bbc 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-          EXPECT_EQ(3, cm->AllocationId(node, 0));
+          EXPECT_EQ(7, cm->AllocationId(node, 0));
         } else {
-          EXPECT_EQ(4, cm->AllocationId(node, 0));
+          EXPECT_EQ(8, cm->AllocationId(node, 0));
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 229b4a45fa..2b995e8b5e 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -157,8 +157,10 @@ InferenceContext::~InferenceContext() {}
 
 Status InferenceContext::Run(
     const std::function<Status(shape_inference::InferenceContext* c)>& fn) {
+  ForgetMerges();
   Status s = fn(this);
   if (!s.ok()) {
+    ForgetMerges();
     return AttachContext(s);
   }
 #ifndef NDEBUG
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index cdb4bd79bb..9431a62abe 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -285,6 +285,8 @@ class InferenceContext {
     return true;
   }
 
+  void SetInput(int idx, ShapeHandle shape) { inputs_[idx] = shape; }
+
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
@@ -317,6 +319,10 @@ class InferenceContext {
     input_tensors_as_shapes_ = input_tensors_as_shapes;
   }
 
+  const std::vector<ShapeHandle>& input_tensors_as_shapes() const {
+    return input_tensors_as_shapes_;
+  }
+
   ShapeHandle output(int64 idx) const { return outputs_[idx]; }
   void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
   Status set_output(StringPiece output_name,
@@ -587,6 +593,12 @@ class InferenceContext {
       int idx,
       const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
 
+  void set_input_handle_shapes_and_types(
+      int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+    input_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+  }
+
   // Returns the output handle shapes and types, for the resource tensor output
   // at index <idx>. Returns NULL if the shape and types were never set.
   const std::vector<ShapeAndType>* output_handle_shapes_and_types(int idx) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index a9c777e551..c83ddfe90a 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -18,8 +18,9 @@ limitations under the License.
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
-#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -394,15 +395,121 @@ class TopoQueue {
 // unknown shape/dimension of a given node.
 class SymbolicShapeRefiner {
  public:
-  explicit SymbolicShapeRefiner(ShapeRefiner* shape_refiner)
-      : shape_refiner_(shape_refiner) {}
+  explicit SymbolicShapeRefiner(const GraphDef& graph)
+      : function_library_(OpRegistry::Global(), graph.library()) {
+    graph_def_version_ = graph.versions().producer();
+    node_to_context_.reserve(graph.node_size());
+  }
 
   InferenceContext* GetContext(const Node* node) {
-    return shape_refiner_->GetContext(node);
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return it->second.inference_context.get();
   }
   Status UpdateNode(const Node* node, bool relax, bool* refined) {
-    return shape_refiner_->UpdateNode(node, relax, refined);
+    NodeContext* node_context = GetNodeContext(node);
+    if (node_context == nullptr) {
+      TF_RETURN_IF_ERROR(AddNode(node));
+      node_context = CHECK_NOTNULL(GetNodeContext(node));
+      *refined = true;
+    }
+    // Check if the shapes of the nodes in the fan-in of this node have changed,
+    // and if they have, update the node input shapes.
+    InferenceContext* inference_context = node_context->inference_context.get();
+    std::vector<Tensor> const_values(node->num_inputs());
+    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes(node->num_inputs());
+
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) continue;
+
+      int dst_input = e->dst_input();
+      int src_output = e->src_output();
+
+      Node* input = e->src();
+      NodeContext* c = GetNodeContext(input);
+      if (c == nullptr) {
+        return errors::FailedPrecondition(
+            "Input ", dst_input, " ('", input->name(), "') for '", node->name(),
+            "' was not previously added to ShapeRefiner.");
+      }
+
+      if (input->IsConstant()) {
+        // Convert constant value into tensors.
+        if (const_values[dst_input].FromProto(
+                input->def().attr().at("value").tensor())) {
+          input_tensors[dst_input] = &const_values[dst_input];
+          // Integer tensors of rank one can also be interpreted as a shape
+          // provided all their values are >= -1.
+          if (const_values[dst_input].dims() == 1 &&
+              (const_values[dst_input].dtype() == DT_INT32 ||
+               const_values[dst_input].dtype() == DT_INT64)) {
+            ShapeHandle tensor_shape = inference_context->Vector(
+                const_values[dst_input].NumElements());
+            ShapeHandle shp;
+            if (inference_context
+                    ->MakeShapeFromTensor(input_tensors[dst_input],
+                                          tensor_shape, &shp)
+                    .ok()) {
+              input_tensors_as_shapes[dst_input] = shp;
+            }
+          }
+        }
+      }
+
+      if (c->output_tensors_as_shapes.size() > src_output) {
+        input_tensors_as_shapes[dst_input] =
+            c->output_tensors_as_shapes[src_output];
+      }
+
+      DCHECK_GE(dst_input, 0);
+      if (!*refined && !inference_context->input(dst_input).SameHandle(
+                           c->inference_context->output(src_output))) {
+        *refined = true;
+      }
+      inference_context->SetInput(dst_input,
+                                  c->inference_context->output(src_output));
+
+      if (!*refined &&
+          inference_context->requested_input_tensor_as_partial_shape(
+              dst_input)) {
+        // The input value may have changed. Since we have no way to know if
+        // that's indeed the case, err on the safe side.
+        *refined = true;
+      }
+
+      // Also propagate handle shape and dtype of edges which are carrying
+      // resource handles.
+      if (e->src()->output_type(src_output) == DT_RESOURCE) {
+        auto* outputs =
+            c->inference_context->output_handle_shapes_and_types(src_output);
+        if (!outputs) continue;
+        auto* inputs =
+            inference_context->input_handle_shapes_and_types(dst_input);
+
+        if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
+          *refined = true;
+        }
+        inference_context->set_input_handle_shapes_and_types(dst_input,
+                                                             *outputs);
+      }
+    }
+
+    if (!*refined) {
+      // No input shape has changed, we're done
+      return Status::OK();
+    }
+
+    node_context->inference_context->set_input_tensors(input_tensors);
+    node_context->inference_context->set_input_tensors_as_shapes(
+        input_tensors_as_shapes);
+
+    // Update the shapes of the outputs.
+    return InferShapes(node, node_context);
   }
+
   Status SetUnknownShape(const Node* node, int output_port) {
     shape_inference::ShapeHandle shape =
         GetUnknownOutputShape(node, output_port);
@@ -450,7 +557,7 @@ class SymbolicShapeRefiner {
     if (shape1.SameHandle(shape2)) {
       return shape1;
     }
-    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    InferenceContext* ctx = GetContext(node);
     ShapeHandle merged = shape1;
     if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
       // Return either one since they're expected to represent the same value.
@@ -495,7 +602,7 @@ class SymbolicShapeRefiner {
     if (shape1.SameHandle(shape2)) {
       return shape1;
     }
-    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    InferenceContext* ctx = GetContext(node);
     ShapeHandle relaxed = shape1;
     const int rank = ctx->Rank(shape1);
     if (!ctx->RankKnown(shape2) || ctx->Rank(shape2) != rank) {
@@ -569,7 +676,7 @@ class SymbolicShapeRefiner {
     if (it != unknown_shapes_.end()) {
       return it->second;
     }
-    InferenceContext* c = shape_refiner_->GetContext(node);
+    InferenceContext* c = GetContext(node);
     ShapeHandle shp = c->UnknownShape();
     unknown_shapes_[id] = shp;
     return shp;
@@ -582,16 +689,114 @@ class SymbolicShapeRefiner {
     if (it != unknown_dims_.end()) {
       return it->second;
     }
-    InferenceContext* c = shape_refiner_->GetContext(node);
+    InferenceContext* c = GetContext(node);
     DimensionHandle dim = c->UnknownDim();
     unknown_dims_[id] = dim;
     return dim;
   }
 
-  ShapeRefiner* shape_refiner_;
+  Status AddNode(const Node* node) {
+    // Create the inference context for this node.
+    std::vector<ShapeHandle> input_shapes(node->num_inputs());
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+        input_handle_shapes_and_types(node->num_inputs());
+    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+
+    NodeContext& node_ctx = node_to_context_[node];
+    node_ctx.inference_context.reset(new InferenceContext(
+        graph_def_version_, &node->def(), node->op_def(), input_shapes,
+        input_tensors, input_tensors_as_shapes,
+        std::move(input_handle_shapes_and_types)));
+    const Status s = node_ctx.inference_context->construction_status();
+    if (!s.ok()) {
+      node_ctx.inference_context.reset(nullptr);
+    }
+    return s;
+  }
+
+  struct NodeContext {
+    std::unique_ptr<InferenceContext> inference_context;
+    std::vector<ShapeHandle> output_tensors_as_shapes;
+  };
+
+  Status InferShapes(const Node* node, NodeContext* c) {
+    InferenceContext* ic = c->inference_context.get();
+
+    // Propagate shape tensors
+    if (node->type_string() == "Shape") {
+      c->output_tensors_as_shapes.resize(1);
+      c->output_tensors_as_shapes[0] = c->inference_context->input(0);
+    } else if (node->type_string() == "ShapeN") {
+      c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
+      for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
+        c->output_tensors_as_shapes[i] = c->inference_context->input(i);
+      }
+    } else if (node->type_string() == "ConcatV2") {
+      bool valid = true;
+      ShapeHandle result;
+      for (int i = 0; i < ic->num_inputs() - 1; ++i) {
+        ShapeHandle input = ic->input_tensors_as_shapes()[i];
+        if (!ic->RankKnown(input)) {
+          valid = false;
+          break;
+        } else if (i == 0) {
+          result = input;
+        } else {
+          TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
+        }
+      }
+      if (valid) {
+        c->output_tensors_as_shapes.resize(1);
+        c->output_tensors_as_shapes[0] = result;
+      }
+    } else if (node->type_string() == "Slice") {
+      ShapeHandle input = ic->input_tensors_as_shapes()[0];
+      bool valid = ic->RankKnown(input);
+      const Tensor* slice_offset = ic->input_tensor(1);
+      valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
+      const Tensor* slice_size = ic->input_tensor(2);
+      valid &= slice_size != nullptr && slice_size->NumElements() == 1;
+      if (valid) {
+        int64 start = slice_offset->dtype() == DT_INT32
+                          ? slice_offset->flat<int32>()(0)
+                          : slice_offset->flat<int64>()(0);
+        int64 end = start + (slice_size->dtype() == DT_INT32
+                                 ? slice_size->flat<int32>()(0)
+                                 : slice_size->flat<int64>()(0));
+        ShapeHandle result;
+        TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+        c->output_tensors_as_shapes.resize(1);
+        c->output_tensors_as_shapes[0] = result;
+      }
+    }
+
+    // Infer the shapes of output tensors.
+    const OpRegistrationData* op_reg_data;
+    Status s = function_library_.default_registry()->LookUp(node->type_string(),
+                                                            &op_reg_data);
+    if (!s.ok() || op_reg_data->shape_inference_fn == nullptr) {
+      // There is nothing more we can infer, annotate outputs with unknown
+      // shapes
+      return c->inference_context->Run(shape_inference::UnknownShape);
+    }
+
+    return c->inference_context->Run(op_reg_data->shape_inference_fn);
+  }
+
+  NodeContext* GetNodeContext(const Node* node) {
+    auto it = node_to_context_.find(node);
+    if (it == node_to_context_.end()) {
+      return nullptr;
+    }
+    return &it->second;
+  }
 
+  int graph_def_version_;
+  std::unordered_map<const Node*, NodeContext> node_to_context_;
   std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
   std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
+  FunctionLibraryDefinition function_library_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -977,9 +1182,6 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
                                              item_.graph.library());
   Graph graph(function_library);
   graph_ = &graph;
-  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
-  shape_refiner.set_require_shape_inference_fns(false);
-  shape_refiner.set_disable_constant_propagation(true);
   ImportGraphDefOptions options;
   // Graph optimization happens at the late stage of graph execution,
   // when colocation constraints are already validated previously and
@@ -987,7 +1189,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   // is no need to validate colocation constraints again.
   options.validate_colocation_constraints = false;
   options.validate_shape = false;
-  Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
+  Status s = ImportGraphDef(options, item_.graph, &graph, nullptr);
   TF_RETURN_IF_ERROR(s);
 
   std::unordered_map<string, std::unordered_set<int>> fed_ports;
@@ -1041,7 +1243,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  SymbolicShapeRefiner refiner(&shape_refiner);
+  SymbolicShapeRefiner refiner(item_.graph);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
   // exclusively merge shapes but we do not propagate shapes through the
@@ -1073,7 +1275,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   SymbolicShapeManager shape_manager;
   bool found_error = false;
   for (const Node* const node : graph.nodes()) {
-    auto node_ctx = shape_refiner.GetContext(node);
+    auto node_ctx = refiner.GetContext(node);
     if (!node_ctx) {
       continue;
     }
@@ -1105,7 +1307,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
 
   for (const Node* const node : graph.nodes()) {
     VLOG(3) << "Filling in graph properties for node: " << node->name();
-    auto ctx = shape_refiner.GetContext(node);
+    auto ctx = refiner.GetContext(node);
     if (!ctx) {
       continue;
     }
-- 
GitLab


From bdcca449fc22cf1d8a1d6a2c01c3b67706d6023b Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 19 Apr 2018 10:14:09 -0700
Subject: [PATCH 1121/1262] Prototype for tf.data writer API.

PiperOrigin-RevId: 193534333
---
 .../contrib/data/python/kernel_tests/BUILD    |  20 +++
 .../python/kernel_tests/writer_ops_test.py    | 117 ++++++++++++++++++
 tensorflow/contrib/data/python/ops/BUILD      |  13 ++
 tensorflow/contrib/data/python/ops/writers.py |  58 +++++++++
 .../base_api/api_def_DatasetToTFRecord.pbtxt  |  24 ++++
 tensorflow/core/framework/dataset.h           |   4 +-
 tensorflow/core/kernels/data/BUILD            |  14 +++
 tensorflow/core/kernels/data/writer_ops.cc    | 113 +++++++++++++++++
 tensorflow/core/ops/dataset_ops.cc            |   6 +
 9 files changed, 367 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
 create mode 100644 tensorflow/contrib/data/python/ops/writers.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
 create mode 100644 tensorflow/core/kernels/data/writer_ops.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c554607960..83daa04efc 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -516,3 +516,23 @@ tf_py_test(
         "//third_party/py/numpy",
     ],
 )
+
+tf_py_test(
+    name = "writer_ops_test",
+    size = "small",
+    srcs = ["writer_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/data/python/ops:writers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
new file mode 100644
index 0000000000..c603ecc5ab
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import writers
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class TFRecordWriterTest(test.TestCase):
+
+  def setUp(self):
+    super(TFRecordWriterTest, self).setUp()
+    self._num_records = 7
+    self.filename = array_ops.placeholder(dtypes.string, shape=[])
+    self.compression_type = array_ops.placeholder_with_default("", shape=[])
+
+    input_dataset = readers.TFRecordDataset([self.filename],
+                                            self.compression_type)
+    self.writer = writers.TFRecordWriter(
+        self._outputFilename(), self.compression_type).write(input_dataset)
+
+  def _record(self, i):
+    return compat.as_bytes("Record %d" % (i))
+
+  def _createFile(self, options=None):
+    filename = self._inputFilename()
+    writer = python_io.TFRecordWriter(filename, options)
+    for i in range(self._num_records):
+      writer.write(self._record(i))
+    writer.close()
+    return filename
+
+  def _inputFilename(self):
+    return os.path.join(self.get_temp_dir(), "tf_record.in.txt")
+
+  def _outputFilename(self):
+    return os.path.join(self.get_temp_dir(), "tf_record.out.txt")
+
+  def testWrite(self):
+    with self.test_session() as sess:
+      sess.run(
+          self.writer, feed_dict={
+              self.filename: self._createFile(),
+          })
+    for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
+      self.assertAllEqual(self._record(i), r)
+
+  def testWriteZLIB(self):
+    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
+    with self.test_session() as sess:
+      sess.run(
+          self.writer,
+          feed_dict={
+              self.filename: self._createFile(options),
+              self.compression_type: "ZLIB",
+          })
+    for i, r in enumerate(
+        tf_record.tf_record_iterator(self._outputFilename(), options=options)):
+      self.assertAllEqual(self._record(i), r)
+
+  def testWriteGZIP(self):
+    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)
+    with self.test_session() as sess:
+      sess.run(
+          self.writer,
+          feed_dict={
+              self.filename: self._createFile(options),
+              self.compression_type: "GZIP",
+          })
+    for i, r in enumerate(
+        tf_record.tf_record_iterator(self._outputFilename(), options=options)):
+      self.assertAllEqual(self._record(i), r)
+
+  def testFailDataset(self):
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write("whoops")
+
+  def testFailDType(self):
+    input_dataset = dataset_ops.Dataset.from_tensors(10)
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write(input_dataset)
+
+  def testFailShape(self):
+    input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]])
+    with self.assertRaises(TypeError):
+      writers.TFRecordWriter(self._outputFilename(),
+                             self.compression_type).write(input_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index e00f2304cc..5b04c5316c 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -280,6 +280,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "writers",
+    srcs = [
+        "writers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "gen_dataset_ops.py",
@@ -342,6 +354,7 @@ py_library(
         ":stats_ops",
         ":threadpool",
         ":unique",
+        ":writers",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py
new file mode 100644
index 0000000000..f53bd3f738
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/writers.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for tf.data writers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class TFRecordWriter(object):
+  """Writes data to a TFRecord file."""
+
+  def __init__(self, filename, compression_type=None):
+    self._filename = ops.convert_to_tensor(
+        filename, dtypes.string, name="filename")
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
+
+  def write(self, dataset):
+    """Returns a @{tf.Operation} to write a dataset to a file.
+
+    Args:
+      dataset: a @{tf.data.Dataset} whose elements are to be written to a file
+
+    Returns:
+      A @{tf.Operation} that, when run, writes contents of `dataset` to a file.
+    """
+    if not isinstance(dataset, dataset_ops.Dataset):
+      raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+    if (dataset.output_types != dtypes.string or
+        dataset.output_shapes != tensor_shape.scalar()):
+      raise TypeError(
+          "`dataset` must produce scalar `DT_STRING` tensors whereas it "
+          "produces shape {0} and types {1}".format(dataset.output_shapes,
+                                                    dataset.output_types))
+    return gen_dataset_ops.dataset_to_tf_record(
+        dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
new file mode 100644
index 0000000000..e1b8a9abdd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "DatasetToTFRecord"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to write.
+END
+  }
+  in_arg {
+    name: "filename"
+    description: <<END
+A scalar string tensor representing the filename to use.
+END
+  }
+  in_arg {
+    name: "compression_type"
+    description: <<END
+A scalar string tensor containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  summary: "Writes the given dataset to the given file using the TFRecord format."
+}
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 9e7ffe6c0b..8d127baac4 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -364,7 +364,7 @@ class IteratorBase {
  protected:
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
+  // `RepeatDatasetOp::Dataset`.
   Status SaveParent(IteratorStateWriter* writer,
                     const std::unique_ptr<IteratorBase>& parent) {
     return parent->SaveInternal(writer);
@@ -372,7 +372,7 @@ class IteratorBase {
 
   // This is needed so that sub-classes of IteratorBase can call
   // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
+  // `RepeatDatasetOp::Dataset`.
   Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
                        const std::unique_ptr<IteratorBase>& parent) {
     return parent->RestoreInternal(ctx, reader);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 1e96eb6421..667a6967a8 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -576,6 +576,20 @@ tf_kernel_library(
         ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
         ":unbatch_dataset_op",
+        ":writer_ops",
         ":zip_dataset_op",
     ],
 )
+
+tf_kernel_library(
+    name = "writer_ops",
+    srcs = ["writer_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
new file mode 100644
index 0000000000..46821fd7b3
--- /dev/null
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -0,0 +1,113 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ToTFRecordOp : public AsyncOpKernel {
+ public:
+  explicit ToTFRecordOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([this, ctx, done]() {
+      string filename;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
+      string compression_type;
+      OP_REQUIRES_OK_ASYNC(ctx,
+                           ParseScalarArgument<string>(ctx, "compression_type",
+                                                       &compression_type),
+                           done);
+      std::unique_ptr<WritableFile> file;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
+                           done);
+      std::unique_ptr<io::RecordWriter> writer;
+      writer.reset(new io::RecordWriter(
+          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
+                          compression_type)));
+
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
+
+      IteratorContext::Params params;  // TODO(b/78245447)
+      params.env = ctx->env();
+      params.runner = *(ctx->runner());
+      params.lib = ctx->function_library();
+      DeviceBase* device = ctx->function_library()->device();
+      params.allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+
+      IteratorContext iter_ctx(std::move(params));
+
+      std::vector<Tensor> components;
+      components.reserve(dataset->output_dtypes().size());
+      bool end_of_sequence;
+
+      do {
+        OP_REQUIRES_OK_ASYNC(
+            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+            done);
+
+        if (!end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(
+              ctx, writer->WriteRecord(components[0].scalar<string>()()), done);
+        }
+        components.clear();
+      } while (!end_of_sequence);
+      done();
+    });
+  }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
+                        ToTFRecordOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 8be569b315..67c6c58fe2 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -551,4 +551,10 @@ REGISTER_OP("EnqueueInQueueDataset")
     // reading from queue handle (is that even possible?).
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
-- 
GitLab


From 5fbd21e3bbd4f89dd2c6eed8a63b66ee2eff40a0 Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Thu, 19 Apr 2018 10:20:43 -0700
Subject: [PATCH 1122/1262] distribution_util moved into its own BUILD target,
 so linear_operator can depend on it.

PiperOrigin-RevId: 193535400
---
 tensorflow/python/ops/distributions/BUILD | 26 ++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 9d9ede7ad7..e7ad028376 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -8,9 +8,13 @@ licenses(["notice"])  # Apache 2.0
 
 py_library(
     name = "distributions",
-    srcs = glob(["*.py"]),
+    srcs = glob(
+        ["*.py"],
+        exclude = ["util.py"],
+    ),
     srcs_version = "PY2AND3",
     deps = [
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -26,3 +30,23 @@ py_library(
         "@six_archive//:six",
     ],
 )
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:tensor_util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
-- 
GitLab


From 72240a9b5e67e315f6c037bb4579df9709335e35 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 01:23:54 +0800
Subject: [PATCH 1123/1262] fix single paragraph format and also arrow like
 format

---
 tensorflow/contrib/optimizer_v2/adam.py          | 16 ++++++++--------
 .../api_def/base_api/api_def_ApplyAdam.pbtxt     |  8 ++++----
 .../base_api/api_def_ResourceApplyAdam.pbtxt     |  8 ++++----
 tensorflow/python/training/adam.py               | 16 ++++++++--------
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index a38c98f471..76a867039a 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -40,19 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     Initialization:
 
-    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
-    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
-    \\(t <- 0\\) (Initialize timestep)
+    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
+    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
+    $$t \Leftarrow 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t <- t + 1$$
-    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t \Leftarrow t + 1$$
+    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index fc2cb09471..fca8ba2530 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 5c60fa3aa1..8b16d824bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,8 +76,8 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 }
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index dc0f1aba09..9f523a3aca 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,19 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    \\(m_0 <- 0\\) (Initialize initial 1st moment vector)
-    \\(v_0 <- 0\\) (Initialize initial 2nd moment vector)
-    \\(t <- 0\\) (Initialize timestep)
+    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
+    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
+    $$t \Leftarrow 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t <- t + 1$$
-    $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t \Leftarrow t + 1$$
+    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From 08a9107a2754d9e56cbc3a0f90ee0763f13e99e0 Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Thu, 19 Apr 2018 10:26:26 -0700
Subject: [PATCH 1124/1262] Fix doc gen error

Mismatch after the fix in #17815
---
 tensorflow/contrib/tensor_forest/ops/stats_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
index be0a11546d..5be581aaec 100644
--- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
@@ -75,7 +75,7 @@ REGISTER_OP("GrowTreeV4")
     .Attr("params: string")
     .Input("tree_handle: resource")
     .Input("stats_handle: resource")
-    .Input("finshed_nodes: int32")
+    .Input("finished_nodes: int32")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
 Grows the tree for finished nodes and allocates waiting nodes.
-- 
GitLab


From ba3bc495bbf1140e9375e1ec03c3ff788b8ebc6e Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Thu, 19 Apr 2018 10:26:54 -0700
Subject: [PATCH 1125/1262] Add metric names to model.metrics_names in compile
 for keras models run in eager execution. This prevents us from dropping
 metrics when we run model.evaluate.

PiperOrigin-RevId: 193536341
---
 .../keras/_impl/keras/engine/training.py      | 29 ++-------
 .../_impl/keras/engine/training_eager.py      | 39 ++++--------
 .../_impl/keras/engine/training_eager_test.py | 12 ++--
 .../keras/_impl/keras/engine/training_test.py | 26 ++++++++
 .../_impl/keras/engine/training_utils.py      | 62 +++++++++++++++++++
 5 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 7c46743814..012d9ceea4 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -276,6 +276,8 @@ class Model(Network):
           self.metrics_names.append(self.output_names[i] + '_loss')
       self.nested_metrics = training_utils.collect_metrics(metrics,
                                                            self.output_names)
+      with K.name_scope('metrics'):
+        training_utils.populate_metric_names(self)
       self._feed_sample_weight_modes = []
       for i in range(len(self.outputs)):
         self._feed_sample_weight_modes.append(None)
@@ -462,7 +464,6 @@ class Model(Network):
         output_weighted_metrics = nested_weighted_metrics[i]
 
         def handle_metrics(metrics, weights=None):
-          metric_name_prefix = 'weighted_' if weights is not None else ''
 
           for metric in metrics:
             if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
@@ -489,39 +490,19 @@ class Model(Network):
                   metric_fn = metrics_module.categorical_accuracy
                 elif metric in ('crossentropy', 'ce'):
                   metric_fn = metrics_module.categorical_crossentropy
-              if metric in ('accuracy', 'acc'):
-                suffix = 'acc'
-              elif metric in ('crossentropy', 'ce'):
-                suffix = 'ce'
               weighted_metric_fn = training_utils.weighted_masked_objective(
                   metric_fn)
-              metric_name = metric_name_prefix + suffix
             else:
               metric_fn = metrics_module.get(metric)
               weighted_metric_fn = training_utils.weighted_masked_objective(
                   metric_fn)
-              # Get metric name as string
-              if hasattr(metric_fn, 'name'):
-                metric_name = metric_fn.name
-              else:
-                metric_name = metric_fn.__name__
-              metric_name = metric_name_prefix + metric_name
-
+            metric_name = training_utils.get_base_metric_name(
+                metric, weighted=weights is not None)
             with K.name_scope(metric_name):
               metric_result = weighted_metric_fn(
                   y_true, y_pred, weights=weights, mask=masks[i])
 
-            # Append to self.metrics_names, self.metric_tensors,
-            # self.stateful_metric_names
-            if len(self.output_names) > 1:
-              metric_name = '%s_%s' % (self.output_names[i], metric_name)
-            # Dedupe name
-            j = 1
-            base_metric_name = metric_name
-            while metric_name in self.metrics_names:
-              metric_name = '%s_%d' % (base_metric_name, j)
-              j += 1
-            self.metrics_names.append(metric_name)
+            training_utils.add_metric_name(self, metric_name, i)
             self.metrics_tensors.append(metric_result)
 
             # Keep track of state updates created by
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 695669d9ee..ad239d6151 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -100,7 +100,7 @@ def _eager_metrics_fn(model, outputs, targets):
         metric_names.append(metric_name)
         metric_results.append(backend.mean(metric_result))
 
-  return metric_names, metric_results
+  return metric_results
 
 
 def _model_loss(model, inputs, targets, sample_weights=None, training=False):
@@ -151,7 +151,12 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       with backend.name_scope(model.output_names[i] + '_loss'):
         output_loss = weighted_masked_fn(
             targets[i], outs[i], weights, mask=mask)
-      loss_metrics.append(backend.mean(output_loss))
+      # If the number of outputs is 1 then we don't append the loss metric
+      # associated with each model output. When there are multiple outputs
+      # associated with a model, each output's loss is calculated and returned
+      # as part of the loss_metrics.
+      if len(model.outputs) > 1:
+        loss_metrics.append(backend.mean(output_loss))
 
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
@@ -274,7 +279,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
-  _, metrics_results = _eager_metrics_fn(
+  metrics_results = _eager_metrics_fn(
       model, outs, targets)
   if not isinstance(loss, list):
     loss = [loss]
@@ -304,7 +309,7 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  _, metrics_results = _eager_metrics_fn(
+  metrics_results = _eager_metrics_fn(
       model, outs, targets)
   if not isinstance(loss, list):
     loss = [loss]
@@ -498,34 +503,12 @@ def fit_loop(
         for l, o in zip(out_labels, outs):
           batch_logs[l] = o
         # Required for Eager mode
-        metrics_names, metrics_results = _eager_metrics_fn(
-            model, outs, targets_batch)
+        metrics_results = _eager_metrics_fn(model, outs, targets_batch)
         batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
 
-        # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
-        # In graph mode we set the metric names in compile. However in
-        # Eager mode we calculate the metrics for each batch in fit_loop.
-        # We could calculate the metric names and functions in compile.
-        # This would avoid setting the callback parameters separately.
-        # We need to do this for the first iteration alone
-        for m in metrics_names:
-          if m not in callback_metrics:
-            callback_metrics.append(m)
-
-        callbacks.set_params({
-            'batch_size': batch_size,
-            'epochs': epochs,
-            'steps': steps_per_epoch,
-            'samples': num_train_samples,
-            'verbose': verbose,
-            'do_validation': do_validation,
-            'metrics': callback_metrics or [],
-        })
-
         for k, v in zip(model.metrics_names,
                         [backend.mean(loss)] + loss_metrics + metrics_results):
           batch_logs[k] = tensor_util.constant_value(v)
-
         callbacks.on_batch_end(batch_index, batch_logs)
         if callback_model.stop_training:
           break
@@ -611,7 +594,7 @@ def test_loop(model, inputs, targets,
           targets_batch,
           sample_weights=sample_weights_batch,
           training=False)
-      _, metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
+      metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
       batch_outs = []
       for _, v in zip(model.metrics_names,
                       [backend.mean(loss)] + loss_metrics + metrics_results):
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index ed0f91ee1e..deaf1d1306 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -212,7 +212,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     loss_weights = [1., 0.5]
-    metrics = ['mae']
+    metrics = ['acc', 'mae']
     model.compile(
         optimizer,
         loss,
@@ -231,20 +231,20 @@ class TrainingTest(test.TestCase):
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=0)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.evaluate(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=1)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.evaluate(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         batch_size=5,
         verbose=2)
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
     out = model.test_on_batch([input_a_np, input_b_np],
                               [output_d_np, output_e_np])
-    self.assertEqual(len(out), 5)
+    self.assertEqual(len(out), 7)
 
     # Test evaluate with dictionary inputs
     model.evaluate(
@@ -625,7 +625,6 @@ class LossWeightingTest(test.TestCase):
       bad_w_np = np.random.random((10, 2, 2))
       model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
 
-
 class CorrectnessTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes()
@@ -649,7 +648,6 @@ class CorrectnessTest(test.TestCase):
     self.assertEqual(
         np.around(history.history['loss'][-1], decimals=4), 0.6173)
 
-
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 6699fd5212..d9281436de 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -24,12 +24,15 @@ import unittest
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
 
 try:
   import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
@@ -1684,6 +1687,29 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.train_on_batch([input_a_np, input_b_np],
                            [output_a_np, output_b_np])
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_metric_names_are_identical_in_graph_and_eager(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae', 'acc']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    reference_metric_names = ['loss', 'dense_loss', 'dropout_loss',
+                              'dense_mean_absolute_error',
+                              'dense_acc',
+                              'dropout_mean_absolute_error',
+                              'dropout_acc']
+    self.assertEqual(reference_metric_names, model.metrics_names)
 
 if __name__ == '__main__':
   # Bazel sets these environment variables to very long paths.
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 48afe48e6c..662938f421 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.ops import math_ops
 
 
@@ -552,3 +553,64 @@ def standardize_weights(y,
 def has_symbolic_tensors(ls):
   return (any(tensor_util.is_tensor(v) for v in ls)
           and not context.executing_eagerly())
+
+
+def populate_metric_names(model):
+  for i in range(len(model.outputs)):
+    metrics = model.nested_metrics[i]
+    for metric in metrics:
+      base_metric_name = get_base_metric_name(metric)
+      add_metric_name(model, base_metric_name, i)
+
+
+def get_base_metric_name(metric, weighted=False):
+  """Returns the metric name given the metric function.
+
+  Arguments:
+      metric: Metric function name or reference.
+      weighted: Boolean indicating if the metric for which we are adding
+          names is weighted.
+
+  Returns:
+      a metric name.
+  """
+  metric_name_prefix = 'weighted_' if weighted else ''
+  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+    if metric in ('accuracy', 'acc'):
+      suffix = 'acc'
+    elif metric in ('crossentropy', 'ce'):
+      suffix = 'ce'
+    metric_name = metric_name_prefix + suffix
+  else:
+    metric_fn = metrics_module.get(metric)
+    # Get metric name as string
+    if hasattr(metric_fn, 'name'):
+      metric_name = metric_fn.name
+    else:
+      metric_name = metric_fn.__name__
+    metric_name = metric_name_prefix + metric_name
+
+  return metric_name
+
+
+def add_metric_name(model, metric_name, index):
+  """Makes the metric name unique and adds it to the model's metric name list.
+
+    If there are multiple outputs for which the metrics are calculated, the
+    metric names have to be made unique by appending an integer.
+
+  Arguments:
+    model: Model to which we are adding metric names.
+    metric_name: Metric name that corresponds to the metric specified by the
+        user. For example: 'acc'
+    index: The index of the model output for which the metric name is being
+        added.
+  """
+  if len(model.output_names) > 1:
+    metric_name = '%s_%s' % (model.output_names[index], metric_name)
+  j = 1
+  base_metric_name = metric_name
+  while metric_name in model.metrics_names:
+    metric_name = '%s_%d' % (base_metric_name, j)
+    j += 1
+  model.metrics_names.append(metric_name)
-- 
GitLab


From 6a7779f3384e48012d3e27ae0f48d410f5174d06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 10:33:42 -0700
Subject: [PATCH 1126/1262] Fix undefined signed integer overflow by performing
 addition more carefully.

PiperOrigin-RevId: 193537461
---
 .../core/lib/random/random_distributions.h    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 4cf3a999f6..e963511f5c 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <string.h>
 #include <algorithm>
+#include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
@@ -40,6 +41,20 @@ PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
 // Helper function to convert two 32-bit integers to a double between [0..1).
 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
 
+// Computes a + b. Requires that the result is representable in the destination
+// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
+// need *not* be representable in that type. (The condition on b excludes the
+// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
+// compute.)
+template <typename Int>
+PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
+                                   typename std::make_unsigned<Int>::type b) {
+  // Implementation note: both b_div_2 and b - b_div_2 are positive and
+  // representatble as Int.
+  auto b_div_2 = b >> 1;
+  return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
+}
+
 // A class that generates uniform distribution random numbers from the
 // underlying random integer generator.
 // Arguments:
@@ -172,7 +187,7 @@ class UniformDistribution<Generator, int32> {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      result[i] = lo_ + static_cast<int32>(sample[i] % range_);
+      result[i] = SignedAdd(lo_, sample[i] % range_);
     }
     return result;
   }
@@ -208,7 +223,7 @@ class UniformDistribution<Generator, int64> {
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
-      result[i] = lo_ + static_cast<int64>(bits % range_);
+      result[i] = SignedAdd(lo_, bits % range_);
     }
     return result;
   }
-- 
GitLab


From 430230b4b966cade863ea5b660862734ede1cc56 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 01:37:03 +0800
Subject: [PATCH 1127/1262] Fix minor pylint issue

---
 tensorflow/contrib/losses/python/losses/loss_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 5af1f21b11..bdad34a665 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -652,7 +652,7 @@ def cosine_distance(predictions,
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `weights` is `None`.
   """
-  axis = deprecation.deprecated_argument_lookup(
+  axis = deprecated_argument_lookup(
       "axis", axis, "dim", dim)
   if axis is None:
     raise ValueError("You must specify 'axis'.")
-- 
GitLab


From f196351cd4e21ed6c17dcf544e0fa6cfa3030b4e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 10:57:55 -0700
Subject: [PATCH 1128/1262] Allow non-isolated worker sessions to borrow
 `WorkerEnv::device_mgr`.

Without this change, a shared resource (e.g. an Iterator) could not be
created in one session `s1`, and used in a later session `s2` after
`s1` was closed, because the iterator might indirectly capture devices
from the previous session, and use them after they are freed when the
`WorkerSession` was deleted.

The current change only affects the singleton "legacy" WorkerSession,
which is never deleted, but this is necessary to switch all sessions
to use separate WorkerSession objects.

PiperOrigin-RevId: 193541426
---
 tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc  |  2 +-
 tensorflow/core/distributed_runtime/BUILD     |  1 +
 .../base_rendezvous_mgr.cc                    |  4 +-
 .../rpc/rpc_rendezvous_mgr.cc                 |  2 +-
 .../core/distributed_runtime/session_mgr.cc   | 40 +++++++++++++------
 .../core/distributed_runtime/session_mgr.h    |  2 +-
 .../distributed_runtime/session_mgr_test.cc   | 23 ++++++-----
 .../distributed_runtime/worker_session.cc     | 38 +++++++++++++++++-
 .../core/distributed_runtime/worker_session.h | 28 +++++++++++--
 9 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 28f68cec8c..94f522c04e 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -155,7 +155,7 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     }
 
     Device* dst_device;
-    Status s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    Status s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
     if (!s.ok()) {
       sess->worker_cache->ReleaseWorker(src_worker, rwi);
       done(s, Args(), recv_args, Tensor{}, false);
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index b07cb8cdcb..d564727da5 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -133,6 +133,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index bafd9bfc68..5f6931e008 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -253,13 +253,13 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
 
   WorkerSession* sess = session();
   Device* src_device;
-  Status s = sess->device_mgr->LookupDevice(parsed.src_device, &src_device);
+  Status s = sess->device_mgr()->LookupDevice(parsed.src_device, &src_device);
   if (!s.ok()) {
     done(s);
     return;
   }
   Device* dst_device;
-  s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+  s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
   if (!s.ok()) {
     done(s);
     return;
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 067dc5dff5..b8cb538503 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -227,7 +227,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   Device* dst_device;
   if (s.ok()) {
-    s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
     if (rwi != nullptr) {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index e51d63cf2b..357e9f8930 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -33,11 +34,11 @@ SessionMgr::SessionMgr(
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
       default_worker_cache_(std::move(default_worker_cache)),
-      legacy_session_(new WorkerSession(
+      legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr(
           "", default_worker_name,
           std::unique_ptr<WorkerCacheInterface>(
               new WorkerCacheWrapper(default_worker_cache_.get())),
-          std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+          worker_env->device_mgr,
           std::unique_ptr<GraphMgr>(
               new GraphMgr(worker_env, worker_env->device_mgr)))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
@@ -71,19 +72,32 @@ Status SessionMgr::CreateSession(const string& session,
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
 
-  std::vector<Device*> renamed_devices;
-  for (Device* d : worker_env_->local_devices) {
-    renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
-        worker_name, d, false, isolate_session_state));
-  }
-  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
+  std::shared_ptr<WorkerSession> worker_session;
 
-  std::unique_ptr<GraphMgr> graph_mgr(
-      new GraphMgr(worker_env_, device_mgr.get()));
+  if (isolate_session_state) {
+    // Create a private copy of the DeviceMgr for the WorkerSession.
+    std::vector<Device*> renamed_devices;
+    for (Device* d : worker_env_->local_devices) {
+      renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
+          worker_name, d, false, isolate_session_state));
+    }
 
-  std::shared_ptr<WorkerSession> worker_session(new WorkerSession(
-      session, worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(device_mgr), std::move(graph_mgr)));
+    auto device_mgr = MakeUnique<DeviceMgr>(renamed_devices);
+    auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
+    worker_session.reset(
+        new WorkerSession(session, worker_name,
+                          std::unique_ptr<WorkerCacheInterface>(worker_cache),
+                          std::move(device_mgr), std::move(graph_mgr)));
+  } else {
+    // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
+    // that resources using it can use its devices after the
+    // WorkerSession has been deleted.
+    auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
+    worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
+        session, worker_name,
+        std::unique_ptr<WorkerCacheInterface>(worker_cache),
+        worker_env_->device_mgr, std::move(graph_mgr));
+  }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 0a10fe240f..04d1d61409 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -65,7 +65,7 @@ class SessionMgr {
   void ClearLogs();
 
  private:
-  const WorkerEnv* const worker_env_;  // Not owned.
+  WorkerEnv* const worker_env_;  // Not owned.
 
   // A note about destruction:
   // We must delete graph_mgr before device_mgr, due to shared
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 858e636e08..0da333833a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -43,15 +43,17 @@ class FakeDevice : public Device {
 class SessionMgrTest : public ::testing::Test {
  protected:
   SessionMgrTest()
-      : device_(FakeDevice::MakeCPU(
-            "/job:mnist/replica:0/task:0/device:fakecpu:0")),
-        mgr_(&env_, "/job:mnist/replica:0/task:0",
+      : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(), factory_) {
-    TF_CHECK_OK(mgr_.WorkerSessionForSession("", &legacy_session_));
-    env_.local_devices = {device_.get()};
+    Device* device =
+        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0")
+            .release();
+    env_.local_devices = {device};
+    device_mgr_.reset(new DeviceMgr(env_.local_devices));
+    env_.device_mgr = device_mgr_.get();
   }
 
-  std::unique_ptr<Device> device_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
   WorkerEnv env_;
   SessionMgr::WorkerCacheFactory factory_ =
       [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
@@ -59,7 +61,6 @@ class SessionMgrTest : public ::testing::Test {
         return Status::OK();
       };
   SessionMgr mgr_;
-  std::shared_ptr<WorkerSession> legacy_session_;
 };
 
 TEST_F(SessionMgrTest, CreateSessionSimple) {
@@ -84,25 +85,25 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
   std::shared_ptr<WorkerSession> session_1;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_1", &session_1));
-  std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
+  std::vector<Device*> devices_1 = session_1->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_1.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
   std::shared_ptr<WorkerSession> session_2;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_2", &session_2));
-  std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
+  std::vector<Device*> devices_2 = session_2->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_2.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
   std::shared_ptr<WorkerSession> session_3;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_3", &session_3));
-  std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
+  std::vector<Device*> devices_3 = session_3->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_3.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
   std::shared_ptr<WorkerSession> session_4;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_4", &session_4));
-  std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
+  std::vector<Device*> devices_4 = session_4->device_mgr()->ListDevices();
   EXPECT_EQ(1, devices_4.size());
 
   EXPECT_EQ(devices_1[0]->resource_manager(), devices_2[0]->resource_manager());
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 18886babd5..ca6dc1b1de 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -95,9 +95,43 @@ WorkerSession::WorkerSession(const string& session_name,
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
-      device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)),
       cluster_flr(
-          new ClusterFunctionLibraryRuntime(this, !session_name.empty())) {}
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      device_mgr_(std::move(device_mgr)),
+      borrowed_device_mgr_(nullptr) {}
+
+/* static */
+std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
+    const string& session_name, const string& worker_name,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr) {
+  return std::shared_ptr<WorkerSession>(
+      new WorkerSession(session_name, worker_name, std::move(worker_cache),
+                        borrowed_device_mgr, std::move(graph_mgr)));
+}
+
+WorkerSession::WorkerSession(const string& session_name,
+                             const string& worker_name,
+                             std::unique_ptr<WorkerCacheInterface> worker_cache,
+                             DeviceMgr* borrowed_device_mgr,
+                             std::unique_ptr<GraphMgr> graph_mgr)
+    : session_name(session_name),
+      worker_name(worker_name),
+      worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
+      graph_mgr(std::move(graph_mgr)),
+      cluster_flr(
+          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      device_mgr_(nullptr),
+      borrowed_device_mgr_(borrowed_device_mgr) {}
+
+WorkerSession::~WorkerSession() {
+  if (graph_mgr) {
+    Status s = graph_mgr->DeregisterAll();
+    if (!s.ok()) {
+      LOG(WARNING) << "Error during worker session deletion: " << s;
+    }
+  }
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 0fd19ac27f..f1faf49364 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -40,10 +40,14 @@ struct WorkerSession {
   // Object from which WorkerInterface instances can be obtained.
   const std::unique_ptr<WorkerCacheInterface> worker_cache;
 
-  // Collection of local devices. These devices are typically RenamedDevices
-  // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr
-  // == worker_env_.device_mgr, which holds the true devices.
-  const std::unique_ptr<DeviceMgr> device_mgr;
+  // Collection of local devices. These devices are typically
+  // RenamedDevices in all except the SessionMgr.legacy_session_ and
+  // sessions created with `isolate_session_state == false`. In the
+  // those cases, this method returns a pointer to a borrowed
+  // DeviceMgr (typically the `worker_env.device_mgr`).
+  DeviceMgr* device_mgr() {
+    return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
+  }
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
@@ -57,6 +61,22 @@ struct WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr);
+
+  static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
+      const string& session_name, const string& worker_name,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr);
+
+  ~WorkerSession();
+
+ private:
+  WorkerSession(const string& session_name, const string& worker_name,
+                std::unique_ptr<WorkerCacheInterface> worker_cache,
+                DeviceMgr* borrowed_device_mgr,
+                std::unique_ptr<GraphMgr> graph_mgr);
+
+  const std::unique_ptr<DeviceMgr> device_mgr_;
+  DeviceMgr* const borrowed_device_mgr_;  // Not owned.
 };
 
 }  // namespace tensorflow
-- 
GitLab


From e77bb988e470d35aca3ea1e27a4f335409f1f4d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 10:59:08 -0700
Subject: [PATCH 1129/1262] Fix open source BUILD bugs for cloud profiler.
 Increment version for releasing cloud_tpu_profiler 1.6 with pod profiling
 support.

PiperOrigin-RevId: 193541692
---
 .../tpu/profiler/capture_tpu_profile.cc       | 12 +++++-----
 .../pip_package/cloud_tpu_profiler/main.py    | 23 +++++++++++++++++--
 .../contrib/tpu/profiler/pip_package/setup.py |  2 +-
 tensorflow/contrib/tpu/profiler/version.h     |  2 +-
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index a535884263..816897499b 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -41,7 +41,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
-using ::tensorflow::grpc::TPUProfileAnalysis;
+using ::tensorflow::TPUProfileAnalysis;
 using ::tensorflow::TPUProfiler;
 
 constexpr uint64 kMaxEvents = 1000000;
@@ -137,9 +137,9 @@ bool NewSession(const string& service_addr,
       PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
   new_session_request.set_repository_root(repository_root);
   new_session_request.set_session_id(session_id);
-  std::copy(
-      hostnames.begin(), hostnames.end(),
-      proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts()));
+  for (const auto& hostname : hostnames) {
+    new_session_request.add_hosts(hostname);
+  }
 
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
@@ -159,8 +159,8 @@ bool NewSession(const string& service_addr,
   TF_QCHECK_OK(FromGrpcStatus(
       stub->NewSession(&context, new_session_request, &new_session_response)));
 
-  std::cout << "Profile session succeed for hosts:"
-            << str_util::Join(hostnames, ",");
+  std::cout << "Profile session succeed for host(s):"
+            << str_util::Join(hostnames, ",") << std::endl;
   return new_session_response.empty_trace();
 }
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 0b78cf8695..508c7a842f 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -37,12 +37,17 @@ flags.DEFINE_string(
     'will attempt to automatically detect the GCE project from metadata.')
 flags.DEFINE_string('tpu_name', None,
                     'Name of the Cloud TPU for Cluster Resolvers. You must '
-                    'specify either this flag or --master.')
+                    'specify either this flag or --service_addr.')
 
 # Tool specific parameters
 flags.DEFINE_string(
     'service_addr', None, 'Address of TPU profiler service e.g. '
     'localhost:8466, you must specify either this flag or --tpu_name.')
+flags.DEFINE_string(
+    'workers_list', None, 'The list of worker TPUs that we are about to profile'
+    ' e.g. 10.0.1.2, 10.0.1.3. You can specify this flag with --tpu_name or '
+    '--service_addr to profile a subset of tpu nodes. You can also use only'
+    '--tpu_name and leave this flag unspecified to profile all the tpus.')
 flags.DEFINE_string('logdir', None,
                     'Path of TensorBoard log directory e.g. /tmp/tb_log, '
                     'gs://tb_bucket')
@@ -56,18 +61,25 @@ flags.DEFINE_boolean('include_dataset_ops', True,
 
 FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
+JOB_NAME = 'worker'
 
+def get_workers_list(cluster_resolver):
+  cluster_spec = cluster_resolver.cluster_spec()
+  task_indices = cluster_spec.task_indices(JOB_NAME)
+  workers_list = [cluster_spec.task_address(JOB_NAME, i).split(':')[0]
+                  for i in task_indices]
+  return ','.join(workers_list)
 
 def run_main():
   tf.app.run(main)
 
-
 def main(unused_argv=None):
   tf.logging.set_verbosity(tf.logging.INFO)
 
   if FLAGS.service_addr is None and FLAGS.tpu_name is None:
     sys.exit('You must specify either --service_addr or --tpu_name.')
 
+  tpu_cluster_resolver = None
   if FLAGS.service_addr is not None:
     if FLAGS.tpu_name is not None:
       tf.logging.warn('Both --service_addr and --tpu_name are set. Ignoring '
@@ -82,6 +94,12 @@ def main(unused_argv=None):
     service_addr = tpu_cluster_resolver.get_master()
   service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466')
 
+  workers_list = ""
+  if FLAGS.workers_list is not None:
+    workers_list = FLAGS.workers_list
+  elif tpu_cluster_resolver is not None:
+    workers_list = get_workers_list(tpu_cluster_resolver)
+
   if not FLAGS.logdir:
     sys.exit('logdir must be provided.')
   executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE)
@@ -89,6 +107,7 @@ def main(unused_argv=None):
   cmd = [executable_path]
   cmd.append('--logdir=' + logdir)
   cmd.append('--service_addr=' + service_addr)
+  cmd.append('--workers_list=' + workers_list)
   cmd.append('--duration_ms=' + str(FLAGS.duration_ms))
   cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts))
   cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower())
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 8d99835b64..ebd478fd02 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.6.0-rc1'
+_VERSION = '1.6.0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index dc6a934891..618479e1a6 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.5.0"
+#define TPU_PROFILER_VERSION "1.6.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
-- 
GitLab


From 62c3b7dece92a3ad1a39e7c4eb0894411e435258 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:08:08 -0700
Subject: [PATCH 1130/1262] Updating tests in constant_folding_test.cc so that
 they Evaluate the optimized and original graph and check if their outputs are
 the same.

PiperOrigin-RevId: 193543478
---
 .../optimizers/constant_folding_test.cc       | 52 +++++++++++++++++--
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 36625b68b7..1acce05909 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -689,8 +689,7 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   GrapplerItem item;
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  EXPECT_EQ(1, tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -717,9 +716,6 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
     }
   }
   EXPECT_EQ(1, found);
-  auto tensors = EvaluateNodes(output, item.fetch);
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
@@ -995,6 +991,18 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({11, 13}));
+  std::vector<string> fetch_nodes = {"p2"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
@@ -1192,6 +1200,30 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+
+  v_ctrl_t.flat<bool>()(0) = true;
+  std::vector<string> fetch_nodes = {"m", "m2"};
+  auto tensors_expected = EvaluateNodes(
+      item.graph, fetch_nodes, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, fetch_nodes,
+                               {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+
+  v_ctrl_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, fetch_nodes,
+                                   {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  tensors = EvaluateNodes(output, fetch_nodes,
+                          {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SwitchNodes) {
@@ -1268,6 +1300,16 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
   EXPECT_EQ(2, tensors.size());
   test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
+
+  v_ctrl_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, item.fetch,
+                                   {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  tensors = EvaluateNodes(output, item.fetch,
+                          {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, MergeNodes) {
-- 
GitLab


From 9b496c9134529f6d85f0e9757099104cf506cbd6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:21:21 -0700
Subject: [PATCH 1131/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193546050
---
 tensorflow/core/ops/compat/ops_history.v1.pbtxt | 15 +++++++++++++++
 tensorflow/core/ops/ops.pbtxt                   | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 9bc11cf0fe..dbd6f859c4 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -15829,6 +15829,21 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9b665190ce..46afe357f0 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7051,6 +7051,21 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
-- 
GitLab


From 87229e4fc3bc23c7a92bfdf40e5834ac65a00d34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:47:28 -0700
Subject: [PATCH 1132/1262] Go: Update generated wrapper functions for
 TensorFlow ops. PiperOrigin-RevId: 193550428

---
 tensorflow/go/op/wrappers.go | 72 ++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 35ad1eff0f..3b3dff0573 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -3105,6 +3105,42 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 	return op.Output(0)
 }
 
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that passes a sliding window over `input_dataset`.
 //
 // Arguments:
@@ -25383,42 +25419,6 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Gather slices from `params` axis `axis` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-- 
GitLab


From 78db5136edf30667090988c703f98f4f8c4c4269 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 19 Apr 2018 11:52:10 -0700
Subject: [PATCH 1133/1262] Implements linear_model using _LinearModel. Added
 support for cols_to_vars in _LinearModel in order to make this possible.
 Also, made some fixes so that variable names come out the same as before.

PiperOrigin-RevId: 193551353
---
 .../python/feature_column/feature_column.py   | 106 ++++++++--------
 .../feature_column/feature_column_test.py     | 117 ++++++++++++------
 .../training/warm_starting_util_test.py       |  16 +--
 3 files changed, 138 insertions(+), 101 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 0ad8131599..87a52f8441 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -409,58 +409,19 @@ def linear_model(features,
     ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
       nor `_CategoricalColumn`.
   """
-  feature_columns = _clean_feature_columns(feature_columns)
-  for column in feature_columns:
-    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
-      raise ValueError('Items of feature_columns must be either a _DenseColumn '
-                       'or _CategoricalColumn. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-  with variable_scope.variable_scope(
-      None, default_name='linear_model', values=features.values()):
-    weighted_sums = []
-    ordered_columns = []
-    builder = _LazyBuilder(features)
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
-        ordered_columns.append(column)
-        weighted_sum = _create_weighted_sum(
-            column=column,
-            builder=builder,
-            units=units,
-            sparse_combiner=sparse_combiner,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        weighted_sums.append(weighted_sum)
-        if cols_to_vars is not None:
-          # Retrieve the variables created.
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
-    predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
-    bias = variable_scope.get_variable(
-        'bias_weights',
-        shape=[units],
-        initializer=init_ops.zeros_initializer(),
-        trainable=trainable,
-        collections=weight_collections)
-    predictions = nn_ops.bias_add(
-        predictions_no_bias, bias, name='weighted_sum')
-    if cols_to_vars is not None:
-      # Add the bias to cols_to_vars as well, converting the Variable or
-      # PartitionedVariable to a list of Variable's.
-      if (isinstance(bias, variables.Variable) or
-          resource_variable_ops.is_resource_variable(bias)):
-        cols_to_vars['bias'] = [bias]
-      else:  # Must be a PartitionedVariable.
-        cols_to_vars['bias'] = list(bias)
-    return predictions
+  linear_model_layer = _LinearModel(
+      feature_columns=feature_columns,
+      units=units,
+      sparse_combiner=sparse_combiner,
+      weight_collections=weight_collections,
+      trainable=trainable,
+      name='linear_model')
+  retval = linear_model_layer(features)  # pylint: disable=not-callable
+  if cols_to_vars is None:
+    return retval
+  for k, v in linear_model_layer.cols_to_vars().items():
+    cols_to_vars[k] = v
+  return retval
 
 
 def _add_to_collections(var, weight_collections):
@@ -551,8 +512,22 @@ class _BiasLayer(base.Layer):
     return self._bias_variable
 
 
+def _get_expanded_variable_list(variable):
+  if (isinstance(variable, variables.Variable) or
+      resource_variable_ops.is_resource_variable(variable)):
+    return [variable]  # Single variable case.
+  else:  # Must be a PartitionedVariable, so convert into a list.
+    return list(variable)
+
+
+def _strip_leading_slashes(name):
+  return name.rsplit('/', 1)[-1]
+
+
 class _LinearModel(training.Model):
   """Creates a linear model using feature columns.
+
+  See `linear_model` for details.
   """
 
   def __init__(self,
@@ -573,7 +548,10 @@ class _LinearModel(training.Model):
     for column in sorted(self._feature_columns, key=lambda x: x.name):
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
-        column_name = vs.name
+        # Having the fully expressed variable scope name ends up doubly
+        # expressing the outer scope (scope with which this method was called)
+        # in the name of the variable that would get created.
+        column_name = _strip_leading_slashes(vs.name)
       column_layer = _FCLinearWrapper(column, units, sparse_combiner,
                                       self._weight_collections, trainable,
                                       column_name, **kwargs)
@@ -585,6 +563,15 @@ class _LinearModel(training.Model):
         weight_collections=self._weight_collections,
         name='bias_layer',
         **kwargs)
+    self._cols_to_vars = {}
+
+  def cols_to_vars(self):
+    """Returns a dict mapping _FeatureColumns to variables.
+
+    See `linear_model` for more information.
+    This is not populated till `call` is called i.e. layer is built.
+    """
+    return self._cols_to_vars
 
   def call(self, features):
     with variable_scope.variable_scope(self.name):
@@ -597,15 +584,24 @@ class _LinearModel(training.Model):
       ordered_columns = []
       builder = _LazyBuilder(features)
       for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
-        ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+        column = layer._feature_column  # pylint: disable=protected-access
+        ordered_columns.append(column)
         weighted_sum = layer(builder)
         weighted_sums.append(weighted_sum)
+        self._cols_to_vars[column] = ops.get_collection(
+            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
 
       _verify_static_batch_size_equality(weighted_sums, ordered_columns)
       predictions_no_bias = math_ops.add_n(
           weighted_sums, name='weighted_sum_no_bias')
       predictions = nn_ops.bias_add(
-          predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+          predictions_no_bias,
+          self._bias_layer(  # pylint: disable=not-callable
+              builder,
+              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
+          name='weighted_sum')
+      bias = self._bias_layer.variables[0]
+      self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
     return predictions
 
   def _add_layers(self, layers):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 46404abadc..49e06b8245 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -345,7 +345,7 @@ class NumericColumnTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -584,7 +584,7 @@ class BucketizedColumnTest(test.TestCase):
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [bucketized_price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -610,7 +610,7 @@ class BucketizedColumnTest(test.TestCase):
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [bucketized_price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -849,7 +849,7 @@ class HashedCategoricalColumnTest(test.TestCase):
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
       }, (wire_column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -1171,7 +1171,7 @@ class CrossedColumnTest(test.TestCase):
                   values=['cA', 'cB', 'cC'],
                   dense_shape=(2, 2)),
       }, (crossed,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
         self.assertAllClose((0.,), bias.eval())
@@ -1254,18 +1254,13 @@ def get_linear_model_column_var(column):
                             'linear_model/' + column.name)[0]
 
 
-def get_keras_linear_model_bias():
-  with variable_scope.variable_scope('linear_model', reuse=True):
-    with variable_scope.variable_scope('bias_layer', reuse=True):
-      return variable_scope.get_variable('bias_weights')
-
-
 def get_keras_linear_model_predictions(features,
                                        feature_columns,
                                        units=1,
                                        sparse_combiner='sum',
                                        weight_collections=None,
-                                       trainable=True):
+                                       trainable=True,
+                                       cols_to_vars=None):
   keras_linear_model = _LinearModel(
       feature_columns,
       units,
@@ -1273,7 +1268,12 @@ def get_keras_linear_model_predictions(features,
       weight_collections,
       trainable,
       name='linear_model')
-  return keras_linear_model(features)  # pylint: disable=not-callable
+  retval = keras_linear_model(features)  # pylint: disable=not-callable
+  if cols_to_vars is None:
+    return retval
+  for k, v in keras_linear_model.cols_to_vars().items():
+    cols_to_vars[k] = v
+  return retval
 
 
 @test_util.with_c_api
@@ -1977,7 +1977,7 @@ class _LinearModelTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -1994,7 +1994,7 @@ class _LinearModelTest(test.TestCase):
           dense_shape=[2, 2])
       features = {'wire_cast': wire_tensor}
       predictions = get_keras_linear_model_predictions(features, [wire_cast])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -2014,7 +2014,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [wire_cast, price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -2072,7 +2072,7 @@ class _LinearModelTest(test.TestCase):
       features = {dense_and_sparse_column.name: sp_tensor}
       predictions = get_keras_linear_model_predictions(
           features, [dense_and_sparse_column])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       dense_and_sparse_column_var = get_linear_model_column_var(
           dense_and_sparse_column)
       with _initialized_session() as sess:
@@ -2088,7 +2088,7 @@ class _LinearModelTest(test.TestCase):
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(
           features, [price], units=3)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -2108,7 +2108,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor}
       predictions = get_keras_linear_model_predictions(
           features, [wire_cast], units=3)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -2163,7 +2163,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor}
       predictions = get_keras_linear_model_predictions(
           features, [wire_cast], sparse_combiner='mean')
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
@@ -2176,7 +2176,7 @@ class _LinearModelTest(test.TestCase):
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(
           features, [price], units=3)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose(np.zeros((3,)), bias.eval())
@@ -2206,7 +2206,7 @@ class _LinearModelTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
         self.assertAllClose([0.], bias.eval())
@@ -2222,7 +2222,7 @@ class _LinearModelTest(test.TestCase):
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       predictions = get_keras_linear_model_predictions(features,
                                                        [price1, price2])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
@@ -2235,6 +2235,45 @@ class _LinearModelTest(test.TestCase):
         sess.run(bias.assign([7.]))
         self.assertAllClose([[3217.], [4657.]], predictions.eval())
 
+  def test_fills_cols_to_vars(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      cols_to_vars = {}
+      get_keras_linear_model_predictions(
+          features, [price1, price2], cols_to_vars=cols_to_vars)
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      self.assertAllEqual(cols_to_vars['bias'], [bias])
+      self.assertAllEqual(cols_to_vars[price1], [price1_var])
+      self.assertAllEqual(cols_to_vars[price2], [price2_var])
+
+  def test_fills_cols_to_vars_partitioned_variables(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=3)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [6., 7.]],
+          'price2': [[3., 4., 5.], [8., 9., 10.]]
+      }
+      cols_to_vars = {}
+      with variable_scope.variable_scope(
+          'linear',
+          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
+        get_keras_linear_model_predictions(
+            features, [price1, price2], cols_to_vars=cols_to_vars)
+      with _initialized_session():
+        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
+        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
+        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+        # a [1, 1] Variable.
+        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
+        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+
   def test_dense_collection(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
@@ -2242,7 +2281,7 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(
           features, [price], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       self.assertIn(bias, my_vars)
       self.assertIn(price_var, my_vars)
@@ -2256,7 +2295,7 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(
           features, [wire_cast], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       self.assertIn(bias, my_vars)
       self.assertIn(wire_cast_var, my_vars)
@@ -2266,7 +2305,7 @@ class _LinearModelTest(test.TestCase):
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price])
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertIn(bias, trainable_vars)
@@ -2280,7 +2319,7 @@ class _LinearModelTest(test.TestCase):
       features = {'wire_cast': wire_tensor}
       get_keras_linear_model_predictions(features, [wire_cast])
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       self.assertIn(bias, trainable_vars)
       self.assertIn(wire_cast_var, trainable_vars)
@@ -2427,7 +2466,7 @@ class _LinearModelTest(test.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
 
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
       body_style_var = get_linear_model_column_var(body_style)
 
@@ -2470,7 +2509,7 @@ class _LinearModelTest(test.TestCase):
     net = get_keras_linear_model_predictions(features,
                                              [price_buckets, body_style])
     with _initialized_session() as sess:
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       price_buckets_var = get_linear_model_column_var(price_buckets)
       body_style_var = get_linear_model_column_var(body_style)
 
@@ -2509,7 +2548,7 @@ class _LinearModelTest(test.TestCase):
 
     net = get_keras_linear_model_predictions(
         features, [price_buckets, body_style, country])
-    bias = get_keras_linear_model_bias()
+    bias = get_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
     with _initialized_session() as sess:
@@ -3688,7 +3727,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
       }, (wire_column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -4080,7 +4119,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
       }, (wire_column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -4326,7 +4365,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   values=(0, 2, 1),
                   dense_shape=(2, 2))
       }, (column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -5108,7 +5147,7 @@ class EmbeddingColumnTest(test.TestCase):
           categorical_column.name: sparse_input
       }, (embedding_column,))
       expected_var_names = (
-          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/bias_weights:0',
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
@@ -5120,7 +5159,7 @@ class EmbeddingColumnTest(test.TestCase):
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
@@ -5757,7 +5796,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
       # Linear weights do not follow the column name. But this is a rare use
       # case, and fixing it would add too much complexity to the code.
       expected_var_names = (
-          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/bias_weights:0',
           'linear_model/aaa_bbb_shared_embedding/weights:0',
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
@@ -5770,7 +5809,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
       linear_weights_a = trainable_vars[
@@ -6105,7 +6144,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   values=(.5, 1., .1),
                   dense_shape=(2, 2))
       }, (column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
@@ -6172,7 +6211,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
           'values': ((.5,), (1.,), (.1,))
       }, (column,))
-      bias = get_keras_linear_model_bias()
+      bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 6e445d8bd1..7e8cbd6bae 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -946,18 +946,20 @@ class WarmStartingUtilTest(test.TestCase):
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
         self._assert_cols_to_vars(
-            cols_to_vars, {
+            cols_to_vars,
+            {
                 emb_vocab: [
-                    # embedding_weights part 0.
-                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
-                    # embedding_weights part 1.
-                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]),
                     # linear weights part 0.
                     np.array([[0.69]]),
                     # linear weights part 1.
-                    np.array([[0.71]])
+                    np.array([[0.71]]),
+                    # embedding_weights part 0.
+                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
+                    # embedding_weights part 1.
+                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                 ]
-            }, sess)
+            },
+            sess)
 
   def testErrorConditions(self):
     x = variable_scope.get_variable(
-- 
GitLab


From 173aadc6b62dd95691257c2d9f158dd9044bb4ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:55:46 -0700
Subject: [PATCH 1134/1262] Change estimator to only log non-binary eval
 metrics, because logging binary metrics such as images will lead to crash.

PiperOrigin-RevId: 193551927
---
 tensorflow/python/estimator/estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index a42b6cfee8..9862fdecdb 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -1256,7 +1256,8 @@ def _dict_to_str(dictionary):
     A `str` representing the `dictionary`.
   """
   return ', '.join('%s = %s' % (k, v)
-                   for k, v in sorted(six.iteritems(dictionary)))
+                   for k, v in sorted(six.iteritems(dictionary))
+                   if not isinstance(v, six.binary_type))
 
 
 def _write_dict_to_summary(output_dir,
-- 
GitLab


From fb02b02689b0e126c93cbcb8462e8417e1d954cc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 19 Apr 2018 11:57:36 -0700
Subject: [PATCH 1135/1262] Avoid looking up the shape functions multiple times
 Improved the handling of fed nodes

PiperOrigin-RevId: 193552210
---
 .../core/grappler/costs/graph_properties.cc   | 155 +++++++++---------
 .../core/grappler/costs/graph_properties.h    |   7 -
 2 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index c83ddfe90a..dd2d53dfdf 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -395,8 +395,11 @@ class TopoQueue {
 // unknown shape/dimension of a given node.
 class SymbolicShapeRefiner {
  public:
-  explicit SymbolicShapeRefiner(const GraphDef& graph)
-      : function_library_(OpRegistry::Global(), graph.library()) {
+  explicit SymbolicShapeRefiner(
+      const GraphDef& graph,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports)
+      : function_library_(OpRegistry::Global(), graph.library()),
+        fed_ports_(fed_ports) {
     graph_def_version_ = graph.versions().producer();
     node_to_context_.reserve(graph.node_size());
   }
@@ -704,6 +707,9 @@ class SymbolicShapeRefiner {
     std::vector<ShapeHandle> input_tensors_as_shapes;
 
     NodeContext& node_ctx = node_to_context_[node];
+    TF_RETURN_IF_ERROR(
+        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
+
     node_ctx.inference_context.reset(new InferenceContext(
         graph_def_version_, &node->def(), node->op_def(), input_shapes,
         input_tensors, input_tensors_as_shapes,
@@ -716,6 +722,7 @@ class SymbolicShapeRefiner {
   }
 
   struct NodeContext {
+    const OpRegistrationData* op_data;
     std::unique_ptr<InferenceContext> inference_context;
     std::vector<ShapeHandle> output_tensors_as_shapes;
   };
@@ -723,65 +730,80 @@ class SymbolicShapeRefiner {
   Status InferShapes(const Node* node, NodeContext* c) {
     InferenceContext* ic = c->inference_context.get();
 
-    // Propagate shape tensors
-    if (node->type_string() == "Shape") {
-      c->output_tensors_as_shapes.resize(1);
-      c->output_tensors_as_shapes[0] = c->inference_context->input(0);
-    } else if (node->type_string() == "ShapeN") {
-      c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
-      for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
-        c->output_tensors_as_shapes[i] = c->inference_context->input(i);
-      }
-    } else if (node->type_string() == "ConcatV2") {
-      bool valid = true;
-      ShapeHandle result;
-      for (int i = 0; i < ic->num_inputs() - 1; ++i) {
-        ShapeHandle input = ic->input_tensors_as_shapes()[i];
-        if (!ic->RankKnown(input)) {
-          valid = false;
-          break;
-        } else if (i == 0) {
-          result = input;
-        } else {
-          TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
-        }
-      }
-      if (valid) {
+    auto it = fed_ports_.find(node->name());
+    const bool is_fed = it != fed_ports_.end();
+
+    // Propagate shape tensors unless the node is fed.
+    // TODO(bsteiner) We should still propagate the shapes to the ports that
+    // aren't fed in the case of a ShapeN node.
+    if (!is_fed) {
+      if (node->type_string() == "Shape") {
         c->output_tensors_as_shapes.resize(1);
-        c->output_tensors_as_shapes[0] = result;
-      }
-    } else if (node->type_string() == "Slice") {
-      ShapeHandle input = ic->input_tensors_as_shapes()[0];
-      bool valid = ic->RankKnown(input);
-      const Tensor* slice_offset = ic->input_tensor(1);
-      valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
-      const Tensor* slice_size = ic->input_tensor(2);
-      valid &= slice_size != nullptr && slice_size->NumElements() == 1;
-      if (valid) {
-        int64 start = slice_offset->dtype() == DT_INT32
-                          ? slice_offset->flat<int32>()(0)
-                          : slice_offset->flat<int64>()(0);
-        int64 end = start + (slice_size->dtype() == DT_INT32
-                                 ? slice_size->flat<int32>()(0)
-                                 : slice_size->flat<int64>()(0));
+        c->output_tensors_as_shapes[0] = c->inference_context->input(0);
+      } else if (node->type_string() == "ShapeN") {
+        c->output_tensors_as_shapes.resize(c->inference_context->num_inputs());
+        for (int i = 0; i < c->inference_context->num_inputs(); ++i) {
+          c->output_tensors_as_shapes[i] = c->inference_context->input(i);
+        }
+      } else if (node->type_string() == "ConcatV2") {
+        bool valid = true;
         ShapeHandle result;
-        TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
-        c->output_tensors_as_shapes.resize(1);
-        c->output_tensors_as_shapes[0] = result;
+        for (int i = 0; i < ic->num_inputs() - 1; ++i) {
+          ShapeHandle input = ic->input_tensors_as_shapes()[i];
+          if (!ic->RankKnown(input)) {
+            valid = false;
+            break;
+          } else if (i == 0) {
+            result = input;
+          } else {
+            TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result));
+          }
+        }
+        if (valid) {
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
+      } else if (node->type_string() == "Slice") {
+        ShapeHandle input = ic->input_tensors_as_shapes()[0];
+        bool valid = ic->RankKnown(input);
+        const Tensor* slice_offset = ic->input_tensor(1);
+        valid &= slice_offset != nullptr && slice_offset->NumElements() == 1;
+        const Tensor* slice_size = ic->input_tensor(2);
+        valid &= slice_size != nullptr && slice_size->NumElements() == 1;
+        if (valid) {
+          int64 start = slice_offset->dtype() == DT_INT32
+                            ? slice_offset->flat<int32>()(0)
+                            : slice_offset->flat<int64>()(0);
+          int64 end = start + (slice_size->dtype() == DT_INT32
+                                   ? slice_size->flat<int32>()(0)
+                                   : slice_size->flat<int64>()(0));
+          ShapeHandle result;
+          TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
       }
     }
 
     // Infer the shapes of output tensors.
-    const OpRegistrationData* op_reg_data;
-    Status s = function_library_.default_registry()->LookUp(node->type_string(),
-                                                            &op_reg_data);
-    if (!s.ok() || op_reg_data->shape_inference_fn == nullptr) {
+    if (!c->op_data || c->op_data->shape_inference_fn == nullptr) {
       // There is nothing more we can infer, annotate outputs with unknown
       // shapes
       return c->inference_context->Run(shape_inference::UnknownShape);
     }
 
-    return c->inference_context->Run(op_reg_data->shape_inference_fn);
+    TF_RETURN_IF_ERROR(
+        c->inference_context->Run(c->op_data->shape_inference_fn));
+
+    Status status = Status::OK();
+    if (is_fed) {
+      // It is possible to feed node output ports with tensors of any shape: as
+      // a result, the shape of a fed port is completely unknown.
+      for (const int output_port : it->second) {
+        status.Update(SetUnknownShape(node, output_port));
+      }
+    }
+    return status;
   }
 
   NodeContext* GetNodeContext(const Node* node) {
@@ -797,6 +819,7 @@ class SymbolicShapeRefiner {
   std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
   std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
   FunctionLibraryDefinition function_library_;
+  const std::unordered_map<string, std::unordered_set<int>>& fed_ports_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -983,23 +1006,6 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
   return Status::OK();
 }
 
-Status GraphProperties::OverwriteFedPorts(
-    SymbolicShapeRefiner* shape_refiner,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-    const Node* node, bool* new_shapes) const {
-  auto it = fed_ports.find(node->name());
-  Status status;
-  if (it != fed_ports.end()) {
-    // It is possible to feed node output ports with tensors of any shape: as a
-    // result, the shape of a fed port is completely unknown.
-    for (const int output_port : it->second) {
-      status.Update(shape_refiner->SetUnknownShape(node, output_port));
-    }
-    *new_shapes = true;
-  }
-  return status;
-}
-
 // Manually propagate the input shape for Enter nodes and update any Merge node
 // outputs.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
@@ -1032,7 +1038,6 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
 
 Status GraphProperties::UpdateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
     const Node* n, bool* new_shapes) const {
   if (n->IsEnter()) {
     // The Enter shape function always forwards an UnknownShape, so do the right
@@ -1053,9 +1058,7 @@ Status GraphProperties::UpdateShapes(
       }
     }
   }
-  // Nodes can be fed with any shape. The TensorFlow shape inference code can't
-  // handle this properly, so overwrite its behavior here.
-  return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes);
+  return Status::OK();
 }
 
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
@@ -1063,7 +1066,6 @@ Status GraphProperties::PropagateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
     const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
         resources,
-    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
   // incorrect shape functions. The algoritm should converge in at most
@@ -1087,8 +1089,7 @@ Status GraphProperties::PropagateShapes(
            num_loop_iterations++ < max_loop_iterations) {
       const Node* n = new_shapes->pop();
       bool updated = false;
-      TF_RETURN_IF_ERROR(
-          UpdateShapes(shape_refiner, relax, fed_ports, n, &updated));
+      TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated));
       if (updated) {
         for (const Edge* e : n->out_edges()) {
           if (!e->IsControlEdge()) {
@@ -1243,7 +1244,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  SymbolicShapeRefiner refiner(item_.graph);
+  SymbolicShapeRefiner refiner(item_.graph, fed_ports);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
   // exclusively merge shapes but we do not propagate shapes through the
@@ -1267,8 +1268,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
       new_shapes.push(node);
     }
     // Propagate shapes normally.
-    TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
-                                       fed_ports, num_loops));
+    TF_RETURN_IF_ERROR(
+        PropagateShapes(&refiner, relax, &new_shapes, resources, num_loops));
   }
 
   // Track shapes globally across the graph.
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 30351f58fd..4c3f3f5f53 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -102,16 +102,10 @@ class GraphProperties {
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                             const Node* node, bool relax, bool* new_shapes);
-  // Process a node that is used to feed the model.
-  Status OverwriteFedPorts(
-      SymbolicShapeRefiner* shape_refiner,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
-      const Node* node, bool* new_shapes) const;
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
   Status UpdateShapes(
       SymbolicShapeRefiner* shape_refiner, bool relax,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
       const Node* n, bool* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
@@ -119,7 +113,6 @@ class GraphProperties {
       SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
       const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
           resources,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
       int num_loops) const;
 
   // Data members
-- 
GitLab


From 0ea0049fa500078c132ed29b60beb8831de26dbb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 11:57:48 -0700
Subject: [PATCH 1136/1262] Internal cleanup.

PiperOrigin-RevId: 193552240
---
 .../java/org/tensorflow/lite/DataType.java    | 12 ++-
 .../java/org/tensorflow/lite/Interpreter.java | 19 +++--
 .../lite/NativeInterpreterWrapper.java        | 21 +++---
 .../main/java/org/tensorflow/lite/Tensor.java |  7 +-
 .../java/src/main/native/exception_jni.cc     |  3 +-
 .../native/nativeinterpreterwrapper_jni.cc    | 74 +++++++++++--------
 .../lite/java/src/main/native/tensor_jni.cc   | 35 +++++----
 .../lite/NativeInterpreterWrapperTest.java    |  6 +-
 8 files changed, 102 insertions(+), 75 deletions(-)

diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index fc16488a64..75334cd96e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -51,7 +51,11 @@ enum DataType {
       }
     }
     throw new IllegalArgumentException(
-        "DataType " + c + " is not recognized in Java (version " + TensorFlowLite.version() + ")");
+        "DataType error: DataType "
+            + c
+            + " is not recognized in Java (version "
+            + TensorFlowLite.version()
+            + ")");
   }
 
   /** Returns byte size of the type. */
@@ -68,7 +72,8 @@ enum DataType {
       case BYTEBUFFER:
         return 1;
     }
-    throw new IllegalArgumentException("DataType " + this + " is not supported yet");
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
   }
 
   /** Gets string names of the data type. */
@@ -85,7 +90,8 @@ enum DataType {
       case BYTEBUFFER:
         return "ByteBuffer";
     }
-    throw new IllegalArgumentException("DataType " + this + " is not supported yet");
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
   }
 
   // Cached to avoid copying it
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index a33959dca4..e915e65aa1 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -137,17 +137,19 @@ public final class Interpreter implements AutoCloseable {
   public void runForMultipleInputsOutputs(
       @NonNull Object[] inputs, @NonNull Map<Integer, Object> outputs) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     Tensor[] tensors = wrapper.run(inputs);
     if (outputs == null || tensors == null || outputs.size() > tensors.length) {
-      throw new IllegalArgumentException("Outputs do not match with model outputs.");
+      throw new IllegalArgumentException("Output error: Outputs do not match with model outputs.");
     }
     final int size = tensors.length;
     for (Integer idx : outputs.keySet()) {
       if (idx == null || idx < 0 || idx >= size) {
         throw new IllegalArgumentException(
-            String.format("Invalid index of output %d (should be in range [0, %d))", idx, size));
+            String.format(
+                "Output error: Invalid index of output %d (should be in range [0, %d))",
+                idx, size));
       }
       tensors[idx].copyTo(outputs.get(idx));
     }
@@ -160,7 +162,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public void resizeInput(int idx, @NonNull int[] dims) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     wrapper.resizeInput(idx, dims);
   }
@@ -173,7 +175,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public int getInputIndex(String opName) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     return wrapper.getInputIndex(opName);
   }
@@ -186,7 +188,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public int getOutputIndex(String opName) {
     if (wrapper == null) {
-      throw new IllegalStateException("The Interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
     }
     return wrapper.getOutputIndex(opName);
   }
@@ -198,7 +200,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public Long getLastNativeInferenceDurationNanoseconds() {
     if (wrapper == null) {
-      throw new IllegalStateException("The interpreter has already been closed.");
+      throw new IllegalStateException("Internal error: The interpreter has already been closed.");
     }
     return wrapper.getLastNativeInferenceDurationNanoseconds();
   }
@@ -208,7 +210,8 @@ public final class Interpreter implements AutoCloseable {
     if (wrapper != null) {
       wrapper.setUseNNAPI(useNNAPI);
     } else {
-      throw new IllegalStateException("NativeInterpreterWrapper has already been closed.");
+      throw new IllegalStateException(
+          "Internal error: NativeInterpreterWrapper has already been closed.");
     }
   }
 
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index fc8187acfe..dfc8ac111a 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -80,7 +80,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   /** Sets inputs, runs model inference and returns outputs. */
   Tensor[] run(Object[] inputs) {
     if (inputs == null || inputs.length == 0) {
-      throw new IllegalArgumentException("Invalid inputs. Inputs should not be null or empty.");
+      throw new IllegalArgumentException("Input error: Inputs should not be null or empty.");
     }
     int[] dataTypes = new int[inputs.length];
     Object[] sizes = new Object[inputs.length];
@@ -92,7 +92,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
         ByteBuffer buffer = (ByteBuffer) inputs[i];
         if (buffer.order() != ByteOrder.nativeOrder()) {
           throw new IllegalArgumentException(
-              "Invalid ByteBuffer. It shoud use ByteOrder.nativeOrder().");
+              "Input error: ByteBuffer shoud use ByteOrder.nativeOrder().");
         }
         numsOfBytes[i] = buffer.limit();
         sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]);
@@ -103,7 +103,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       } else {
         throw new IllegalArgumentException(
             String.format(
-                "%d-th element of the %d inputs is not an array or a ByteBuffer.",
+                "Input error: %d-th element of the %d inputs is not an array or a ByteBuffer.",
                 i, inputs.length));
       }
     }
@@ -119,7 +119,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
             this,
             isMemoryAllocated);
     if (outputsHandles == null || outputsHandles.length == 0) {
-      throw new IllegalStateException("Interpreter has no outputs.");
+      throw new IllegalStateException("Internal error: Interpreter has no outputs.");
     }
     isMemoryAllocated = true;
     Tensor[] outputs = new Tensor[outputsHandles.length];
@@ -169,7 +169,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "%s is not a valid name for any input. The indexes of the inputs are %s",
+              "Input error: %s is not a valid name for any input. "
+                  + "The indexes of the inputs are %s",
               name, inputsIndexes.toString()));
     }
   }
@@ -190,7 +191,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "%s is not a valid name for any output. The indexes of the outputs are %s",
+              "Input error: %s is not a valid name for any output. "
+                  + "The indexes of the outputs are %s",
               name, outputsIndexes.toString()));
     }
   }
@@ -229,7 +231,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
         return DataType.BYTEBUFFER;
       }
     }
-    throw new IllegalArgumentException("cannot resolve DataType of " + o.getClass().getName());
+    throw new IllegalArgumentException(
+        "DataType error: cannot resolve DataType of " + o.getClass().getName());
   }
 
   /** Returns the shape of an object as an int array. */
@@ -245,7 +248,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       return 0;
     }
     if (Array.getLength(o) == 0) {
-      throw new IllegalArgumentException("array lengths cannot be 0.");
+      throw new IllegalArgumentException("Array lengths cannot be 0.");
     }
     return 1 + numDimensions(Array.get(o, 0));
   }
@@ -259,7 +262,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       shape[dim] = len;
     } else if (shape[dim] != len) {
       throw new IllegalArgumentException(
-          String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
+          String.format("Mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
     }
     for (int i = 0; i < len; ++i) {
       fillShape(Array.get(o, i), dim + 1, shape);
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 54ace6c63c..09e887aae3 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -34,15 +34,16 @@ final class Tensor {
     if (NativeInterpreterWrapper.dataTypeOf(dst) != dtype) {
       throw new IllegalArgumentException(
           String.format(
-              "Cannot convert an TensorFlowLite tensor with type %s to a Java object of "
-                  + "type %s (which is compatible with the TensorFlowLite type %s)",
+              "Output error: Cannot convert an TensorFlowLite tensor with type %s to a Java "
+                  + "object of type %s (which is compatible with the TensorFlowLite type %s)",
               dtype, dst.getClass().getName(), NativeInterpreterWrapper.dataTypeOf(dst)));
     }
     int[] dstShape = NativeInterpreterWrapper.shapeOf(dst);
     if (!Arrays.equals(dstShape, shapeCopy)) {
       throw new IllegalArgumentException(
           String.format(
-              "Shape of output target %s does not match with the shape of the Tensor %s.",
+              "Output error: Shape of output target %s does not match with the shape of the "
+                  + "Tensor %s.",
               Arrays.toString(dstShape), Arrays.toString(shapeCopy)));
     }
     readMultiDimensionalArray(nativeHandle, dst);
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
index 1578c9e3dd..34d91be04c 100644
--- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
@@ -44,7 +44,8 @@ BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) {
   buffer_ = new char[limit];
   if (!buffer_) {
     throwException(env, kNullPointerException,
-                   "Malloc of BufferErrorReporter to hold %d char failed.",
+                   "Internal error: Malloc of BufferErrorReporter to hold %d "
+                   "char failed.",
                    limit);
     return;
   }
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 844226203b..ccfdfd829b 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -22,7 +22,7 @@ const int kBufferSize = 256;
 tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to Interpreter.");
+                   "Internal error: Invalid handle to Interpreter.");
     return nullptr;
   }
   return reinterpret_cast<tflite::Interpreter*>(handle);
@@ -30,7 +30,8 @@ tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
 
 tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException, "Invalid handle to model.");
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to model.");
     return nullptr;
   }
   return reinterpret_cast<tflite::FlatBufferModel*>(handle);
@@ -39,7 +40,7 @@ tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
 BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to ErrorReporter.");
+                   "Internal error: Invalid handle to ErrorReporter.");
     return nullptr;
   }
   return reinterpret_cast<BufferErrorReporter*>(handle);
@@ -51,7 +52,7 @@ std::vector<int> convertJIntArrayToVector(JNIEnv* env, jintArray inputs) {
   jint* ptr = env->GetIntArrayElements(inputs, nullptr);
   if (ptr == nullptr) {
     throwException(env, kIllegalArgumentException,
-                   "Empty dimensions of input array.");
+                   "Array has empty dimensions.");
     return {};
   }
   for (int i = 0; i < size; ++i) {
@@ -113,7 +114,7 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                          jobjectArray sizes) {
   if (input_size != interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Expected num of inputs is %d but got %d",
+                   "Input error: Expected num of inputs is %d but got %d",
                    interpreter->inputs().size(), input_size);
     return kTfLiteError;
   }
@@ -121,8 +122,9 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
       input_size != env->GetArrayLength(nums_of_bytes) ||
       input_size != env->GetArrayLength(values)) {
     throwException(env, kIllegalArgumentException,
-                   "Arrays in arguments should be of the same length, but got "
-                   "%d sizes, %d data_types, %d nums_of_bytes, and %d values",
+                   "Internal error: Arrays in arguments should be of the same "
+                   "length, but got %d sizes, %d data_types, %d nums_of_bytes, "
+                   "and %d values",
                    input_size, env->GetArrayLength(data_types),
                    env->GetArrayLength(nums_of_bytes),
                    env->GetArrayLength(values));
@@ -136,8 +138,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
     int num_dims = static_cast<int>(env->GetArrayLength(dims));
     if (target->dims->size != num_dims) {
       throwException(env, kIllegalArgumentException,
-                     "%d-th input should have %d dimensions, but found %d "
-                     "dimensions",
+                     "Input error: %d-th input should have %d dimensions, but "
+                     "found %d dimensions",
                      i, target->dims->size, num_dims);
       return kTfLiteError;
     }
@@ -150,7 +152,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
                   num_dims);
         printDims(obtained_dims.get(), kBufferSize, ptr, num_dims);
         throwException(env, kIllegalArgumentException,
-                       "%d-th input dimension should be [%s], but found [%s]",
+                       "Input error: %d-th input dimension should be [%s], but "
+                       "found [%s]",
                        i, expected_dims.get(), obtained_dims.get());
         env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT);
         return kTfLiteError;
@@ -236,8 +239,8 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter,
       TfLiteType type = resolveDataType(data_type[i]);
       if (type != target->type) {
         throwException(env, kIllegalArgumentException,
-                       "DataType (%d) of input data does not match with the "
-                       "DataType (%d) of model inputs.",
+                       "Input error: DataType (%d) of input data does not "
+                       "match with the DataType (%d) of model inputs.",
                        type, target->type);
         return kTfLiteError;
       }
@@ -270,7 +273,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
     throwException(env, kUnsupportedOperationException,
-                   "Can not find java/lang/String class to get input names.");
+                   "Internal error: Can not find java/lang/String class to get "
+                   "input names.");
     return nullptr;
   }
   size_t size = interpreter->inputs().size();
@@ -292,7 +296,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
     throwException(env, kUnsupportedOperationException,
-                   "Can not find java/lang/String class to get output names.");
+                   "Internal error: Can not find java/lang/String class to get "
+                   "output names.");
     return nullptr;
   }
   size_t size = interpreter->outputs().size();
@@ -351,8 +356,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
       path, verifier.get(), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "Contents of %s does not encode a valid TensorFlowLite "
-                   "model: %s",
+                   "Contents of %s does not encode a valid "
+                   "TensorFlowLite model: %s",
                    path, error_reporter->CachedErrorMessage());
     env->ReleaseStringUTFChars(model_file, path);
     return 0;
@@ -380,8 +385,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer does not encode a valid TensorFlowLite "
-                   "model: %s",
+                   "MappedByteBuffer does not encode a valid "
+                   "TensorFlowLite model: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -403,7 +408,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
       &interpreter, static_cast<int>(num_threads));
   if (status != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
-                   "Cannot create interpreter: %s",
+                   "Internal error: Cannot create interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -411,7 +416,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   status = interpreter->AllocateTensors();
   if (status != kTfLiteOk) {
     throwException(env, kNullPointerException,
-                   "Can not allocate memory for the interpreter",
+                   "Internal error: Cannot allocate memory for the interpreter",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -440,7 +445,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
     // resizes inputs
     status = resizeInputs(env, interpreter, input_size, sizes);
     if (status != kTfLiteOk) {
-      throwException(env, kNullPointerException, "Can not resize the input: %s",
+      throwException(env, kNullPointerException,
+                     "Internal error: Can not resize the input: %s",
                      error_reporter->CachedErrorMessage());
       return nullptr;
     }
@@ -448,7 +454,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
     status = interpreter->AllocateTensors();
     if (status != kTfLiteOk) {
       throwException(env, kNullPointerException,
-                     "Can not allocate memory for the given inputs: %s",
+                     "Internal error: Can not allocate memory for the given "
+                     "inputs: %s",
                      error_reporter->CachedErrorMessage());
       return nullptr;
     }
@@ -461,7 +468,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   // runs inference
   if (interpreter->Invoke() != kTfLiteOk) {
     throwException(env, kIllegalArgumentException,
-                   "Failed to run on the given Interpreter: %s",
+                   "Internal error: Failed to run on the given Interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return nullptr;
   }
@@ -479,8 +486,9 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   // returns outputs
   const std::vector<int>& results = interpreter->outputs();
   if (results.empty()) {
-    throwException(env, kIllegalArgumentException,
-                   "The Interpreter does not have any outputs.");
+    throwException(
+        env, kIllegalArgumentException,
+        "Internal error: The Interpreter does not have any outputs.");
     return nullptr;
   }
   jlongArray outputs = env->NewLongArray(results.size());
@@ -501,7 +509,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
   const int idx = static_cast<int>(input_idx);
   if (input_idx < 0 || input_idx >= interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Out of range: Failed to get %d-th input out of %d inputs",
+                   "Input error: Out of range: Failed to get %d-th input out of"
+                   " %d inputs",
                    input_idx, interpreter->inputs().size());
     return nullptr;
   }
@@ -514,8 +523,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
     }
     if (num_bytes != expected_num_bytes) {
       throwException(env, kIllegalArgumentException,
-                     "Failed to get input dimensions. %d-th input should have"
-                     " %d bytes, but found %d bytes.",
+                     "Input error: Failed to get input dimensions. %d-th input "
+                     "should have %d bytes, but found %d bytes.",
                      idx, expected_num_bytes, num_bytes);
       return nullptr;
     }
@@ -533,8 +542,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
   const int idx = static_cast<int>(output_idx);
   if (output_idx < 0 || output_idx >= interpreter->outputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Out of range: Failed to get %d-th output out of %d outputs",
-                   output_idx, interpreter->outputs().size());
+                   "Failed to get %d-th output out of %d outputs", output_idx,
+                   interpreter->outputs().size());
     return -1;
   }
   TfLiteTensor* target = interpreter->tensor(interpreter->outputs()[idx]);
@@ -555,7 +564,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
   const int idx = static_cast<int>(input_idx);
   if (idx < 0 || idx >= interpreter->inputs().size()) {
     throwException(env, kIllegalArgumentException,
-                   "Can not resize %d-th input for a model having %d inputs.",
+                   "Input error: Can not resize %d-th input for a model having "
+                   "%d inputs.",
                    idx, interpreter->inputs().size());
     return JNI_FALSE;
   }
@@ -567,7 +577,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
         interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
     if (status != kTfLiteOk) {
       throwException(env, kIllegalArgumentException,
-                     "Failed to resize %d-th input: %s", idx,
+                     "Internal error: Failed to resize %d-th input: %s", idx,
                      error_reporter->CachedErrorMessage());
       return JNI_FALSE;
     }
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
index 65126e78a3..17f4be09c6 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -23,7 +23,7 @@ namespace {
 TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalArgumentException,
-                   "Invalid handle to TfLiteTensor.");
+                   "Internal error: Invalid handle to TfLiteTensor.");
     return nullptr;
   }
   return reinterpret_cast<TfLiteTensor*>(handle);
@@ -36,7 +36,8 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   size_t to_copy = num_elements * elementByteSize(type);
   if (to_copy > dst_size) {
     throwException(env, kIllegalStateException,
-                   "cannot write Java array of %d bytes to Tensor of %d bytes",
+                   "Internal error: cannot write Java array of %d bytes to "
+                   "Tensor of %d bytes",
                    to_copy, dst_size);
     return 0;
   }
@@ -71,10 +72,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
     }
     default: {
       throwException(env, kUnsupportedOperationException,
-                     "TensorFlowLite currently supports float (32 bits), "
-                     "int (32 bits), byte (8 bits), and long (64 bits), "
-                     "support for other types (DataType %d in this case) will "
-                     "be added in the future",
+                     "DataType error: TensorFlowLite currently supports float "
+                     "(32 bits), int (32 bits), byte (8 bits), and long "
+                     "(64 bits), support for other types (DataType %d in this "
+                     "case) will be added in the future",
                      kTfLiteFloat32, type);
       return 0;
     }
@@ -88,8 +89,9 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
   if (size > src_size) {
     throwException(
         env, kIllegalStateException,
-        "cannot fill a Java array of %d bytes with a Tensor of %d bytes", size,
-        src_size);
+        "Internal error: cannot fill a Java array of %d bytes with a Tensor of "
+        "%d bytes",
+        size, src_size);
     return 0;
   }
   switch (data_type) {
@@ -117,8 +119,8 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
       return size;
     }
     default: {
-      throwException(env, kIllegalStateException, "invalid DataType(%d)",
-                     data_type);
+      throwException(env, kIllegalStateException,
+                     "DataType error: invalid DataType(%d)", data_type);
     }
   }
   return 0;
@@ -152,19 +154,22 @@ size_t elementByteSize(TfLiteType data_type) {
   switch (data_type) {
     case kTfLiteFloat32:
       static_assert(sizeof(jfloat) == 4,
-                    "Java float not compatible with kTfLiteFloat");
+                    "Interal error: Java float not compatible with "
+                    "kTfLiteFloat");
       return 4;
     case kTfLiteInt32:
       static_assert(sizeof(jint) == 4,
-                    "Java int not compatible with kTfLiteInt");
+                    "Interal error: Java int not compatible with kTfLiteInt");
       return 4;
     case kTfLiteUInt8:
       static_assert(sizeof(jbyte) == 1,
-                    "Java byte not compatible with kTfLiteUInt8");
+                    "Interal error: Java byte not compatible with "
+                    "kTfLiteUInt8");
       return 1;
     case kTfLiteInt64:
       static_assert(sizeof(jlong) == 8,
-                    "Java long not compatible with kTfLiteInt64");
+                    "Interal error: Java long not compatible with "
+                    "kTfLiteInt64");
       return 8;
     default:
       return 0;
@@ -212,7 +217,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
   int num_dims = tensor->dims->size;
   if (num_dims == 0) {
     throwException(env, kIllegalArgumentException,
-                   "copyTo() is not meant for scalar Tensors.");
+                   "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
   readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index dbe45e5a05..7c00d3196f 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -321,9 +321,7 @@ public final class NativeInterpreterWrapperTest {
       wrapper.run(inputs);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e)
-          .hasMessageThat()
-          .contains("Invalid inputs. Inputs should not be null or empty.");
+      assertThat(e).hasMessageThat().contains("Inputs should not be null or empty.");
     }
     wrapper.close();
   }
@@ -440,7 +438,7 @@ public final class NativeInterpreterWrapperTest {
       NativeInterpreterWrapper.numDimensions(emptyArray);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e).hasMessageThat().contains("array lengths cannot be 0.");
+      assertThat(e).hasMessageThat().contains("Array lengths cannot be 0.");
     }
   }
 
-- 
GitLab


From 16d25e8c8a9ebb6500d3b3418ca8c2bb80c3e42e Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 19 Apr 2018 11:58:04 -0700
Subject: [PATCH 1137/1262] Add support for Dataset Iterators in Model
 training/eval methods in graph mode.

PiperOrigin-RevId: 193552275
---
 tensorflow/python/keras/BUILD                 |   1 +
 .../keras/_impl/keras/engine/training.py      | 195 ++++++++++++------
 .../_impl/keras/engine/training_arrays.py     |  12 +-
 .../keras/_impl/keras/engine/training_test.py |  84 +++++++-
 .../api/golden/tensorflow.keras.-model.pbtxt  |   4 +-
 .../golden/tensorflow.keras.-sequential.pbtxt |   4 +-
 .../tensorflow.keras.models.-model.pbtxt      |   4 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   4 +-
 8 files changed, 223 insertions(+), 85 deletions(-)

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index ca7686b1d1..70040b7e74 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -175,6 +175,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
+        "//tensorflow/python/data",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 012d9ceea4..146e8fdac9 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -634,12 +636,20 @@ class Model(Network):
     This is a purely internal method, subject to refactoring at any time.
 
     Args:
-      x: An array or list of arrays, to be used as input data. If the model
-       has known, named inputs, this could also be a dict mapping input names
-       to the corresponding array.
-      y: An array or list of arrays, to be used as target data. If the model
-       has known, named outputs, this could also be a dict mapping output names
-       to the corresponding array.
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        - A `tf.data` dataset iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely). If `x` is a dataset iterator,
+        `y` should not be specified
+        (since targets will be obtained from the iterator).
       sample_weight: An optional sample-weight array passed by the user to
         weight the importance of each sample in `x`.
       class_weight: An optional class-weight array by the user to
@@ -659,6 +669,31 @@ class Model(Network):
       RuntimeError: If the model was never compiled.
     """
     # First, we build/compile the model on the fly if necessary.
+    if isinstance(x, dataset_ops.Dataset):
+      raise ValueError('You passed a `Dataset` instance to your model (%s), '
+                       'which is not supported. Instead, pass an `Iterator`, '
+                       'which you can obtain e.g. via '
+                       '`dataset.make_one_shot_iterator()` (the exact method '
+                       'to use will depend on your specific dataset).' % x)
+    if isinstance(x, iterator_ops.Iterator):
+      if y is not None:
+        raise ValueError('You passed a dataset iterator (%s) as input `x` to '
+                         'your model. In that case, you should not specify '
+                         'a target (`y`) argument, since the dataset iterator '
+                         'generates both input data and target data. '
+                         'Received: %s' % (x, y))
+      if not context.executing_eagerly():
+        x, y = x.get_next()
+        # TODO(fchollet): handle case of `get_next` not returning 2 tensors?
+      else:
+        # TODO(psv): implement this. The way to support it will be to typecheck
+        # for `iterator` before `_standardize_user_data` is called and redirect
+        # to new training/eval functions in `training_eager.py`. The model
+        # may need to get built using the specs of the data from the first batch
+        # drawn from the iterator.
+        raise ValueError('Dataset iterators are not supported '
+                         'with eager execution yet.')
+
     all_inputs = []
     if not self.built:
       # We need to use `x` to set the model inputs.
@@ -1016,22 +1051,26 @@ class Model(Network):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
-        x: Numpy array of training data (if the model has a single input),
-            or list of Numpy arrays (if the model has multiple inputs).
-            If input layers in the model are named, you can also pass a
-            dictionary mapping input names to Numpy arrays.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data
-            (if the model has a single output),
-            or list of Numpy arrays (if the model has multiple outputs).
-            If output layers in the model are named, you can also pass a
-            dictionary mapping output names to Numpy arrays.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1053,11 +1092,14 @@ class Model(Network):
             on this data at the end of each epoch.
             The validation data is selected from the last samples
             in the `x` and `y` data provided, before shuffling.
-        validation_data: tuple `(x_val, y_val)` or tuple
-            `(x_val, y_val, val_sample_weights)` on which to evaluate
+        validation_data: Data on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
             `validation_data` will override `validation_split`.
+            `validation_data` could be:
+              - tuple `(x_val, y_val)` of Numpy arrays or tensors
+              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
+              - dataset iterator
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1134,17 +1176,22 @@ class Model(Network):
         batch_size=batch_size)
     # Prepare validation data.
     if validation_data:
-      if len(validation_data) == 2:
+      if isinstance(validation_data, iterator_ops.Iterator):
+        val_x = validation_data
+        val_y = None
+        val_sample_weight = None
+      elif len(validation_data) == 2:
         val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
         val_sample_weight = None
       elif len(validation_data) == 3:
         val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
       else:
         raise ValueError(
-            'When passing validation_data, '
-            'it must contain 2 (x_val, y_val) '
-            'or 3 (x_val, y_val, val_sample_weights) '
-            'items, however it contains %d items' % len(validation_data))
+            'When passing a `validation_data` argument, '
+            'it must contain either 2 items (x_val, y_val), '
+            'or 3 items (x_val, y_val, val_sample_weights), '
+            'or alternatively it could be a dataset iterator. However we '
+            'received `validation_data=%s`' % validation_data)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1218,22 +1265,26 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: Numpy array of test data (if the model has a single input),
-            or list of Numpy arrays (if the model has multiple inputs).
-            If input layers in the model are named, you can also pass a
-            dictionary mapping input names to Numpy arrays.
-            `x` can be `None` (default) if feeding from
-            TensorFlow data tensors.
-        y: Numpy array of target (label) data
-            (if the model has a single output),
-            or list of Numpy arrays (if the model has multiple outputs).
-            If output layers in the model are named, you can also pass a
-            dictionary mapping output names to Numpy arrays.
-            `y` can be `None` (default) if feeding from
-            TensorFlow data tensors.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
-            Number of samples per evaluation step.
+            Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         verbose: 0 or 1. Verbosity mode.
             0 = silent, 1 = progress bar.
         sample_weight: Optional Numpy array of weights for
@@ -1291,9 +1342,13 @@ class Model(Network):
     Computation is done in batches.
 
     Arguments:
-        x: The input data, as a Numpy array
-            (or list of Numpy arrays if the model has multiple outputs).
-        batch_size: Integer. If unspecified, it will default to 32.
+        x: Input samples, as Numpy array(s) or tensor(s).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors or dataset iterators (since they generate
+            batches).
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
@@ -1324,20 +1379,24 @@ class Model(Network):
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
 
-  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
+  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
-        x: Numpy array of training data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -1384,20 +1443,24 @@ class Model(Network):
       return outputs[0]
     return outputs
 
-  def test_on_batch(self, x, y, sample_weight=None):
+  def test_on_batch(self, x, y=None, sample_weight=None):
     """Test the model on a single batch of samples.
 
     Arguments:
-        x: Numpy array of test data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset iterator,
+          `y` should not be specified
+          (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
             In the case of temporal data, you can pass a 2D array
@@ -1437,7 +1500,7 @@ class Model(Network):
     """Returns predictions for a single batch of samples.
 
     Arguments:
-        x: Input samples, as a Numpy array.
+        x: Input samples, as Numpy array(s) or tensor(s).
 
     Returns:
         Numpy array(s) of predictions.
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
index 18116e3a14..4164cae864 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
@@ -23,6 +23,7 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.framework import errors
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras.engine import training_utils
@@ -30,6 +31,7 @@ from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.platform import tf_logging as logging
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -190,7 +192,15 @@ def fit_loop(model,
         batch_logs['batch'] = step_index
         batch_logs['size'] = 1
         callbacks.on_batch_begin(step_index, batch_logs)
-        outs = f(ins)
+        try:
+          outs = f(ins)
+        except errors.OutOfRangeError:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+          break
 
         if not isinstance(outs, list):
           outs = [outs]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index d9281436de..58011a1412 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -23,6 +23,7 @@ import unittest
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
@@ -31,9 +32,9 @@ from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_m
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
-
 try:
   import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
 except ImportError:
@@ -1711,14 +1712,77 @@ class TestTrainingWithDataTensors(test.TestCase):
                               'dropout_acc']
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
 
+class TestTrainingWithDatasetIterators(test.TestCase):
+
+  def test_training_and_eval_methods_on_iterators_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(iterator, steps=2, verbose=0)
+      model.predict(iterator, steps=2)
+      model.train_on_batch(iterator)
+      model.test_on_batch(iterator)
+      # Test with validation data
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=iterator, validation_steps=2)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(iterator,
+                  epochs=1, steps_per_epoch=2, verbose=0,
+                  validation_split=0.5, validation_steps=2)
+
+      # Test invalid usage
+      with self.assertRaisesRegexp(ValueError,
+                                   'Instead, pass an `Iterator`'):
+        model.fit(dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should not specify a target'):
+        model.fit(iterator, iterator,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+
+  def test_iterators_running_out_of_data(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(2)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'dataset iterator ran out of data')
+
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7713d78b8a..cdf2da712f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -251,7 +251,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +263,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 69b81f75fa..5c2c29e60f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -268,7 +268,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +280,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 3ac285681f..b3f3f16922 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -251,7 +251,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +263,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 51ba0c5043..4ac6811bac 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -268,7 +268,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +280,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
-- 
GitLab


From a186c4c093fce7e3fcc8cd59ca0e968324311f09 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 12:32:52 -0700
Subject: [PATCH 1138/1262] Fix bug in ring_reducer.cc abort handling.

PiperOrigin-RevId: 193557334
---
 .../core/common_runtime/ring_reducer.cc       | 20 ++++++++++---------
 .../core/common_runtime/ring_reducer_test.cc  | 12 +++++------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 79d03a24ce..a1cd762505 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -426,17 +426,20 @@ bool RingReducer::RunAsyncParts() {
     // is done.
     bool dispatched = false;  // true if async action was initiated
     do {
-      if (aborted) break;
+      if (aborted) {
+        // Requeue this RingField to be counted off below.
+        ready_queue.Enqueue(rf);
+        break;
+      }
       switch (rf->action) {
         case RF_INIT:
           if (rf->do_recv) {
             rf->action = RF_RECV;
             auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
-              if (!s.ok()) {
-                aborted = true;
-                StartAbort(s);
-              }
+              const bool bad_status = !s.ok();
+              if (bad_status) aborted = true;
               ready_queue.Enqueue(rf);
+              if (bad_status) StartAbort(s);
             };
             DispatchRecv(rf, requeue);
             dispatched = true;
@@ -481,11 +484,10 @@ bool RingReducer::RunAsyncParts() {
           if (rf->do_send) {
             rf->action = RF_SEND;
             auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
-              if (!s.ok()) {
-                aborted = true;
-                StartAbort(s);
-              }
+              const bool bad_status = !s.ok();
+              if (bad_status) aborted = true;
               ready_queue.Enqueue(rf);
+              if (bad_status) StartAbort(s);
             };
             DispatchSend(rf, send_complete);
             dispatched = true;
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 57c36d6582..e4387a074a 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -572,9 +572,9 @@ DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
 DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
 
-// // Failure tests
-// DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
-// DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
 #endif
 
 #ifdef GOOGLE_CUDA
@@ -597,9 +597,9 @@ DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
 // DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
 DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
 
-// // Failure tests
-// DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
-// DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
 #endif
 
 }  // namespace
-- 
GitLab


From 46aec0d27f5d6fb3a0b81bc5a3384da11273dad6 Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Thu, 19 Apr 2018 12:44:21 -0700
Subject: [PATCH 1139/1262] Make PmfToQuantizedCdf op to make adjustments if
 the sum of quantized pmf is less than 2**precision.

Prior to the change, the op did nothing when the sum of quantized pmf was less
than 2**precision. While the produced CDF was valid for range coders,
adjustments to CDF could be made to achieve better compression rate.

PiperOrigin-RevId: 193558740
---
 .../contrib/coder/kernels/pmf_to_cdf_op.cc    | 60 ++++++++++++++++---
 .../coder/kernels/pmf_to_cdf_op_test.cc       |  6 +-
 tensorflow/contrib/coder/ops/coder_ops.cc     | 16 +++--
 3 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
index c787e8eded..bd5272ee6f 100644
--- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <algorithm>
+#include <functional>
 #include <iterator>
 #include <numeric>
 #include <vector>
@@ -79,8 +80,8 @@ class PmfToCdfOp : public OpKernel {
   }
 
  private:
-  struct Item {
-    Item(int32* p, double mass) : pointer(p), mass(mass) {
+  struct PenaltyItem {
+    PenaltyItem(int32* p, double mass) : pointer(p), mass(mass) {
       penalty = ComputeNextPenalty();
     }
 
@@ -90,7 +91,7 @@ class PmfToCdfOp : public OpKernel {
       penalty = ComputeNextPenalty();
     }
 
-    friend bool operator<(const Item& lhs, const Item& rhs) {
+    friend bool operator<(const PenaltyItem& lhs, const PenaltyItem& rhs) {
       return lhs.penalty < rhs.penalty;
     }
 
@@ -106,6 +107,34 @@ class PmfToCdfOp : public OpKernel {
     double penalty;
   };
 
+  struct GainItem {
+    GainItem(int32* p, double mass) : pointer(p), mass(mass) {
+      gain = ComputeNextGain();
+    }
+
+    void Increase() {
+      CHECK_GT(*pointer, 0);
+      ++*pointer;
+      gain = ComputeNextGain();
+    }
+
+    friend bool operator>(const GainItem& lhs, const GainItem& rhs) {
+      return lhs.gain > rhs.gain;
+    }
+
+    double ComputeNextGain() {
+      // Never increment zero value to non-zero value.
+      if (*pointer < 1) {
+        return -std::numeric_limits<double>::infinity();
+      }
+      return mass * (std::log2(*pointer + 1) - std::log2(*pointer));
+    }
+
+    int32* pointer;
+    double mass;
+    double gain;
+  };
+
   void PerShard(gtl::ArraySlice<float> pmf,
                 gtl::MutableArraySlice<int32> cdf) const {
     CHECK_EQ(pmf.size(), cdf.size());
@@ -121,7 +150,7 @@ class PmfToCdfOp : public OpKernel {
 
     int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0);
     if (sum > normalizer) {
-      std::vector<Item> queue;
+      std::vector<PenaltyItem> queue;
       queue.reserve(cdf.size());
       for (int i = 0; i < cdf.size(); ++i) {
         queue.emplace_back(&cdf[i], pmf[i]);
@@ -132,9 +161,26 @@ class PmfToCdfOp : public OpKernel {
         queue[0].Decrease();
         // Performs a linear search because this find_if is likely to return
         // iterator very close to the begin.
-        auto iter =
-            std::find_if(std::next(queue.begin()), queue.end(),
-                         [&queue](const Item& rhs) { return queue[0] < rhs; });
+        auto iter = std::find_if(
+            std::next(queue.begin()), queue.end(),
+            [&queue](const PenaltyItem& rhs) { return queue[0] < rhs; });
+        std::rotate(queue.begin(), std::next(queue.begin()), iter);
+      }
+    } else if (sum < normalizer) {
+      std::vector<GainItem> queue;
+      queue.reserve(cdf.size());
+      for (int i = 0; i < cdf.size(); ++i) {
+        queue.emplace_back(&cdf[i], pmf[i]);
+      }
+
+      std::sort(queue.begin(), queue.end(), std::greater<GainItem>());
+      while (sum++ < normalizer) {
+        queue[0].Increase();
+        // Performs a linear search because this find_if is likely to return
+        // iterator very close to the begin.
+        auto iter = std::find_if(
+            std::next(queue.begin()), queue.end(),
+            [&queue](const GainItem& rhs) { return queue[0] > rhs; });
         std::rotate(queue.begin(), std::next(queue.begin()), iter);
       }
     }
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
index c70e38faab..3408f6b519 100644
--- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
@@ -82,7 +82,7 @@ class PmfToQuantizedCdfOpTest : public OpsTestBase {
         EXPECT_GT(diff, 0);
       }
 
-      EXPECT_LE(cdf_slice(cdf_slice.size() - 1), normalizer);
+      EXPECT_EQ(cdf_slice(cdf_slice.size() - 1), normalizer);
     }
   }
 };
@@ -98,6 +98,8 @@ TEST_F(PmfToQuantizedCdfOpTest, UnderSum) {
     GenerateData(&rand, {&matrix(i, 0), n});
   }
 
+  pmf.flat<float>() = pmf.flat<float>() * 0.85f;
+
   constexpr int kPrecision = 10;
   SetupOp(kPrecision, &pmf);
   TF_ASSERT_OK(RunOpKernel());
@@ -115,7 +117,7 @@ TEST_F(PmfToQuantizedCdfOpTest, OverSum) {
   matrix.setZero();
   const std::size_t n = matrix.dimension(1) / 2;
 
-  random::PhiloxRandom gen;
+  random::PhiloxRandom gen(random::New64(), random::New64());
   random::SimplePhilox rand(&gen);
   for (int64 i = 0; i < matrix.dimension(0); ++i) {
     GenerateData(&rand, {&matrix(i, 0), n});
diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc
index 9bb171298f..a185e07913 100644
--- a/tensorflow/contrib/coder/ops/coder_ops.cc
+++ b/tensorflow/contrib/coder/ops/coder_ops.cc
@@ -77,7 +77,7 @@ are incorrect. For this reason, the range coder uses integer arithmetics and
 avoids using any floating point operations internally, and `cdf` should contain
 integers representing quantized probability mass rather than floating points. 
 
-data: An int32 tensor.
+data: An int16 tensor.
 cdf: An int32 tensor representing the CDF's of `data`. Each integer is divided
   by `2^precision` to represent a fraction.
 encoded: A range-coded scalar string.
@@ -112,7 +112,7 @@ potential performance issues, the decoder does not return error status.
 encoded: A scalar string tensor from RangeEncode.
 shape: An int32 1-D tensor representing the shape of the data encoded by
   RangeEncode.
-decoded: An int32 tensor with shape equal to `shape`.
+decoded: An int16 tensor with shape equal to `shape`.
 precision: The number of bits for probability quantization. Must be <= 16, and
   must match the precision used by RangeEncode that produced `encoded`.
 )doc");
@@ -138,14 +138,12 @@ platforms. For entropy encoders and decoders to have the same quantized CDF on
 different platforms, the quantized CDF should be produced once and saved, then
 the saved quantized CDF should be used everywhere.
 
-After quantization, if PMF sums to less than or equal to 2^precision, then this
-is equivalent to cumsum over the last dimension. This op makes no effort to make
-the sum close to 2^precision when the sum is already <= 2^precision.
+After quantization, if PMF does not sum to 2^precision, then some values of PMF
+are increased or decreased to adjust the sum to equal to 2^precision.
 
-After quantization, if PMF sums to greater than 2^precision, then some values of
-PMF is decreased to keep the sum no more than 2^precision.
-
-Note that the input PMF is pre-quantization.
+Note that the input PMF is pre-quantization. The input PMF is not normalized
+by this op prior to quantization. Therefore the user is responsible for
+normalizing PMF if necessary.
 )doc");
 // clang-format on
 }  // namespace tensorflow
-- 
GitLab


From b3118b1f741896585d47184018f1d74d70e0e6c7 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:08:37 -0700
Subject: [PATCH 1140/1262] Update adam.py

---
 tensorflow/contrib/optimizer_v2/adam.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 76a867039a..d538ad0fb0 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -40,19 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
     Initialization:
 
-    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
-    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
-    $$t \Leftarrow 0 (Initialize timestep)$$
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t \Leftarrow t + 1$$
-    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From 58f6760373b7a2d71053bd17b8017e57e5d1195d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:09:24 -0700
Subject: [PATCH 1141/1262] Update api_def_ApplyAdam.pbtxt

---
 tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
index fca8ba2530..b90f5473c8 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -82,9 +82,9 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
-- 
GitLab


From 3c49ae705fc8dc65c34021bc616218e7bae5d625 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:09:59 -0700
Subject: [PATCH 1142/1262] Update api_def_ResourceApplyAdam.pbtxt

---
 .../core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 8b16d824bf..743247bb60 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -76,8 +76,8 @@ END
   }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: <<END
-$$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
-$$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-$$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-$$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 }
-- 
GitLab


From 391626d76f6311219d4b78b5515934cbd0dd0c6d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 19 Apr 2018 13:11:04 -0700
Subject: [PATCH 1143/1262] Update adam.py

---
 tensorflow/python/training/adam.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 9f523a3aca..6fa3ff6658 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -43,19 +43,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Initialization:
 
-    $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$
-    $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$
-    $$t \Leftarrow 0 (Initialize timestep)$$
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$
 
     The update rule for `variable` with gradient `g` uses an optimization
     described at the end of section2 of the paper:
 
-    $$t \Leftarrow t + 1$$
-    $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 
-    $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-- 
GitLab


From b6686d2808b40ed985db2151bcf31961b53e49f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 13:09:07 -0700
Subject: [PATCH 1144/1262] Collective Ops Part 4

Add Broadcaster.
A few minor adjustments to CollectiveParams and RMA.

This change is part of a series of changes introducing infrastructure
for collective ops and initial implementations of reduction and broadcast.

PiperOrigin-RevId: 193562391
---
 tensorflow/core/BUILD                         |  30 +
 .../base_collective_executor.cc               |  81 +-
 .../common_runtime/base_collective_executor.h |   7 +
 tensorflow/core/common_runtime/broadcaster.cc | 249 ++++++
 tensorflow/core/common_runtime/broadcaster.h  |  66 ++
 .../core/common_runtime/broadcaster_test.cc   | 741 ++++++++++++++++++
 .../collective_param_resolver_local.cc        |  42 +-
 .../collective_param_resolver_local_test.cc   |   8 +-
 .../common_runtime/collective_rma_local.h     |   2 +
 tensorflow/core/framework/collective.cc       |  15 +-
 tensorflow/core/framework/collective.h        |   7 +-
 11 files changed, 1220 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/broadcaster.cc
 create mode 100644 tensorflow/core/common_runtime/broadcaster.h
 create mode 100644 tensorflow/core/common_runtime/broadcaster_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 54e7ab31d7..c15e7de186 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2256,6 +2256,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
     "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/broadcaster.h",
     "common_runtime/buf_rendezvous.h",
     "common_runtime/build_graph_options.h",
     "common_runtime/collective_executor_mgr.h",
@@ -2303,6 +2304,7 @@ tf_cuda_library(
         "common_runtime/allocator_retry.cc",
         "common_runtime/base_collective_executor.cc",
         "common_runtime/bfc_allocator.cc",
+        "common_runtime/broadcaster.cc",
         "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
         "common_runtime/collective_executor_mgr.cc",
@@ -3140,6 +3142,34 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "broadcaster_test",
+    size = "small",
+    srcs = [
+        "common_runtime/broadcaster_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index f6332fabdb..637b43c844 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -14,14 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/broadcaster.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 #include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 #define VALUE_IN_DEBUG_STRING false
 
@@ -194,37 +193,68 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
                                           const CollectiveParams& col_params,
                                           const string& exec_key,
                                           StatusCallback done) {
-  const Tensor* input = &ctx->input(0);
+  // On any individual collective Op failure we need to abort the
+  // BufRendezvous so that other Ops in the instance don't hang
+  // waiting for transmissions that will never happen.  Do so after a
+  // delay so that the original error status is more likely to
+  // propagate up, and peers are unlikely to re-create the purged
+  // BufRendezvous by late-arriving requests.
+  StatusCallback done_safe = [this, done](const Status& s) {
+    if (!s.ok()) {
+      Ref();  // Ensure this lasts until the closure executes.
+      SchedNonBlockingClosureAfter(1000000, [this, s] {
+        remote_access_->buf_rendezvous()->StartAbort(s);
+        Unref();
+      });
+    }
+    done(s);
+  };
+
   Tensor* output = ctx->mutable_output(0);
   string error;
   switch (col_params.instance.type) {
     case REDUCTION_COLLECTIVE: {
       // TODO(tucker): support other reduction algorithms,
       // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc.
+      const Tensor* input = &ctx->input(0);
       RingReducer* reducer =
           CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_,
                         input, output, &error);
       if (!reducer) {
-        done(errors::Internal(error));
+        done_safe(errors::Internal(error));
         return;
       }
       // Run in an I/O thread, so as not to starve the executor threads.
       // TODO(tucker): Instead of forking every per-device Collective
       // Op off into its own thread, consider queuing them on a
       // fixed-size thread-pool dedicated to running CollectiveOps.
-      SchedClosure([reducer, done]() {
-        reducer->Run([reducer, done](const Status& s) {
-          done(s);
+      SchedClosure([reducer, done_safe]() {
+        reducer->Run([reducer, done_safe](const Status& s) {
+          done_safe(s);
           delete reducer;
         });
       });
     } break;
-    case BROADCAST_COLLECTIVE:
-      done(errors::Internal("Collective Broadcast unimplemented"));
-      break;
+
+    case BROADCAST_COLLECTIVE: {
+      Broadcaster* broadcaster = CreateBroadcaster(
+          ctx, CtxParams(ctx), col_params, exec_key, step_id_, output, &error);
+      if (!broadcaster) {
+        done_safe(errors::Internal(error));
+        return;
+      }
+      // Run in an I/O thread, so as not to starve the executor threads.
+      SchedClosure([broadcaster, done_safe]() {
+        broadcaster->Run([broadcaster, done_safe](const Status& s) {
+          done_safe(s);
+          delete broadcaster;
+        });
+      });
+    } break;
+
     default:
-      done(errors::Internal("Unimplemented CollectiveType ",
-                            col_params.instance.type));
+      done_safe(errors::Internal("Unimplemented CollectiveType ",
+                                 col_params.instance.type));
   }
 }
 
@@ -254,4 +284,31 @@ RingReducer* BaseCollectiveExecutor::CreateReducer(
   }
 }
 
+Broadcaster* BaseCollectiveExecutor::CreateBroadcaster(
+    OpKernelContext* ctx, OpKernelContext::Params* params,
+    const CollectiveParams& col_params, const string& exec_key, int64 step_id,
+    Tensor* output, string* error) {
+  switch (col_params.instance.data_type) {
+    case DT_INT32:
+      if (col_params.group.device_type == DEVICE_GPU) {
+        *error =
+            "Collective Broadcast does not support datatype DT_INT32 on "
+            "DEVICE_GPU";
+        return nullptr;
+      }
+      TF_FALLTHROUGH_INTENDED;
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT64: {
+      return new Broadcaster(this, dev_mgr_, ctx, params, col_params, exec_key,
+                             step_id, output);
+    } break;
+    default:
+      *error =
+          strings::StrCat("Collective Broadcast does not support datatype ",
+                          DataTypeString(col_params.instance.data_type));
+      return nullptr;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 58eaf31f71..462d6b7533 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_attributes.pb.h"
 
 namespace tensorflow {
+class Broadcaster;
 class DeviceMgr;
 class RingReducer;
 
@@ -138,6 +139,12 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                              const string& exec_key, int64 step_id,
                              const Tensor* input, Tensor* output,
                              string* error);
+
+  Broadcaster* CreateBroadcaster(OpKernelContext* ctx,
+                                 OpKernelContext::Params* params,
+                                 const CollectiveParams& col_params,
+                                 const string& exec_key, int64 step_id,
+                                 Tensor* output, string* error);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc
new file mode 100644
index 0000000000..5e8af8653d
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster.cc
@@ -0,0 +1,249 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/broadcaster.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+
+namespace tensorflow {
+
+namespace {
+// Key to be used for BufRendezvous by Broadcaster.
+string BroadcastBufKey(const string& exec_key, int src_rank, int dst_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat("broadcast(", exec_key, "):src(", src_rank, "):dst(",
+                           dst_rank, ")");
+  } else {
+    // TODO(tucker): Try a denser format, e.g. a 64 or 128 bit hash.
+    return strings::StrCat(exec_key, ":", src_rank, ":", dst_rank);
+  }
+}
+}  // namespace
+
+Broadcaster::Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+                         OpKernelContext* ctx, OpKernelContext::Params* params,
+                         const CollectiveParams& col_params,
+                         const string& exec_key, int64 step_id, Tensor* output)
+    : col_exec_(col_exec),
+      dev_mgr_(dev_mgr),
+      ctx_(ctx),
+      col_params_(col_params),
+      exec_key_(exec_key),
+      rank_(col_params.subdiv_rank[0]),
+      is_source_(col_params.is_source),
+      output_(output),
+      done_(nullptr),
+      device_(nullptr) {}
+
+void Broadcaster::Run(StatusCallback done) {
+  // The optimal data transfer choreography is going to very platform dependent.
+  // That will be addressed by later improvements here or by platform-specific
+  // overrides of collective broadcast. The initial version is simply
+  // a binary tree that completely ignores DeviceLocality.
+  done_ = std::move(done);
+
+  // Get the device for which we're executing and look up its locality.
+  status_ = dev_mgr_->LookupDevice(
+      col_params_.instance.device_names[col_params_.default_rank], &device_);
+  if (!status_.ok()) {
+    done_(status_);
+    return;
+  }
+  CHECK(device_);
+  device_locality_ = device_->attributes().locality();
+
+  RunTree();
+}
+
+// Binary tree parent/child relations are trivial to calculate, i.e.
+// device at rank r is the parent of 2r+1 and 2r+2.  The one exception
+// is if the source is not rank 0.  We treat that case as though the
+// source is appended to the front of the rank ordering as well as
+// continuing to occupy its current position.  Hence we calculate as
+// though each device's rank is actually r+1, then subtract 1 again to
+// get the descendent ranks.  If the source is not rank 0 then its
+// decendents include both {0,1} and the descendents of its current
+// position.  Where a non-0-rank source is a descendent of another
+// device, no send to it is necessary.
+
+/* static*/
+int Broadcaster::TreeRecvFrom(const CollectiveParams& cp) {
+  DCHECK_EQ(1, cp.subdiv_rank.size());
+  if (cp.is_source) return -1;
+  int source_rank = cp.instance.impl_details.subdiv_source_rank[0];
+  int my_rank = cp.subdiv_rank[0];
+  if (source_rank == 0) {
+    return (my_rank - 1) / 2;
+  } else {
+    int predecessor_rank = (my_rank / 2) - 1;
+    return (predecessor_rank < 0) ? source_rank : predecessor_rank;
+  }
+}
+
+/* static */
+void Broadcaster::TreeSendTo(const CollectiveParams& cp,
+                             std::vector<int>* targets) {
+  DCHECK_EQ(1, cp.subdiv_rank.size());
+  targets->clear();
+  int my_rank = cp.subdiv_rank[0];
+  DCHECK_EQ(1, cp.instance.impl_details.subdiv_source_rank.size());
+  int source_rank = cp.instance.impl_details.subdiv_source_rank[0];
+  int successor_rank = 0;
+  if (source_rank == 0) {
+    successor_rank = (2 * my_rank) + 1;
+  } else {
+    successor_rank = (2 * (my_rank + 1));
+  }
+  DCHECK_NE(successor_rank, my_rank);
+  if (cp.is_source && source_rank != 0) {
+    // The source sends to rank 0,1 in addition to its positional
+    // decendents.
+    if (cp.group.group_size > 1) {
+      targets->push_back(0);
+    }
+    if (cp.group.group_size > 2 && source_rank != 1) {
+      targets->push_back(1);
+    }
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (successor_rank < cp.group.group_size && successor_rank != source_rank) {
+      targets->push_back(successor_rank);
+    }
+    ++successor_rank;
+  }
+}
+
+// Execute a tree broadcast, i.e. each non-source device receives from
+// one other and sends to up-to two others.
+void Broadcaster::RunTree() {
+  mutex mu;
+  int pending_count = 0;  // GUARDED_BY(mu)
+  condition_variable all_done;
+  std::vector<int> send_to_ranks;
+  TreeSendTo(col_params_, &send_to_ranks);
+
+  if (!is_source_) {
+    // Begin by receiving the value.
+    int recv_from_rank = TreeRecvFrom(col_params_);
+    Notification note;
+    DispatchRecv(recv_from_rank, output_,
+                 [this, recv_from_rank, &mu, &note](const Status& s) {
+                   mutex_lock l(mu);
+                   status_.Update(s);
+                   note.Notify();
+                 });
+    note.WaitForNotification();
+  }
+
+  // Then forward value to all descendent devices.
+  if (status_.ok()) {
+    for (int i = 0; i < send_to_ranks.size(); ++i) {
+      int target_rank = send_to_ranks[i];
+      {
+        mutex_lock l(mu);
+        ++pending_count;
+      }
+      DispatchSend(
+          target_rank, output_,
+          [this, target_rank, &mu, &pending_count, &all_done](const Status& s) {
+            status_.Update(s);
+            {
+              mutex_lock l(mu);
+              --pending_count;
+              if (pending_count == 0) {
+                all_done.notify_all();
+              }
+            }
+          });
+    }
+  }
+
+  if (status_.ok() && is_source_) {
+    // Meanwhile, copy input to output if we weren't lucky enough to
+    // be able to reuse input as output.
+    const Tensor* input = &ctx_->input(0);
+    if (input != output_ &&
+        (DMAHelper::base(input) != DMAHelper::base(output_))) {
+      {
+        mutex_lock l(mu);
+        ++pending_count;
+      }
+      DeviceContext* op_dev_ctx = ctx_->op_device_context();
+      CollectiveRemoteAccessLocal::MemCpyAsync(
+          op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0),
+          ctx_->output_alloc_attr(0), input, output_,
+          [this, &mu, &pending_count, &all_done](const Status& s) {
+            status_.Update(s);
+            {
+              mutex_lock l(mu);
+              --pending_count;
+              if (0 == pending_count) {
+                all_done.notify_all();
+              }
+            }
+          });
+    }
+  }
+
+  // Then wait for all pending actions to complete.
+  {
+    mutex_lock l(mu);
+    if (pending_count > 0) {
+      all_done.wait(l);
+    }
+  }
+
+  VLOG(2) << "return status " << status_;
+  done_(status_);
+}
+
+void Broadcaster::DispatchSend(int dst_rank, const Tensor* src_tensor,
+                               const StatusCallback& done) {
+  string send_buf_key = BroadcastBufKey(exec_key_, rank_, dst_rank);
+  VLOG(1) << "DispatchSend " << send_buf_key << " from_device "
+          << device_->name();
+  int dst_idx =
+      col_params_.instance.impl_details.subdiv_permutations[0][dst_rank];
+  col_exec_->PostToPeer(col_params_.instance.device_names[dst_idx],
+                        col_params_.instance.task_names[dst_idx], send_buf_key,
+                        device_, ctx_->op_device_context(),
+                        ctx_->output_alloc_attr(0), src_tensor,
+                        device_locality_, done);
+}
+
+void Broadcaster::DispatchRecv(int src_rank, Tensor* dst_tensor,
+                               const StatusCallback& done) {
+  string recv_buf_key = BroadcastBufKey(exec_key_, src_rank, rank_);
+  int src_idx =
+      col_params_.instance.impl_details.subdiv_permutations[0][src_rank];
+  VLOG(1) << "DispatchRecv " << recv_buf_key << " from_device "
+          << col_params_.instance.device_names[src_idx];
+  int dst_idx = col_params_.instance.impl_details.subdiv_permutations[0][rank_];
+  CHECK_EQ(col_params_.instance.device_names[dst_idx], device_->name());
+  col_exec_->RecvFromPeer(col_params_.instance.device_names[src_idx],
+                          col_params_.instance.task_names[src_idx],
+                          col_params_.task.is_local[src_idx], recv_buf_key,
+                          device_, ctx_->op_device_context(),
+                          ctx_->output_alloc_attr(0), dst_tensor,
+                          device_locality_, done);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/broadcaster.h b/tensorflow/core/common_runtime/broadcaster.h
new file mode 100644
index 0000000000..bdf68f19ab
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
+
+#include <vector>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+
+// Tree-algorithm implementation of collective broadcast.
+class Broadcaster {
+ public:
+  Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+              OpKernelContext* ctx, OpKernelContext::Params* params,
+              const CollectiveParams& col_params, const string& exec_key,
+              int64 step_id, Tensor* output);
+
+  void Run(StatusCallback done);
+
+  // Returns the rank of the device from which this device should receive
+  // its value, -1 if no value should be received.
+  static int TreeRecvFrom(const CollectiveParams& cp);
+
+  // Populates targets with the ranks of the devices to which this device
+  // should forward the value.
+  static void TreeSendTo(const CollectiveParams& cp, std::vector<int>* targets);
+
+ private:
+  void DispatchSend(int dst_rank, const Tensor* src_tensor,
+                    const StatusCallback& done);
+  void DispatchRecv(int src_rank, Tensor* dst_tensor,
+                    const StatusCallback& done);
+  void RunTree();
+
+  Status status_;
+  CollectiveExecutor* col_exec_;  // Not owned
+  const DeviceMgr* dev_mgr_;      // Not owned
+  OpKernelContext* ctx_;          // Not owned
+  const CollectiveParams& col_params_;
+  const string exec_key_;
+  const int rank_;
+  const bool is_source_;
+  Tensor* output_;  // Not owned
+  std::unique_ptr<CollectiveAdapter> ca_;
+  StatusCallback done_;
+  Device* device_;  // The device for which this instance labors
+  DeviceLocality device_locality_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_
diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/broadcaster_test.cc
new file mode 100644
index 0000000000..89d39144b3
--- /dev/null
+++ b/tensorflow/core/common_runtime/broadcaster_test.cc
@@ -0,0 +1,741 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/broadcaster.h"
+
+#include <algorithm>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+static int64 kStepId = 123;
+static int32 kNumSubdivs = 1;  // Subdiv not yet meaningful for broadcast
+
+// The test harness won't allow a mixture of fixture and non-fixture
+// tests in one file, so this is a trival fixture for tests that don't
+// need the heavy-weight BroadcasterTest fixture.
+class TrivialTest : public ::testing::Test {
+ protected:
+  TrivialTest() {}
+};
+
+// Tests of static TreeSendTo() and TreeRecvFrom() functions.
+// D = number of devices
+// S = source rank
+// R = tested rank
+// RF = receive-from rank
+// ST = send_to rank vector
+#define DEF_TL_TEST(D, S, R, RF, ST)                               \
+  TEST_F(TrivialTest, TreeLinks_##D##Devs_##S##Source_##R##Rank) { \
+    CollectiveParams cp;                                           \
+    cp.group.group_size = D;                                       \
+    cp.instance.impl_details.subdiv_source_rank = {S};             \
+    cp.subdiv_rank = {R};                                          \
+    cp.is_source = (S == R);                                       \
+    EXPECT_EQ(RF, Broadcaster::TreeRecvFrom(cp));                  \
+    std::vector<int> expected = ST;                                \
+    std::vector<int> send_to;                                      \
+    Broadcaster::TreeSendTo(cp, &send_to);                         \
+    ASSERT_EQ(expected.size(), send_to.size());                    \
+    for (int i = 0; i < expected.size(); ++i) {                    \
+      EXPECT_EQ(expected[i], send_to[i]);                          \
+    }                                                              \
+  }
+
+#define V(...) std::vector<int>({__VA_ARGS__})
+
+//          D  S  R  RF  ST
+// 2 device cases
+DEF_TL_TEST(2, 0, 0, -1, V(1))
+DEF_TL_TEST(2, 1, 0, 1, V())
+DEF_TL_TEST(2, 0, 1, 0, V())
+DEF_TL_TEST(2, 1, 1, -1, V(0))
+// 3 device cases
+DEF_TL_TEST(3, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(3, 0, 1, 0, V())
+DEF_TL_TEST(3, 0, 2, 0, V())
+DEF_TL_TEST(3, 1, 0, 1, V(2))
+DEF_TL_TEST(3, 1, 1, -1, V(0))
+DEF_TL_TEST(3, 1, 2, 0, V())
+DEF_TL_TEST(3, 2, 0, 2, V())
+DEF_TL_TEST(3, 2, 1, 2, V())
+DEF_TL_TEST(3, 2, 2, -1, V(0, 1))
+// 4 device cases
+DEF_TL_TEST(4, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(4, 0, 1, 0, V(3))
+DEF_TL_TEST(4, 0, 2, 0, V())
+DEF_TL_TEST(4, 0, 3, 1, V())
+DEF_TL_TEST(4, 1, 0, 1, V(2, 3))
+DEF_TL_TEST(4, 1, 1, -1, V(0))
+DEF_TL_TEST(4, 1, 2, 0, V())
+DEF_TL_TEST(4, 1, 3, 0, V())
+DEF_TL_TEST(4, 2, 0, 2, V(3))
+DEF_TL_TEST(4, 2, 1, 2, V())
+DEF_TL_TEST(4, 2, 2, -1, V(0, 1))
+DEF_TL_TEST(4, 2, 3, 0, V())
+DEF_TL_TEST(4, 3, 0, 3, V(2))
+DEF_TL_TEST(4, 3, 1, 3, V())
+DEF_TL_TEST(4, 3, 2, 0, V())
+DEF_TL_TEST(4, 3, 3, -1, V(0, 1))
+// 8 device cases
+//          D  S  R  RF  ST
+DEF_TL_TEST(8, 0, 0, -1, V(1, 2))
+DEF_TL_TEST(8, 0, 1, 0, V(3, 4))
+DEF_TL_TEST(8, 0, 2, 0, V(5, 6))
+DEF_TL_TEST(8, 0, 3, 1, V(7))
+DEF_TL_TEST(8, 0, 4, 1, V())
+DEF_TL_TEST(8, 0, 5, 2, V())
+DEF_TL_TEST(8, 0, 6, 2, V())
+DEF_TL_TEST(8, 0, 7, 3, V())
+DEF_TL_TEST(8, 7, 0, 7, V(2, 3))
+DEF_TL_TEST(8, 7, 1, 7, V(4, 5))
+DEF_TL_TEST(8, 7, 2, 0, V(6))
+DEF_TL_TEST(8, 7, 3, 0, V())
+DEF_TL_TEST(8, 7, 4, 1, V())
+DEF_TL_TEST(8, 7, 5, 1, V())
+DEF_TL_TEST(8, 7, 6, 2, V())
+DEF_TL_TEST(8, 7, 7, -1, V(0, 1))
+#undef DEF_TL_TEST
+#undef V
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+// TODO(tucker): factor out of this file and ring_reducer_test.cc
+// into a single common source.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      auto error = errors::Internal("Deliberate failure");
+      LOG(INFO) << "triggering failure " << error;
+      SchedNonBlockingClosureAfter(
+          1000, [this, error] { buf_rendezvous()->StartAbort(error); });
+      done(error);
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+class BroadcasterTest : public ::testing::Test {
+ protected:
+  BroadcasterTest() : device_type_(DEVICE_CPU) {}
+
+  ~BroadcasterTest() override {
+    stop_ = true;
+    for (auto i : instances_) {
+      delete i;
+    }
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void SetUp() override {
+#if GOOGLE_CUDA
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+#endif
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int fail_after) {
+    device_type_ = device_type;
+    std::vector<Device*> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
+                                            "/device:CPU:", di);
+          local_devices.push_back(new ThreadPoolDevice(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(gpu_devices_[dev_idx]);
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      dev_mgr_.reset(new DeviceMgr(local_devices));
+    }
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get());
+    col_params_.name = "test_collective";
+    col_params_.instance.data_type = dtype;
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    col_params_.instance.impl_details.subdiv_permutations.resize(kNumSubdivs);
+    col_params_.subdiv_rank.resize(kNumSubdivs);
+    int subdiv_stride = num_devices / kNumSubdivs;
+    for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    broadcast_dev_id_ = local_ring_order[0];
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/device:CPU:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name = strings::StrCat(task_name, "/device:GPU:0");
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  typedef std::function<void(Tensor*)> InitFunc;
+
+  void Broadcast() {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoBroadcast();
+        ++done;
+      });
+    }
+    while (done < instances_.size()) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                      const DeviceType& device_type,
+                                      DeviceBase* device) {
+    Status status;
+    std::unique_ptr<OpKernel> k = CreateOpKernel(
+        device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+        TF_GRAPH_DEF_VERSION, &status);
+    if (!status.ok()) {
+      LOG(FATAL) << status;
+    }
+    return k;
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveBcastSend(
+      const CollectiveParams& params, Tensor* input,
+      const DeviceType& device_type, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_bcast_send_", bcast_send_counter_++),
+        "CollectiveBcastSend");
+    TF_CHECK_OK(builder.Attr("T", input->dtype())
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", input->shape())
+                    .Input(FakeInput(params.instance.data_type))
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveBcastRecv(
+      const CollectiveParams& params, const TensorShape& shape,
+      const DeviceType& device_type, DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_bcast_recv_", bcast_recv_counter_++),
+        "CollectiveBcastRecv");
+    TF_CHECK_OK(builder.Attr("T", params.instance.data_type)
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", shape)
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  void BuildColParams() {}
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int tensor_len, int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, fail_after);
+
+    // Initialize each instance tensor with distinct values.
+    for (int di = 0; di < instances_.size(); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}), [di, dtype](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              t->flat<T>()(i) = value;
+            }
+          });
+    }
+
+    // Copy the expected value from the broadcast source tensor
+    std::vector<T> expected(tensor_len, 0.0);
+    const CollectiveParams& cp = instances_[0]->col_params_;
+    int broadcast_dev_id =
+        cp.instance.impl_details.subdiv_permutations
+            [0][cp.instance.impl_details.subdiv_source_rank[0]];
+    const Tensor* t = &instances_[broadcast_dev_id]->tensor_;
+    Tensor cpu_copy(dtype, TensorShape({tensor_len}));
+    if (device_type == DEVICE_GPU) {
+      Notification notification;
+      Device* dev = instances_[broadcast_dev_id]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      CHECK(dev_info);
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          t, "" /*tensor_name*/, dev, &cpu_copy,
+          [this, &notification](Status s) {
+            TF_CHECK_OK(s);
+            notification.Notify();
+          });
+      notification.WaitForNotification();
+      t = &cpu_copy;
+    }
+    for (size_t i = 0; i < t->NumElements(); ++i) {
+      expected[i] = t->flat<T>()(i);
+    }
+
+    Broadcast();
+
+    // At this point all of the ops have terminated.
+    for (int di = 0; di < instances_.size(); ++di) {
+      if (!instances_[di]->status_.ok()) {
+        ASSERT_GT(fail_after, 0);
+        ASSERT_EQ(instances_[di]->status_.error_message(),
+                  "Deliberate failure");
+        mutex_lock l(mu_);
+        ++failure_count_;
+        continue;
+      }
+      Tensor* inst = &instances_[di]->tensor_;
+      Tensor actual(dtype, TensorShape({tensor_len}));
+      if (device_type_ == DEVICE_CPU) {
+        CHECK(actual.CopyFrom(*inst, inst->shape()));
+      } else if (device_type_ == DEVICE_GPU) {
+        Notification notification;
+        Device* dev = instances_[di]->device_;
+        auto* dev_info = dev->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        dev_info->default_context->CopyDeviceTensorToCPU(
+            inst, "" /*tensor_name*/, dev, &actual,
+            [this, &notification](Status s) {
+              TF_CHECK_OK(s);
+              notification.Notify();
+            });
+        notification.WaitForNotification();
+      }
+      for (int i = 0; i < tensor_len; ++i) {
+        switch (dtype) {
+          case DT_FLOAT:
+            EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_DOUBLE:
+            EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_INT32:
+          case DT_INT64:
+            EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          default:
+            LOG(FATAL) << "unimplemented";
+        }
+      }
+    }
+
+    // Note that the order of operations during broadcast is
+    // non-deterministic and unlike the reduce case some Ops in the
+    // instance may succeed while others fail, even if a transmission
+    // failure occurs early in the operation chain.  So, when an abort
+    // is specified we need to verify that at least one Op fails with
+    // the expected status and any Op that succeeds yeilds the correct
+    // value.
+    if (fail_after > 0) {
+      mutex_lock l(mu_);
+      EXPECT_GT(failure_count_, 0);
+    }
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, BroadcasterTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_));
+      col_params_.name = parent_->col_params_.name;
+      col_params_.instance.data_type = parent_->col_params_.instance.data_type;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.instance.instance_key =
+          parent_->col_params_.instance.instance_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance.device_names =
+          parent_->col_params_.instance.device_names;
+      col_params_.instance.task_names =
+          parent_->col_params_.instance.task_names;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.instance.impl_details.subdiv_permutations =
+          parent_->col_params_.instance.impl_details.subdiv_permutations;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size, col_params_.instance.device_names.size());
+      // Default rank is order in device_names.
+      col_params_.default_rank = rank;
+      // perm_rank is order in subdiv[0]:
+      int perm_rank = -1;
+      for (int i = 0;
+           i < col_params_.instance.impl_details.subdiv_permutations[0].size();
+           ++i) {
+        if (rank ==
+            col_params_.instance.impl_details.subdiv_permutations[0][i]) {
+          perm_rank = i;
+          break;
+        }
+      }
+      CHECK_GE(perm_rank, 0);
+      col_params_.instance.impl_details.subdiv_source_rank.resize(1, 0);
+      col_params_.is_source =
+          (perm_rank ==
+           col_params_.instance.impl_details.subdiv_source_rank[0]);
+      // Set rank in all subdivs by finding that default_rank.
+      for (int sdi = 0; sdi < kNumSubdivs; ++sdi) {
+        for (int r = 0;
+             r <
+             col_params_.instance.impl_details.subdiv_permutations[sdi].size();
+             ++r) {
+          if (col_params_.default_rank ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            CHECK_EQ(0, sdi);
+            CHECK_EQ(perm_rank, col_params_.subdiv_rank[sdi]);
+            break;
+          }
+        }
+      }
+      CHECK_EQ(group_size, col_params_.task.is_local.size());
+      CHECK_EQ(group_size, col_params_.instance.task_names.size());
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const InitFunc& f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        f(&tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        f(&cpu_tensor);
+        Notification notification;
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &tensor_, [this, &notification](Status s) {
+              TF_CHECK_OK(s);
+              notification.Notify();
+            });
+        notification.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoBroadcast() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = parent_->step_id_;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from[] = {0};
+      if (col_params_.is_source) {
+        op_params.forward_from_array = &forward_from[0];
+      }
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          col_params_.is_source
+              ? parent_->GetCollectiveBcastSend(col_params_, &tensor_,
+                                                DEVICE_CPU, device_)
+              : parent_->GetCollectiveBcastRecv(col_params_, tensor_.shape(),
+                                                DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      Tensor* output_tensor_ptr = nullptr;
+      if (col_params_.is_source) {
+        TF_CHECK_OK(ctx.forward_input_or_allocate_output(
+            {0}, 0, tensor_.shape(), &output_tensor_ptr));
+      } else {
+        TF_CHECK_OK(
+            ctx.allocate_output(0, tensor_.shape(), &output_tensor_ptr));
+      }
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a Broadcaster instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      Broadcaster broadcaster(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx,
+                              &op_params, col_params_, exec_key, kStepId,
+                              output_tensor_ptr);
+
+      // Start execution in a threadpool then wait for completion.
+      Notification notification;
+      broadcaster.Run([this, &notification](Status s) {
+        status_ = s;
+        notification.Notify();
+      });
+      notification.WaitForNotification();
+      if (status_.ok()) {
+        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    BroadcasterTest* parent_;
+    string dev_name_;
+    DeviceType device_type_ = DEVICE_CPU;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };  // class DeviceInstance
+
+  bool stop_ = false;
+  int64 step_id_ = kStepId;
+  int broadcast_dev_id_ = 0;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_ = nullptr;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<tensorflow::Device*> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  mutex mu_;
+  int bcast_recv_counter_ GUARDED_BY(mu_) = 0;
+  int bcast_send_counter_ GUARDED_BY(mu_) = 0;
+  int failure_count_ GUARDED_BY(mu_) = 0;
+};
+
+// Tests of full broadcast algorithm, with different device and
+// data types.
+// B = data element type
+// T = device type
+// W = number of workers
+// D = number of devices per worker
+// L = tensor length
+// A = abort after count
+#define DEF_TEST(B, T, W, D, L, A)                                 \
+  TEST_F(BroadcasterTest,                                          \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A) { \
+    DataType dtype = DT_##B;                                       \
+    switch (dtype) {                                               \
+      case DT_FLOAT: {                                             \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      case DT_DOUBLE: {                                            \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);            \
+      } break;                                                     \
+      case DT_INT32: {                                             \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      case DT_INT64: {                                             \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);             \
+      } break;                                                     \
+      default:                                                     \
+        LOG(FATAL) << "Unimplemented";                             \
+    }                                                              \
+  }
+
+#ifndef GOOGLE_CUDA
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+#endif
+
+#ifdef GOOGLE_CUDA
+// Can only set W=1 for GPU tests.
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 393d3f824d..bdddf927d8 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -250,6 +250,38 @@ GlobalDeviceMap EstablishGlobalRank(
   return gdm;
 }
 
+// Count the devices associated with each task and set
+// cp->same_num_devices_per_task.  Requires cp->instance.task_names
+// be sorted.
+void SetDevPerTask(CollectiveParams* cp) {
+  cp->instance.same_num_devices_per_task = false;
+  if (cp->instance.task_names.empty()) return;
+  int dev_per_task = -1;
+  int count = 0;
+  const string* last_task_name = &cp->instance.task_names[0];
+  for (const string& task_name : cp->instance.task_names) {
+    if (task_name != *last_task_name) {
+      CHECK_GT(count, 0);
+      if (dev_per_task < 0) {
+        dev_per_task = count;
+      } else {
+        CHECK_GT(dev_per_task, 0);
+        if (count != dev_per_task) return;
+      }
+      count = 1;
+      last_task_name = &task_name;
+    } else {
+      ++count;
+    }
+  }
+  CHECK_GT(count, 0);
+  if ((dev_per_task > 0) && (count != dev_per_task)) {
+    return;
+  }
+  cp->instance.same_num_devices_per_task = true;
+  CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
+}
+
 // Sort cp->instance.device_names lexicographically, but do by first
 // computing a reordering permutation so we can keep cp->instance.task_names
 // in corresponding order.
@@ -278,6 +310,7 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
   cp->instance.device_names = std::move(new_devs);
   cp->instance.task_names = std::move(new_tasks);
   VLOG(1) << "Modified device_names on " << cp;
+  SetDevPerTask(cp);
 }
 
 // Establish the requested number of subdivision permutations based on the
@@ -343,17 +376,18 @@ void GenerateSubdivPerms(const string& device, int source_rank,
 
   if (cp->instance.type == BROADCAST_COLLECTIVE) {
     CHECK_GE(source_rank, 0);
-    cp->subdiv_source_rank.resize(
+    cp->instance.impl_details.subdiv_source_rank.resize(
         cp->instance.impl_details.subdiv_offsets.size(), -1);
-    for (int sdi = 0; sdi < cp->subdiv_source_rank.size(); ++sdi) {
+    for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_source_rank.size();
+         ++sdi) {
       for (int j = 0; j < cp->group.group_size; ++j) {
         if (cp->instance.impl_details.subdiv_permutations[sdi][j] ==
             source_rank) {
-          cp->subdiv_source_rank[sdi] = j;
+          cp->instance.impl_details.subdiv_source_rank[sdi] = j;
           break;
         }
       }
-      CHECK_GE(cp->subdiv_source_rank[sdi], 0);
+      CHECK_GE(cp->instance.impl_details.subdiv_source_rank[sdi], 0);
     }
   }
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 4e3c7125f2..4e33c4779a 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -91,9 +91,10 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
       EXPECT_TRUE(cps[i].task.is_local[j]);
     }
     EXPECT_EQ(cps[i].subdiv_rank[0], i);
-    EXPECT_EQ(cps[i].subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
     EXPECT_FALSE(cps[i].is_source);
     EXPECT_EQ(cps[i].default_rank, i);
+    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
   }
 }
 
@@ -138,10 +139,11 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
     }
     ASSERT_GT(cps[i].subdiv_rank.size(), 0);
     EXPECT_EQ(cps[i].subdiv_rank[0], i);
-    ASSERT_GT(cps[i].subdiv_source_rank.size(), 0);
-    EXPECT_EQ(cps[i].subdiv_source_rank[0], 1);
+    ASSERT_GT(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank[0], 1);
     EXPECT_EQ(cps[i].is_source, (i == 1));
     EXPECT_EQ(cps[i].default_rank, i);
+    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index d25dd5f04a..716e23bfa1 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -67,6 +67,8 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
     dev_resolver_->ClearTask(task);
   }
 
+  BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
+
   // Copy utility that always copies bytes from src to dst even if
   // they are on the same device, unlike CopyTensor::ViaDMA which will
   // just change the dst buffer pointer in that case.
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index a26f2c2f31..d4ac50cbbe 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -38,6 +38,7 @@ CollInstanceParams& CollInstanceParams::operator=(
     device_names.clear();
     device_names.assign(other.device_names.begin(), other.device_names.end());
     task_names.assign(other.task_names.begin(), other.task_names.end());
+    same_num_devices_per_task = other.same_num_devices_per_task;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
@@ -76,6 +77,13 @@ string CollInstanceParams::ToString() const {
     }
     strings::StrAppend(&v, "}");  // one subdiv
   }
+  if (!impl_details.subdiv_source_rank.empty()) {
+    strings::StrAppend(&v, " subdiv_source_rank={");
+    for (const auto& r : impl_details.subdiv_source_rank) {
+      strings::StrAppend(&v, r, ",");
+    }
+    strings::StrAppend(&v, "}");
+  }
   strings::StrAppend(&v, "}");  // all subdivs
   return v;
 }
@@ -98,13 +106,6 @@ string CollectiveParams::ToString() const {
   for (const auto& r : subdiv_rank) {
     strings::StrAppend(&v, r, ",");
   }
-  if (!subdiv_source_rank.empty()) {
-    strings::StrAppend(&v, " subdiv_rank={");
-    for (const auto& r : subdiv_source_rank) {
-      strings::StrAppend(&v, r, ",");
-    }
-    strings::StrAppend(&v, "}");
-  }
   strings::StrAppend(&v, "}}");
   return v;
 }
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 5810c7fa54..40d82ab0e9 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -79,6 +79,8 @@ struct CollInstanceParams {
   std::vector<string> device_names;
   // Task name prefix of corresponding device name.
   std::vector<string> task_names;
+  // True if every task has the same number of devices.
+  bool same_num_devices_per_task;
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -102,7 +104,6 @@ struct CollectiveParams {
   bool is_source;    // broadcast only
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
-  std::vector<int> subdiv_source_rank;
   std::unique_ptr<OpKernel> merge_op;  // reduction only
   std::unique_ptr<OpKernel> final_op;  // reduction only
   string ToString() const;
@@ -284,12 +285,14 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
 };
 
-// Interface of a helper object that provices a CollectiveExecutor with
+// Interface of a helper object that provides a CollectiveExecutor with
 // all of the remote access it needs.
 class CollectiveRemoteAccess : public PeerAccessInterface,
                                public DeviceResolverInterface {
  public:
   virtual ~CollectiveRemoteAccess() {}
+
+  virtual BufRendezvous* buf_rendezvous() = 0;
 };
 
 // A per-step version of CollectiveRemoteAccess that cleans up outstanding
-- 
GitLab


From 55706e693ab20f6200061fb73067cbf27707cccd Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Thu, 19 Apr 2018 13:19:27 -0700
Subject: [PATCH 1145/1262] Support various shapes in TPU DistributionStrategy.

PiperOrigin-RevId: 193563912
---
 .../distribute/python/minimize_loss_test.py   | 11 +---
 .../distribute/python/single_loss_example.py  |  5 +-
 .../contrib/distribute/python/tpu_strategy.py | 61 +++++++++++++------
 .../contrib/distribute/python/values.py       | 33 ++++++++++
 4 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 6c73250ded..43b2e91cbf 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -57,25 +57,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def tpu_dataset_fn():
-        return dataset_fn().batch(2)
       # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
       # `DistributionStrategy.create_monitor` so that each DistributionStrategy
       # could influence its training loop. That method would return an instance
       # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
       # tpu.shutdown_system().
       iterator = distribution.distribute_dataset(
-          tpu_dataset_fn if is_tpu else dataset_fn).make_one_shot_iterator()
+          dataset_fn).make_one_shot_iterator()
 
       def run_step():
-        # TODO(isaprykin): Make iterator get_next() return a list of sub-
-        # batches for each iteration. Pass iterator.get_next() and not iterator
-        # to call_for_each_tower.
         return distribution.group(
             distribution.call_for_each_tower(
-                model_fn,
-                iterator.get_next() if not is_tpu else iterator,
-                run_concurrently=layer.built))
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index 9e8f919c8a..abd13c6cc6 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -54,7 +54,7 @@ def minimize_loss_example(optimizer_fn,
   """Example of non-distribution-aware legacy code."""
 
   def dataset_fn():
-    return dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+    return dataset_ops.Dataset.from_tensors([[1.]]).repeat().batch(2)
 
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
@@ -63,10 +63,11 @@ def minimize_loss_example(optimizer_fn,
 
   layer = core.Dense(1, use_bias=use_bias)
 
-  def model_fn(x):
+  def model_fn(xs):
     """A very simple model written by the user."""
 
     def loss_fn():
+      x = math_ops.reduce_mean(xs, keepdims=True)
       y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
       return y * y
 
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 804217b5ce..ceb52ceca7 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 from tensorflow.contrib import tpu
 from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -33,35 +34,48 @@ from tensorflow.python.ops import control_flow_ops
 
 # TODO(isaprykin):  Consider whether inheriting is really appropriate.
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
+  """Experimental TPU distribution strategy implementation."""
 
-  def __init__(self, master=None, iterations=None, model_dir=None):
+  def __init__(self,
+               global_batch_size=2,
+               num_cores_per_host=2,
+               iterations_per_step=2):
+    # TODO(isaprykin): Generalize the defaults.
     super(TPUStrategy, self).__init__('/cpu:0')
+    # TODO(isaprykin): Auto-detect number of cores and hosts.
+    self._num_cores_per_host = num_cores_per_host
+    self._global_batch_size = global_batch_size
+    # TODO(isaprykin): This might have to be per-call.
+    self._iterations_per_step = iterations_per_step
+
+  def distribute_dataset(self, dataset_fn):
+    return values.PerIterationDataset(
+        self._call_dataset_fn(dataset_fn), self._iterations_per_step)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
 
-    # TODO(isaprykin): Give an API for many iterations per step.
-    iterations = 1
+    # TODO(isaprykin): Support variable arguments similar to PerDevice+regroup.
+    inputs = args[0]
 
-    # TODO(isaprykin): Do not hard code shapes and input format :)
-    # TODO(isaprykin): Detect the number of TPU cores automatically.
-
-    def dequeueing_fn(*args, **kwargs):
-      del args, kwargs
-      x, = tpu.infeed_dequeue_tuple(dtypes=[dtypes.float32], shapes=[[1, 1, 1]])
-      return fn(x)
-
-    iterator = args[0]
+    sharded_shape = [None]  # Python 2 nonlocal.
 
     def infeed_input(i):
       """Get input, split it and then enqueue."""
-      batches = iterator.get_next()
-      batches = array_ops.split(batches, 2)
+      batches = array_ops.gather(inputs, i)
+
+      # TODO(isaprykin):  Handle partial batch.
+      global_shape = [self._global_batch_size] + list(batches.get_shape())[1:]
+      sharded_shape[0] = ([self._global_batch_size / self._num_cores_per_host] +
+                          list(global_shape)[1:])
+
+      batches.set_shape(global_shape)
+      batches = array_ops.split(batches, self._num_cores_per_host)
 
       infeeds = [
           tpu_ops.infeed_enqueue_tuple(
-              inputs=[batches[j]], shapes=[[1, 1, 1]], device_ordinal=j)
-          for j in range(2)
+              inputs=[batches[j]], shapes=[sharded_shape[0]], device_ordinal=j)
+          for j in range(self._num_cores_per_host)
       ]
 
       with ops.control_dependencies(infeeds):
@@ -69,14 +83,23 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
     with ops.device('/task:0/device:CPU:0'):
       enqueue_ops = control_flow_ops.while_loop(
-          lambda i: i < iterations,
+          lambda i: i < self._iterations_per_step,
           infeed_input, [constant_op.constant(0)],
           parallel_iterations=1)
 
+    assert sharded_shape[0]
+
+    def dequeueing_fn(*args, **kwargs):
+      del args, kwargs
+      x, = tpu.infeed_dequeue_tuple(
+          dtypes=[dtypes.float32], shapes=[sharded_shape[0]])
+      return fn(x)
+
     def iterate_on_tpu():
-      return tpu.repeat(iterations, dequeueing_fn, [])
+      return tpu.repeat(self._iterations_per_step, dequeueing_fn, [])
 
     with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
-      tpu_result = tpu.batch_parallel(iterate_on_tpu, [], num_shards=2)
+      tpu_result = tpu.batch_parallel(
+          iterate_on_tpu, [], num_shards=self._num_cores_per_host)
 
     return control_flow_ops.group(tpu_result, enqueue_ops)
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 18fedd2775..62016c3a78 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -570,6 +570,39 @@ class PerDeviceDataset(object):
         dataset_iterator, self._devices, self._prefetch_on_device)
 
 
+class MultiIterator(object):
+  """Iterator that returns results of multiple get_next()s."""
+
+  def __init__(self, dataset_iterator, iterations):
+    self._dataset_iterator = dataset_iterator
+    self._iterations = iterations
+
+  def get_next(self, name=None):
+    return [
+        self._dataset_iterator.get_next(name=name)
+        for _ in range(self._iterations)
+    ]
+
+  @property
+  def initializer(self):
+    return self._dataset_iterator.initializer
+
+
+class PerIterationDataset(object):
+
+  def __init__(self, dataset, iterations):
+    self._dataset = dataset
+    self._iterations = iterations
+
+  def make_one_shot_iterator(self):
+    iterator = self._dataset.make_one_shot_iterator()
+    return MultiIterator(iterator, self._iterations)
+
+  def make_initializable_iterator(self):
+    iterator = self._dataset.make_initializable_iterator()
+    return MultiIterator(iterator, self._iterations)
+
+
 class MapOutput(object):
   """Map can result in multiple outputs per device."""
 
-- 
GitLab


From 7f1e64eb94447665047fac16c67b5351bcf3c8a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 13:21:25 -0700
Subject: [PATCH 1146/1262] Allow output has a different shape from input in
 the image.transform (#17011).

PiperOrigin-RevId: 193564222
---
 tensorflow/contrib/image/kernels/image_ops.cc |  7 ++-
 tensorflow/contrib/image/kernels/image_ops.h  |  2 +-
 tensorflow/contrib/image/ops/image_ops.cc     | 52 +++++++++++++++++--
 .../python/kernel_tests/image_ops_test.py     | 30 +++++++++++
 .../contrib/image/python/ops/image_ops.py     | 39 ++++++++------
 5 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index c2e32da133..ae4b1ba62a 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -70,6 +70,7 @@ class ImageProjectiveTransform : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
+    const Tensor& output_dim = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -83,7 +84,11 @@ class ImageProjectiveTransform : public OpKernel {
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
     Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
+    // Image is NHWC format.
+    auto output_shape = images_t.shape();
+    output_shape.set_dim(1, output_dim.vec<int>()(0));
+    output_shape.set_dim(2, output_dim.vec<int>()(1));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
     auto output = output_t->tensor<T, 4>();
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index ad50133061..2320329b92 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -161,7 +161,7 @@ struct FillProjectiveTransform {
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = images.generate(
+    output->device(device) = output->generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d05..4c6d8c0d19 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,9 +19,55 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+namespace {
+
+// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
+// height and width come from the size_tensor.
+Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
+                             int size_input_idx, DimensionHandle channel_dim) {
+  // Verify shape of size input.
+  ShapeHandle size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
+
+  // Get size values from the size tensor.
+  const Tensor* size_tensor = c->input_tensor(size_input_idx);
+  DimensionHandle width;
+  DimensionHandle height;
+  if (size_tensor == nullptr) {
+    width = c->UnknownDim();
+    height = c->UnknownDim();
+  } else {
+    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
+    if (size_tensor->dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
+          "but got ",
+          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
+          " in ", c->DebugString());
+    }
+    auto vec = size_tensor->vec<int32>();
+    height = c->MakeDim(vec(0));
+    width = c->MakeDim(vec(1));
+  }
+  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
+  return Status::OK();
+}
+
+Status ResizeShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
+                               c->Dim(input, 3));
+}
+
+}  // namespace
+
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
@@ -29,13 +75,11 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
+    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
+    .SetShapeFn(ResizeShapeFn)
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index b50177ae56..c0151d320f 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -195,10 +195,40 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
+  def _test_grad_different_shape(self, input_shape, output_shape):
+    with self.test_session():
+      test_image_shape = input_shape
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      if len(output_shape) == 2:
+        resize_shape = output_shape
+      elif len(output_shape) == 3:
+        resize_shape = output_shape[0:2]
+      elif len(output_shape) == 4:
+        resize_shape = output_shape[1:3]
+      output = image_ops.transform(
+          images=test_image_tensor,
+          transforms=test_transform,
+          output_shape=resize_shape)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
+    self._test_grad_different_shape([16, 16], [8, 8])
+    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
+    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8..0cb7bdc75d 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -212,7 +212,11 @@ def translations_to_projective_transforms(translations, name=None):
         axis=1)
 
 
-def transform(images, transforms, interpolation="NEAREST", name=None):
+def transform(images,
+              transforms,
+              output_shape=None,
+              interpolation="NEAREST",
+              name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -228,7 +232,10 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
+    output_shape: Output dimesion after the transform, [height, width].
+       If None, output is the same size as input image.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
+    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -255,6 +262,14 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
+    if output_shape is None:
+      output_shape = images.get_shape()[1:3]
+    elif len(output_shape) != 2:
+      raise TypeError(
+          "output_shape must either be None or a vector of 2 elements.")
+    output_shape = ops.convert_to_tensor(
+        output_shape, name="output_shape", dtype=dtypes.int32)
+
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -265,7 +280,7 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
     output = gen_image_ops.image_projective_transform(
-        images, transforms, interpolation=interpolation.upper())
+        images, transforms, output_shape, interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -375,14 +390,6 @@ def _image_projective_transform_grad(op, grad):
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
-  if len(image_or_images.get_shape()) == 2:
-    images = image_or_images[None, :, :, None]
-  elif len(image_or_images.get_shape()) == 3:
-    images = image_or_images[None, :, :, :]
-  elif len(image_or_images.get_shape()) == 4:
-    images = image_or_images
-  else:
-    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -395,13 +402,11 @@ def _image_projective_transform_grad(op, grad):
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      grad, transforms, interpolation=interpolation)
-  if len(image_or_images.get_shape()) == 2:
-    return [output[0, :, :, 0], None]
-  elif len(image_or_images.get_shape()) == 3:
-    return [output[0, :, :, :], None]
-  else:
-    return [output, None]
+      images=grad,
+      transforms=transforms,
+      output_shape=image_or_images.get_shape()[1:3],
+      interpolation=interpolation)
+  return [output, None, None]
 
 
 def bipartite_match(distance_mat,
-- 
GitLab


From ab47eb8d9bcac55fd19b0e862cf9a2a7de195787 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 13:38:43 -0700
Subject: [PATCH 1147/1262] tools/lib_package: Fix typo in README

PiperOrigin-RevId: 193566850
---
 tensorflow/tools/lib_package/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/tools/lib_package/README.md b/tensorflow/tools/lib_package/README.md
index 7008148260..cb6aef2624 100644
--- a/tensorflow/tools/lib_package/README.md
+++ b/tensorflow/tools/lib_package/README.md
@@ -35,8 +35,8 @@ The following commands:
 bazel test --config opt //tensorflow/tools/lib_package:libtensorflow_test
 bazel build --config opt \
   //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \
-  //tensorflow/tools/lib_package:libtensorflow.jar \
-  //tensorflow/tools/lib_package:libtensorflow-src.jar
+  //tensorflow/java:libtensorflow.jar \
+  //tensorflow/java:libtensorflow-src.jar
 ```
 
 test and produce the following:
@@ -44,9 +44,9 @@ test and produce the following:
 -   The native library (`libtensorflow_jni.so`) packaged in an archive at:
     `bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz`
 -   The Java archive at:
-    `bazel-bin/tensorflow/tools/lib_package/libtensorflow.jar`
+    `bazel-bin/tensorflow/java/libtensorflow.jar`
 -   The Java archive for Java sources at:
-    `bazel-bin/tensorflow/tools/lib_package/libtensorflow-src.jar`
+    `bazel-bin/tensorflow/java/libtensorflow-src.jar`
 
 ## Release
 
-- 
GitLab


From 1e7289fc0e64a706bb1867cfe5a8c5f5d2f7150f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 14:05:06 -0700
Subject: [PATCH 1148/1262] Make flat_transforms_to_matrices and
 matrices_to_flat_transforms public available.

PiperOrigin-RevId: 193571089
---
 tensorflow/contrib/image/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index e982030bc8..8f406ace1d 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -25,6 +25,8 @@ projective transforms (including rotation) are supported.
 @@angles_to_projective_transforms
 @@compose_transforms
 @@adjust_yiq_hsv
+@@flat_transforms_to_matrices
+@@matrices_to_flat_transforms
 @@random_yiq_hsv
 @@rotate
 @@transform
@@ -58,6 +60,8 @@ from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_
 from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms
 from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
 from tensorflow.contrib.image.python.ops.image_ops import connected_components
+from tensorflow.contrib.image.python.ops.image_ops import flat_transforms_to_matrices
+from tensorflow.contrib.image.python.ops.image_ops import matrices_to_flat_transforms
 from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
 from tensorflow.contrib.image.python.ops.image_ops import translate
-- 
GitLab


From ab5abfa42bdced7bf1c371e5e1224bdc1fafdcc1 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 14:10:01 -0700
Subject: [PATCH 1149/1262] RecordReader: Simplify interface contract and
 implementation.

Prior to this change, RecordReader had the following contract:
- Records can be read in any order, EXCEPT if compression or buffering was
  enabled.
- If the underlying file is being concurrently written to
  then calls to ReadRecord() may fail (because of an incomplete
  record near the end of a file), but a retry may succeed (once the
  record is written), EXCEPT if compression or buffering is enabled
  (in which case the failure will be terminal).

  This "retry-may-succeed" behavior is relied upon by tensorboard
  (https://github.com/tensorflow/tensorboard/blob/1.7/tensorboard/backend/event_processing/event_file_loader.py#L55)
  where one process (typically the model training process) is writing
  tf.summary events to an event file and another process (tensorboard)
  is concurrently reading it.

With this change, the intent is to remove the EXCEPTions and have the
same behavior irrespective of compression/buffering.

Additionally, fix a memory leak when ZlibInputStream::Reset() is invoked.

PiperOrigin-RevId: 193571934
---
 tensorflow/core/lib/io/record_reader.cc    | 147 ++++----------
 tensorflow/core/lib/io/record_reader.h     |  16 +-
 tensorflow/core/lib/io/recordio_test.cc    | 212 ++++++++++++++-------
 tensorflow/core/lib/io/zlib_inputstream.cc |   9 +-
 tensorflow/core/lib/io/zlib_inputstream.h  |  10 +-
 5 files changed, 206 insertions(+), 188 deletions(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 6de850bb20..c24628be57 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : src_(file), options_(options) {
+    : options_(options),
+      input_stream_(new RandomAccessInputStream(file)),
+      last_read_failed_(false) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
-  } else {
-    input_stream_.reset(new RandomAccessInputStream(file));
+    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
+                                                options.buffer_size, true));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    zlib_input_stream_.reset(new ZlibInputStream(
-        input_stream_.get(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options));
+    input_stream_.reset(new ZlibInputStream(
+        input_stream_.release(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options, true));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
+    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-// May use *storage as backing store.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
-                                     StringPiece* result, string* storage) {
+//
+// offset corresponds to the user-provided value to ReadRecord()
+// and is used only in error messages.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  storage->resize(expected);
-
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    // If we have a zlib compressed buffer, we assume that the
-    // file is being read sequentially, and we use the underlying
-    // implementation to read the data.
-    //
-    // No checks are done to validate that the file is being read
-    // sequentially.  At some point the zlib input buffer may support
-    // seeking, possibly inefficiently.
-    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
-
-    if (storage->size() != expected) {
-      if (storage->empty()) {
-        return errors::OutOfRange("eof");
-      } else {
-        return errors::DataLoss("truncated record at ", offset);
-      }
-    }
+  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
-    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-      return errors::DataLoss("corrupted record at ", offset);
-    }
-    *result = StringPiece(storage->data(), n);
-  } else {
-#endif  // IS_SLIM_BUILD
-    if (options_.buffer_size > 0) {
-      // If we have a buffer, we assume that the file is being read
-      // sequentially, and we use the underlying implementation to read the
-      // data.
-      //
-      // No checks are done to validate that the file is being read
-      // sequentially.
-      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
-
-      if (storage->size() != expected) {
-        if (storage->empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-
-      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(storage->data(), n);
+  if (result->size() != expected) {
+    if (result->empty()) {
+      return errors::OutOfRange("eof");
     } else {
-      // This version supports reading from arbitrary offsets
-      // since we are accessing the random access file directly.
-      StringPiece data;
-      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
-      if (data.size() != expected) {
-        if (data.empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(data.data(), n);
+      return errors::DataLoss("truncated record at ", offset);
     }
-#if !defined(IS_SLIM_BUILD)
   }
-#endif  // IS_SLIM_BUILD
 
+  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
+    return errors::DataLoss("corrupted record at ", offset);
+  }
+  result->resize(n);
   return Status::OK();
 }
 
@@ -167,50 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
+  // Position the input stream.
+  int64 curr_pos = input_stream_->Tell();
+  int64 desired_pos = static_cast<int64>(*offset);
+  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
+      (curr_pos == desired_pos && last_read_failed_)) {
+    last_read_failed_ = false;
+    TF_RETURN_IF_ERROR(input_stream_->Reset());
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
+  } else if (curr_pos < desired_pos) {
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
+  }
+  DCHECK_EQ(desired_pos, input_stream_->Tell());
+
   // Read header data.
-  StringPiece lbuf;
-  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
+  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(lbuf.data());
+  const uint64 length = core::DecodeFixed64(record->data());
 
   // Read data
-  StringPiece data;
-  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
+  s = ReadChecksummed(*offset + kHeaderSize, length, record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
-  if (record->data() != data.data()) {
-    // RandomAccessFile placed the data in some other location.
-    memmove(&(*record)[0], data.data(), data.size());
-  }
-
-  record->resize(data.size());
-
   *offset += kHeaderSize + length + kFooterSize;
+  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
-Status RecordReader::SkipNBytes(uint64 offset) {
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
-  } else {
-#endif
-    if (options_.buffer_size > 0) {
-      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
-    }
-#if !defined(IS_SLIM_BUILD)
-  }
-#endif
-  return Status::OK();
-}  // namespace io
-
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 26278e0328..f6d587dfa0 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -69,25 +69,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  //
-  // Note: if buffering is used (with or without compression), access must be
-  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
-  // Skip the records till "offset". Returns OK on success,
-  // OUT_OF_RANGE for end of file, or something else for an error.
-  Status SkipNBytes(uint64 offset);
-
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
-                         string* storage);
+  Status ReadChecksummed(uint64 offset, size_t n, string* result);
 
-  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-#if !defined(IS_SLIM_BUILD)
-  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
-#endif  // IS_SLIM_BUILD
+  bool last_read_failed_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -121,7 +110,6 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
-    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 63235761d9..da514bd21c 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -26,10 +26,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
+namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-static string BigString(const string& partial_string, size_t n) {
+string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -39,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-static string NumberString(int n) {
+string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class RecordioTest : public ::testing::Test {
+class StringDest : public WritableFile {
+ public:
+  explicit StringDest(string* contents) : contents_(contents) {}
+
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
+  Status Append(const StringPiece& slice) override {
+    contents_->append(slice.data(), slice.size());
+    return Status::OK();
+  }
+
  private:
-  class StringDest : public WritableFile {
-   public:
-    string contents_;
-
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Append(const StringPiece& slice) override {
-      contents_.append(slice.data(), slice.size());
-      return Status::OK();
+  string* contents_;
+};
+
+class StringSource : public RandomAccessFile {
+ public:
+  explicit StringSource(string* contents)
+      : contents_(contents), force_error_(false) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    if (force_error_) {
+      force_error_ = false;
+      return errors::DataLoss("read error");
     }
-  };
-
-  class StringSource : public RandomAccessFile {
-   public:
-    StringPiece contents_;
-    mutable bool force_error_;
-    mutable bool returned_partial_;
-    StringSource() : force_error_(false), returned_partial_(false) {}
-
-    Status Read(uint64 offset, size_t n, StringPiece* result,
-                char* scratch) const override {
-      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
-
-      if (force_error_) {
-        force_error_ = false;
-        returned_partial_ = true;
-        return errors::DataLoss("read error");
-      }
-
-      if (offset >= contents_.size()) {
-        return errors::OutOfRange("end of file");
-      }
-
-      if (contents_.size() < offset + n) {
-        n = contents_.size() - offset;
-        returned_partial_ = true;
-      }
-      *result = StringPiece(contents_.data() + offset, n);
-      return Status::OK();
+
+    if (offset >= contents_->size()) {
+      return errors::OutOfRange("end of file");
+    }
+
+    if (contents_->size() < offset + n) {
+      n = contents_->size() - offset;
     }
-  };
+    *result = StringPiece(contents_->data() + offset, n);
+    return Status::OK();
+  }
+
+  void force_error() { force_error_ = true; }
+
+ private:
+  string* contents_;
+  mutable bool force_error_;
+};
 
+class RecordioTest : public ::testing::Test {
+ private:
+  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -104,7 +109,9 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : reading_(false),
+      : dest_(&contents_),
+        source_(&contents_),
+        reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -119,12 +126,11 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return dest_.contents_.size(); }
+  size_t WrittenBytes() const { return contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
-      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -137,26 +143,20 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) {
-    dest_.contents_[offset] += delta;
-  }
+  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
 
-  void SetByte(int offset, char new_byte) {
-    dest_.contents_[offset] = new_byte;
-  }
+  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
 
-  void ShrinkSize(int bytes) {
-    dest_.contents_.resize(dest_.contents_.size() - bytes);
-  }
+  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
+    core::EncodeFixed32(&contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error_ = true; }
+  void ForceError() { source_.force_error(); }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -165,7 +165,6 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -217,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
+void TestNonSequentialReads(const RecordWriterOptions& writer_options,
+                            const RecordReaderOptions& reader_options) {
+  string contents;
+  StringDest dst(&contents);
+  RecordWriter writer(&dst, writer_options);
+  for (int i = 0; i < 10; ++i) {
+    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
+  }
+  TF_ASSERT_OK(writer.Close());
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  string record;
+  // First read sequentially to fill in the offsets table.
+  uint64 offsets[10] = {0};
+  uint64 offset = 0;
+  for (int i = 0; i < 10; ++i) {
+    offsets[i] = offset;
+    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
+  }
+
+  // Read randomly: First go back to record #3 then forward to #8.
+  offset = offsets[3];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("3.", record);
+  EXPECT_EQ(offsets[4], offset);
+
+  offset = offsets[8];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("8.", record);
+  EXPECT_EQ(offsets[9], offset);
+}
+
+TEST_F(RecordioTest, NonSequentialReads) {
+  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 10;
+  TestNonSequentialReads(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
+  TestNonSequentialReads(
+      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+}
+
 // Tests of all the error paths in log_reader.cc follow:
-static void AssertHasSubstr(StringPiece s, StringPiece expected) {
+void AssertHasSubstr(StringPiece s, StringPiece expected) {
   EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
+void TestReadError(const RecordWriterOptions& writer_options,
+                   const RecordReaderOptions& reader_options) {
+  const string wrote = BigString("well hello there!", 100);
+  string contents;
+  StringDest dst(&contents);
+  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  uint64 offset = 0;
+  string read;
+  file.force_error();
+  Status status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(errors::IsDataLoss(status));
+  ASSERT_EQ(0, offset);
+
+  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
+  // lose the record.
+  status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_GT(offset, 0);
+  EXPECT_EQ(wrote, read);
+}
+
 TEST_F(RecordioTest, ReadError) {
-  Write("foo");
-  ForceError();
-  AssertHasSubstr(Read(), "Data loss");
+  TestReadError(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, ReadErrorWithBuffering) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 20;
+  TestReadError(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, ReadErrorWithCompression) {
+  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -257,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
+}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 984fbc2810..bf8dcf0988 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options)
-    : input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
+    : owns_input_stream_(owns_input_stream),
+      input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -41,10 +42,14 @@ ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
+  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 9c7e14441c..6099e2455d 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,10 +40,13 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents. Does *not* take ownership of "input_stream".
+  // contents.
+  //
+  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
-                  const ZlibCompressionOptions& zlib_options);
+                  const ZlibCompressionOptions& zlib_options,
+                  bool owns_input_stream = false);
 
   ~ZlibInputStream();
 
@@ -65,7 +68,8 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  InputStreamInterface* input_stream_;  // Not owned
+  const bool owns_input_stream_;
+  InputStreamInterface* input_stream_;
   size_t input_buffer_capacity_;        // Size of z_stream_input_
   size_t output_buffer_capacity_;       // Size of z_stream_output_
   char* next_unread_byte_;              // Next unread byte in z_stream_output_
-- 
GitLab


From a4945fc86cabcf3d5f0b9eaac21bb7c1d1146d57 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 19 Apr 2018 14:30:27 -0700
Subject: [PATCH 1150/1262] The HLO element type converter must remove side
 effecting instructions like Rng

The CPU backend does not know how to lower bf16 typed RNG nodes so even unused
instances of these can't remain in the HLO IR.
HloComputation::ReplaceInstruction keeps these Rng nodes around since it doesn't
remove side effecting nodes.
PiperOrigin-RevId: 193575183
---
 .../xla/service/hlo_element_type_converter.cc | 15 ++++-
 .../hlo_element_type_converter_test.cc        | 66 +++++++++++++++++++
 .../compiler/xla/service/hlo_instruction.cc   | 37 ++++++++---
 .../compiler/xla/service/hlo_instruction.h    | 28 +++++---
 tensorflow/compiler/xla/util.h                | 10 +++
 5 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index c782d1b0ad..d236f83aeb 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -178,24 +178,37 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       if (hlo->shape().element_type() == eliminate_type_) {
         Shape shape =
             ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
+
         new_hlo = computation->AddInstruction(
             hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
+
         new_hlo = ToElementType(new_hlo, eliminate_type_);
       } else if (ShapeUtil::IsTuple(hlo->shape())) {
         Shape old_shape = hlo->shape();
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
+
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             new_shape, new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
+
         // Convert the elements of the result of `new_hlo` to produce a new
         // tuple with shape `old_shape`.
         new_hlo = ConvertTupleElements(new_hlo, old_shape);
       } else {
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             hlo->shape(), new_operands, hlo->GetModule()));
+        TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
       }
 
-      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      TF_RETURN_IF_ERROR(hlo->DropAllControlDeps());
+
+      // NB!  We want to replace and remove side effecting instructions like Rng
+      // as well so we can't rely HloComputation::ReplaceInstruction to reliably
+      // remove the replaced instruction.
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(hlo));
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index cb94d9f19b..5c5a059e0f 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -22,6 +22,12 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Not;
+using ::testing::ResultOf;
+
 class HloElementTypeConverterTest : public HloTestBase {
  public:
   std::unique_ptr<HloModule> CreateModuleFromHloString(
@@ -117,5 +123,65 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
                         op::Convert(op::GetTupleElement(batch_norm, 2))));
 }
 
+TEST_F(HloElementTypeConverterTest, RngIsRemoved) {
+  const string& hlo_string = R"(
+HloModule RngIsRemoved
+
+ENTRY main {
+  constant.3 = bf16[] constant(0)
+  constant.4 = bf16[] constant(1)
+  ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
+}
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+
+  std::function<bool(const HloInstruction*)> is_bf16_rng =
+      [](const HloInstruction* inst) {
+        return inst->shape().element_type() == BF16 &&
+               inst->opcode() == HloOpcode::kRng;
+      };
+
+  EXPECT_THAT(module->entry_computation()->instructions(),
+              Not(Contains(ResultOf(is_bf16_rng, Eq(true)))));
+}
+
+TEST_F(HloElementTypeConverterTest, RngCtrlDep) {
+  const string& hlo_string = R"(
+HloModule RngIsRemoved
+
+ENTRY main {
+  constant.3 = bf16[] constant(0)
+  constant.4 = bf16[] constant(1)
+  rng0 = bf16[1,2000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
+  ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform
+}
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+
+  HloInstruction *rng0, *rng1;
+  for (auto* inst : module->entry_computation()->instructions()) {
+    if (inst->opcode() == HloOpcode::kRng) {
+      const Shape& shape = inst->shape();
+      ASSERT_EQ(shape.dimensions_size(), 3);
+      ASSERT_TRUE(shape.dimensions(1) == 2000 || shape.dimensions(1) == 1000);
+      if (shape.dimensions(1) == 2000) {
+        rng0 = inst;
+      } else {
+        rng1 = inst;
+      }
+    }
+  }
+
+  EXPECT_THAT(rng0->control_successors(), ElementsAre(rng1));
+  EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 6303bcc59f..a638d54d85 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1678,14 +1678,35 @@ Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
 }
 
 Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
-  auto succ_it = std::find(control_successors_.begin(),
-                           control_successors_.end(), instruction);
-  TF_RET_CHECK(succ_it != control_successors_.end());
-  control_successors_.erase(succ_it);
-  auto pred_it = std::find(instruction->control_predecessors_.begin(),
-                           instruction->control_predecessors_.end(), this);
-  TF_RET_CHECK(pred_it != instruction->control_predecessors_.end());
-  instruction->control_predecessors_.erase(pred_it);
+  TF_RET_CHECK(instruction->parent() == parent());
+  TF_RETURN_IF_ERROR(EraseElementFromVector(&control_successors_, instruction));
+  TF_RETURN_IF_ERROR(
+      EraseElementFromVector(&instruction->control_predecessors_, this));
+  return Status::OK();
+}
+
+Status HloInstruction::DropAllControlDeps() {
+  for (auto* ctrl_succ : control_successors_) {
+    TF_RETURN_IF_ERROR(
+        EraseElementFromVector(&ctrl_succ->control_predecessors_, this));
+  }
+  for (auto* ctrl_pred : control_predecessors_) {
+    TF_RETURN_IF_ERROR(
+        EraseElementFromVector(&ctrl_pred->control_successors_, this));
+  }
+  control_successors_.clear();
+  control_predecessors_.clear();
+  return Status::OK();
+}
+
+Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) {
+  for (auto* ctrl_pred : inst->control_predecessors()) {
+    TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(this));
+  }
+
+  for (auto* ctrl_succ : inst->control_successors()) {
+    TF_RETURN_IF_ERROR(this->AddControlDependencyTo(ctrl_succ));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5a7394f7a6..a5e9aecb9e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -557,6 +557,18 @@ class HloInstruction {
   // 'instruction'.
   Status RemoveControlDependencyTo(HloInstruction* instruction);
 
+  // Drops all control predecessors and successors from this HLO instruction.
+  Status DropAllControlDeps();
+
+  // Copies the control predecessors and successors on this HLO instruction to
+  // `inst`.  Does not do a deep copy so this makes sense only if `inst` and
+  // this HLO are in the same module.
+  //
+  // Depending on the use cases we see in practice, in the future we may
+  // consider folding the logic here into Clone, CloneWithNewOperands and
+  // ReplaceAllUsesWith by treating control dependencies like data dependencies.
+  Status CopyAllControlDepsFrom(const HloInstruction* inst);
+
   // Returns the set of control predecessors (successors) of this
   // instruction. Control predecessors (successors) must execute before (after)
   // the current instruction.
@@ -1148,17 +1160,17 @@ class HloInstruction {
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
-  // the instruction to form the name of the cloned instruction.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // the instruction to form the name of the cloned instruction.  If the module
+  // pointer is not nullptr, it will be the module where the cloned computations
+  // will be added to (in order to support deep cloning).  Ignores the control
+  // predecessors and successors of this HLO instruction.
   std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
                                         HloModule* module = nullptr) const;
 
-  // Clones the HLO instruction as above but with new shape and operands.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // Clones the HLO instruction as above but with new shape and operands.  If
+  // the module pointer is not nullptr, it will be the module where the cloned
+  // computations will be added to (in order to support deep cloning).  Ignores
+  // the control predecessors and successors of this HLO instruction.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloModule* module = nullptr) const;
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 2da9f9ed6f..be33bd6dd1 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -528,6 +528,16 @@ bool IsInt32(T x) {
   // value is implementation-defined."
   return static_cast<int32>(x) == x;
 }
+
+template <typename T>
+Status EraseElementFromVector(std::vector<T>* container, const T& value) {
+  // c_find returns a const_iterator which does not seem to work on gcc 4.8.4,
+  // and this breaks the ubuntu/xla_gpu build bot.
+  auto it = std::find(container->begin(), container->end(), value);
+  TF_RET_CHECK(it != container->end());
+  container->erase(it);
+  return Status::OK();
+}
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
-- 
GitLab


From 1aa032b94f630845abf6c3dce8d6623ae9e35b0f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 14:35:27 -0700
Subject: [PATCH 1151/1262] Replaced calls to deprecated
 tensorflow::StringPiece methods with their tensorflow::str_util equivalents.

This will allow the deprecated methods to be removed.

PiperOrigin-RevId: 193575992
---
 tensorflow/core/platform/test_main.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/test_main.cc b/tensorflow/core/platform/test_main.cc
index 677114f5f2..e57bbd80af 100644
--- a/tensorflow/core/platform/test_main.cc
+++ b/tensorflow/core/platform/test_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -37,7 +37,7 @@ GTEST_API_ int main(int argc, char** argv) {
   tensorflow::testing::InstallStacktraceHandler();
   testing::InitGoogleTest(&argc, argv);
   for (int i = 1; i < argc; i++) {
-    if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) {
+    if (tensorflow::str_util::StartsWith(argv[i], "--benchmarks=")) {
       const char* pattern = argv[i] + strlen("--benchmarks=");
       tensorflow::testing::Benchmark::Run(pattern);
       return 0;
-- 
GitLab


From 470842748b9ee219fa0fcb8e3de25720960c83e3 Mon Sep 17 00:00:00 2001
From: Olivia Nordquist <nolivia@google.com>
Date: Thu, 19 Apr 2018 14:59:25 -0700
Subject: [PATCH 1152/1262] disabling opensource testing for failing xla test

PiperOrigin-RevId: 193579805
---
 tensorflow/compiler/xla/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 0517a5502e..0b9333b406 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -8,6 +8,7 @@ py_library(
     name = "xla_client",
     srcs = ["xla_client.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     visibility = ["//visibility:public"],
     deps = [
         ":pywrap_xla",
-- 
GitLab


From 2d0a7087a14f015ea49f4b8feb70e0b5ecd41b28 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 15:09:58 -0700
Subject: [PATCH 1153/1262] Only generate floating points that are fractions
 like n / 256, since they are RGB pixels. This fixes RGBToHSVTest.testBatch on
 low-precision dtypes like bfloat16.

PiperOrigin-RevId: 193581652
---
 tensorflow/compiler/tests/image_ops_test.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 5b19e993ec..42e637734c 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -34,20 +34,23 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
+def GenerateNumpyRandomRGB(shape):
+  # Only generate floating points that are fractions like n / 256, since they
+  # are RGB pixels. Some low-precision floating point types in this test can't
+  # handle arbitrary precision floating points well.
+  return np.random.randint(0, 256, shape) / 256.
+
+
 class RGBToHSVTest(XLATestCase):
 
   def testBatch(self):
-    # TODO(b/78230407): Reenable the test on GPU.
-    if self.device == "XLA_GPU":
-      return
-
     # Build an arbitrary RGB image
     np.random.seed(7)
     batch_size = 5
     shape = (batch_size, 2, 7, 3)
 
     for nptype in self.float_types:
-      inp = np.random.rand(*shape).astype(nptype)
+      inp = GenerateNumpyRandomRGB(shape).astype(nptype)
 
       # Convert to HSV and back, as a batch and individually
       with self.test_session() as sess:
@@ -87,7 +90,7 @@ class RGBToHSVTest(XLATestCase):
   def testRGBToHSVNumpy(self):
     """Tests the RGB to HSV conversion matches a reference implementation."""
     for nptype in self.float_types:
-      rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype)
+      rgb_flat = GenerateNumpyRandomRGB((64, 3)).astype(nptype)
       rgb_np = rgb_flat.reshape(4, 4, 4, 3)
       hsv_np = np.array([
           colorsys.rgb_to_hsv(
-- 
GitLab


From 38c0d7e1c0ee0617cf73ccf6809bd55d70089233 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 15:27:19 -0700
Subject: [PATCH 1154/1262] Convert a local variable and mutex to a struct so
 GUARDED_BY annotation works correctly.

PiperOrigin-RevId: 193584438
---
 tensorflow/core/kernels/sdca_ops.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 55e68b348b..05c835ebc4 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -156,8 +156,10 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
   } else {
     examples.RandomShuffle();
   }
-  mutex mu;
-  Status train_step_status GUARDED_BY(mu);
+  struct {
+    mutex mu;
+    Status value GUARDED_BY(mu);
+  } train_step_status;
   std::atomic<std::int64_t> atomic_index(-1);
   auto train_step = [&](const int64 begin, const int64 end) {
     // The static_cast here is safe since begin and end can be at most
@@ -171,8 +173,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
       const Status conversion_status =
           options.loss_updater->ConvertLabel(&example_label);
       if (!conversion_status.ok()) {
-        mutex_lock l(mu);
-        train_step_status = conversion_status;
+        mutex_lock l(train_step_status.mu);
+        train_step_status.value = conversion_status;
         // Return from this worker thread - the calling thread is
         // responsible for checking context status and returning on error.
         return;
@@ -217,7 +219,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
 
   Shard(worker_threads.num_threads, worker_threads.workers,
         examples.num_examples(), kCostPerUnit, train_step);
-  OP_REQUIRES_OK(context, train_step_status);
+  mutex_lock l(train_step_status.mu);
+  OP_REQUIRES_OK(context, train_step_status.value);
 }
 
 }  // namespace
-- 
GitLab


From 4bcf49c4b22205fc829f89da96e37f366c9fa9e6 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 15:29:21 -0700
Subject: [PATCH 1155/1262] Prevent a bool field from being accessed when
 uninitialized.

PiperOrigin-RevId: 193584746
---
 tensorflow/core/distributed_runtime/message_wrappers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 92c5668e3a..72a0c7edd8 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -353,7 +353,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 
  private:
   string session_handle_;
-  bool create_worker_session_called_;
+  bool create_worker_session_called_ = false;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
-- 
GitLab


From 4868ddd508a567a497935378956e9da18976f152 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 19 Apr 2018 15:32:37 -0700
Subject: [PATCH 1156/1262] Simplifying cols_to_vars update

PiperOrigin-RevId: 193585237
---
 tensorflow/python/feature_column/feature_column.py      | 6 ++----
 tensorflow/python/feature_column/feature_column_test.py | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 87a52f8441..a7c4eabcb2 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -417,10 +417,8 @@ def linear_model(features,
       trainable=trainable,
       name='linear_model')
   retval = linear_model_layer(features)  # pylint: disable=not-callable
-  if cols_to_vars is None:
-    return retval
-  for k, v in linear_model_layer.cols_to_vars().items():
-    cols_to_vars[k] = v
+  if cols_to_vars is not None:
+    cols_to_vars.update(linear_model_layer.cols_to_vars())
   return retval
 
 
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 49e06b8245..d963dd9b55 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1269,10 +1269,8 @@ def get_keras_linear_model_predictions(features,
       trainable,
       name='linear_model')
   retval = keras_linear_model(features)  # pylint: disable=not-callable
-  if cols_to_vars is None:
-    return retval
-  for k, v in keras_linear_model.cols_to_vars().items():
-    cols_to_vars[k] = v
+  if cols_to_vars is not None:
+    cols_to_vars.update(keras_linear_model.cols_to_vars())
   return retval
 
 
-- 
GitLab


From f500bcb889b3598f386f59eb69a79af6b704bf50 Mon Sep 17 00:00:00 2001
From: joel-shor <joelshor@google.com>
Date: Fri, 20 Apr 2018 01:41:28 +0300
Subject: [PATCH 1157/1262] [tf.data] Allow `sample_from_datasets` to accept a
 tf.Dataset object for `weights`.

Tested:
bazel test :interleave_dataset_op_test
---
 .../interleave_dataset_op_test.py             | 59 +++++++++++--------
 .../contrib/data/python/ops/interleave_ops.py | 25 ++++----
 2 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index ff6d0c31aa..43aa4b1bd0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -928,8 +928,7 @@ class DirectedInterleaveDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def _normalize(self, vec):
-    batched = (len(vec.shape) == 2)
-    return vec / vec.sum(axis=1, keepdims=True) if batched else vec / vec.sum()
+    return vec / vec.sum()
 
   def _chi2(self, expected, actual):
     actual = np.asarray(actual)
@@ -938,35 +937,43 @@ class DirectedInterleaveDatasetTest(test.TestCase):
     chi2 = np.sum(diff * diff / expected, axis=0)
     return chi2
 
+  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
+    # Create a dataset that samples each integer in `[0, num_datasets)`
+    # with probability given by `weights[i]`.
+    dataset = interleave_ops.sample_from_datasets([
+        dataset_ops.Dataset.from_tensors(i).repeat(None)
+        for i in range(num_datasets)
+    ], weights)
+    dataset = dataset.take(num_samples)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      freqs = np.zeros([num_datasets])
+      for _ in range(num_samples):
+        freqs[sess.run(next_element)] += 1
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    return freqs
+
   def testSampleFromDatasets(self):
-    random_seed.set_random_seed(1618)
+    random_seed.set_random_seed(1619)
     num_samples = 10000
-    rand_probs = self._normalize(np.random.random_sample((10,)))
-    rand_probs2 = self._normalize(np.random.random_sample((15,)))
+    rand_probs = self._normalize(np.random.random_sample((15,)))
 
-    for probs in [[.5, .5], [.85, .05, .1], rand_probs, rand_probs2]:
+    # Use chi-squared test to assert that the observed distribution matches the
+    # expected distribution. Based on the implementation in
+    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
+    for probs in [[.85, .05, .1], rand_probs]:
       probs = np.asarray(probs)
+      classes = len(probs)
+      freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
 
-      # Create a dataset that samples each integer in `[0, probs.shape[0])`
-      # with probability given by `probs[i]`.
-      dataset = interleave_ops.sample_from_datasets([
-          dataset_ops.Dataset.from_tensors(i).repeat(None)
-          for i in range(probs.shape[0])
-      ], probs)
-      dataset = dataset.take(num_samples)
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with self.test_session() as sess:
-        freqs = np.zeros_like(probs)
-        for _ in range(num_samples):
-          freqs[sess.run(next_element)] += 1
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
-
-      # Use chi-squared test to assert that the observed distribution
-      # matches the expected distribution. Based on the implementation
-      # in "tensorflow/python/kernel_tests/multinomial_op_test.py".
+      # Also check that `weights` as a dataset samples correctly.
+      probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat()
+      freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
 
   def testErrors(self):
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 106a1ef388..5ae1fa9e9e 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -200,10 +200,10 @@ def sample_from_datasets(datasets, weights=None, seed=None):
 
   Args:
     datasets: A list of @{tf.data.Dataset} objects with compatible structure.
-    weights: (Optional.) A list of `len(datasets)` floating-point values,
-      where `weights[i]` represents the probability with which an element
-      should be sampled from `datasets[i]`. Defaults to a uniform distribution
-      across `datasets`.
+    weights: (Optional.) A list of `len(datasets)` floating-point values or a
+      @{tf.data.Dataset} object, where `weights[i]` represents the probability
+      with which an element should be sampled from `datasets[i]`. Defaults to a
+      uniform distribution across `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
       @{tf.set_random_seed} for behavior.
@@ -219,24 +219,23 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   """
   num_datasets = len(datasets)
   if weights is None:
-    weights = array_ops.ones(
-        [num_datasets], dtype=dtypes.float32, name="weights")
-  else:
+    weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat()
+  elif not isinstance(weights, dataset_ops.Dataset):
     weights = ops.convert_to_tensor(weights, name="weights")
     if weights.dtype not in (dtypes.float32, dtypes.float64):
       raise TypeError("`weights` must be convertible to a tensor of "
                       "`tf.float32` or `tf.float64` elements.")
     if not weights.shape.is_compatible_with([num_datasets]):
       raise ValueError("`weights` must be a vector of length `len(datasets)`.")
+    weights = dataset_ops.Dataset.from_tensors(weights).repeat()
 
   # The `stateless_multinomial()` op expects log-probabilities, as opposed to
   # weights.
-  logits = math_ops.log(weights, name="logits")
-
-  def select_dataset(seed):
+  logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+  def select_dataset(logits, seed):
     return array_ops.squeeze(
-        stateless.stateless_multinomial([logits], 1, seed=seed), axis=[0, 1])
-
-  selector_input = random_ops.RandomDataset(seed).batch(2).map(select_dataset)
+        stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
+  selector_input = dataset_ops.Dataset.zip(
+      (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset)
 
   return DirectedInterleaveDataset(selector_input, datasets)
-- 
GitLab


From d5c32f4ccc85ad0d13f3a1f83e063211504cf976 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 15:55:53 -0700
Subject: [PATCH 1158/1262] Internal-only change.

PiperOrigin-RevId: 193588868
---
 tensorflow/contrib/data/python/kernel_tests/BUILD | 1 +
 tensorflow/contrib/estimator/BUILD                | 1 +
 tensorflow/contrib/learn/BUILD                    | 5 ++++-
 tensorflow/python/kernel_tests/BUILD              | 3 +++
 tensorflow/python/kernel_tests/linalg/BUILD       | 5 ++++-
 5 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 83daa04efc..05a4f5028a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -216,6 +216,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "noasan",  # times out
         "optonly",
     ],
     deps = [
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 9e88bc7de1..62ddb3d290 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -447,6 +447,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "noasan",  # times out
         "notsan",
     ],
     deps = [
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index d665fc9335..3b053cd4c6 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -281,7 +281,10 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "noasan",  # times out
+    ],
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 9440f2a4f9..8628ca5d40 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1190,6 +1190,9 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
+    tags = [
+        "noasan",  # times out
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 4e3f24890b..7ffa48b653 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -123,7 +123,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "noasan",  # times out
+        "optonly",
+    ],
 )
 
 cuda_py_test(
-- 
GitLab


From 9e5fdb83e609701457f6fdc2d153b1f7e83ead6c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 15:56:17 -0700
Subject: [PATCH 1159/1262] Automated g4 rollback of changelist 193564222

PiperOrigin-RevId: 193588935
---
 tensorflow/contrib/image/kernels/image_ops.cc |  7 +--
 tensorflow/contrib/image/kernels/image_ops.h  |  2 +-
 tensorflow/contrib/image/ops/image_ops.cc     | 52 ++-----------------
 .../python/kernel_tests/image_ops_test.py     | 30 -----------
 .../contrib/image/python/ops/image_ops.py     | 39 ++++++--------
 5 files changed, 23 insertions(+), 107 deletions(-)

diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index ae4b1ba62a..c2e32da133 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -70,7 +70,6 @@ class ImageProjectiveTransform : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
-    const Tensor& output_dim = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -84,11 +83,7 @@ class ImageProjectiveTransform : public OpKernel {
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
     Tensor* output_t;
-    // Image is NHWC format.
-    auto output_shape = images_t.shape();
-    output_shape.set_dim(1, output_dim.vec<int>()(0));
-    output_shape.set_dim(2, output_dim.vec<int>()(1));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
     auto output = output_t->tensor<T, 4>();
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 2320329b92..ad50133061 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -161,7 +161,7 @@ struct FillProjectiveTransform {
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = output->generate(
+    output->device(device) = images.generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 4c6d8c0d19..68771b3d05 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,55 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-namespace {
-
-// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
-// height and width come from the size_tensor.
-Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
-                             int size_input_idx, DimensionHandle channel_dim) {
-  // Verify shape of size input.
-  ShapeHandle size;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
-  DimensionHandle unused;
-  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
-
-  // Get size values from the size tensor.
-  const Tensor* size_tensor = c->input_tensor(size_input_idx);
-  DimensionHandle width;
-  DimensionHandle height;
-  if (size_tensor == nullptr) {
-    width = c->UnknownDim();
-    height = c->UnknownDim();
-  } else {
-    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
-    if (size_tensor->dtype() != DT_INT32) {
-      return errors::InvalidArgument(
-          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
-          "but got ",
-          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
-          " in ", c->DebugString());
-    }
-    auto vec = size_tensor->vec<int32>();
-    height = c->MakeDim(vec(0));
-    width = c->MakeDim(vec(1));
-  }
-  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
-  return Status::OK();
-}
-
-Status ResizeShapeFn(InferenceContext* c) {
-  ShapeHandle input;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
-  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
-                               c->Dim(input, 3));
-}
-
-}  // namespace
-
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
@@ -75,11 +29,13 @@ Status ResizeShapeFn(InferenceContext* c) {
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
-    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn(ResizeShapeFn)
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index c0151d320f..b50177ae56 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -195,40 +195,10 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
-  def _test_grad_different_shape(self, input_shape, output_shape):
-    with self.test_session():
-      test_image_shape = input_shape
-      test_image = np.random.randn(*test_image_shape)
-      test_image_tensor = constant_op.constant(
-          test_image, shape=test_image_shape)
-      test_transform = image_ops.angles_to_projective_transforms(
-          np.pi / 2, 4, 4)
-
-      if len(output_shape) == 2:
-        resize_shape = output_shape
-      elif len(output_shape) == 3:
-        resize_shape = output_shape[0:2]
-      elif len(output_shape) == 4:
-        resize_shape = output_shape[1:3]
-      output = image_ops.transform(
-          images=test_image_tensor,
-          transforms=test_transform,
-          output_shape=resize_shape)
-      left_err = gradient_checker.compute_gradient_error(
-          test_image_tensor,
-          test_image_shape,
-          output,
-          output_shape,
-          x_init_value=test_image)
-      self.assertLess(left_err, 1e-10)
-
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
-    self._test_grad_different_shape([16, 16], [8, 8])
-    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
-    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 0cb7bdc75d..c139ae89d8 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -212,11 +212,7 @@ def translations_to_projective_transforms(translations, name=None):
         axis=1)
 
 
-def transform(images,
-              transforms,
-              output_shape=None,
-              interpolation="NEAREST",
-              name=None):
+def transform(images, transforms, interpolation="NEAREST", name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -232,10 +228,7 @@ def transform(images,
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
-    output_shape: Output dimesion after the transform, [height, width].
-       If None, output is the same size as input image.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
-    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -262,14 +255,6 @@ def transform(images,
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
-    if output_shape is None:
-      output_shape = images.get_shape()[1:3]
-    elif len(output_shape) != 2:
-      raise TypeError(
-          "output_shape must either be None or a vector of 2 elements.")
-    output_shape = ops.convert_to_tensor(
-        output_shape, name="output_shape", dtype=dtypes.int32)
-
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -280,7 +265,7 @@ def transform(images,
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
     output = gen_image_ops.image_projective_transform(
-        images, transforms, output_shape, interpolation=interpolation.upper())
+        images, transforms, interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -390,6 +375,14 @@ def _image_projective_transform_grad(op, grad):
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(image_or_images.get_shape()) == 2:
+    images = image_or_images[None, :, :, None]
+  elif len(image_or_images.get_shape()) == 3:
+    images = image_or_images[None, :, :, :]
+  elif len(image_or_images.get_shape()) == 4:
+    images = image_or_images
+  else:
+    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -402,11 +395,13 @@ def _image_projective_transform_grad(op, grad):
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      images=grad,
-      transforms=transforms,
-      output_shape=image_or_images.get_shape()[1:3],
-      interpolation=interpolation)
-  return [output, None, None]
+      grad, transforms, interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return [output[0, :, :, 0], None]
+  elif len(image_or_images.get_shape()) == 3:
+    return [output[0, :, :, :], None]
+  else:
+    return [output, None]
 
 
 def bipartite_match(distance_mat,
-- 
GitLab


From c3f5d8c53295d9740c622f5221464c23559747ad Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Thu, 19 Apr 2018 16:02:09 -0700
Subject: [PATCH 1160/1262] Update install_python3.5_pip_packages.sh

---
 .../tools/ci_build/install/install_python3.5_pip_packages.sh   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index aefc49f604..204a82f647 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,9 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade setuptools
+pip3.5 install --upgrade pip
+
 pip3.5 install --upgrade virtualenv
 
 # Install six.
-- 
GitLab


From d4402725d2f6d9a8c5273ab1474117a27dd455c9 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 16:30:02 -0700
Subject: [PATCH 1161/1262] Make xla/service:cpu_plugin depend on the
 StreamExecutor host platform.

PiperOrigin-RevId: 193593761
---
 tensorflow/compiler/xla/service/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9009cbf845..d5d09bd8a3 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -699,6 +699,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:stream_executor_impl",
     ],
 )
 
-- 
GitLab


From 704ac94a8e362feb3710391787342fe36187b9ef Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 19 Apr 2018 16:30:26 -0700
Subject: [PATCH 1162/1262] Cleaned up the handling of merge nodes

PiperOrigin-RevId: 193593810
---
 .../core/grappler/costs/graph_properties.cc   | 89 +++++++------------
 1 file changed, 32 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index dd2d53dfdf..a0125ce342 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -670,6 +670,29 @@ class SymbolicShapeRefiner {
     return true;
   }
 
+  Status AddNode(const Node* node) {
+    // Create the inference context for this node.
+    std::vector<ShapeHandle> input_shapes(node->num_inputs());
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+        input_handle_shapes_and_types(node->num_inputs());
+    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+
+    NodeContext& node_ctx = node_to_context_[node];
+    TF_RETURN_IF_ERROR(
+        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
+
+    node_ctx.inference_context.reset(new InferenceContext(
+        graph_def_version_, &node->def(), node->op_def(), input_shapes,
+        input_tensors, input_tensors_as_shapes,
+        std::move(input_handle_shapes_and_types)));
+    const Status s = node_ctx.inference_context->construction_status();
+    if (!s.ok()) {
+      node_ctx.inference_context.reset(nullptr);
+    }
+    return s;
+  }
+
  private:
   // Return the one ShapeHandle used to denote a fully unknown shape for a node
   // output.
@@ -698,29 +721,6 @@ class SymbolicShapeRefiner {
     return dim;
   }
 
-  Status AddNode(const Node* node) {
-    // Create the inference context for this node.
-    std::vector<ShapeHandle> input_shapes(node->num_inputs());
-    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
-        input_handle_shapes_and_types(node->num_inputs());
-    std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
-    std::vector<ShapeHandle> input_tensors_as_shapes;
-
-    NodeContext& node_ctx = node_to_context_[node];
-    TF_RETURN_IF_ERROR(
-        function_library_.LookUp(node->type_string(), &node_ctx.op_data));
-
-    node_ctx.inference_context.reset(new InferenceContext(
-        graph_def_version_, &node->def(), node->op_def(), input_shapes,
-        input_tensors, input_tensors_as_shapes,
-        std::move(input_handle_shapes_and_types)));
-    const Status s = node_ctx.inference_context->construction_status();
-    if (!s.ok()) {
-      node_ctx.inference_context.reset(nullptr);
-    }
-    return s;
-  }
-
   struct NodeContext {
     const OpRegistrationData* op_data;
     std::unique_ptr<InferenceContext> inference_context;
@@ -929,37 +929,16 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
                                         bool* new_shapes) const {
   InferenceContext* c = shape_refiner->GetContext(node);
   if (!c) {
-    // The shape refiner can't handle loops. Therefore we first need to remove
-    // all edges
-    std::vector<Edge> edges;
-    std::vector<const Edge*> edge_ptrs;
-    for (const Edge* edge : node->in_edges()) {
-      if (!edge->IsControlEdge()) {
-        edges.push_back(*edge);
-        edge_ptrs.push_back(edge);
-      }
-    }
-    for (const Edge* edge : edge_ptrs) {
-      if (!edge->IsControlEdge()) {
-        graph_->RemoveEdge(edge);
-      }
-    }
     // Now we can run shape inference
-    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes));
-    // And add all the edges back
-    for (const Edge& edge : edges) {
-      graph_->AddEdge(edge.src(), edge.src_output(), edge.dst(),
-                      edge.dst_input());
-    }
-
-    c = shape_refiner->GetContext(node);
+    TF_RETURN_IF_ERROR(shape_refiner->AddNode(node));
+    c = CHECK_NOTNULL(shape_refiner->GetContext(node));
     *new_shapes = true;
-    CHECK_NE(c, nullptr);
-  }
 
-  ShapeHandle out1;
-  TF_RETURN_IF_ERROR(c->WithRank(c->output(1), 0, &out1));
-  c->set_output(1, out1);
+    // Infer the shape of the second output once and for all since it never
+    // changes.
+    ShapeHandle out1 = c->Scalar();
+    c->set_output(1, out1);
+  }
 
   ShapeHandle out;
   bool out_initialized = false;
@@ -981,11 +960,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
       continue;
     }
     ShapeHandle input = in->output(e->src_output());
-    if (relax) {
-      c->RelaxInput(e->dst_input(), input);
-    } else {
-      c->MergeInput(e->dst_input(), input);
-    }
+    c->SetInput(e->dst_input(), input);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
@@ -998,7 +973,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
     }
   }
 
-  if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
+  if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) {
     c->set_output(0, out);
     *new_shapes = true;
   }
-- 
GitLab


From c93a883fcea141dc0f63fe63afcd9490e39e3eaf Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Thu, 19 Apr 2018 16:35:40 -0700
Subject: [PATCH 1163/1262] Improve error messages for LiteralTestUtil::Near.
 Previously error messages for mismatches were difficult to read with much of
 the space taken by useless stack traces. This CL cleans up the message
 considerably and adds additional information including statistics about the
 values and mismatches.

PiperOrigin-RevId: 193594593
---
 .../compiler/xla/tests/literal_test_util.cc   | 772 +++++++++++-------
 .../compiler/xla/tests/literal_test_util.h    |   9 +-
 .../xla/tests/literal_test_util_test.cc       |   2 +-
 3 files changed, 473 insertions(+), 310 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 81630df34c..c28f79ae38 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -39,6 +39,11 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::strings::Appendf;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 /* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
     const Shape& expected, const Shape& actual) {
   if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
@@ -173,14 +178,11 @@ template <typename FloatT, typename UnsignedT>
   auto lhs_double = static_cast<double>(lhs);
   auto rhs_double = static_cast<double>(rhs);
   if (ulhs != urhs) {
-    return ::testing::AssertionFailure() << tensorflow::strings::Printf(
+    return ::testing::AssertionFailure() << Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
-               tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
-                   .c_str(),
-               lhs_double, lhs_double,
-               tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs))
-                   .c_str(),
+               StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double,
+               lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(),
                rhs_double, rhs_double);
   }
   return ::testing::AssertionSuccess();
@@ -264,9 +266,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
       << "expected:\n"
       << expected.ToString() << "\n\tvs actual:\n"
       << actual.ToString()
-      << (message.empty()
-              ? ""
-              : tensorflow::strings::StrCat("\nmessage: ", message));
+      << (message.empty() ? "" : StrCat("\nmessage: ", message));
 }
 
 /* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected,
@@ -321,9 +321,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case TUPLE: {
       bool tuple_match = true;
       for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-        SCOPED_TRACE(tensorflow::strings::StrCat(
-            "Tuple index ", i, " in ",
-            ShapeUtil::HumanString(expected.shape())));
+        SCOPED_TRACE(StrCat("Tuple index ", i, " in ",
+                            ShapeUtil::HumanString(expected.shape())));
 
         // Create LiteralViews of the expected and actual elements.
         auto result = Equal(LiteralView::Create(expected, {i}),
@@ -350,227 +349,301 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
 
 namespace {
 
+// Gets the total element count.  For tuples, this is not the count of tuple
+// elements, but the sum of elements of each tuple element.
+int64 RecursiveElementCount(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+    int64 total = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+    }
+    return total;
+  } else {
+    return ShapeUtil::ElementsIn(shape);
+  }
+}
+
+// Calling ToString on a literal with over 100 million elements takes around
+// 3 minutes.  The utility of printing a literal with >1000 elements is
+// questionable, especially when writing the Literal proto to disk is orders
+// of magnitude faster.
+string TruncateHugeLiteral(const Literal& literal) {
+  return RecursiveElementCount(literal.shape()) < 1000
+             ? literal.ToString()
+             : "[TRUNCATED, Literal with more than 1000 values]";
+}
+
+// Returns whether the actual and expected values are mismatched with respect to
+// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+template <typename NativeT>
+bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+  if (relaxed_nans) {
+    return !std::isnan(expected) && std::isnan(actual);
+  } else {
+    return std::isnan(expected) != std::isnan(actual);
+  }
+}
+
+template <>
+bool NanMismatch<complex64>(complex64 expected, complex64 actual,
+                            bool relaxed_nans) {
+  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+}
+
+template <>
+bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
+  return NanMismatch<float>(static_cast<float>(expected),
+                            static_cast<float>(actual), relaxed_nans);
+}
+
+// Converts the given floating-point value to a string.
+template <typename NativeT>
+string FpValueToString(NativeT value) {
+  return Printf("%8.4g", static_cast<double>(value));
+}
+
+template <>
+string FpValueToString<complex64>(complex64 value) {
+  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
+// Returns the absolute value of the given floating point value. This function
+// is used instead of std::abs directly in order to allow type-dependent
+// implementations for NearComparator.
+template <typename NativeT>
+float FpAbsoluteValue(NativeT value) {
+  return std::abs(value);
+}
+
+template <>
+float FpAbsoluteValue(bfloat16 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+template <>
+float FpAbsoluteValue(half value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
 // Helper class for comparing floating-point literals within an error bound.
+template <typename NativeT>
 class NearComparator {
  public:
-  explicit NearComparator(ErrorSpec error) : error_(error) {}
+  // Compares the two array literals elementwise and returns an assertion
+  // result. The assertion result is successful if all actual and expected
+  // elements are within the given error bound. In case of error, the assertion
+  // result contains a detailed error message in case of failure.
+  static ::testing::AssertionResult Compare(const Literal& expected,
+                                            const Literal& actual,
+                                            ErrorSpec error,
+                                            bool detailed_message) {
+    NearComparator<NativeT> comparator(expected, actual, error,
+                                       detailed_message);
+    return comparator.Run();
+  }
+
+ private:
+  // Data structure encapsulating metadata about a single element mismatch.
+  struct Mismatch {
+    NativeT actual;
+    NativeT expected;
+    float rel_error;
+    float abs_error;
+
+    // The linear index of the failure within the shape. This linear index is
+    // from the 'actual' literal.
+    int64 linear_index;
+
+    bool operator<(const Mismatch& other) const {
+      return rel_error < other.rel_error;
+    }
 
-  // Compares the two literals elementwise. EXPECTs each pair of elements to be
-  // within the error bound. Emits useful log messages and dumps literals to
-  // temporary files on failure. Returns true if  literals match.
-  bool ExpectNear(const Literal& expected, const Literal& actual) {
+    string ToString(const Shape& shape) const {
+      return Printf(
+          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
+          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          rel_error, abs_error);
+    }
+  };
+
+  explicit NearComparator(const Literal& expected, const Literal& actual,
+                          ErrorSpec error, bool detailed_message)
+      : expected_(expected),
+        actual_(actual),
+        error_(error),
+        detailed_message_(detailed_message),
+        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
+        abs_error_buckets_(kErrorBucketBounds.size(), 0),
+        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
+
+  // Runs the comparison between expected and actual literals.
+  ::testing::AssertionResult Run() {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     ::testing::AssertionResult equal_shapes =
-        LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
+        LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape());
     if (!equal_shapes) {
-      EXPECT_TRUE(equal_shapes);
-      return false;
+      return equal_shapes;
     }
-
-    // Set up members used during the comparison.
-    num_miscompares_ = 0;
-    abs_diff_sum_ = 0.0;
-    abs_expected_sum_ = 0.0;
-    abs_diff_miscompare_sum_ = 0.0;
-    abs_expected_miscompare_sum_ = 0.0;
-    max_rel_err_ = 0.0;
-    max_abs_err_ = 0.0;
-    first_linear_index_ = -1;
-    last_linear_index_ = -1;
-    max_rel_linear_index_ = -1;
-    max_abs_linear_index_ = -1;
-    miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
-    miscompares_.PopulateWithValue(false);
-    multi_index_.resize(expected.shape().dimensions_size(), 0);
-
-    switch (expected.shape().element_type()) {
-      case BF16:
-        ExpectLiteralsNear<bfloat16>(expected, actual, 0);
-        break;
-      case F16:
-        ExpectLiteralsNear<half>(expected, actual, 0);
-        break;
-      case F32:
-        ExpectLiteralsNear<float>(expected, actual, 0);
-        break;
-      case F64:
-        ExpectLiteralsNear<double>(expected, actual, 0);
-        break;
-      case C64:
-        ExpectLiteralsNear<complex64>(expected, actual, 0);
-        break;
-      default:
-        LOG(FATAL) << "Unsupported primitive type in near comparator: "
-                   << PrimitiveType_Name(expected.shape().element_type())
-                   << ". Must be floating-point type.";
+    if (!ShapeUtil::IsArray(expected_.shape())) {
+      return ::testing::AssertionFailure() << "Expected array shape";
     }
 
-    if (num_miscompares_ > 0) {
-      if (!VLOG_IS_ON(1)) {
-        LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << TruncateHugeLiteral(expected);
-        LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << TruncateHugeLiteral(actual);
-        LOG(INFO) << "Dumping literals to temp files...";
-        WriteLiteralToTempFile(expected, "expected");
-        WriteLiteralToTempFile(actual, "actual");
-        WriteLiteralToTempFile(miscompares_, "miscompares");
-      }
-      EXPECT_TRUE(num_miscompares_ == 0)
-          << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), max_rel_linear_index_))
-          << "\nmaximum relative error " << max_rel_err_
-          << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), max_abs_linear_index_))
-          << "\nmaximum absolute error " << max_abs_err_
-          << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), first_linear_index_))
-          << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(
-                 IndexUtil::LinearIndexToMultidimensionalIndex(
-                     actual.shape(), last_linear_index_))
-          << "\ntotal absolute error " << abs_diff_sum_
-          << "\ntotal absolute error of miscompares "
-          << abs_diff_miscompare_sum_ << "\ntotal relative error "
-          << (abs_diff_sum_ / abs_expected_sum_)
-          << "\ntotal relative error of miscompares "
-          << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
-          << "\nfailure count " << num_miscompares_;
+    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
+    mismatches_.PopulateWithValue(false);
+
+    CompareLiterals();
+
+    if (num_mismatches_ == 0) {
+      return ::testing::AssertionSuccess();
+    } else if (!VLOG_IS_ON(1)) {
+      LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape())
+                << " " << TruncateHugeLiteral(expected_);
+      LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual_.shape())
+                << " " << TruncateHugeLiteral(actual_);
+      LOG(INFO) << "Dumping literals to temp files...";
+      WriteLiteralToTempFile(expected_, "expected");
+      WriteLiteralToTempFile(actual_, "actual");
+      WriteLiteralToTempFile(mismatches_, "mismatches");
     }
-    return num_miscompares_ == 0;
+    return ::testing::AssertionFailure() << ErrorMessage();
   }
 
- private:
-  template <typename NativeT>
-  bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-    if (relaxed_nans) {
-      return !std::isnan(expected) && std::isnan(actual);
-    } else {
-      return std::isnan(expected) != std::isnan(actual);
+  // Insert the given absolute value into the absolute value bucket vector. The
+  // bounds of the buckets are given by kAbsValueBucketBounds.
+  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
+    // Adjust the bucket containing the absolute values of the 'actual'
+    // elements.
+    const float abs_value = FpAbsoluteValue(value);
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      if (i == abs_value_buckets_.size() - 1 ||
+          (abs_value >= kAbsValueBucketBounds[i] &&
+           abs_value < kAbsValueBucketBounds[i + 1])) {
+        // The first value of the pair is the count of elements in the bucket,
+        // the second is the count of mismatches in the bucket.
+        abs_value_buckets_[i].first++;
+        if (is_mismatch) {
+          abs_value_buckets_[i].second++;
+        }
+        return;
+      }
     }
   }
 
-  template <typename NativeT>
-  void ExpectNear(NativeT expected, NativeT actual,
-                  const ::testing::Message& message) {
-    EXPECT_NEAR(expected, actual, error_.abs)
-        << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-        << message;
-  }
-
-  // EXPECTs that the two given scalar values are within the error bound. Keeps
-  // track of how many mismatches have occurred to keep the size of the output
-  // manageable.
-  template <typename NativeT>
-  bool ExpectValuesNear(NativeT expected, NativeT actual) {
-    if (expected == actual) {
-      return true;
+  // Insert the given error into the given error bucket vector.
+  void UpdateErrorBucket(
+      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
+    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
+    for (int i = 0; i < error_buckets.size(); ++i) {
+      if (error >= kErrorBucketBounds[i]) {
+        error_buckets[i]++;
+      }
     }
-
-    const float abs_diff = std::abs(actual - expected);
-    const float rel_err = abs_diff / std::abs(expected);
-    const bool nan_mismatch =
-        NanMismatch<NativeT>(expected, actual, error_.relaxed_nans);
-    const bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    return !mismatch;
   }
 
-  // Assumes that expected vs actual fail ExpectValuesNear.
-  template <typename NativeT>
-  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
-                               const Shape& shape, const int64 linear_index) {
-    const float abs_diff = std::abs(actual - expected);
-    const float rel_err = abs_diff / std::abs(expected);
-    abs_diff_sum_ += abs_diff;
-    abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
-      max_rel_err_ = rel_err;
-      max_rel_linear_index_ = linear_index;
+  // Compares the two given elements from the expected and actual literals at
+  // the given literal_index and keeps track of various mismatch statistics.
+  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
+    const bool is_nan_mismatch =
+        NanMismatch(expected, actual, error_.relaxed_nans);
+    float abs_error;
+    float rel_error;
+    if (actual == expected) {
+      abs_error = 0;
+      rel_error = 0;
+    } else if (is_nan_mismatch) {
+      num_nan_mismatches_++;
+      // A nan mismatch is considered to have infinite error. rel_error is used
+      // for sorting a std::set of the top mismatchs, and a nan value here will
+      // result in undefined behavior because nan's do not satisfy the strict
+      // weak ordering requirement of std containers.
+      abs_error = std::numeric_limits<float>::infinity();
+      rel_error = std::numeric_limits<float>::infinity();
+    } else {
+      abs_error = FpAbsoluteValue(actual - expected);
+      rel_error = abs_error / FpAbsoluteValue(expected);
     }
-    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
-      max_abs_err_ = abs_diff;
-      max_abs_linear_index_ = linear_index;
+    const bool is_abs_mismatch = abs_error > error_.abs;
+    const bool is_rel_mismatch = rel_error > error_.rel;
+    const bool is_mismatch =
+        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+
+    // Update the error of the relative bucket only if the *absolute* error
+    // bound is exceeded and vice versa.
+    if (is_abs_mismatch) {
+      num_abs_mismatches_++;
+      UpdateErrorBucket(rel_error, &rel_error_buckets_);
     }
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << tensorflow::strings::Printf(
-          "index %s abs_diff %f rel_err %f",
-          LiteralTestUtil::MultiIndexAsString(
-              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
-                                                            linear_index))
-              .c_str(),
-          abs_diff, rel_err);
+    if (is_rel_mismatch) {
+      num_rel_mismatches_++;
+      UpdateErrorBucket(abs_error, &abs_error_buckets_);
     }
-    abs_diff_miscompare_sum_ += abs_diff;
-    abs_expected_miscompare_sum_ += std::abs(expected);
-    const int64 kMaxFailures = 2;
-    if (num_miscompares_ < kMaxFailures) {
-      const auto multi_index =
-          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
-      ::testing::Message msg;
-      msg << "mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
-          << abs_diff << " rel err " << rel_err << " failure #"
-          << num_miscompares_;
-      ExpectNear<NativeT>(expected, actual, msg);
-    } else if (num_miscompares_ == kMaxFailures) {
-      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+
+    UpdateAbsValueBucket(actual, is_mismatch);
+
+    if (!is_mismatch) {
+      return;
     }
-    if (num_miscompares_ == 0) {
-      first_linear_index_ = linear_index;
+
+    num_mismatches_++;
+
+    // Keep track of the kTopRelativeErrorCount relative error mismatches.
+    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
+        rel_error > top_rel_mismatches_.begin()->rel_error) {
+      Mismatch mismatch = {actual, expected, rel_error, abs_error,
+                           linear_index};
+      top_rel_mismatches_.insert(mismatch);
+      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
+        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
+      }
     }
-    num_miscompares_++;
-    last_linear_index_ = linear_index;
-    miscompares_.data<bool>()[linear_index] = true;
+
+    mismatches_.data<bool>()[linear_index] = true;
   }
 
-  // Recursive function which compares the two given literals elementwise.
-  template <typename NativeT>
-  void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
-                          int64 dimension) {
+  // Compares the two literals elementwise.
+  void CompareLiterals() {
     // Fast path optimization for the case were layouts match.
-    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+    if (LayoutUtil::Equal(actual_.shape().layout(),
+                          expected_.shape().layout())) {
       tensorflow::gtl::ArraySlice<const NativeT> expected_data =
-          expected.data<NativeT>();
+          expected_.data<NativeT>();
       tensorflow::gtl::ArraySlice<const NativeT> actual_data =
-          actual.data<NativeT>();
+          actual_.data<NativeT>();
       const int64 len = expected_data.size();
       for (int64 i = 0; i < len; ++i) {
-        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
-        if (!near) {
-          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
-                                           actual.shape(), i);
-        }
+        CompareValues(expected_data[i], actual_data[i], i);
       }
       return;
     }
+    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    CompareLiteralsSlow(0, &multi_index);
+  }
 
-    if (dimension == expected.shape().dimensions_size()) {
-      bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
-                                   actual.Get<NativeT>(multi_index_));
-      if (!near) {
-        UpdateAndLogMiscompares<NativeT>(
-            expected.Get<NativeT>(multi_index_),
-            actual.Get<NativeT>(multi_index_), actual.shape(),
-            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
-                                                          multi_index_));
-      }
+  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
+  // different layouts. In this case, multidimensional indices are constructed
+  // and indexed for each element.
+  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
+    if (dimension == multi_index->size()) {
+      CompareValues(expected_.Get<NativeT>(*multi_index),
+                    actual_.Get<NativeT>(*multi_index),
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        actual_.shape(), *multi_index));
     } else {
-      for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
-        multi_index_[dimension] = i;
-        ExpectLiteralsNear<NativeT>(expected, actual, dimension + 1);
+      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
+        (*multi_index)[dimension] = i;
+        CompareLiteralsSlow(dimension + 1, multi_index);
       }
     }
   }
@@ -580,159 +653,247 @@ class NearComparator {
     int64 now_usec = tensorflow::Env::Default()->NowMicros();
     string filename = tensorflow::io::JoinPath(
         tensorflow::testing::TmpDir(),
-        tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(),
-                                    now_usec, name.c_str()));
+        Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec,
+               name.c_str()));
     TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
                                              filename, literal.ToProto()));
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
-  // Gets the total element count.  For tuples, this is not the count of tuple
-  // elements, but the sum of elements of each tuple element.
-  int64 RecursiveElementCount(const Shape& shape) {
-    if (ShapeUtil::IsTuple(shape)) {
-      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
-      int64 total = 0;
-      for (int64 i = 0; i < tuple_elements; ++i) {
-        total +=
-            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
-      }
-      return total;
-    } else {
-      return ShapeUtil::ElementsIn(shape);
+  // Returns an error message string with a detailed breakdown of the
+  // mismatches. Called after calling Run().
+  string ErrorMessage() {
+    string out;
+    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
+
+    auto percent_string = [](float a, float b) {
+      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
+      return Printf("%0.4f%%", pct);
+    };
+
+    Appendf(&out,
+            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
+            "%g, rel bound %g\n",
+            num_mismatches_,
+            percent_string(num_mismatches_, element_count).c_str(),
+            ShapeUtil::HumanString(actual_.shape()).c_str(),
+            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
+    if (num_nan_mismatches_ > 0) {
+      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
+    }
+    Appendf(&out, "Top relative error mismatches:\n");
+    for (auto it = top_rel_mismatches_.rbegin();
+         it != top_rel_mismatches_.rend(); ++it) {
+      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
     }
-  }
 
-  // Calling ToString on a literal with over 100 million elements takes around
-  // 3 minutes.  The utility of printing a literal with >1000 elements is
-  // questionable, especially when writing the Literal proto to disk is orders
-  // of magnitude faster.
-  string TruncateHugeLiteral(const Literal& literal) {
-    return RecursiveElementCount(literal.shape()) < 1000
-               ? literal.ToString()
-               : "[TRUNCATED, Literal with more than 1000 values]";
-  }
+    if (!detailed_message_) {
+      return out;
+    }
 
-  ErrorSpec error_;
+    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
+    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      const int64 bucket_size = abs_value_buckets_[i].first;
+      const int64 bucket_mismatches = abs_value_buckets_[i].second;
+      string mismatch_str = bucket_mismatches > 0
+                                ? Printf(", mismatches %lld", bucket_mismatches)
+                                : "";
+      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
+              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
+              bucket_size, percent_string(bucket_size, element_count).c_str(),
+              mismatch_str.c_str());
+    }
 
-  // Number of element miscomparisons encountered so far.
-  int64 num_miscompares_;
+    auto print_accum_buckets = [&](const string& header, int64 total,
+                                   tensorflow::gtl::ArraySlice<int64> buckets) {
+      StrAppend(&out, header, ":\n");
+      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
+              total - buckets[0],
+              percent_string(total - buckets[0], total).c_str());
+      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
+      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
+        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
+                buckets[i], percent_string(buckets[i], total).c_str());
+      }
+    };
+    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
+            error_.abs, num_abs_mismatches_,
+            percent_string(num_abs_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Relative error breakdown of elements exceeding abs error bound",
+        num_abs_mismatches_, rel_error_buckets_);
+    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
+            error_.rel, num_rel_mismatches_,
+            percent_string(num_rel_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Absolute error breakdown of elements exceeding rel error bound",
+        num_rel_mismatches_, abs_error_buckets_);
+    return out;
+  }
 
-  // A Literal containing which elements did not match in the expected and
-  // actual literals. miscompares_ contains PREDs and is of the same sizes as
-  // the comparison literals.
-  Literal miscompares_;
-
-  // A multidimensional index used when performing the recursive comparison.
-  std::vector<int64> multi_index_;
-
-  // Aggregated Statistics on input.
-  double abs_diff_sum_;
-  double abs_expected_sum_;
-  double abs_diff_miscompare_sum_;
-  double abs_expected_miscompare_sum_;
-  float max_rel_err_;
-  float max_abs_err_;
-  int64 first_linear_index_;
-  int64 last_linear_index_;
-  int64 max_rel_linear_index_;
-  int64 max_abs_linear_index_;
-};
+  // 'actual' and 'expected' literals being compared.
+  const Literal& expected_;
+  const Literal& actual_;
 
-template <>
-bool NearComparator::NanMismatch<complex64>(complex64 expected,
-                                            complex64 actual,
-                                            bool relaxed_nans) {
-  return NanMismatch(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch(expected.imag(), actual.imag(), relaxed_nans);
-}
+  // The error bounds of the comparison.
+  ErrorSpec error_;
 
-template <>
-void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
-                                           const ::testing::Message& message) {
-  EXPECT_NEAR(expected.real(), actual.real(), error_.abs)
-      << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-      << message;
-  EXPECT_NEAR(expected.imag(), actual.imag(), error_.abs)
-      << "expected:\n  " << expected << "\n\tvs actual:\n  " << actual << "\n"
-      << message;
-}
+  // Whether to include detailed breakdown of mismatches in the error message.
+  bool detailed_message_;
 
-template <>
-bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
-                                                bfloat16 actual) {
-  return ExpectValuesNear(static_cast<float>(expected),
-                          static_cast<float>(actual));
-}
+  // Number of element element mismatches encountered so far.
+  int64 num_mismatches_ = 0;
 
-template <>
-bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
-  return ExpectValuesNear(static_cast<float>(std::move(expected)),
-                          static_cast<float>(std::move(actual)));
-}
+  // Number of elements with a nan mismatch.
+  int64 num_nan_mismatches_ = 0;
 
-template <>
-void NearComparator::UpdateAndLogMiscompares<bfloat16>(
-    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
-    const int64 linear_index) {
-  UpdateAndLogMiscompares(static_cast<float>(expected),
-                          static_cast<float>(actual), shape, linear_index);
-}
+  // Number of elements which exceed the absolute/relative error bound.
+  int64 num_abs_mismatches_ = 0;
+  int64 num_rel_mismatches_ = 0;
 
-template <>
-void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
-                                                   const Shape& shape,
-                                                   const int64 linear_index) {
-  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
-                          static_cast<float>(std::move(actual)), shape,
-                          linear_index);
-}
-
-}  // namespace
+  // A Literal containing which elements did not match in the expected and
+  // actual literals. mismatches_ contains PREDs and is of the same sizes as
+  // the comparison literals.
+  Literal mismatches_;
+
+  // The number of mismatches to report in the output, sorted by relative error
+  // magnitude.
+  static constexpr int64 kTopRelativeErrorCount = 5;
+
+  // The set of mismatches with the largest relative error. The size of this set
+  // is bounded by kTopRelativeErrorCount.
+  std::multiset<Mismatch> top_rel_mismatches_;
+
+  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
+  // bounds of these buckets. abs_value_buckets_ contains a pair for each
+  // bucket: the element count and failure count.
+  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
+      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
+  std::vector<std::pair<int64, int64>> abs_value_buckets_;
+
+  // Buckets for relative and absolute errors. The relative error buckets only
+  // contains those elements which exceed the *absolute* error bound, and vice
+  // versa. This makes it easy to see the effect of adjusting the relative (or
+  // absolute) error bound on the success of the comparison. kErrorBucketBounds
+  // are the lower bounds of the buckets in both vectors. The error buckets are
+  // a cumulative distribution so an error value may appear in more than one
+  // bucket. For example an error value of 0.003 may appear in the buckets
+  // bounded by 0.01, 0.1, and 1.0.
+  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
+                                                              0.01, 0.1, 1};
+  std::vector<int64> abs_error_buckets_;
+  std::vector<int64> rel_error_buckets_;
+};
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error) {
+template <typename NativeT>
+constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
+template <typename NativeT>
+constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
+
+// Helper function for comparing two literals for nearness. Handles tuple-shapes
+// via recursion. shape_index is the ShapeIndex of expected (or actual)
+// currently being compared.
+::testing::AssertionResult NearHelper(const Literal& expected,
+                                      const Literal& actual,
+                                      const ErrorSpec& error,
+                                      bool detailed_message,
+                                      const ShapeIndex& shape_index) {
   ::testing::AssertionResult err =
-      EqualShapes(expected.shape(), actual.shape());
+      LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
   if (!err) {
     return err;
   }
 
   if (ShapeUtil::IsTuple(expected.shape())) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-      SCOPED_TRACE(tensorflow::strings::StrCat(
-          "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
       const auto expected_element = LiteralView::Create(expected, {i});
       const auto actual_element = LiteralView::Create(actual, {i});
-
+      ShapeIndex element_index = shape_index;
+      element_index.push_back(i);
       ::testing::AssertionResult res =
-          Near(expected_element, actual_element, error);
-      if (err && !res) {
-        err = res;
+          NearHelper(expected_element, actual_element, error, detailed_message,
+                     element_index);
+      if (!res) {
+        string err_message =
+            Printf("\nArray at shape index %s%s",
+                   element_index.ToString().c_str(), res.message());
+        if (err) {
+          err = ::testing::AssertionFailure() << err_message;
+        } else {
+          err << err_message;
+        }
       }
     }
+    if (!err && shape_index.empty()) {
+      // Emit a top-level error message containing the top-level shape in case
+      // of mismatch.
+      int64 total_elements = RecursiveElementCount(actual.shape());
+      err = ::testing::AssertionFailure()
+            << Printf("\nMismatches in shape %s (%lld elements):\n%s",
+                      ShapeUtil::HumanString(actual.shape()).c_str(),
+                      total_elements, err.message());
+    }
     return err;
   }
 
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
-    NearComparator comparator(error);
-    return comparator.ExpectNear(expected, actual)
-               ? ::testing::AssertionSuccess()
-               : ::testing::AssertionFailure() << "values were not near";
+    switch (expected.shape().element_type()) {
+      case BF16:
+        return NearComparator<bfloat16>::Compare(expected, actual, error,
+                                                 detailed_message);
+        break;
+      case F16:
+        return NearComparator<half>::Compare(expected, actual, error,
+                                             detailed_message);
+        break;
+      case F32:
+        return NearComparator<float>::Compare(expected, actual, error,
+                                              detailed_message);
+        break;
+      case F64:
+        return NearComparator<double>::Compare(expected, actual, error,
+                                               detailed_message);
+        break;
+      case C64:
+        return NearComparator<complex64>::Compare(expected, actual, error,
+                                                  detailed_message);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported primitive type in near comparator: "
+                   << PrimitiveType_Name(expected.shape().element_type())
+                   << ". Must be floating-point type.";
+    }
   }
 
-  return Equal(expected, actual);
+  // Non-floating point literal.
+  return LiteralTestUtil::Equal(expected, actual);
+}
+
+}  // namespace
+
+/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
+    const Literal& expected, const Literal& actual, const ErrorSpec& error,
+    bool detailed_message) {
+  return NearHelper(expected, actual, error, detailed_message,
+                    /*shape_index=*/{});
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
                                               const Literal& actual,
                                               const ErrorSpec& error,
                                               const string& message) {
-  EXPECT_TRUE(Near(expected, actual, error))
-      << (message.empty()
-              ? ""
-              : tensorflow::strings::StrCat("\nmessage: ", message));
+  ::testing::AssertionResult res =
+      Near(expected, actual, error, /*detailed_message=*/false);
+  if (!res) {
+    res << "Expected: " << TruncateHugeLiteral(expected) << "\n";
+    res << "Actual: " << TruncateHugeLiteral(actual) << "\n";
+    if (!message.empty()) {
+      res << StrCat("\nmessage: ", message);
+    }
+  }
+  EXPECT_TRUE(res);
 }
 
 /*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
@@ -754,8 +915,7 @@ void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
 
 /* static */ string LiteralTestUtil::MultiIndexAsString(
     tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return tensorflow::strings::StrCat(
-      "{", tensorflow::str_util::Join(multi_index, ","), "}");
+  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
 }
 
 /* static */ std::unique_ptr<Literal> LiteralTestUtil::Reshape(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 7b757a4bd7..a755568c0f 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -122,16 +122,19 @@ class LiteralTestUtil {
   // bounds are equivalent.
   //
   // Tuples are matched recursively.  When comparing tensors of
-  // non-floating-point type, checks for exact equality, ignoring the ErroSpec.
+  // non-floating-point type, checks for exact equality, ignoring the ErrorSpec.
   //
   // If the shape of the literals is neither a complex/floating-point tensor nor
   // a tuple which contains a complex/floating-point tensor, Near() is
   // equivalent to Equal().  We don't raise an error in this case, because we
   // want to allow callers to call Near() even if they have no preconceptions
   // about the shapes being compared.
+  //
+  // If detailed_message is true, then the error message in the assertion result
+  // will contain a more detailed breakdown of mismatches.
   static ::testing::AssertionResult Near(
-      const Literal& expected, const Literal& actual,
-      const ErrorSpec& error) TF_MUST_USE_RESULT;
+      const Literal& expected, const Literal& actual, const ErrorSpec& error,
+      bool detailed_message = false) TF_MUST_USE_RESULT;
 
   // Expects expected and actual to be Near with the given error.
   static void ExpectNear(const Literal& expected, const Literal& actual,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 3a421f8458..9d619a77c7 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -89,7 +89,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
       EXPECT_EQ("2", literal->ToString());
     } else if (result.find("actual") != string::npos) {
       EXPECT_EQ("4", literal->ToString());
-    } else if (result.find("miscompares") != string::npos) {
+    } else if (result.find("mismatches") != string::npos) {
       EXPECT_EQ("true", literal->ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
-- 
GitLab


From 35543d5777b87c18b47eb73e83af41240a022e26 Mon Sep 17 00:00:00 2001
From: joel-shor <joelshor@google.com>
Date: Fri, 20 Apr 2018 02:49:58 +0300
Subject: [PATCH 1164/1262] [tf.data] Correct / clarify docstring for `weights`
 as a dataset.

This is a noop.
---
 tensorflow/contrib/data/python/ops/interleave_ops.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 5ae1fa9e9e..812a50ecbf 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -200,10 +200,11 @@ def sample_from_datasets(datasets, weights=None, seed=None):
 
   Args:
     datasets: A list of @{tf.data.Dataset} objects with compatible structure.
-    weights: (Optional.) A list of `len(datasets)` floating-point values or a
-      @{tf.data.Dataset} object, where `weights[i]` represents the probability
-      with which an element should be sampled from `datasets[i]`. Defaults to a
-      uniform distribution across `datasets`.
+    weights: (Optional.) A list of `len(datasets)` floating-point values where
+      `weights[i]` represents the probability with which an element should be
+      sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each
+      element is such a list. Defaults to a uniform distribution across
+      `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
       @{tf.set_random_seed} for behavior.
-- 
GitLab


From e07c9e23a94866966aa7e336a519b55931d570e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 16:53:14 -0700
Subject: [PATCH 1165/1262] Run EvaluateNodes for ModelPruner test except for
 NoPruning.

PiperOrigin-RevId: 193596812
---
 tensorflow/core/grappler/optimizers/BUILD     |  1 +
 .../grappler/optimizers/model_pruner_test.cc  | 52 +++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 63492e1a7f..a371186fe6 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -365,6 +365,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 2b12eadec9..cf5b990377 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -133,6 +134,13 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
   EXPECT_EQ(1, new_c.input_size());
   EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, NoOpPruning) {
@@ -171,6 +179,13 @@ TEST_F(ModelPrunerTest, NoOpPruning) {
       EXPECT_EQ("a", new_node.input(0));
     }
   }
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, PreserveIdentities) {
@@ -201,6 +216,19 @@ TEST_F(ModelPrunerTest, PreserveIdentities) {
 
   TF_EXPECT_OK(status);
   EXPECT_EQ(item.graph.node_size(), output.node_size());
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+  v_ctrl_t.flat<bool>()(0) = true;
+  auto expected_tensors = EvaluateNodes(
+      item.graph, {"merge", "id2"}, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  auto actual_tensors = EvaluateNodes(output, {"merge", "id2"},
+                                      {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, expected_tensors.size());
+  EXPECT_EQ(2, actual_tensors.size());
+  for (int i = 0; i < expected_tensors.size(); i++) {
+    test::ExpectTensorEqual<float>(expected_tensors[i], actual_tensors[i]);
+  }
 }
 
 TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
@@ -241,6 +269,14 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
   EXPECT_EQ("b", new_c.input(0));
   EXPECT_EQ("b", new_d.input(0));
   EXPECT_EQ("b", new_e.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto a_t = GenerateRandomTensor<DT_INT64>(TensorShape({}));
+  auto expected_tensors = EvaluateNodes(item.graph, fetch, {{"a", a_t}});
+  auto actual_tensors = EvaluateNodes(output, fetch, {{"a", a_t}});
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<int64>(expected_tensors[0], actual_tensors[0]);
 }
 
 // TODO(rmlarsen): Reenable this test when the issues with
@@ -316,6 +352,12 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   EXPECT_EQ(NodeName(b.name()), new_b.name());
   const NodeDef& new_c = output.node(2);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
+
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
@@ -348,6 +390,16 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
       EXPECT_EQ("c", node.input(0));
     }
   }
+  if (GetNumAvailableGPUs() > 0) {
+    auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+    auto actual_tensors = EvaluateNodes(output, item.fetch);
+    EXPECT_EQ(4, expected_tensors.size());
+    EXPECT_EQ(4, actual_tensors.size());
+    for (int i = 0; i < expected_tensors.size(); i++) {
+      test::ExpectTensorNear<float>(expected_tensors[i], actual_tensors[i],
+                                    1e-6);
+    }
+  }
 }
 
 }  // namespace
-- 
GitLab


From 2d8da1d12a5fbeaa99e1cdd761b735a02020611b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 17:17:05 -0700
Subject: [PATCH 1166/1262] Removed deprecated methods from
 tensorflow::StringPiece.

This will allow tensorflow::StringPiece to be more easily replaced with absl::string_view as absl::string_view does not contain those methods.

PiperOrigin-RevId: 193599651
---
 tensorflow/core/lib/core/stringpiece.cc      |  4 ---
 tensorflow/core/lib/core/stringpiece.h       | 26 --------------------
 tensorflow/core/lib/core/stringpiece_test.cc | 10 --------
 3 files changed, 40 deletions(-)

diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 0b006fa2b4..4c488066e4 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -25,10 +25,6 @@ std::ostream& operator<<(std::ostream& o, StringPiece piece) {
   return o;
 }
 
-bool StringPiece::contains(StringPiece s) const {
-  return std::search(begin(), end(), s.begin(), s.end()) != end();
-}
-
 size_t StringPiece::find(char c, size_t pos) const {
   if (pos >= size_) {
     return npos;
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 835b938cbf..0cf6c24850 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -88,20 +88,6 @@ class StringPiece {
 
   size_t find(char c, size_t pos = 0) const;
   size_t rfind(char c, size_t pos = npos) const;
-  // DEPRECATED: Use tensorflow::str_util::StrContains instead.
-  bool contains(StringPiece s) const;
-
-  // Checks whether StringPiece starts with x and if so advances the beginning
-  // of it to past the match.  It's basically a shortcut for starts_with
-  // followed by remove_prefix.
-  // DEPRECATED: Use tensorflow::str_util::ConsumePrefix instead.
-  bool Consume(StringPiece x) {
-    if (starts_with(x)) {
-      remove_prefix(x.size_);
-      return true;
-    }
-    return false;
-  }
 
   StringPiece substr(size_t pos, size_t n = npos) const;
 
@@ -114,18 +100,6 @@ class StringPiece {
   //   >  0 iff "*this" >  "b"
   int compare(StringPiece b) const;
 
-  // Return true iff "x" is a prefix of "*this"
-  // DEPRECATED: Use tensorflow::str_util::StartsWith instead.
-  bool starts_with(StringPiece x) const {
-    return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
-  }
-  // Return true iff "x" is a suffix of "*this"
-  // DEPRECATED: Use tensorflow::str_util::EndsWith instead.
-  bool ends_with(StringPiece x) const {
-    return ((size_ >= x.size_) &&
-            (memcmp(data_ + (size_ - x.size_), x.data_, x.size_) == 0));
-  }
-
  private:
   const char* data_;
   size_t size_;
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index d0dbeb6072..de35d6eac6 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -55,14 +55,4 @@ TEST(StringPiece, Ctor) {
   }
 }
 
-TEST(StringPiece, Contains) {
-  StringPiece a("abcdefg");
-  StringPiece b("abcd");
-  StringPiece c("efg");
-  StringPiece d("gh");
-  EXPECT_TRUE(a.contains(b));
-  EXPECT_TRUE(a.contains(c));
-  EXPECT_TRUE(!a.contains(d));
-}
-
 }  // namespace tensorflow
-- 
GitLab


From 4e17a3f1496b398afe632b002b0589b7346b2e3f Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 17:18:10 -0700
Subject: [PATCH 1167/1262] [XLA] De-unique_ptr-ify ShapedBuffer and
 ScopedShapedBuffer.

These are already notionally equivalent to T* and unique_ptr<T>, so
having a unique_ptr of a {Scoped,}ShapedBuffer is pretty redundant.

Also clean up the ScopedShapedBuffer API a bit.

PiperOrigin-RevId: 193599773
---
 tensorflow/compiler/jit/xla_launch_util.cc    |  47 ++---
 tensorflow/compiler/jit/xla_launch_util.h     |   2 +-
 tensorflow/compiler/jit/xla_tensor.cc         |   6 +-
 tensorflow/compiler/jit/xla_tensor.h          |   6 +-
 .../compiler/xla/client/local_client.cc       |  23 ++-
 tensorflow/compiler/xla/client/local_client.h |   6 +-
 .../xla/python/local_computation_builder.cc   |  46 ++---
 .../xla/python/local_computation_builder.h    |   6 +-
 .../xla/service/allocation_tracker.cc         |  33 ++--
 .../compiler/xla/service/allocation_tracker.h |  14 +-
 .../xla/service/cpu/cpu_executable.cc         |  14 +-
 .../compiler/xla/service/cpu/cpu_executable.h |   8 +-
 .../service/cpu/parallel_cpu_executable.cc    |   9 +-
 .../xla/service/cpu/parallel_cpu_executable.h |   4 +-
 tensorflow/compiler/xla/service/executable.cc |  16 +-
 tensorflow/compiler/xla/service/executable.h  |   8 +-
 .../xla/service/gpu/gpu_executable.cc         |  10 +-
 .../compiler/xla/service/gpu/gpu_executable.h |   4 +-
 tensorflow/compiler/xla/service/hlo_runner.cc |  45 +++--
 .../xla/service/interpreter/executable.cc     |   9 +-
 .../xla/service/interpreter/executable.h      |   4 +-
 tensorflow/compiler/xla/service/service.cc    |  14 +-
 .../compiler/xla/service/shaped_buffer.cc     |  36 ++--
 .../compiler/xla/service/shaped_buffer.h      |  64 ++++---
 .../compiler/xla/service/transfer_manager.cc  |  21 ++-
 .../compiler/xla/service/transfer_manager.h   |   8 +-
 .../compiler/xla/tests/dynamic_ops_test.cc    |   8 +-
 tensorflow/compiler/xla/tests/fusion_test.cc  |  16 +-
 .../xla/tests/local_client_allocation_test.cc |   7 +-
 .../xla/tests/local_client_execute_test.cc    | 170 ++++++++----------
 .../xla/tests/local_client_test_base.cc       |  12 +-
 .../xla/tests/local_client_test_base.h        |  11 +-
 .../xla/tests/transfer_manager_test.cc        |  46 ++---
 .../xla/tests/xla_hlo_profile_test.cc         |  10 +-
 34 files changed, 373 insertions(+), 370 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 50b0061d69..3520501c1a 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -32,10 +32,13 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+namespace {
 namespace gpu = perftools::gputools;
+using xla::ScopedShapedBuffer;
+using xla::ShapedBuffer;
+}  // anonymous namespace
 
 namespace tensorflow {
-
 std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
                                                         int num_variables) {
   std::map<int, OptionalTensor> snapshot;
@@ -80,17 +83,17 @@ namespace {
 // Return the 'index''th subtree of the given ShapedBuffer as a
 // ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
 // subtree, and sets the input's buffer pointers to nullptr for the subtree.
-std::unique_ptr<xla::ScopedShapedBuffer> ExtractSubShapedBuffer(
-    xla::ShapedBuffer* shaped_buffer, int index,
+ScopedShapedBuffer ExtractSubShapedBuffer(
+    ShapedBuffer* shaped_buffer, int index,
     xla::DeviceMemoryAllocator* allocator) {
   xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_host_shape(), index);
   xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_device_shape(), index);
 
-  xla::ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
-                                      shaped_buffer->platform(),
-                                      shaped_buffer->device_ordinal());
+  ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
+                                 shaped_buffer->platform(),
+                                 shaped_buffer->device_ordinal());
 
   auto& shape_tree = shaped_buffer->buffers();
   auto& sub_shape_tree = sub_shaped_buffer.buffers();
@@ -102,8 +105,7 @@ std::unique_ptr<xla::ScopedShapedBuffer> ExtractSubShapedBuffer(
       index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0);
     }
   }
-  return xla::ScopedShapedBuffer::MakeScoped(&sub_shaped_buffer, allocator)
-      .ValueOrDie();
+  return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
 }
 }  // namespace
 
@@ -118,10 +120,10 @@ XlaComputationLaunchContext::XlaComputationLaunchContext(
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
     const std::map<int, OptionalTensor>& variables) {
-  // Build xla::ShapedBuffers that point directly to the Tensor buffers.
+  // Build ShapedBuffers that point directly to the Tensor buffers.
   arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1);
   arg_buffers_.resize(kernel->xla_input_shapes.size());
-  arg_ptrs_ = std::vector<xla::ShapedBuffer*>(arg_buffers_.size());
+  arg_ptrs_ = std::vector<ShapedBuffer*>(arg_buffers_.size());
 
   // Pass remaining parameters.
   const Tensor* t;
@@ -140,8 +142,7 @@ void XlaComputationLaunchContext::PopulateInputs(
     if (xla::ShapeUtil::IsTuple(on_device_shape)) {
       const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
-      arg_ptrs_[i] =
-          const_cast<xla::ShapedBuffer*>(&xla_tensor->shaped_buffer());
+      arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
     } else {
       CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
           << "On-device shape "
@@ -149,7 +150,7 @@ void XlaComputationLaunchContext::PopulateInputs(
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
       gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_[i] = xla::MakeUnique<xla::ShapedBuffer>(
+      arg_buffers_[i] = xla::MakeUnique<ShapedBuffer>(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
       arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
@@ -160,15 +161,15 @@ void XlaComputationLaunchContext::PopulateInputs(
 
 void XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
-    std::unique_ptr<xla::ScopedShapedBuffer> output) {
+    ScopedShapedBuffer output) {
   gpu::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
+    VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString();
     VLOG(2) << "Result tuple shape (on device): "
-            << output->on_device_shape().DebugString();
+            << output.on_device_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
@@ -226,18 +227,18 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const TensorShape& shape = kernel->outputs[i].shape;
       VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
 
-      gpu::DeviceMemoryBase buffer = output->buffer({output_num});
+      gpu::DeviceMemoryBase buffer = output.buffer({output_num});
       if (allocate_xla_tensors_) {
         Tensor* output_tensor;
         OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
         XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
         CHECK(xla_tensor);
-        xla_tensor->set_shaped_buffer(
-            ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+        xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
+            ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
       } else {
         Tensor output_tensor = XlaTensorBuffer::MakeTensor(
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+        output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
         ctx->set_output(i, output_tensor);
       }
       ++output_num;
@@ -257,7 +258,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
 
-    gpu::DeviceMemoryBase buffer = output->buffer({output_num});
+    gpu::DeviceMemoryBase buffer = output.buffer({output_num});
 
     Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
@@ -282,12 +283,12 @@ void XlaComputationLaunchContext::PopulateOutputs(
       XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
       CHECK(xla_tensor);
       xla_tensor->set_shaped_buffer(
-          ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+          ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
       *variable->tensor() = output_tensor;
     } else {
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+      output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
       *variable->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 14f70fe358..26dcaa8a51 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -87,7 +87,7 @@ class XlaComputationLaunchContext {
   // Given the XLA output in `output`, populate all outputs of `ctx`.
   void PopulateOutputs(OpKernelContext* ctx,
                        const XlaCompiler::CompilationResult* kernel,
-                       std::unique_ptr<xla::ScopedShapedBuffer> output);
+                       xla::ScopedShapedBuffer output);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 956328e675..84b2835c40 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -65,10 +65,8 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
                             device_ordinal, size, /*retry_on_failure=*/false));
   }
 
-  TF_ASSIGN_OR_RETURN(auto scoped_buffer,
-                      xla::ScopedShapedBuffer::MakeScoped(
-                          &buffer, client->backend().memory_allocator()));
-  set_shaped_buffer(std::move(scoped_buffer));
+  set_shaped_buffer(xla::ScopedShapedBuffer(
+      std::move(buffer), client->backend().memory_allocator()));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 5ff2fb08f0..2334fd272b 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -64,9 +64,9 @@ class XlaTensor {
     return *shaped_buffer_;
   }
   // Mutates the TensorInfo to set the ShapedBuffer.
-  void set_shaped_buffer(
-      std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer) {
-    shaped_buffer_ = std::move(shaped_buffer);
+  void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
+    shaped_buffer_ =
+        xla::MakeUnique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
   }
 
   // Some tensors on the device may have known values on the host. We use these
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index d951c44cb9..d0e945b70f 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -134,7 +134,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
+StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     ExecutableRunOptions run_options) {
   TF_RETURN_IF_ERROR(
@@ -167,27 +167,26 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
     return ExecuteAndDump(&service_options, arguments);
   }
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ShapedBuffer result,
       executable_->ExecuteOnStreamWrapper(
           &service_options, run_options.execution_profile(), arguments));
 
-  return MakeUnique<ScopedShapedBuffer>(std::move(*result),
-                                        run_options.allocator());
+  return ScopedShapedBuffer(std::move(result), run_options.allocator());
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
+StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
     const ServiceExecutableRunOptions* run_options,
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   executable_->session_module()->set_execution_platform(
       backend_->platform()->Name());
   TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module()));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ShapedBuffer result,
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
-  TF_RETURN_IF_ERROR(RecordResult(result.get(), executable_->session_module()));
+  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module()));
   TF_RETURN_IF_ERROR(executable_->DumpSessionModule());
-  return ScopedShapedBuffer::MakeScoped(result.get(), run_options->allocator());
+  return ScopedShapedBuffer(std::move(result), run_options->allocator());
 }
 
 tensorflow::Status LocalExecutable::RecordArguments(
@@ -281,9 +280,9 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
                                         updated_options));
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
-                                   DeviceMemoryAllocator* allocator) {
+StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
+    const Literal& literal, int device_ordinal,
+    DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
@@ -293,7 +292,7 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-      executor, literal, *scoped_buffer));
+      executor, literal, scoped_buffer));
   return std::move(scoped_buffer);
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 42812b936f..f306c520ed 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -38,7 +38,7 @@ class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
+  StatusOr<ScopedShapedBuffer> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       ExecutableRunOptions run_options);
 
@@ -73,7 +73,7 @@ class LocalExecutable {
 
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteAndDump(
+  StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
@@ -136,7 +136,7 @@ class LocalClient : public Client {
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
   // device is used.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> LiteralToShapedBuffer(
+  StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 2bacc6a914..24e17abbe0 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -89,17 +89,16 @@ StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocalReplica(
   return client->TransferFromOutfeedLocal(shape, device_ordinal);
 }
 
-LocalShapedBuffer::LocalShapedBuffer(
-    std::unique_ptr<ScopedShapedBuffer> shaped_buffer)
+LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
     : shaped_buffer_(std::move(shaped_buffer)) {}
 
-const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
-    const {
-  return shaped_buffer_;
+const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
+  return &shaped_buffer_;
 }
 
-static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
-    LocalClient* client, int device_ordinal, const Literal& arg) {
+static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
+                                             int device_ordinal,
+                                             const Literal& arg) {
   return client->LiteralToShapedBuffer(arg, device_ordinal,
                                        client->backend().memory_allocator());
 }
@@ -109,14 +108,15 @@ LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
     const Literal& argument,
     const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  std::unique_ptr<ScopedShapedBuffer> buf;
-  if (shape_with_layout) {
-    std::unique_ptr<Literal> relaid =
-        argument.Relayout(shape_with_layout.value());
-    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
-  } else {
-    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
-  }
+  ScopedShapedBuffer buf = [&] {
+    if (shape_with_layout) {
+      std::unique_ptr<Literal> relaid =
+          argument.Relayout(shape_with_layout.value());
+      return ToBuffer(client, /*device_ordinal=*/0, *relaid)
+          .ConsumeValueOrDie();
+    }
+    return ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }();
   return new LocalShapedBuffer(std::move(buf));
 }
 
@@ -158,14 +158,14 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
                 << device_ordinal;
 
         // Transfer arguments in
-        std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
+        std::vector<ScopedShapedBuffer> scoped_buffers;
         scoped_buffers.reserve(arguments.size());
         for (int i = 0; i < arguments.size(); ++i) {
           const Literal& argument = arguments[i];
           const tensorflow::gtl::optional<Shape>& shape_with_layout =
               shapes_with_layout[i];
 
-          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          StatusOr<ScopedShapedBuffer> pushed;
           if (shape_with_layout) {
             std::unique_ptr<Literal> relaid =
                 argument.Relayout(shape_with_layout.value());
@@ -185,7 +185,7 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         std::vector<const ShapedBuffer*> argument_buffers;
         argument_buffers.reserve(scoped_buffers.size());
         for (auto& buffer : scoped_buffers) {
-          argument_buffers.push_back(buffer.get());
+          argument_buffers.push_back(&buffer);
         }
 
         DeviceAssignment device_assignment =
@@ -202,7 +202,7 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         options.set_intra_op_thread_pool(
             client->backend().eigen_intra_op_thread_pool_device());
         options.set_device_assignment(&device_assignment);
-        StatusOr<std::unique_ptr<ScopedShapedBuffer>> result_buffer_status =
+        StatusOr<ScopedShapedBuffer> result_buffer_status =
             executable_->Run(argument_buffers, options);
         if (!result_buffer_status.ok()) {
           results[replica] = result_buffer_status.status();
@@ -210,8 +210,8 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         }
 
         // Transfer result out
-        results[replica] =
-            client->ShapedBufferToLiteral(*result_buffer_status.ValueOrDie());
+        results[replica] = client->ShapedBufferToLiteral(
+            std::move(result_buffer_status).ValueOrDie());
       });
     }
   }
@@ -236,7 +236,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   std::vector<const ShapedBuffer*> argument_buffers;
   argument_buffers.reserve(argument_handles.size());
   for (auto& handle : argument_handles) {
-    argument_buffers.push_back(handle->shaped_buffer().get());
+    argument_buffers.push_back(handle->shaped_buffer());
   }
 
   // Execute
@@ -245,7 +245,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool());
   options.set_intra_op_thread_pool(
       client->backend().eigen_intra_op_thread_pool_device());
-  std::unique_ptr<ScopedShapedBuffer> result_buffer =
+  ScopedShapedBuffer result_buffer =
       executable_->Run(argument_buffers, options).ConsumeValueOrDie();
 
   return new LocalShapedBuffer(std::move(result_buffer));
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 31046e60f1..e1048909ab 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -62,12 +62,12 @@ class LocalShapedBuffer {
   static LocalShapedBuffer* FromLiteral(
       const Literal& argument,
       const tensorflow::gtl::optional<Shape>& shape_with_layout);
-  LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
-  const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
+  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
+  const ScopedShapedBuffer* shaped_buffer() const;
   std::unique_ptr<Literal> ToLiteral() const;
 
  private:
-  std::unique_ptr<ScopedShapedBuffer> shaped_buffer_;
+  ScopedShapedBuffer shaped_buffer_;
 };
 
 // Wraps a LocalExecutable produced by compiling a
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 359582a78c..6bf65825cd 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -31,52 +31,51 @@ limitations under the License.
 namespace xla {
 
 StatusOr<GlobalDataHandle> AllocationTracker::Register(
-    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
+    ShapedBuffer shaped_buffer, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Register";
-  std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers;
+  std::vector<ShapedBuffer> replicated_buffers;
   replicated_buffers.emplace_back(std::move(shaped_buffer));
   return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterReplicatedBuffers(
-    std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-    const string& tag) {
+    std::vector<ShapedBuffer> replicated_buffers, const string& tag) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "RegisterReplicatedBuffers";
   return RegisterInternal(std::move(replicated_buffers), tag);
 }
 
 StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
-    std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-    const string& tag) {
+    std::vector<ShapedBuffer> replicated_buffers, const string& tag) {
   VLOG(2) << "RegisterInternal("
           << "tag: \"" << tag << "\" with " << replicated_buffers.size()
           << " shaped_buffers.";
   for (const auto& shaped_buffer : replicated_buffers) {
-    VLOG(2) << "shaped_buffer:" << *shaped_buffer;
-    if (shaped_buffer->platform() != backend_->platform()) {
+    VLOG(2) << "shaped_buffer:" << shaped_buffer;
+    if (shaped_buffer.platform() != backend_->platform()) {
       return InvalidArgument(
           "AllocationTracker for platform %s cannot register buffer from "
           "platform %s",
           backend_->platform()->Name().c_str(),
-          shaped_buffer->platform()->Name().c_str());
+          shaped_buffer.platform()->Name().c_str());
     }
   }
 
   int64 handle = next_handle_++;
   for (auto& shaped_buffer : replicated_buffers) {
     std::vector<ShapeIndex> shape_indices;
-    ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+    ShapeUtil::ForEachSubshape(shaped_buffer.on_device_shape(),
                                [this, &shape_indices](const Shape& /*subshape*/,
                                                       const ShapeIndex& index) {
                                  shape_indices.push_back(index);
                                });
     for (const ShapeIndex& index : shape_indices) {
-      AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index),
-                                       shaped_buffer->device_ordinal());
+      AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index),
+                                       shaped_buffer.device_ordinal());
     }
-    handle_to_shaped_buffers_[handle].emplace_back(std::move(shaped_buffer));
+    handle_to_shaped_buffers_[handle].emplace_back(
+        MakeUnique<ShapedBuffer>(std::move(shaped_buffer)));
   }
 
   GlobalDataHandle result;
@@ -146,13 +145,13 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
   for (int i = 0;
        i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape());
        ++i) {
-    auto element_buffer = MakeUnique<ShapedBuffer>(
+    auto element_buffer = ShapedBuffer(
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i),
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
         shaped_buffer->platform(), shaped_buffer->device_ordinal());
-    element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}),
-                               /*index=*/{});
-    std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers;
+    element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}),
+                              /*index=*/{});
+    std::vector<ShapedBuffer> replicated_buffers;
     replicated_buffers.emplace_back(std::move(element_buffer));
     TF_ASSIGN_OR_RETURN(
         GlobalDataHandle element_handle,
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 60e93358ef..2bfcd53712 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -45,14 +45,13 @@ class AllocationTracker {
   // Registers a shaped buffer of device memory, and returns a corresponding
   // handle that can be used for talking to XLA clients. The given shaped buffer
   // will be treated as the buffer corresponding to the only replica.
-  StatusOr<GlobalDataHandle> Register(
-      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag);
+  StatusOr<GlobalDataHandle> Register(ShapedBuffer shaped_buffer,
+                                      const string& tag);
 
   // Registers a vector of shaped buffers of device memory, one per replica, and
   // returns a corresponding handle that can be used for talking to XLA clients.
   StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
-      std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-      const string& tag);
+      std::vector<ShapedBuffer> replicated_buffers, const string& tag);
 
   // Unregister the allocation for the given data handle.
   Status Unregister(const GlobalDataHandle& data);
@@ -95,8 +94,8 @@ class AllocationTracker {
   // Internal helper which registers a vector of shaped buffers, one per
   // replica.
   StatusOr<GlobalDataHandle> RegisterInternal(
-      std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers,
-      const string& tag) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      std::vector<ShapedBuffer> replicated_buffers, const string& tag)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Resets the shaped buffers corresponding to the given handle.
   Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
@@ -132,6 +131,9 @@ class AllocationTracker {
 
   // A map from data handle to a vector of shaped buffers that represent the
   // buffers for different replicas.
+  //
+  // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our
+  // public API returns pointers to them.
   tensorflow::gtl::FlatMap<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
       handle_to_shaped_buffers_ GUARDED_BY(mutex_);
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index aee62a4935..97e550abe4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -243,18 +243,18 @@ static Status DeallocateTempBuffers(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
+StatusOr<ShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
     std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
-  auto result_buffer = MakeUnique<ShapedBuffer>(
+  ShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
       stream->parent()->platform(), stream->parent()->device_ordinal());
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
-  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
         // The points to set is unambiguous so the set should be a
@@ -281,7 +281,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> CpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -300,7 +300,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
+      ShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   // Free all buffers not in the result.
@@ -310,7 +310,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   if (hlo_profiling_enabled()) {
@@ -330,7 +330,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
 
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
+      ShapedBuffer result_buffer,
       CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   LogLiveAddresses(buffers, buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index c3c2820c26..06b6943cb5 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -55,12 +55,12 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
@@ -102,13 +102,13 @@ class CpuExecutable : public Executable {
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Create a ShapedBuffer for holding the result of the computation. The
+  // Creates a ShapedBuffer for holding the result of the computation. The
   // addresses (DeviceMemoryBases) are set according to buffer assignment.
   // 'buffers_in_result' should point to a vector of the same size as
   // 'allocated_buffers'. An element in buffers_in_result is set to true if the
   // corresponding buffer is live out of the computation (and thus contained in
   // the returned ShapedBuffer).
-  StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
+  StatusOr<ShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
       std::vector<bool>* buffers_in_result);
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 2d0f1d0be5..a2bd4fa195 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -447,7 +447,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -459,7 +459,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  auto result_buffer = MakeUnique<ShapedBuffer>(
+  ShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
       stream->parent()->platform(), stream->parent()->device_ordinal());
 
@@ -472,7 +472,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   // Copy DeviceMemoryBase values which into the respective location in
   // ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
 
@@ -511,8 +511,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>>
-ParallelCpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> ParallelCpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index d87ba57a1e..5ce84fa996 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -59,12 +59,12 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index caa46686be..b097ef79cc 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -29,18 +29,19 @@ using tensorflow::gtl::ArraySlice;
 
 namespace xla {
 
-StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>>
-Executable::ExecuteOnStreams(
+StatusOr<std::vector<ShapedBuffer>> Executable::ExecuteOnStreams(
     ArraySlice<const ServiceExecutableRunOptions> run_options,
     ArraySlice<ArraySlice<const ShapedBuffer*>> arguments) {
   TF_RET_CHECK(run_options.size() == arguments.size());
 
-  std::vector<std::unique_ptr<ShapedBuffer>> return_values(run_options.size());
+  std::vector<ShapedBuffer> return_values;
+  return_values.reserve(run_options.size());
 
   if (run_options.size() == 1) {
-    TF_ASSIGN_OR_RETURN(return_values[0],
+    TF_ASSIGN_OR_RETURN(auto rv,
                         ExecuteOnStream(&run_options[0], arguments[0],
                                         /*hlo_execution_profile=*/nullptr));
+    return_values.push_back(std::move(rv));
     return std::move(return_values);
   }
 
@@ -48,8 +49,9 @@ Executable::ExecuteOnStreams(
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
     // executions may never complete if not all executions are running.
-    TF_ASSIGN_OR_RETURN(return_values[i],
+    TF_ASSIGN_OR_RETURN(auto rv,
                         ExecuteAsyncOnStream(&run_options[i], arguments[i]));
+    return_values.push_back(std::move(rv));
   }
   for (const auto& options : run_options) {
     TF_RET_CHECK(options.stream() != nullptr);
@@ -58,7 +60,7 @@ Executable::ExecuteOnStreams(
   return std::move(return_values);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
+StatusOr<ShapedBuffer> Executable::ExecuteOnStreamWrapper(
     const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
     ArraySlice<const ShapedBuffer*> arguments) {
   se::Stream* stream = run_options->stream();
@@ -78,7 +80,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
                                             &hlo_profile_index_map())
           : nullptr;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
+  StatusOr<ShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
   TF_RETURN_IF_ERROR(return_value.status());
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 6f4cd99767..9c725f21d8 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -62,14 +62,14 @@ class Executable {
   // enabled.
   //
   // Returns a shaped buffer containing the result of the computation.
-  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  virtual StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) = 0;
 
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  virtual StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) = 0;
 
@@ -77,7 +77,7 @@ class Executable {
   // streams. arguments[i] contains the arguments to the execution on
   // run_options[i]->stream() and the returned value is at index i of the
   // returned vector.
-  virtual StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>> ExecuteOnStreams(
+  virtual StatusOr<std::vector<ShapedBuffer>> ExecuteOnStreams(
       tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
           run_options,
       tensorflow::gtl::ArraySlice<
@@ -97,7 +97,7 @@ class Executable {
   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
   // given ExecutionProfile if non-null.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStreamWrapper(
+  StatusOr<ShapedBuffer> ExecuteOnStreamWrapper(
       const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 5676d4de8e..62ce15bc59 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -250,7 +250,7 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -297,13 +297,13 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
-  auto shaped_buffer = MakeUnique<ShapedBuffer>(
-      root->shape(), root->shape(), executor->platform(), device_ordinal);
+  auto shaped_buffer = ShapedBuffer(root->shape(), root->shape(),
+                                    executor->platform(), device_ordinal);
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer.
   std::set<se::DeviceMemoryBase> buffers_in_result;
-  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus(
       [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
           const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
@@ -335,7 +335,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   return std::move(shaped_buffer);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index dcb3991f41..361bc30b2f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -74,12 +74,12 @@ class GpuExecutable : public Executable {
 
   // ExecuteOnStream will fail if the compute capability of the stream doesn't
   // match the compute capability passed to this object's constructor.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 171477299e..df5ffd0b7d 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -107,33 +107,35 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
   const ExecutableRunOptions& run_options = service_run_options.run_options();
 
   // Copy arguments to device.
-  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
-  std::vector<ShapedBuffer*> argument_buffer_ptrs;
+  std::vector<ScopedShapedBuffer> argument_buffers;
   for (Literal* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+        ScopedShapedBuffer argument_buffer,
         backend().transfer_manager()->AllocateScopedShapedBuffer(
             argument->shape(), run_options.allocator(),
             run_options.device_ordinal()));
     TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-        stream.parent(), *argument, *argument_buffer));
+        stream.parent(), *argument, argument_buffer));
     argument_buffers.push_back(std::move(argument_buffer));
-    argument_buffer_ptrs.push_back(argument_buffers.back().get());
+  }
+
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs;
+  argument_buffer_ptrs.reserve(argument_buffers.size());
+  for (const auto& buf : argument_buffers) {
+    argument_buffer_ptrs.push_back(&buf);
   }
 
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result,
+      ShapedBuffer result,
       executable->ExecuteOnStreamWrapper(
           &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs));
 
   // Create a ScopedShapedBuffer of the result to manage deallocation. This will
   // deallocate all the device memory when it goes out of scope.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ScopedShapedBuffer> scoped_result,
-      ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator()));
+  ScopedShapedBuffer scoped_result(std::move(result), run_options.allocator());
 
   auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
-      stream.parent(), *scoped_result);
+      stream.parent(), scoped_result);
   if (result_literal.ok()) {
     VLOG(4) << "Executed binary and got result: "
             << result_literal.ValueOrDie()->ToString();
@@ -155,7 +157,13 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
       backend().computation_placer()->AssignDevices(options.num_replicas, 1));
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
-  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
+
+  std::vector<ScopedShapedBuffer> argument_buffers;
+  // This reserve() call is necessary for correctness, because
+  // argument_buffer_ptrs contains pointers into the elements of
+  // argument_buffers.
+  argument_buffers.reserve(options.num_replicas * options.arguments.size());
+
   // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
   // no arguments.
   std::vector<const ShapedBuffer*> argument_buffer_ptrs(
@@ -175,13 +183,13 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     // Copy arguments to device.
     for (const Literal* argument : options.arguments) {
       TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+          ScopedShapedBuffer argument_buffer,
           backend().transfer_manager()->AllocateScopedShapedBuffer(
               argument->shape(), backend().memory_allocator(), device));
       TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-          executor, *argument, *argument_buffer));
+          executor, *argument, argument_buffer));
       argument_buffers.push_back(std::move(argument_buffer));
-      argument_buffer_ptrs[index++] = argument_buffers.back().get();
+      argument_buffer_ptrs[index++] = &argument_buffers.back();
     }
     argument_buffer_slices.emplace_back(
         &argument_buffer_ptrs[index - options.arguments.size()],
@@ -240,19 +248,18 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
   }
 
   LOG(INFO) << "Replicated execution started";
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<ShapedBuffer>> results,
+  TF_ASSIGN_OR_RETURN(std::vector<ShapedBuffer> results,
                       executable->ExecuteOnStreams(service_run_options,
                                                    argument_buffer_slices));
   LOG(INFO) << "Replicated execution terminated";
 
   std::vector<std::unique_ptr<Literal>> exec_results;
   for (int64 i = 0; i < options.num_replicas; ++i) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<ScopedShapedBuffer> result,
-                        ScopedShapedBuffer::MakeScoped(
-                            results[i].get(), backend().memory_allocator()));
+    ScopedShapedBuffer result(std::move(results[i]),
+                              backend().memory_allocator());
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
-                            streams[i]->parent(), *result));
+                            streams[i]->parent(), result));
     exec_results.push_back(std::move(literal));
   }
   return std::move(exec_results);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index acfa79ea75..6553000336 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable(
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
+StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
@@ -88,12 +88,12 @@ StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
       evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
 
   // Transform the result literal back into a ShapedBuffer.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
+  TF_ASSIGN_OR_RETURN(ShapedBuffer result,
                       transfer_manager->AllocateShapedBuffer(
                           result_literal->shape(), run_options->allocator(),
                           executor->device_ordinal()));
   TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
-      executor, *result_literal, *result));
+      executor, *result_literal, result));
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -106,8 +106,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
   return std::move(result);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>>
-InterpreterExecutable::ExecuteAsyncOnStream(
+StatusOr<ShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return tensorflow::errors::Unimplemented(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 410110a1ad..c825a9a368 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable {
   InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
   ~InterpreterExecutable() override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+  StatusOr<ShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
+  StatusOr<ShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 2df59c3556..39f3aefdf8 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -550,7 +550,7 @@ Service::ExecuteParallelAndRegisterResult(
     // Stream executors for the replicas of the current computation.
     TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
     CHECK_EQ(replicas.size(), arguments[i].size());
-    std::vector<std::unique_ptr<ShapedBuffer>> result_buffers;
+    std::vector<ShapedBuffer> result_buffers;
     for (int64 replica = 0; replica < replicas.size(); ++replica) {
       TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                           backend->BorrowStream(replicas[replica]));
@@ -582,7 +582,7 @@ Service::ExecuteParallelAndRegisterResult(
                                               backend->StreamBorrower());
 
       // Asynchronously launch the computation.
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
+      TF_ASSIGN_OR_RETURN(ShapedBuffer result,
                           executables[i]->ExecuteAsyncOnStream(
                               &run_options, arguments[i][replica]));
 
@@ -1234,7 +1234,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     streams.push_back(std::move(stream));
   }
 
-  std::vector<std::unique_ptr<ShapedBuffer>> result_buffers;
+  std::vector<ShapedBuffer> result_buffers;
   for (size_t i = 0; i < streams.size(); ++i) {
     const auto& stream = streams[i];
     ExecutableRunOptions options;
@@ -1247,7 +1247,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     ServiceExecutableRunOptions service_options(
         options, execute_backend_->StreamBorrower());
 
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> this_result_buffer,
+    TF_ASSIGN_OR_RETURN(ShapedBuffer this_result_buffer,
                         executable->ExecuteAsyncOnStream(
                             &service_options, replicated_arguments[i]));
 
@@ -1347,16 +1347,16 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
   }
 
   // Allocate memory in each replica and transfer the data to all replicas.
-  std::vector<std::unique_ptr<ShapedBuffer>> replicated_buffers;
+  std::vector<ShapedBuffer> replicated_buffers;
   for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ShapedBuffer> shaped_buffer,
+        ShapedBuffer shaped_buffer,
         execute_backend_->transfer_manager()->AllocateShapedBuffer(
             shape, execute_backend_->memory_allocator(),
             executor->device_ordinal()));
     TF_RETURN_IF_ERROR(
         execute_backend_->transfer_manager()->TransferLiteralToDevice(
-            executor, *literal, *shaped_buffer));
+            executor, *literal, shaped_buffer));
     replicated_buffers.emplace_back(std::move(shaped_buffer));
   }
   TF_ASSIGN_OR_RETURN(*result->mutable_data(),
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 10a2aa2b30..0b5a383f6f 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -66,6 +66,8 @@ ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) {
   return *this;
 }
 
+ShapedBuffer::~ShapedBuffer() {}
+
 void ShapedBuffer::clear() {
   for (auto& pair : buffers_) {
     // A default constructed DeviceMemoryBase is a null pointer.
@@ -102,18 +104,6 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
   return out;
 }
 
-/* static */
-StatusOr<std::unique_ptr<ScopedShapedBuffer>> ScopedShapedBuffer::MakeScoped(
-    ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) {
-  auto scoped_buffer = WrapUnique(new ScopedShapedBuffer(
-      shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(),
-      allocator, shaped_buffer->device_ordinal()));
-  scoped_buffer->buffers_ = shaped_buffer->buffers();
-  shaped_buffer->clear();
-
-  return std::move(scoped_buffer);
-}
-
 ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
                                        const Shape& on_device_shape,
                                        DeviceMemoryAllocator* allocator,
@@ -126,7 +116,25 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
                                        DeviceMemoryAllocator* allocator)
     : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
 
+ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
+    : ShapedBuffer(std::move(s)), allocator_(s.allocator_) {
+  // Null out s.allocator_ so it doesn't try to free anything in its destructor.
+  s.allocator_ = nullptr;
+}
+
+ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
+  *static_cast<ShapedBuffer*>(this) = std::move(static_cast<ShapedBuffer&>(s));
+  allocator_ = s.allocator_;
+  // Null out s.allocator_ so it doesn't try to free anything in its destructor.
+  s.allocator_ = nullptr;
+  return *this;
+}
+
 ScopedShapedBuffer::~ScopedShapedBuffer() {
+  // allocator_ will be null if we were moved-from.
+  if (allocator_ == nullptr) {
+    return;
+  }
   // Deallocate all non-null buffers. A buffer may appear in more than one spot
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
@@ -142,8 +150,8 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   }
 }
 
-std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
-  auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
+ShapedBuffer ScopedShapedBuffer::release() {
+  ShapedBuffer shaped_buffer(std::move(*this));
   buffers_ = ShapeTree<se::DeviceMemoryBase>();
   return shaped_buffer;
 }
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index 62ba8f2734..f1b0527474 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -43,6 +43,14 @@ class ShapedBuffer {
   ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
                const se::Platform* platform, int device_ordinal);
 
+  // Movable, but not copyable.
+  ShapedBuffer(ShapedBuffer&& s);
+  ShapedBuffer& operator=(ShapedBuffer&&);
+  ShapedBuffer(const ShapedBuffer&) = delete;
+  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
+
+  virtual ~ShapedBuffer();
+
   // Returns the shape of the on-host representation of the data held by this
   // ShapedBuffer.
   const Shape& on_host_shape() const { return on_host_shape_; }
@@ -80,13 +88,7 @@ class ShapedBuffer {
 
   string ToString() const;
 
-  ShapedBuffer(ShapedBuffer&& s);
-  ShapedBuffer& operator=(ShapedBuffer&&);
-
  protected:
-  ShapedBuffer(const ShapedBuffer&) = delete;
-  ShapedBuffer& operator=(const ShapedBuffer&) = delete;
-
   // The shape of the data when represented on the host.
   Shape on_host_shape_;
 
@@ -108,41 +110,45 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 // ShapedBuffer derived class which allocates all internal buffers on
 // construction and deallocates the memory when the object is
 // destructed.
+//
+// TODO(timshen): Remove inheritance between ScopedShapedBuffer and
+// ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer
+// as a ShapedBuffer, because in that case we should just be able to pass around
+// our ShapeTree<DeviceMemoryBase>.  Inheritance only adds complexity.  See
+// discussion in cl/192849370.
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
-  // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the
-  // deallocation of the device memory held in the shaped buffer. All device
-  // memory pointers in the given ShapedBuffer are set to null.
-  static StatusOr<std::unique_ptr<ScopedShapedBuffer>> MakeScoped(
-      ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator);
-
-  // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index.
-  ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-                     DeviceMemoryAllocator* allocator, int device_ordinal);
+  // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  explicit ScopedShapedBuffer(const Shape& on_host_shape,
+                              const Shape& on_device_shape,
+                              DeviceMemoryAllocator* allocator,
+                              int device_ordinal);
 
   // Create a ScopedShapedBuffer by taking over the memory from the incoming
   // ShapedBuffer.
-  ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                     DeviceMemoryAllocator* allocator);
+  explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
+                              DeviceMemoryAllocator* allocator);
+
+  // Movable, but not copyable.
+  ScopedShapedBuffer(ScopedShapedBuffer&& s);
+  ScopedShapedBuffer& operator=(ScopedShapedBuffer&&);
+  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
+  ScopedShapedBuffer& operator=(const ScopedShapedBuffer&) = delete;
+
+  // All buffers in the shape are deallocated on destruction.
+  ~ScopedShapedBuffer() override;
 
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
-  // Release all device memory owned by this ScopedShapedBuffer and
-  // return the device memory pointers in the form of a
-  // ShapedBuffer. The returned ShapedBuffer takes over the memory
-  // from the ScopedShapedBuffer. The resulting ScopedShapedBuffer can
-  // only be destroyed.
-  std::unique_ptr<ShapedBuffer> release();
-
-  // All buffers in the shape are deallocated on destruction.
-  virtual ~ScopedShapedBuffer();
+  // Releases all device memory owned by this ScopedShapedBuffer and returns the
+  // device memory pointers in the form of a ShapedBuffer. The returned
+  // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The
+  // resulting ScopedShapedBuffer can only be destroyed.
+  ShapedBuffer release();
 
  protected:
-  ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
-  void operator=(const ScopedShapedBuffer&) = delete;
-
   DeviceMemoryAllocator* allocator_;
 };
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index be8231b73c..98d0111d04 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -175,7 +175,7 @@ Status TransferManager::TransferBufferToDevice(
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
+StatusOr<ShapedBuffer> TransferManager::AllocateShapedBuffer(
     const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
     int device_ordinal) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
@@ -187,31 +187,30 @@ StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
   const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
   TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
 
-  auto shaped_buffer = WrapUnique(new ShapedBuffer(
-      on_host_shape, on_device_shape, allocator->platform(), device_ordinal));
+  ShapedBuffer shaped_buffer(on_host_shape, on_device_shape,
+                             allocator->platform(), device_ordinal);
 
   // Allocate an appropriate sized buffer for each element in the shape
   // including the tuple pointer arrays.
-  for (auto& pair : shaped_buffer->buffers()) {
+  for (auto& pair : shaped_buffer.buffers()) {
     const ShapeIndex& index = pair.first;
     se::DeviceMemoryBase& memory_base = pair.second;
     const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
     TF_ASSIGN_OR_RETURN(memory_base,
-                        allocator->Allocate(shaped_buffer->device_ordinal(),
+                        allocator->Allocate(shaped_buffer.device_ordinal(),
                                             GetByteSizeRequirement(subshape)));
   }
 
   return std::move(shaped_buffer);
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape,
-                                            DeviceMemoryAllocator* allocator,
-                                            int device_ordinal) {
+StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
+    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    int device_ordinal) {
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> unscoped_buffer,
+      ShapedBuffer unscoped_buffer,
       AllocateShapedBuffer(on_host_shape, allocator, device_ordinal));
-  return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator);
+  return ScopedShapedBuffer(std::move(unscoped_buffer), allocator);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 410d2af7af..a6451c4bb1 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -107,10 +107,10 @@ class TransferManager {
   // Allocate a ShapedBuffer which can hold data with the given on-host
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
-  StatusOr<std::unique_ptr<ShapedBuffer>> AllocateShapedBuffer(
-      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
-      int device_ordinal);
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> AllocateScopedShapedBuffer(
+  StatusOr<ShapedBuffer> AllocateShapedBuffer(const Shape& on_host_shape,
+                                              DeviceMemoryAllocator* allocator,
+                                              int device_ordinal);
+  StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
       const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
       int device_ordinal);
 
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 464b8cbebb..021fbcedb9 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -735,11 +735,11 @@ void BM_DynamicSlice(int num_iters) {
 
   auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *start_indices_literal, *buffer));
+      executors[device_ordinal], *start_indices_literal, buffer));
 
   std::unique_ptr<LocalExecutable> executable =
       client
-          ->Compile(computation, {&buffer->on_host_shape()},
+          ->Compile(computation, {&buffer.on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
@@ -748,14 +748,14 @@ void BM_DynamicSlice(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({buffer.get()}, options);
+    auto result = executable->Run({&buffer}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({buffer.get()}, options);
+    auto result = executable->Run({&buffer}, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index ed16963b40..c7f64d8560 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -794,19 +794,19 @@ void BM_ParallelFusion(int num_iters) {
   // Transfer literals to device.
   auto param0_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1);
-  std::unique_ptr<ShapedBuffer> buffer0 =
+  ShapedBuffer buffer0 =
       client->LiteralToShapedBuffer(*param0_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param1_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1);
-  std::unique_ptr<ShapedBuffer> buffer1 =
+  ShapedBuffer buffer1 =
       client->LiteralToShapedBuffer(*param1_literal, device_ordinal)
           .ConsumeValueOrDie();
 
   auto param2_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1);
-  std::unique_ptr<ShapedBuffer> buffer2 =
+  ShapedBuffer buffer2 =
       client->LiteralToShapedBuffer(*param2_literal, device_ordinal)
           .ConsumeValueOrDie();
 
@@ -814,8 +814,8 @@ void BM_ParallelFusion(int num_iters) {
   std::unique_ptr<LocalExecutable> executable =
       client
           ->Compile(computation,
-                    {&buffer0->on_host_shape(), &buffer1->on_host_shape(),
-                     &buffer2->on_host_shape()},
+                    {&buffer0.on_host_shape(), &buffer1.on_host_shape(),
+                     &buffer2.on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
@@ -836,8 +836,7 @@ void BM_ParallelFusion(int num_iters) {
   // Run some warm-up executions.
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result =
-        executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options);
+    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
 
@@ -850,8 +849,7 @@ void BM_ParallelFusion(int num_iters) {
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result =
-        executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options);
+    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index 3d30ceeaf1..7209f91639 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,7 +54,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
   // deallocation happen on the right allocator.
   ExecutableRunOptions options;
   options.set_allocator(allocator);
-  std::unique_ptr<ScopedShapedBuffer> result =
+  tensorflow::gtl::optional<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {},
                           DefaultExecutableBuildOptions(), options);
 
@@ -66,7 +67,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
 
   // Deallocate result and verify that deallocate was called once.
   int64 deallocation_count_before = allocator_->deallocation_count();
-  result = nullptr;
+  result.reset();
   EXPECT_EQ(deallocation_count_before + 1, allocator_->deallocation_count());
 }
 
@@ -92,7 +93,7 @@ XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) {
         computation, {}, ExecutableBuildOptions().set_device_ordinal(d),
         ExecutableRunOptions().set_device_ordinal(d).set_allocator(allocator));
     LiteralTestUtil::ExpectR1Near<float>(
-        {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+        {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 
     // At least one allocation should have been performed when executing the
     // computation.
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 373dd3c5df..7e14e77366 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -57,10 +57,9 @@ XLA_TEST_F(LocalClientExecuteTest, Constant) {
   ComputationBuilder builder(local_client_, TestName());
   auto y = builder.ConstantR0<float>(123.0f);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
+  ScopedShapedBuffer result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
-
-  LiteralTestUtil::ExpectR0Near<float>(123.f, *ShapedBufferToLiteral(*result),
+  LiteralTestUtil::ExpectR0Near<float>(123.f, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
@@ -71,10 +70,9 @@ XLA_TEST_F(LocalClientExecuteTest, AddScalars) {
   builder.Add(x, y);
 
   auto x_value = LiteralToShapedBuffer(*Literal::CreateR0<float>(42.0f));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_value.get()});
-
-  LiteralTestUtil::ExpectR0Near<float>(165.f, *ShapedBufferToLiteral(*result),
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_value});
+  LiteralTestUtil::ExpectR0Near<float>(165.f, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
@@ -85,10 +83,9 @@ XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) {
   builder.Add(x, y);
 
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR1<float>({}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()});
-
-  LiteralTestUtil::ExpectR1Near<float>({}, *ShapedBufferToLiteral(*result),
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array});
+  LiteralTestUtil::ExpectR1Near<float>({}, *ShapedBufferToLiteral(result),
                                        error_spec_);
 }
 
@@ -100,11 +97,10 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectors) {
 
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()});
-
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array});
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
@@ -116,13 +112,12 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) {
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
   ExecutionProfile profile;
-  std::unique_ptr<ScopedShapedBuffer> result = ExecuteLocallyOrDie(
-      builder.Build().ValueOrDie(), {x_array.get()},
-      DefaultExecutableBuildOptions(),
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(
+      builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions(),
       DefaultExecutableRunOptions().set_execution_profile(&profile));
 
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
   EXPECT_GT(profile.compute_and_transfer_time_ns(), 0);
 }
 
@@ -136,27 +131,27 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   // Create x as a col-major array.
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
-  EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
   auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
-  EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
-  std::unique_ptr<ScopedShapedBuffer> result_colmaj =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result_colmaj =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_colmaj),
+                                       *ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
 
   // Run with the parameter values in a different order.
-  std::unique_ptr<ScopedShapedBuffer> result_param_swap =
-      ExecuteLocallyOrDie(computation, {y_array.get(), x_array.get()});
+  ScopedShapedBuffer result_param_swap =
+      ExecuteLocallyOrDie(computation, {&y_array, &x_array});
   LiteralTestUtil::ExpectR2Near<float>(
       {{11.0f, 22.0f}, {33.0f, 44.0f}},
-      *ShapedBufferToLiteral(*result_param_swap), error_spec_);
+      *ShapedBufferToLiteral(result_param_swap), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
@@ -172,27 +167,27 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
   // Run with col-major result layout.
-  std::unique_ptr<ScopedShapedBuffer> result_colmaj = ExecuteLocallyOrDie(
-      computation, {x_array.get(), y_array.get()},
+  ScopedShapedBuffer result_colmaj = ExecuteLocallyOrDie(
+      computation, {&x_array, &y_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_colmaj),
+                                       *ShapedBufferToLiteral(result_colmaj),
                                        error_spec_);
 
   // Run with row-major result layout.
-  std::unique_ptr<ScopedShapedBuffer> result_rowmaj = ExecuteLocallyOrDie(
-      computation, {x_array.get(), y_array.get()},
+  ScopedShapedBuffer result_rowmaj = ExecuteLocallyOrDie(
+      computation, {&x_array, &y_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj.on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
-                                       *ShapedBufferToLiteral(*result_rowmaj),
+                                       *ShapedBufferToLiteral(result_rowmaj),
                                        error_spec_);
 }
 
@@ -208,13 +203,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   auto y_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -237,13 +232,13 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   auto y_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{10.0f, 20.0f}, {30.0f, 40.0f}}));
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -274,11 +269,11 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
        ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2},
                                       /*minor_to_major=*/{1, 0})});
   options.set_result_layout(shape_with_layout);
-  std::unique_ptr<ScopedShapedBuffer> result = ExecuteLocallyOrDie(
-      builder.Build().ValueOrDie(), {array.get(), array.get()}, options,
-      DefaultExecutableRunOptions());
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&array, &array},
+                          options, DefaultExecutableRunOptions());
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -318,13 +313,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   auto x_buffer = LiteralToShapedBuffer(*x_literal);
   auto y_buffer = LiteralToShapedBuffer(*y_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()});
+  ScopedShapedBuffer result =
+      ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{56.0f, 46.0f}, {36.0f, 26.0f}},
       LiteralView::Create(*result_literal, {0}));
@@ -363,10 +358,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
        Literal::CreateR1<float>({222.0, -2.0, 10.0}).get()});
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
 
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
@@ -394,18 +388,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
        Literal::CreateR2<float>({{11.0, 3.0}, {4.0, 5.0}}).get()});
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result_0 =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(*result_0);
+  ScopedShapedBuffer result_0 = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(result_0);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4.0}},
       LiteralView::Create(*result_0_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1}));
 
-  std::unique_ptr<ScopedShapedBuffer> result_1 =
-      ExecuteLocallyOrDie(computation, {result_0.get()});
-  std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(*result_1);
+  ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0});
+  std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(result_1);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
@@ -451,10 +443,8 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
       Literal::MakeTupleOwned(std::move(arg_elements));
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   for (int i = 0; i < kElementCount; ++i) {
     LiteralTestUtil::ExpectR1Near<float>(
@@ -509,9 +499,8 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
   auto arg_literal = Literal::MakeTupleOwned(std::move(outer_tuple_elements));
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   for (int i = 0; i < kFanout; ++i) {
     for (int j = 0; j < kFanout; ++j) {
@@ -554,9 +543,8 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
   }
   auto arg_buffer = LiteralToShapedBuffer(*arg_literal);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
-      ExecuteLocallyOrDie(computation, {arg_buffer.get()});
-  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
+  ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer});
+  std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
 
   ShapeIndex index;
   for (int i = 0; i < kTupleDepth; ++i) {
@@ -576,7 +564,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({1.0f, 2.0f, 3.0f}));
   auto execute_status =
-      ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()});
+      ExecuteLocally(builder.Build().ValueOrDie(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
@@ -592,7 +580,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
   auto x_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{0.0f, 1.0f}, {2.0f, 3.0f}}));
   auto execute_status =
-      ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()});
+      ExecuteLocally(builder.Build().ValueOrDie(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
@@ -609,7 +597,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
   auto x_array = LiteralToShapedBuffer(
       *Literal::CreateR2<float>({{0.0f, 1.0f}, {2.0f, 3.0f}}));
   auto execute_status = ExecuteLocally(
-      builder.Build().ValueOrDie(), {x_array.get()},
+      builder.Build().ValueOrDie(), {&x_array},
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32,
                                          /*dimensions=*/{1, 2, 3, 4},
@@ -642,9 +630,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
           computation, {},
           DefaultExecutableBuildOptions().set_device_ordinal(d),
           DefaultExecutableRunOptions().set_device_ordinal(d));
-      EXPECT_EQ(d, result->device_ordinal());
+      EXPECT_EQ(d, result.device_ordinal());
       LiteralTestUtil::ExpectR0Equal<float>(42.0f,
-                                            *ShapedBufferToLiteral(*result));
+                                            *ShapedBufferToLiteral(result));
     }
   }
 }
@@ -687,9 +675,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnStream) {
                             DefaultExecutableRunOptions().set_stream(&stream));
     // As a check to verify that the computation ran of the device associated
     // with the stream. This is a weak check, but stronger verification is hard.
-    EXPECT_EQ(d, result->device_ordinal());
+    EXPECT_EQ(d, result.device_ordinal());
     LiteralTestUtil::ExpectR0Equal<float>(42.0f,
-                                          *ShapedBufferToLiteral(*result));
+                                          *ShapedBufferToLiteral(result));
   }
 }
 
@@ -765,9 +753,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
   builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
 
-  std::unique_ptr<ScopedShapedBuffer> result =
+  ScopedShapedBuffer result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
-  std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(*result);
+  std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR1Equal<float>(
       {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
@@ -791,12 +779,12 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
 
   auto x_array =
       LiteralToShapedBuffer(*Literal::CreateR1<float>({0.0f, 1.0f, 2.0f}));
-  std::unique_ptr<ScopedShapedBuffer> result =
-      executable->Run({x_array.get()}, DefaultExecutableRunOptions())
+  ScopedShapedBuffer result =
+      executable->Run({&x_array}, DefaultExecutableRunOptions())
           .ConsumeValueOrDie();
 
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_);
+      {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
 }
 
 XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
@@ -809,7 +797,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
             literal, local_client_->default_device_ordinal(), allocator_));
     TF_ASSERT_OK_AND_ASSIGN(
         auto transferred_literal,
-        local_client_->ShapedBufferToLiteral(*shaped_buffer));
+        local_client_->ShapedBufferToLiteral(shaped_buffer));
     EXPECT_EQ(literal, *transferred_literal);
   };
 
@@ -849,7 +837,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
             literal, local_client_->default_device_ordinal(), allocator_));
     TF_ASSERT_OK_AND_ASSIGN(
         auto transferred_literal,
-        local_client_->ShapedBufferToLiteral(*shaped_buffer));
+        local_client_->ShapedBufferToLiteral(shaped_buffer));
     EXPECT_EQ(literal, *transferred_literal);
   };
 
@@ -917,12 +905,12 @@ void BM_LocalClientOverhead(int num_iters) {
           .ConsumeValueOrDie();
   auto literal = Literal::CreateR2<float>({{0, 0, 0}, {0, 0, 0}});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *literal, *buffer));
+      executors[device_ordinal], *literal, buffer));
 
   const int kWarmups = 2;
 
   auto executable_status = client->Compile(
-      computation, {&buffer->on_host_shape()}, ExecutableBuildOptions());
+      computation, {&buffer.on_host_shape()}, ExecutableBuildOptions());
   ASSERT_IS_OK(executable_status);
   std::unique_ptr<LocalExecutable> executable =
       executable_status.ConsumeValueOrDie();
@@ -934,13 +922,13 @@ void BM_LocalClientOverhead(int num_iters) {
   run_options.set_allocator(&allocator).set_stream(&stream);
 
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({buffer.get()}, run_options);
+    auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
 
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({buffer.get()}, run_options);
+    auto result = executable->Run({&buffer}, run_options);
     ASSERT_IS_OK(result);
   }
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 29fd985acf..c60ba2422f 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -128,7 +128,7 @@ LocalClientTestBase::LocalClientTestBase(se::Platform* platform)
 
 LocalClientTestBase::~LocalClientTestBase() {}
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::LiteralToShapedBuffer(
+ScopedShapedBuffer LocalClientTestBase::LiteralToShapedBuffer(
     const Literal& literal) {
   return local_client_
       ->LiteralToShapedBuffer(literal, local_client_->default_device_ordinal())
@@ -155,7 +155,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
   return run_options;
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
+ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
@@ -163,7 +163,7 @@ std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
       .ConsumeValueOrDie();
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
+ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
@@ -172,16 +172,14 @@ std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
       .ConsumeValueOrDie();
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClientTestBase::ExecuteLocally(
+StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
                         DefaultExecutableRunOptions());
 }
 
-StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-LocalClientTestBase::ExecuteLocally(
+StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableBuildOptions& build_options,
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 7555d5e893..4ee56a05ec 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -83,8 +83,7 @@ class LocalClientTestBase : public ::testing::Test {
   // Copy the given literal onto the default device and return a
   // ScopedShapedBuffer. Convenience wrapper around
   // LocalClient::LiteralToShapedBuffer.
-  std::unique_ptr<ScopedShapedBuffer> LiteralToShapedBuffer(
-      const Literal& literal);
+  ScopedShapedBuffer LiteralToShapedBuffer(const Literal& literal);
 
   // Construct and return a literal containing the array represented by
   // shaped_buffer.
@@ -93,19 +92,19 @@ class LocalClientTestBase : public ::testing::Test {
 
   // Execute the given computation on the local client. With and without
   // options.
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
+  StatusOr<ScopedShapedBuffer> ExecuteLocally(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
+  StatusOr<ScopedShapedBuffer> ExecuteLocally(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
       const ExecutableRunOptions& run_options);
 
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
+  ScopedShapedBuffer ExecuteLocallyOrDie(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
+  ScopedShapedBuffer ExecuteLocallyOrDie(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableBuildOptions& build_options,
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 268ba338f2..e2067bc1b8 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -45,7 +45,7 @@ class TransferManagerTest : public LocalClientTestBase {
 
   ~TransferManagerTest() override = default;
 
-  std::unique_ptr<ScopedShapedBuffer> AllocateDeviceBuffer(const Shape& shape) {
+  ScopedShapedBuffer AllocateDeviceBuffer(const Shape& shape) {
     return transfer_manager_
         ->AllocateScopedShapedBuffer(
             shape, GetOrCreateAllocator(local_client_->platform()),
@@ -64,10 +64,10 @@ XLA_TEST_F(TransferManagerTest, TransferR0U32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR0Equal<uint32>(42, *result);
 }
@@ -80,10 +80,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR1Equal<float>({1.25f, 2.5f, -17.0f, -20.125f},
                                         *result);
@@ -98,10 +98,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR1Equal<float>(test_vector, *result);
 }
@@ -114,10 +114,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1U8) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   EXPECT_EQ(result->GetR1U8AsString(), test_string);
 }
@@ -130,10 +130,10 @@ XLA_TEST_F(TransferManagerTest, TransferR2F32) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result);
@@ -150,10 +150,10 @@ XLA_TEST_F(TransferManagerTest,
   // Round trip literal through device. Set the on-device layout to something
   // different than the literal layout.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   EXPECT_FALSE(
       LayoutUtil::Equal(result->shape().layout(), literal->shape().layout()));
@@ -170,10 +170,10 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -184,10 +184,10 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -204,10 +204,10 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -219,10 +219,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
@@ -238,10 +238,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
 
   // Round trip literal through device.
   ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
-      stream_executor_, *literal, *device_buffer));
+      stream_executor_, *literal, device_buffer));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
                           transfer_manager_->TransferLiteralFromDevice(
-                              stream_executor_, *device_buffer));
+                              stream_executor_, device_buffer));
 
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index efb00d56c5..837a01e873 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -129,18 +129,18 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   auto* transfer_manager = backend->transfer_manager();
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<ScopedShapedBuffer> lhs_arg,
+      ScopedShapedBuffer lhs_arg,
       transfer_manager->AllocateScopedShapedBuffer(
           lhs_arg_shape, allocator, backend->default_device_ordinal()));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
+      executor, *Literal::CreateFromShape(lhs_arg_shape), lhs_arg));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<ScopedShapedBuffer> rhs_arg,
+      ScopedShapedBuffer rhs_arg,
       transfer_manager->AllocateScopedShapedBuffer(
           rhs_arg_shape, allocator, backend->default_device_ordinal()));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
+      executor, *Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
@@ -165,7 +165,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
       backend->eigen_intra_op_thread_pool());
   TF_ASSERT_OK_AND_ASSIGN(
       auto execution_result,
-      executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
+      executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg},
                                   &hlo_execution_profile));
   (void)execution_result;
 
-- 
GitLab


From d710d01a015fda65348ac0e5c25be3747624a779 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 19 Apr 2018 17:21:50 -0700
Subject: [PATCH 1168/1262] Minor code refactoring.

PiperOrigin-RevId: 193600173
---
 tensorflow/core/kernels/data/BUILD            |  3 ++-
 tensorflow/core/kernels/data/dataset_utils.cc | 13 +++++++++++++
 tensorflow/core/kernels/data/dataset_utils.h  |  2 ++
 tensorflow/core/kernels/data/iterator_ops.cc  | 13 ++-----------
 tensorflow/core/kernels/data/writer_ops.cc    | 15 ++-------------
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 667a6967a8..c78e0aff83 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -515,6 +515,7 @@ tf_kernel_library(
     srcs = ["iterator_ops.cc"],
     deps = [
         ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -586,7 +587,7 @@ tf_kernel_library(
     srcs = ["writer_ops.cc"],
     deps = [
         ":dataset",
-        "//tensorflow/core:core_cpu_internal",
+        ":dataset_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index e3a3601ee8..67ddb52d57 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/common_runtime/device.h"
 
 namespace tensorflow {
 
@@ -45,6 +46,18 @@ Status MakeIteratorFromInputElement(
   return Status::OK();
 }
 
+IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
+  IteratorContext::Params params;
+  params.env = ctx->env();
+  params.runner = *(ctx->runner());
+  params.lib = ctx->function_library();
+  DeviceBase* device = ctx->function_library()->device();
+  params.allocator_getter = [device](AllocatorAttributes attrs) {
+    return device->GetAllocator(attrs);
+  };
+  return IteratorContext(params);
+}
+
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 6c4191c2be..e5ca71dd99 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -28,6 +28,8 @@ Status MakeIteratorFromInputElement(
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
+IteratorContext MakeIteratorContext(OpKernelContext* ctx);
+
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 4e4997d7b3..f5db97fd59 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -609,17 +610,7 @@ class ToSingleElementOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       auto iterator = dataset->MakeIterator("SingleElementIterator");
 
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
-      params.lib = ctx->function_library();
-      DeviceBase* device = ctx->function_library()->device();
-      params.allocator_getter = [device](AllocatorAttributes attrs) {
-        return device->GetAllocator(attrs);
-      };
-
-      IteratorContext iter_ctx(std::move(params));
-
+      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 46821fd7b3..656fee1e85 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/record_writer.h"
@@ -72,21 +72,10 @@ class ToTFRecordOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
 
-      IteratorContext::Params params;  // TODO(b/78245447)
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
-      params.lib = ctx->function_library();
-      DeviceBase* device = ctx->function_library()->device();
-      params.allocator_getter = [device](AllocatorAttributes attrs) {
-        return device->GetAllocator(attrs);
-      };
-
-      IteratorContext iter_ctx(std::move(params));
-
+      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
-
       do {
         OP_REQUIRES_OK_ASYNC(
             ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-- 
GitLab


From c2905469335715929c630d2bd70068ccbc8eb2d1 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Fri, 20 Apr 2018 09:28:37 +0900
Subject: [PATCH 1169/1262] fix typo

---
 tensorflow/core/grappler/costs/virtual_scheduler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 5116c8183c..7edd10e3e8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -212,7 +212,7 @@ class FirstReadyManager : public ReadyNodeManager {
 };
 
 // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
-// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv
+// ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv
 // ops, and then it chooses FirstReady among the ops chosen from each
 // internal NodeManagers. The objective is to maximize producer-consumer
 // locality within device, while processing nodes across devices, including
-- 
GitLab


From 28a95990bf9ff228abec6a52389a4244a17a9101 Mon Sep 17 00:00:00 2001
From: manhyuk <manhyuk@kw.ac.kr>
Date: Fri, 20 Apr 2018 09:28:45 +0900
Subject: [PATCH 1170/1262] fix typo

---
 tensorflow/core/grappler/costs/virtual_scheduler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 7edd10e3e8..67bf1e6980 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -199,7 +199,7 @@ class FirstReadyManager : public ReadyNodeManager {
   // current node.
   std::vector<const NodeDef*> nodes_;
   // Newly added nodes are added to waiting_queue_. That way, GetCurrNode(),
-  // wihch returns the front of the nodes_, always returns the same node,
+  // which returns the front of the nodes_, always returns the same node,
   // even if any of new nodes has time_ready smaller than the current node's.
   std::vector<const NodeDef*> waiting_queue_;
   // Comparator functor for heap; stl heap is max heap, so we use "greater than"
-- 
GitLab


From c18a80967e55350affafbf2ff562056d4bddf234 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 17:26:41 -0700
Subject: [PATCH 1171/1262] Add support for non-Tensor args in recompute_grad

Previously, the function decorated by recompute_grad had to have a signature that contained only positional arguments, and all those arguments had to be Tensors. Most "layers" users define however have non-Tensor arguments (for example, various hyperparameters) and often have keyword arguments as well. This change allows a user to use whatever function signature they wish while being explicit about which arguments are Tensors.

PiperOrigin-RevId: 193600682
---
 .../layers/python/layers/rev_block_lib.py     |  77 +++++++++++--
 .../python/layers/rev_block_lib_test.py       | 102 ++++++++++++++++++
 2 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 02d294c68f..9f904cc302 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
@@ -429,12 +430,13 @@ def enable_with_args(dec):
 
 
 @enable_with_args
-def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
+def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False,
+                   tensor_arg_names=None):
   """Decorator that recomputes the function on the backwards pass.
 
   Args:
-    fn: a function that takes Tensors (all as positional arguments) and returns
-      a tuple of Tensors.
+    fn: the subgraph-producing function to wrap and recompute when computing
+      gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s.
     use_data_dep: `bool`, if `True` will use a dummy data dependency to force
       the recompute to happen. If `False` will use a control dependency. By
       default will be `True` if in an XLA context and `False` otherwise. XLA
@@ -443,17 +445,25 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
       that all gradients are produced before any are consumed by downstream ops.
       If `use_data_dep` is also `True`, will use a data dependency instead of
       a control dependency.
+    tensor_arg_names: `list<str>`, names of the `Tensor` arguments to `fn`. If
+      `None`, assumes all arguments are `Tensor`s.
 
   Returns:
     A wrapped fn that is identical to fn when called, but its activations will
     be discarded and recomputed on the backwards pass (i.e. on a call to
     tf.gradients).
   """
+  if tensor_arg_names:
+    if not isinstance(tensor_arg_names, (list, tuple)):
+      raise TypeError("tensor_arg_names must be a list")
 
   @functools.wraps(fn)
-  def wrapped(*args):
+  def wrapped(*args, **kwargs):
+    tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs,
+                                                    tensor_arg_names)
     return _recompute_grad(
-        fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
+        tensor_only_fn, tensor_args, use_data_dep=use_data_dep,
+        tupleize_grads=tupleize_grads)
 
   return wrapped
 
@@ -463,11 +473,59 @@ def _is_on_tpu():
   return control_flow_util.GetContainingXLAContext(ctxt) is not None
 
 
-def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
+def _make_tensor_only(fn, args, kwargs, tensor_arg_names):
+  """Return fn such that it only takes Tensor args for tensor_arg_names."""
+  argspec = tf_inspect.getargspec(fn)
+  if argspec.varargs is not None or argspec.keywords is not None:
+    raise ValueError("Function decorated with recompute_grad must not use "
+                     "*args or **kwargs.")
+  fn_arg_names = list(argspec.args)
+
+  # name_to_arg is a dict of argument name to argument value, including both
+  # positional and keyword arguments passed.
+  name_to_arg = {}
+  # Populate positional arguments.
+  for name, arg in zip(fn_arg_names[:len(args)], args):
+    name_to_arg[name] = arg
+  # Populate keyword arguments.
+  name_to_arg.update(kwargs)
+
+  # Separate the Tensor arguments from the non-Tensor arguments.
+  # The default is that all arguments are Tensor arguments.
+  tensor_arg_names = tensor_arg_names or fn_arg_names
+  for name in tensor_arg_names:
+    if name not in name_to_arg:
+      raise ValueError("Must provide Tensor argument %s" % name)
+  tensor_args = [name_to_arg[name] for name in tensor_arg_names]
+  non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items()
+                            if name not in tensor_arg_names])
+
+  # Check that Tensor arguments are in fact Tensors and that non-Tensor
+  # arguments are not.
+  for name, arg in zip(tensor_arg_names, tensor_args):
+    if not isinstance(arg, framework_ops.Tensor):
+      raise TypeError("Fn argument %s must be a Tensor." % name)
+  for name, arg in non_tensor_kwargs.items():
+    if isinstance(arg, framework_ops.Tensor):
+      raise TypeError("Fn argument %s must not be a Tensor." % name)
+
+  # Construct a Tensor-only wrapper function that will pass the non-Tensor
+  # arguments as well when called.
+  def tensor_only_fn(*tensors):
+    all_kwargs = dict(zip(tensor_arg_names, tensors))
+    all_kwargs.update(non_tensor_kwargs)
+    return fn(**all_kwargs)
+
+  return tensor_only_fn, tensor_args
+
+
+def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT,
+                    tupleize_grads=False):
   """See recompute_grad."""
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
+
   use_data_dep_ = use_data_dep
   if use_data_dep_ == _USE_DEFAULT:
     use_data_dep_ = _is_on_tpu()
@@ -501,14 +559,11 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
     grad_vars = grads[len(inputs):]
     return grad_inputs, grad_vars
 
+  # TODO(rsepassi): Replace with tf.custom_gradient
   @_fn_with_custom_grad(grad_fn)
   def fn_with_recompute(*args):
     cached_vs.append(variable_scope.get_variable_scope())
-    # TODO(rsepassi): Rm conditional in TF 1.4
-    if hasattr(contrib_framework_ops, "current_arg_scope"):
-      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
-    else:
-      cached_arg_scope.append({})
+    cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
     return fn(*args)
 
   return fn_with_recompute(*args)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 392a490be1..66ccc696f9 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -318,6 +318,108 @@ class RecomputeTest(test.TestCase):
       self.assertEqual(1, len(grads))
       self.assertTrue(grads[0] is not None)
 
+  def testWithNontensorArgs(self):
+    @rev_block_lib.recompute_grad(tupleize_grads=True,
+                                  tensor_arg_names=["inputs"])
+    def layer_with_recompute(inputs, plus=None):
+      var = variable_scope.get_variable("var", ())
+      self.assertFalse(plus)  # called with False below
+      if plus:
+        return var + inputs
+      else:
+        return var * inputs
+
+    inputs = array_ops.ones((), dtypes.float32)
+    outputs = layer_with_recompute(inputs, plus=False)
+    loss = math_ops.square(outputs)
+    grads = gradients_impl.gradients(loss, variables.trainable_variables())
+    self.assertEqual(1, len(grads))
+    self.assertTrue(grads[0] is not None)
+
+
+class MakeTensorOnlyTest(test.TestCase):
+
+  def testMakeTensorOnly(self):
+    def fn(a, b, c, d=1, e=None, f=7):
+      return (a, b, c, d, e, f)
+
+    t1 = array_ops.ones(())
+    t2 = array_ops.ones(())
+    t3 = array_ops.ones(())
+    args = [1, t1, 3, t2]
+    kwargs = {"e": t3}
+    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
+        fn, args, kwargs, ["b", "d", "e"])
+    self.assertAllEqual(tensor_args, [t1, t2, t3])
+    out = tensor_only_fn(*tensor_args)
+    self.assertAllEqual(out, (1, t1, 3, t2, t3, 7))
+
+  def testMakeTensorOnlyPositionalArgsOnly(self):
+    def fn(a, b, c):
+      return (a, b, c)
+
+    t1 = array_ops.ones(())
+    t2 = array_ops.ones(())
+    args = [t1, 3, t2]
+    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
+        fn, args, {}, ["a", "c"])
+    self.assertAllEqual(tensor_args, [t1, t2])
+    out = tensor_only_fn(*tensor_args)
+    self.assertAllEqual(out, (t1, 3, t2))
+
+  def testMakeTensorOnlyKwargsArgsOnly(self):
+    def fn(a=1, b=2, c=3):
+      return (a, b, c)
+
+    t1 = array_ops.ones(())
+    t2 = array_ops.ones(())
+    args = [t1]
+    kwargs = {"c": t2}
+    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
+        fn, args, kwargs, ["a", "c"])
+    self.assertAllEqual(tensor_args, [t1, t2])
+    out = tensor_only_fn(*tensor_args)
+    self.assertAllEqual(out, (t1, 2, t2))
+
+  def testErrorOnMissingTensorArg(self):
+    def fn(a, b):
+      return (a, b)
+
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "provide Tensor argument"):
+      rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"])
+
+  def testErrorOnSignatureSplats(self):
+    def fn1(a, *args):
+      return (a, args)
+
+    err_msg = r"must not use \*args or \*\*kwargs"
+    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
+      rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"])
+
+    def fn2(a, **kwargs):
+      return (a, kwargs)
+
+    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
+      rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"])
+
+  def testErrorOnNonTensorForTensor(self):
+    def fn(a, b):
+      return (a, b)
+
+    with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"):
+      rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"])
+
+  def testErrorOnTensorForNonTensor(self):
+    def fn(a, b):
+      return (a, b)
+
+    with self.assertRaisesWithPredicateMatch(
+        TypeError, "must not be a Tensor"):
+      t1 = array_ops.ones(())
+      t2 = array_ops.ones(())
+      rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"])
+
 
 class FnWithCustomGradTest(test.TestCase):
 
-- 
GitLab


From 13a7e9820a800cf3877e5a44b9f654f79808a2d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 17:27:04 -0700
Subject: [PATCH 1172/1262] Update DecodeProtoOp so that it returns explicitly
 specified default values for missing fields.

PiperOrigin-RevId: 193600735
---
 .../kernel_tests/defaut_values.TestCase.pbtxt |  94 +++++++++
 .../promote_unsigned.TestCase.pbtxt           |  10 +-
 .../python/kernel_tests/test_example.proto    |  33 +++
 tensorflow/core/kernels/decode_proto_op.cc    | 188 +++++++++++++++---
 4 files changed, 300 insertions(+), 25 deletions(-)
 create mode 100644 tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt

diff --git a/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt
new file mode 100644
index 0000000000..4e31681907
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt
@@ -0,0 +1,94 @@
+primitive {
+  # No fields specified, so we get all defaults
+}
+shape: 1
+sizes: 0
+field {
+  name: "double_default"
+  dtype: DT_DOUBLE
+  expected { double_value: 1.0 }
+}
+sizes: 0
+field {
+  name: "float_default"
+  dtype: DT_DOUBLE  # Try casting the float field to double.
+  expected { double_value: 2.0 }
+}
+sizes: 0
+field {
+  name: "int64_default"
+  dtype: DT_INT64
+  expected { int64_value: 3 }
+}
+sizes: 0
+field {
+  name: "uint64_default"
+  dtype: DT_INT64
+  expected { int64_value: 4 }
+}
+sizes: 0
+field {
+  name: "int32_default"
+  dtype: DT_INT32
+  expected { int32_value: 5 }
+}
+sizes: 0
+field {
+  name: "fixed64_default"
+  dtype: DT_INT64
+  expected { int64_value: 6 }
+}
+sizes: 0
+field {
+  name: "fixed32_default"
+  dtype: DT_INT32
+  expected { int32_value: 7 }
+}
+sizes: 0
+field {
+  name: "bool_default"
+  dtype: DT_BOOL
+  expected { bool_value: true }
+}
+sizes: 0
+field {
+  name: "string_default"
+  dtype: DT_STRING
+  expected { string_value: "a" }
+}
+sizes: 0
+field {
+  name: "bytes_default"
+  dtype: DT_STRING
+  expected { string_value: "a longer default string" }
+}
+sizes: 0
+field {
+  name: "uint32_default"
+  dtype: DT_INT32
+  expected { int32_value: -1 }
+}
+sizes: 0
+field {
+  name: "sfixed32_default"
+  dtype: DT_INT32
+  expected { int32_value: 10 }
+}
+sizes: 0
+field {
+  name: "sfixed64_default"
+  dtype: DT_INT64
+  expected { int64_value: 11 }
+}
+sizes: 0
+field {
+  name: "sint32_default"
+  dtype: DT_INT32
+  expected { int32_value: 12 }
+}
+sizes: 0
+field {
+  name: "sint64_default"
+  dtype: DT_INT64
+  expected { int64_value: 13 }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
index db7555bf2d..bc07efc8f3 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
+++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt
@@ -4,7 +4,6 @@ primitive {
 }
 shape: 1
 sizes: 1
-sizes: 1
 field {
   name: "fixed32_value"
   dtype: DT_INT64
@@ -12,6 +11,7 @@ field {
     int64_value: 4294967295
   }
 }
+sizes: 1
 field {
   name: "uint32_value"
   dtype: DT_INT64
@@ -19,3 +19,11 @@ field {
     int64_value: 4294967295
   }
 }
+sizes: 0
+field {
+  name: "uint32_default"
+  dtype: DT_INT64
+  expected {
+    int64_value: 4294967295  # Comes from an explicitly-specified default
+  }
+}
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
index dc495034ff..a2c88e372b 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -72,6 +72,23 @@ message RepeatedPrimitiveValue {
   repeated sint32 sint32_value = 17;
   repeated sint64 sint64_value = 18;
   repeated PrimitiveValue message_value = 19;
+
+  // Optional fields with explicitly-specified defaults.
+  optional double double_default = 20 [default = 1.0];
+  optional float float_default = 21 [default = 2.0];
+  optional int64 int64_default = 22 [default = 3];
+  optional uint64 uint64_default = 23 [default = 4];
+  optional int32 int32_default = 24 [default = 5];
+  optional fixed64 fixed64_default = 25 [default = 6];
+  optional fixed32 fixed32_default = 26 [default = 7];
+  optional bool bool_default = 27 [default = true];
+  optional string string_default = 28 [default = "a"];
+  optional bytes bytes_default = 29 [default = "a longer default string"];
+  optional uint32 uint32_default = 30 [default = 4294967295];
+  optional sfixed32 sfixed32_default = 31 [default = 10];
+  optional sfixed64 sfixed64_default = 32 [default = 11];
+  optional sint32 sint32_default = 33 [default = 12];
+  optional sint64 sint64_default = 34 [default = 13];
 }
 
 // A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue
@@ -97,6 +114,22 @@ message PackedPrimitiveValue {
   repeated sint32 sint32_value = 17 [packed = true];
   repeated sint64 sint64_value = 18 [packed = true];
   repeated PrimitiveValue message_value = 19;
+
+  optional double double_default = 20 [default = 1.0];
+  optional float float_default = 21 [default = 2.0];
+  optional int64 int64_default = 22 [default = 3];
+  optional uint64 uint64_default = 23 [default = 4];
+  optional int32 int32_default = 24 [default = 5];
+  optional fixed64 fixed64_default = 25 [default = 6];
+  optional fixed32 fixed32_default = 26 [default = 7];
+  optional bool bool_default = 27 [default = true];
+  optional string string_default = 28 [default = "a"];
+  optional bytes bytes_default = 29 [default = "a longer default string"];
+  optional uint32 uint32_default = 30 [default = 4294967295];
+  optional sfixed32 sfixed32_default = 31 [default = 10];
+  optional sfixed64 sfixed64_default = 32 [default = 11];
+  optional sint32 sint32_default = 33 [default = 12];
+  optional sint64 sint64_default = 34 [default = 13];
 }
 
 message EnumValue {
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index b4e5b776ed..24f8a4f72f 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -105,11 +105,137 @@ bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) {
   }
 }
 
+// Used to store the default value of a protocol message field, casted to the
+// type of the output tensor.
+//
+// TODO(paskin): Use absl::variant once TensorFlow gets absl dependencies.
+struct DefaultValue {
+  DataType dtype = DataType::DT_INVALID;
+  union Value {
+    bool v_bool;           // DT_BOOL
+    uint8 v_uint8;         // DT_UINT8
+    int8 v_int8;           // DT_INT8
+    int32 v_int32;         // DT_INT32
+    int64 v_int64;         // DT_INT64
+    float v_float;         // DT_FLOAT
+    double v_double;       // DT_DOUBLE
+    const char* v_string;  // DT_STRING
+  };
+  Value value;
+};
+
+// Initializes a DefaultValue object.  This generic template handles numeric
+// types and strings are handled by a template specialization below.
+//
+// Args:
+//   dtype: the type of the output tensor
+//   value: the default value as obtained from the FieldDescriptor
+//   result: the object to initialize
+template <typename T>
+Status InitDefaultValue(DataType dtype, const T value, DefaultValue* result) {
+  result->dtype = dtype;
+  switch (dtype) {
+    case DT_BOOL:
+      result->value.v_bool = static_cast<bool>(value);
+      break;
+    case DT_INT32:
+      result->value.v_int32 = static_cast<int32>(value);
+      break;
+    case DT_INT8:
+      result->value.v_int8 = static_cast<int8>(value);
+      break;
+    case DT_UINT8:
+      result->value.v_uint8 = static_cast<uint8>(value);
+      break;
+    case DT_INT64:
+      result->value.v_int64 = static_cast<int64>(value);
+      break;
+    case DT_FLOAT:
+      result->value.v_float = static_cast<float>(value);
+      break;
+    case DT_DOUBLE:
+      result->value.v_double = static_cast<double>(value);
+      break;
+    default:
+      // We should never get here, given the type checking that occurs earlier.
+      return errors::Internal(
+          "Cannot initialize default value for unsupported type: ",
+          DataTypeString(dtype));
+  }
+  return Status::OK();
+}
+
+template <>
+Status InitDefaultValue(DataType dtype, const char* value,
+                        DefaultValue* result) {
+  // These are sanity checks that should never trigger given the code that
+  // leads here.
+  if (TF_PREDICT_FALSE(dtype != DT_STRING)) {
+    return errors::InvalidArgument(
+        "Cannot cast field to anything but DT_STRING");
+  }
+  if (TF_PREDICT_FALSE(value == nullptr)) {
+    return errors::InvalidArgument("Null default string value.");
+  }
+  result->dtype = DT_STRING;
+  result->value.v_string = value;
+  return Status::OK();
+}
+
+// Initializes a default value from the output data type and the field
+// descriptor.
+Status InitDefaultValueFromFieldDescriptor(DataType dtype,
+                                           const FieldDescriptor* field_desc,
+                                           DefaultValue* result) {
+  switch (field_desc->type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return InitDefaultValue(dtype, field_desc->default_value_double(),
+                              result);
+    case WireFormatLite::TYPE_FLOAT:
+      return InitDefaultValue(dtype, field_desc->default_value_float(), result);
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SINT64:
+    case WireFormatLite::TYPE_SFIXED64:
+      return InitDefaultValue(dtype, field_desc->default_value_int64(), result);
+    case WireFormatLite::TYPE_FIXED64:
+    case WireFormatLite::TYPE_UINT64:
+      return InitDefaultValue(dtype, field_desc->default_value_uint64(),
+                              result);
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_SINT32:
+    case WireFormatLite::TYPE_SFIXED32:
+      return InitDefaultValue(dtype, field_desc->default_value_int32(), result);
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return InitDefaultValue(dtype, field_desc->default_value_uint32(),
+                              result);
+    case WireFormatLite::TYPE_BOOL:
+      return InitDefaultValue(dtype, field_desc->default_value_bool(), result);
+    case WireFormatLite::TYPE_BYTES:
+    case WireFormatLite::TYPE_STRING:
+      // Manipulating default string values as C-style pointers should be OK
+      // for typical code-generated protocol messages.  It is possible in
+      // principle to register a message descriptor on the fly, and these
+      // pointers may not be stable if that descriptor has a weird
+      // implementation.  (But the return type of default_value_string() is
+      // const string&, so it'd have to be very weird.)
+      return InitDefaultValue(dtype, field_desc->default_value_string().c_str(),
+                              result);
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+      return InitDefaultValue(dtype, "", result);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return Status::OK();
+}
+
 // A FieldInfo holds a handful of information from the FieldDescriptor
 // and user attributes.
 struct FieldInfo {
-  FieldInfo(const FieldDescriptor* field_desc, int user_index)
-      : output_index(user_index) {
+  FieldInfo(const FieldDescriptor* field_desc, int user_index,
+            DefaultValue def_value)
+      : output_index(user_index), default_value(def_value) {
     // Without this intermediate data structure, the profile had hotspots
     // calling methods of FieldDescriptor.
     number = field_desc->number();
@@ -144,6 +270,7 @@ struct FieldInfo {
   WireFormatLite::FieldType type;
   int number;
   bool is_repeated;
+  DefaultValue default_value;
 };
 
 // A CountCollector counts sizes of repeated and optional fields in a proto.
@@ -394,8 +521,11 @@ class DenseCollector {
   DenseCollector() = default;
 
   // A DenseCollector applies to one field of a serialized message.
-  DenseCollector(uint8* datap, DataType dtype, int max_repeat_count)
-      : datap_(datap), dtype_(dtype), max_repeat_count_(max_repeat_count) {}
+  // Note that default_value.dtype is the type of the output tensor.
+  DenseCollector(uint8* datap, DefaultValue default_value, int max_repeat_count)
+      : datap_(datap),
+        default_value_(default_value),
+        max_repeat_count_(max_repeat_count) {}
 
   // Reads a value from the input stream and stores it.
   //
@@ -415,8 +545,8 @@ class DenseCollector {
     }
     next_repeat_index_ = index + 1;
 
-    return internal::ReadValue(input, field.type, field.number, dtype_, index,
-                               datap_);
+    return internal::ReadValue(input, field.type, field.number,
+                               default_value_.dtype, index, datap_);
   }
 
   // Reads and stores a length-delimited list of values.
@@ -445,8 +575,8 @@ class DenseCollector {
           field.number, ", Max entries allowed: ", max_repeat_count_);
     } else {
       return internal::ReadPackedFromArray(buf, buf_size, field.type,
-                                           field.number, dtype_, stride,
-                                           &next_repeat_index_, datap_);
+                                           field.number, default_value_.dtype,
+                                           stride, &next_repeat_index_, datap_);
     }
   }
 
@@ -454,23 +584,23 @@ class DenseCollector {
   // Dispatches to the appropriately typed field default based on the
   // runtime type tag.
   Status FillWithDefaults() {
-    switch (dtype_) {
+    switch (default_value_.dtype) {
       case DataType::DT_FLOAT:
-        return FillDefault<float>();
+        return FillDefault<float>(default_value_.value.v_float);
       case DataType::DT_DOUBLE:
-        return FillDefault<double>();
+        return FillDefault<double>(default_value_.value.v_double);
       case DataType::DT_INT32:
-        return FillDefault<int32>();
+        return FillDefault<int32>(default_value_.value.v_int32);
       case DataType::DT_UINT8:
-        return FillDefault<uint8>();
+        return FillDefault<uint8>(default_value_.value.v_uint8);
       case DataType::DT_INT8:
-        return FillDefault<int8>();
+        return FillDefault<int8>(default_value_.value.v_int8);
       case DataType::DT_STRING:
-        return FillDefault<string>();
+        return FillDefault<string>(default_value_.value.v_string);
       case DataType::DT_INT64:
-        return FillDefault<int64>();
+        return FillDefault<int64>(default_value_.value.v_int64);
       case DataType::DT_BOOL:
-        return FillDefault<bool>();
+        return FillDefault<bool>(default_value_.value.v_bool);
       default:
         // There are many tensorflow dtypes not handled here, but they
         // should not come up unless type casting is added to the Op.
@@ -485,9 +615,9 @@ class DenseCollector {
   // default value. This uses next_repeat_index_ which counts the number
   // of parsed values for the field.
   template <class T>
-  Status FillDefault() {
+  Status FillDefault(const T& default_value) {
     for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
-      reinterpret_cast<T*>(datap_)[i] = T();
+      reinterpret_cast<T*>(datap_)[i] = default_value;
     }
     return Status::OK();
   }
@@ -501,7 +631,7 @@ class DenseCollector {
   // for more items than we have allocated space.
   void* const datap_ = nullptr;
 
-  const DataType dtype_ = DataType::DT_INVALID;
+  const DefaultValue default_value_;
   const int max_repeat_count_ = 0;
 };
 
@@ -577,8 +707,14 @@ class DecodeProtoOp : public OpKernel {
 
     // Now store the fields in sorted order.
     for (int i = 0; i < field_names.size(); i++) {
-      fields_.push_back(MakeUnique<FieldInfo>(field_descs[output_indices[i]],
-                                              output_indices[i]));
+      const int output_index = output_indices[i];
+      const DataType dtype = output_types[output_index];
+      const FieldDescriptor* field_descriptor = field_descs[output_index];
+      DefaultValue default_value;
+      OP_REQUIRES_OK(context, InitDefaultValueFromFieldDescriptor(
+                                  dtype, field_descriptor, &default_value));
+      fields_.push_back(
+          MakeUnique<FieldInfo>(field_descriptor, output_index, default_value));
     }
 
     message_prototype_ = message_factory_.GetPrototype(message_desc);
@@ -805,9 +941,13 @@ class DecodeProtoOp : public OpKernel {
 
       std::vector<DenseCollector> collectors;
       collectors.reserve(field_count);
-      for (const TensorInfo& info : tensors) {
+      for (int output_index = 0; output_index < field_count; ++output_index) {
+        const TensorInfo& info = tensors[output_index];
+        const FieldInfo* field_info = fields_[output_index].get();
+        DCHECK(field_info != nullptr);
+        const DefaultValue default_value = field_info->default_value;
         collectors.emplace_back(info.data + message_index * info.stride,
-                                info.dtype, info.last_dim_size);
+                                default_value, info.last_dim_size);
       }
 
       // Fill in output tensors from the wire.
-- 
GitLab


From 976229dcbfde389864069433ebfc4085015df9c1 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 19 Apr 2018 17:30:49 -0700
Subject: [PATCH 1173/1262] Internal testing changes

PiperOrigin-RevId: 193601134
---
 tensorflow/contrib/lite/kernels/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 8cfa7e53d1..80cefe83b2 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -212,6 +212,7 @@ tf_cc_test(
     name = "audio_spectrogram_test",
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -225,6 +226,7 @@ tf_cc_test(
     name = "mfcc_test",
     size = "small",
     srcs = ["mfcc_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -346,6 +348,7 @@ tf_cc_test(
     name = "cast_test",
     size = "small",
     srcs = ["cast_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -398,6 +401,7 @@ tf_cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -504,6 +508,7 @@ tf_cc_test(
     name = "maximum_minimum_test",
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
-- 
GitLab


From 7f87125dceb3c69c5fd1d0712c6c93cc4ceaa854 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 19 Apr 2018 17:39:09 -0700
Subject: [PATCH 1174/1262] internal END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 193571934

PiperOrigin-RevId: 193602050
---
 tensorflow/core/lib/io/record_reader.cc    | 147 ++++++++++----
 tensorflow/core/lib/io/record_reader.h     |  16 +-
 tensorflow/core/lib/io/recordio_test.cc    | 212 +++++++--------------
 tensorflow/core/lib/io/zlib_inputstream.cc |   9 +-
 tensorflow/core/lib/io/zlib_inputstream.h  |  10 +-
 5 files changed, 188 insertions(+), 206 deletions(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index c24628be57..6de850bb20 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,55 +56,110 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : options_(options),
-      input_stream_(new RandomAccessInputStream(file)),
-      last_read_failed_(false) {
+    : src_(file), options_(options) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
-                                                options.buffer_size, true));
+    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
+  } else {
+    input_stream_.reset(new RandomAccessInputStream(file));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    input_stream_.reset(new ZlibInputStream(
-        input_stream_.release(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options, true));
+    zlib_input_stream_.reset(new ZlibInputStream(
+        input_stream_.get(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
+    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-//
-// offset corresponds to the user-provided value to ReadRecord()
-// and is used only in error messages.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
+// May use *storage as backing store.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
+                                     StringPiece* result, string* storage) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
+  storage->resize(expected);
+
+#if !defined(IS_SLIM_BUILD)
+  if (zlib_input_stream_) {
+    // If we have a zlib compressed buffer, we assume that the
+    // file is being read sequentially, and we use the underlying
+    // implementation to read the data.
+    //
+    // No checks are done to validate that the file is being read
+    // sequentially.  At some point the zlib input buffer may support
+    // seeking, possibly inefficiently.
+    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
+
+    if (storage->size() != expected) {
+      if (storage->empty()) {
+        return errors::OutOfRange("eof");
+      } else {
+        return errors::DataLoss("truncated record at ", offset);
+      }
+    }
 
-  if (result->size() != expected) {
-    if (result->empty()) {
-      return errors::OutOfRange("eof");
+    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
+    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
+      return errors::DataLoss("corrupted record at ", offset);
+    }
+    *result = StringPiece(storage->data(), n);
+  } else {
+#endif  // IS_SLIM_BUILD
+    if (options_.buffer_size > 0) {
+      // If we have a buffer, we assume that the file is being read
+      // sequentially, and we use the underlying implementation to read the
+      // data.
+      //
+      // No checks are done to validate that the file is being read
+      // sequentially.
+      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
+
+      if (storage->size() != expected) {
+        if (storage->empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+
+      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
+      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
+        return errors::DataLoss("corrupted record at ", offset);
+      }
+      *result = StringPiece(storage->data(), n);
     } else {
-      return errors::DataLoss("truncated record at ", offset);
+      // This version supports reading from arbitrary offsets
+      // since we are accessing the random access file directly.
+      StringPiece data;
+      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
+      if (data.size() != expected) {
+        if (data.empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
+      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
+        return errors::DataLoss("corrupted record at ", offset);
+      }
+      *result = StringPiece(data.data(), n);
     }
+#if !defined(IS_SLIM_BUILD)
   }
+#endif  // IS_SLIM_BUILD
 
-  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
-  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
-    return errors::DataLoss("corrupted record at ", offset);
-  }
-  result->resize(n);
   return Status::OK();
 }
 
@@ -112,42 +167,50 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
-  // Position the input stream.
-  int64 curr_pos = input_stream_->Tell();
-  int64 desired_pos = static_cast<int64>(*offset);
-  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
-      (curr_pos == desired_pos && last_read_failed_)) {
-    last_read_failed_ = false;
-    TF_RETURN_IF_ERROR(input_stream_->Reset());
-    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
-  } else if (curr_pos < desired_pos) {
-    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
-  }
-  DCHECK_EQ(desired_pos, input_stream_->Tell());
-
   // Read header data.
-  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
+  StringPiece lbuf;
+  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
   if (!s.ok()) {
-    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(record->data());
+  const uint64 length = core::DecodeFixed64(lbuf.data());
 
   // Read data
-  s = ReadChecksummed(*offset + kHeaderSize, length, record);
+  StringPiece data;
+  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
   if (!s.ok()) {
-    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
+  if (record->data() != data.data()) {
+    // RandomAccessFile placed the data in some other location.
+    memmove(&(*record)[0], data.data(), data.size());
+  }
+
+  record->resize(data.size());
+
   *offset += kHeaderSize + length + kFooterSize;
-  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
+Status RecordReader::SkipNBytes(uint64 offset) {
+#if !defined(IS_SLIM_BUILD)
+  if (zlib_input_stream_) {
+    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
+  } else {
+#endif
+    if (options_.buffer_size > 0) {
+      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
+    }
+#if !defined(IS_SLIM_BUILD)
+  }
+#endif
+  return Status::OK();
+}  // namespace io
+
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index f6d587dfa0..26278e0328 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -69,14 +69,25 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
+  //
+  // Note: if buffering is used (with or without compression), access must be
+  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
+  // Skip the records till "offset". Returns OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  Status SkipNBytes(uint64 offset);
+
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, string* result);
+  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
+                         string* storage);
 
+  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-  bool last_read_failed_;
+#if !defined(IS_SLIM_BUILD)
+  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
+#endif  // IS_SLIM_BUILD
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -110,6 +121,7 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
+    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index da514bd21c..63235761d9 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -26,11 +26,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
-namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-string BigString(const string& partial_string, size_t n) {
+static string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -40,66 +39,62 @@ string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-string NumberString(int n) {
+static string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class StringDest : public WritableFile {
- public:
-  explicit StringDest(string* contents) : contents_(contents) {}
-
-  Status Close() override { return Status::OK(); }
-  Status Flush() override { return Status::OK(); }
-  Status Sync() override { return Status::OK(); }
-  Status Append(const StringPiece& slice) override {
-    contents_->append(slice.data(), slice.size());
-    return Status::OK();
-  }
-
+class RecordioTest : public ::testing::Test {
  private:
-  string* contents_;
-};
-
-class StringSource : public RandomAccessFile {
- public:
-  explicit StringSource(string* contents)
-      : contents_(contents), force_error_(false) {}
-
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
-    if (force_error_) {
-      force_error_ = false;
-      return errors::DataLoss("read error");
+  class StringDest : public WritableFile {
+   public:
+    string contents_;
+
+    Status Close() override { return Status::OK(); }
+    Status Flush() override { return Status::OK(); }
+    Status Sync() override { return Status::OK(); }
+    Status Append(const StringPiece& slice) override {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
     }
-
-    if (offset >= contents_->size()) {
-      return errors::OutOfRange("end of file");
-    }
-
-    if (contents_->size() < offset + n) {
-      n = contents_->size() - offset;
+  };
+
+  class StringSource : public RandomAccessFile {
+   public:
+    StringPiece contents_;
+    mutable bool force_error_;
+    mutable bool returned_partial_;
+    StringSource() : force_error_(false), returned_partial_(false) {}
+
+    Status Read(uint64 offset, size_t n, StringPiece* result,
+                char* scratch) const override {
+      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        force_error_ = false;
+        returned_partial_ = true;
+        return errors::DataLoss("read error");
+      }
+
+      if (offset >= contents_.size()) {
+        return errors::OutOfRange("end of file");
+      }
+
+      if (contents_.size() < offset + n) {
+        n = contents_.size() - offset;
+        returned_partial_ = true;
+      }
+      *result = StringPiece(contents_.data() + offset, n);
+      return Status::OK();
     }
-    *result = StringPiece(contents_->data() + offset, n);
-    return Status::OK();
-  }
-
-  void force_error() { force_error_ = true; }
-
- private:
-  string* contents_;
-  mutable bool force_error_;
-};
+  };
 
-class RecordioTest : public ::testing::Test {
- private:
-  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -109,9 +104,7 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : dest_(&contents_),
-        source_(&contents_),
-        reading_(false),
+      : reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -126,11 +119,12 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return contents_.size(); }
+  size_t WrittenBytes() const { return dest_.contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
+      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -143,20 +137,26 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
+  void IncrementByte(int offset, int delta) {
+    dest_.contents_[offset] += delta;
+  }
 
-  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
+  void SetByte(int offset, char new_byte) {
+    dest_.contents_[offset] = new_byte;
+  }
 
-  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
+  void ShrinkSize(int bytes) {
+    dest_.contents_.resize(dest_.contents_.size() - bytes);
+  }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&contents_[header_offset], crc);
+    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error(); }
+  void ForceError() { source_.force_error_ = true; }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -165,6 +165,7 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
+    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -216,100 +217,16 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
-void TestNonSequentialReads(const RecordWriterOptions& writer_options,
-                            const RecordReaderOptions& reader_options) {
-  string contents;
-  StringDest dst(&contents);
-  RecordWriter writer(&dst, writer_options);
-  for (int i = 0; i < 10; ++i) {
-    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
-  }
-  TF_ASSERT_OK(writer.Close());
-
-  StringSource file(&contents);
-  RecordReader reader(&file, reader_options);
-
-  string record;
-  // First read sequentially to fill in the offsets table.
-  uint64 offsets[10] = {0};
-  uint64 offset = 0;
-  for (int i = 0; i < 10; ++i) {
-    offsets[i] = offset;
-    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
-  }
-
-  // Read randomly: First go back to record #3 then forward to #8.
-  offset = offsets[3];
-  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
-  EXPECT_EQ("3.", record);
-  EXPECT_EQ(offsets[4], offset);
-
-  offset = offsets[8];
-  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
-  EXPECT_EQ("8.", record);
-  EXPECT_EQ(offsets[9], offset);
-}
-
-TEST_F(RecordioTest, NonSequentialReads) {
-  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
-}
-
-TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
-  RecordReaderOptions options;
-  options.buffer_size = 1 << 10;
-  TestNonSequentialReads(RecordWriterOptions(), options);
-}
-
-TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
-  TestNonSequentialReads(
-      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
-      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
-}
-
 // Tests of all the error paths in log_reader.cc follow:
-void AssertHasSubstr(StringPiece s, StringPiece expected) {
+static void AssertHasSubstr(StringPiece s, StringPiece expected) {
   EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
-void TestReadError(const RecordWriterOptions& writer_options,
-                   const RecordReaderOptions& reader_options) {
-  const string wrote = BigString("well hello there!", 100);
-  string contents;
-  StringDest dst(&contents);
-  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
-
-  StringSource file(&contents);
-  RecordReader reader(&file, reader_options);
-
-  uint64 offset = 0;
-  string read;
-  file.force_error();
-  Status status = reader.ReadRecord(&offset, &read);
-  ASSERT_TRUE(errors::IsDataLoss(status));
-  ASSERT_EQ(0, offset);
-
-  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
-  // lose the record.
-  status = reader.ReadRecord(&offset, &read);
-  ASSERT_TRUE(status.ok()) << status;
-  EXPECT_GT(offset, 0);
-  EXPECT_EQ(wrote, read);
-}
-
 TEST_F(RecordioTest, ReadError) {
-  TestReadError(RecordWriterOptions(), RecordReaderOptions());
-}
-
-TEST_F(RecordioTest, ReadErrorWithBuffering) {
-  RecordReaderOptions options;
-  options.buffer_size = 1 << 20;
-  TestReadError(RecordWriterOptions(), options);
-}
-
-TEST_F(RecordioTest, ReadErrorWithCompression) {
-  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
-                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+  Write("foo");
+  ForceError();
+  AssertHasSubstr(Read(), "Data loss");
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -340,6 +257,5 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
-}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index bf8dcf0988..984fbc2810 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,9 +25,8 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
-    : owns_input_stream_(owns_input_stream),
-      input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options)
+    : input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -42,14 +41,10 @@ ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
-  if (owns_input_stream_) {
-    delete input_stream_;
-  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
-  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 6099e2455d..9c7e14441c 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,13 +40,10 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents.
-  //
-  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
+  // contents. Does *not* take ownership of "input_stream".
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
-                  const ZlibCompressionOptions& zlib_options,
-                  bool owns_input_stream = false);
+                  const ZlibCompressionOptions& zlib_options);
 
   ~ZlibInputStream();
 
@@ -68,8 +65,7 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  const bool owns_input_stream_;
-  InputStreamInterface* input_stream_;
+  InputStreamInterface* input_stream_;  // Not owned
   size_t input_buffer_capacity_;        // Size of z_stream_input_
   size_t output_buffer_capacity_;       // Size of z_stream_output_
   char* next_unread_byte_;              // Next unread byte in z_stream_output_
-- 
GitLab


From b7cca088e90b4c2a28c1038980aa09240584e382 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 18:12:57 -0700
Subject: [PATCH 1175/1262] Respect any device filters in
 {Create,Delete}WorkerSessions().

This is another step towards enabling us to turn on explicit worker
sessions for all master sessions.

PiperOrigin-RevId: 193605565
---
 tensorflow/core/distributed_runtime/master.cc            | 6 +++++-
 tensorflow/core/distributed_runtime/master_env.h         | 3 ++-
 tensorflow/core/distributed_runtime/master_session.cc    | 9 +++++----
 tensorflow/core/distributed_runtime/master_session.h     | 6 +++++-
 .../core/distributed_runtime/rpc/grpc_server_lib.cc      | 4 +++-
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index f47502e844..288656e7f8 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -417,9 +417,13 @@ void Master::CreateSession(const CreateSessionRequest* req,
     SessionOptions options;
     options.config = req->config();
 
+    std::vector<string> filtered_worker_list;
+    DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_,
+                                   worker_cache, &filtered_worker_list);
+
     MasterSession* session = env_->master_session_factory(
         options, env_, std::move(remote_devices), std::move(worker_cache_ptr),
-        std::move(device_set));
+        std::move(device_set), std::move(filtered_worker_list));
 
     GraphDef* gdef =
         const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 178c5b40ee..16f4d93c8b 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -83,7 +83,8 @@ struct MasterEnv {
       SessionOptions, MasterEnv*,
       std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
       std::unique_ptr<WorkerCacheInterface>,
-      std::unique_ptr<DeviceSet> device_set)>
+      std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list)>
       master_session_factory;
 
   std::function<Status(const WorkerCacheFactoryOptions&,
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 7868200fb4..ebe350d313 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -416,6 +416,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   if (!s.ok()) {
     for (Part& part : partitions_) {
       worker_cache_->ReleaseWorker(part.name, part.worker);
+      part.worker = nullptr;
     }
     return s;
   }
@@ -1119,6 +1120,7 @@ MasterSession::MasterSession(
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceSet> device_set,
+    std::vector<string> filtered_worker_list,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
@@ -1126,6 +1128,7 @@ MasterSession::MasterSession(
       remote_devs_(std::move(remote_devs)),
       worker_cache_(std::move(worker_cache)),
       devices_(std::move(device_set)),
+      filtered_worker_list_(std::move(filtered_worker_list)),
       stats_publisher_factory_(std::move(stats_publisher_factory)),
       graph_version_(0),
       run_graphs_(5),
@@ -1183,9 +1186,8 @@ Status MasterSession::Create(GraphDef* graph_def,
 
 Status MasterSession::CreateWorkerSessions(
     const WorkerCacheFactoryOptions& options) {
-  std::vector<string> worker_names;
+  const std::vector<string> worker_names = filtered_worker_list_;
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  worker_cache->ListWorkers(&worker_names);
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
@@ -1263,8 +1265,7 @@ Status MasterSession::CreateWorkerSessions(
 
 Status MasterSession::DeleteWorkerSessions() {
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  std::vector<string> worker_names;
-  worker_cache->ListWorkers(&worker_names);
+  const std::vector<string>& worker_names = filtered_worker_list_;
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index a05419904f..ec34e20b79 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -52,6 +52,7 @@ class MasterSession : public core::RefCounted {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       std::unique_ptr<DeviceSet> device_set,
+      std::vector<string> filtered_worker_list,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
@@ -130,6 +131,10 @@ class MasterSession : public core::RefCounted {
   // The device set used by this session.
   std::unique_ptr<DeviceSet> devices_;
 
+  // The (partial device) names of remote worker tasks that this
+  // session will contact.
+  const std::vector<string> filtered_worker_list_;
+
   StatsPublisherFactory stats_publisher_factory_;
 
   std::atomic_ulong last_access_time_usec_;
@@ -212,7 +217,6 @@ class MasterSession : public core::RefCounted {
   // workers.
   Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
 
-  // TODO(b/36574172): Always use Create/DeleteWorkerSession.
   bool should_delete_worker_sessions_ = false;
   Status DeleteWorkerSessions();
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index be19103582..488dcde9f5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -222,10 +222,12 @@ Status GrpcServer::Init(
           SessionOptions options, const MasterEnv* env,
           std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
           std::unique_ptr<WorkerCacheInterface> worker_cache,
-          std::unique_ptr<DeviceSet> device_set) {
+          std::unique_ptr<DeviceSet> device_set,
+          std::vector<string> filtered_worker_list) {
         options.config.MergeFrom(config);
         return new MasterSession(options, env, std::move(remote_devs),
                                  std::move(worker_cache), std::move(device_set),
+                                 std::move(filtered_worker_list),
                                  stats_factory);
       };
   master_env_.worker_cache_factory =
-- 
GitLab


From 4f8768319cfa56c25973cc66d920146ad454bd97 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 18:17:02 -0700
Subject: [PATCH 1176/1262] Optimize Graph function library.

PiperOrigin-RevId: 193605910
---
 tensorflow/core/grappler/optimizers/BUILD     |   4 +
 .../grappler/optimizers/function_optimizer.cc | 126 ++++++-
 .../grappler/optimizers/function_optimizer.h  |   6 +-
 .../optimizers/function_optimizer_test.cc     |  32 +-
 .../grappler/optimizers/meta_optimizer.cc     | 326 +++++++++++-------
 .../core/grappler/optimizers/meta_optimizer.h |  33 +-
 .../optimizers/meta_optimizer_test.cc         | 172 ++++++++-
 tensorflow/core/grappler/utils/functions.cc   |  12 +-
 tensorflow/core/grappler/utils/functions.h    |  40 ++-
 .../core/grappler/utils/functions_test.cc     |   8 +-
 10 files changed, 563 insertions(+), 196 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index a371186fe6..3ab8d8f584 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -518,11 +518,13 @@ cc_library(
         ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:colocation",
+        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -539,9 +541,11 @@ tf_cuda_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index d008a9719f..950933b933 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -75,12 +76,10 @@ string UniqueSpecializedFunctionName(const FunctionDef& func,
 
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(const GrapplerItem& item,
-                                    RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level),
-        function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
-                                                    item.graph.library())) {
-    InitializeInlinedFunctions(item);
+  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
+                                    const GrapplerItem& item)
+      : function_library_(OpRegistry::Global(), item.graph.library()) {
+    InitializeInlinedFunctions(opt_level, item);
   }
 
   const FunctionLibraryDefinition& function_library() const {
@@ -101,8 +100,9 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeInlinedFunctions(const GrapplerItem& item) {
-    bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
+                                  const GrapplerItem& item) {
+    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
 
     for (const FunctionDef& func : item.graph.library().function()) {
       // Can't create IdentityN nodes with no input or output: skip these
@@ -120,7 +120,6 @@ class FunctionOptimizerContext {
     }
   }
 
-  RewriterConfig::Toggle opt_level_;
   FunctionLibraryDefinition function_library_;
   // Functions that can be inlined into optimized graph.
   std::unordered_map<string, const FunctionDef*> inlined_functions_;
@@ -128,9 +127,93 @@ class FunctionOptimizerContext {
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+// Return trimmed FunctionDefLibrary with functions that are reachable from
+// the optimized graph.
+FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
+                                       const GraphDef& optimized_graph) {
+  // Functions that are reachable from the optimized graph.
+  std::unordered_set<string> keep_funcs;
+
+  std::vector<const FunctionDef*> func_queue;
+  func_queue.reserve(flib.num_functions());
+
+  // Add registered and not already processed functions to the queue by name.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && keep_funcs.find(func_name) == keep_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Find all the functions that are reachable from the given node.
+  const auto add_node_to_func_queue = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  const auto& graph_nodes = optimized_graph.node();
+  std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    keep_funcs.insert(func_name);
+
+    // Find all the functions that called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  FunctionDefLibrary lib;
+  for (const string& func_name : keep_funcs) {
+    const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name));
+    *lib.add_function() = *func;
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef* gd = lib.add_gradient();
+      gd->set_function_name(func_name);
+      gd->set_gradient_func(grad_func_name);
+    }
+  }
+
+  VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions ("
+          << static_cast<int>(keep_funcs.size() - flib.num_functions()) << ")";
+
+  return lib;
+}
+
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
+  VLOG(2) << "Specialize function instantiation: "
+          << SummarizeNodeDef(func_node);
+
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -141,20 +224,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   // TODO(ezhulenev): Push down const inputs and known input shapes.
-  FunctionDef specialized;
-  TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized));
+  FunctionDef specialized_func;
+  TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
 
   // Find a name for specialized function.
   const string specialized_func_name =
       UniqueSpecializedFunctionName(func, func_node, flib);
 
-  specialized.mutable_signature()->set_name(specialized_func_name);
-  auto* specialized_attr = specialized.mutable_attr();
+  specialized_func.mutable_signature()->set_name(specialized_func_name);
+  auto* specialized_attr = specialized_func.mutable_attr();
   (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true);
 
   // Add specialized function to the library.
   TF_RETURN_IF_ERROR(
-      ctx->mutable_function_library().AddFunctionDef(specialized));
+      ctx->mutable_function_library().AddFunctionDef(specialized_func));
 
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
@@ -226,6 +309,8 @@ Status HookInlinedFunctionOutputs(
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionOptimizerContext& ctx,
                       GraphDef* optimized_graph) {
+  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
+
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
@@ -359,6 +444,8 @@ class SymbolicGradientEnv {
 
 Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
                               GraphDef* inlined_graph) {
+  VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
+
   GraphDef graph_def;
 
   // Create a node to anchor the gradient inputs
@@ -454,13 +541,16 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
 
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
+  VLOG(2) << "Optimize function library: id=" << item.id;
+
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
+    VLOG(3) << "Skip Grappler item with empty function library";
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  FunctionOptimizerContext ctx(item, opt_level_);
+  FunctionOptimizerContext ctx(opt_level_, item);
   SymbolicGradientEnv env(item.graph.versions().producer(),
                           item.graph.library());
 
@@ -506,9 +596,11 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     *optimized_graph->add_node() = node;
   }
 
-  // TODO(bsteiner): trim the library to remove unused function definitions
   *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() = ctx.function_library().ToProto();
+  *optimized_graph->mutable_library() =
+      options_.enable_trim_function_library
+          ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph)
+          : ctx.function_library().ToProto();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index c555fadf83..e307b4e533 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,8 +26,9 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
-  ~FunctionOptimizer() override {}
+  explicit FunctionOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~FunctionOptimizer() override = default;
 
   string name() const override { return "function_optimizer"; };
 
@@ -44,6 +45,7 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_function_inlining = true;
     bool enable_function_specialization = true;
     bool enable_symbolic_gradient_inlining = true;
+    bool enable_trim_function_library = true;
   };
 
   RewriterConfig::Toggle opt_level_;
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index fb006d4868..6147e8a27c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -31,20 +31,8 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionOptimizerTest : public GrapplerTest {
  protected:
-  void DisableAll(FunctionOptimizer* optimizer) {
-    optimizer->options_.enable_function_inlining = false;
+  void DisableFunctionSpecialization(FunctionOptimizer* optimizer) {
     optimizer->options_.enable_function_specialization = false;
-    optimizer->options_.enable_symbolic_gradient_inlining = false;
-  }
-
-  void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) {
-    DisableAll(optimizer);
-    optimizer->options_.enable_function_inlining = true;
-  }
-
-  void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) {
-    DisableAll(optimizer);
-    optimizer->options_.enable_function_specialization = true;
   }
 };
 
@@ -352,7 +340,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  EnableOnlyFunctionInlining(&optimizer);
+  DisableFunctionSpecialization(&optimizer);  // do not specialize noinline func
 
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -626,14 +614,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
-  EnableOnlyFunctionSpecialization(&optimizer);
 
-  // Mark XTimesTwo as noinline
+  // Mark XTimesTwo as noinline.
   FunctionDef x_times_two = test::function::XTimesTwo();
   (*x_times_two.mutable_attr())["_noinline"].set_b(true);
   std::vector<FunctionDef> function_library = {x_times_two};
 
-  // Build a graph to compute y = XTimesTwo(x)
+  // Build a graph to compute y = XTimesTwo(x).
   GrapplerItem item;
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
@@ -644,12 +631,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  // Make sure that specialized function was added to the library
-  EXPECT_EQ(2, output.library().function_size());
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  EXPECT_EQ(1, output.library().function_size());
   EXPECT_EQ("XTimesTwo_specialized_for_y",
-            output.library().function(1).signature().name());
+            output.library().function(0).signature().name());
 
-  // And 'y' node is calling specialized function
+  // And 'y' node is calling specialized function.
   int count = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "y" && count++) {
@@ -658,7 +646,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   }
   EXPECT_EQ(1, count);
 
-  // And that graph evaluation yields the same result
+  // And that graph evaluation yields the same result.
   Tensor pi = test::AsScalar<float>(3.14f);
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 558b8a77e8..22799311bc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
+#include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -36,6 +38,9 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+constexpr int kDefaultNumberOfIterations = 1;
+
 int64 NumEdges(const GraphDef& graph) {
   int64 num_edges = 0;
   for (const auto& node : graph.node()) {
@@ -50,144 +55,138 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
                          NumEdges(after), " edges (",
                          NumEdges(after) - NumEdges(before), ")");
 }
+
+int NumIterations(const RewriterConfig& cfg) {
+  return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+             ? kDefaultNumberOfIterations
+             : cfg.meta_optimizer_iterations();
+}
+
+// Check if optimizer is allowed to run only once.
+int IsRunOnceOptimizer(const string& name) { return name == "layout"; }
+
 }  // namespace
 
-std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
-    const string& optimizer) {
-  std::unique_ptr<GraphOptimizer> graph_optimizer;
-  if (optimizer == "pruning") {
-    graph_optimizer.reset(new ModelPruner());
-  }
-  if (optimizer == "function") {
-    graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization()));
+std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
+    const string& optimizer) const {
+#define MK_OPT(NAME, VALUE) \
+  if (optimizer == NAME) return std::unique_ptr<GraphOptimizer>(VALUE)
+
+  MK_OPT("pruning", new ModelPruner());
+  MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
+  MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("layout", new LayoutOptimizer());
+  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
+  MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
+  MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization()));
+  MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization()));
+  MK_OPT("debug_stripper", new DebugStripper());
+
+  return std::unique_ptr<GraphOptimizer>();
+#undef MK_OPT
+}
+
+Status MetaOptimizer::InitializeOptimizers(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  if (!cfg_.disable_model_pruning()) {
+    optimizers->emplace_back(new ModelPruner());
   }
-  if (optimizer == "constfold") {
-    graph_optimizer.reset(new ConstantFolding(cpu_device_));
+  if (cfg_.function_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new FunctionOptimizer(cfg_.function_optimization()));
   }
-  if (optimizer == "layout") {
-    graph_optimizer.reset(new LayoutOptimizer());
+  if (cfg_.debug_stripper() == RewriterConfig::ON) {
+    optimizers->emplace_back(new DebugStripper());
   }
-  if (optimizer == "memory") {
-    graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL));
+  if (cfg_.constant_folding() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
+        new ConstantFolding(cfg_.constant_folding(), cpu_device_));
   }
-  if (optimizer == "arithmetic") {
-    graph_optimizer.reset(
+  if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
   }
-  if (optimizer == "autoparallel") {
-    graph_optimizer.reset(
-        new AutoParallel(cfg_.auto_parallel().num_replicas()));
-  }
-  if (optimizer == "loop") {
-    graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization()));
+  if (cfg_.loop_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization()));
   }
-  if (optimizer == "dependency") {
-    graph_optimizer.reset(
+  if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(
         new DependencyOptimizer(cfg_.dependency_optimization()));
   }
-  if (optimizer == "debug_stripper") {
-    graph_optimizer.reset(new DebugStripper());
+  if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new LayoutOptimizer());
+  }
+  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+    if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
+      optimizers->emplace_back(
+          // Use the default target node name prefix "gradients/"
+          new MemoryOptimizer(cfg_.memory_optimization()));
+    } else {
+      optimizers->emplace_back(
+          new MemoryOptimizer(cfg_.memory_optimization(),
+                              cfg_.memory_optimizer_target_node_name_scope()));
+    }
+  }
+  if (cfg_.auto_parallel().enable()) {
+    optimizers->emplace_back(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
   }
-  return graph_optimizer;
+  return Status::OK();
 }
 
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
-  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  if (cfg_.optimizers().empty()) {
-    if (!cfg_.disable_model_pruning()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
-    }
-    if (cfg_.function_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new FunctionOptimizer(cfg_.function_optimization())));
-    }
-    if (cfg_.debug_stripper() == RewriterConfig::ON) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
-    }
-    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
-    }
-    if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
+Status MetaOptimizer::InitializeOptimizersByName(
+    std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
+  for (const string& optimizer_name : cfg_.optimizers()) {
+    auto optimizer = MakeNewOptimizer(optimizer_name);
+    if (optimizer) {
+      VLOG(2) << "Registered default graph optimizer: " << optimizer_name;
+      optimizers->push_back(std::move(optimizer));
+      continue;
     }
-    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new LoopOptimizer(cfg_.loop_optimization())));
-    }
-    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new DependencyOptimizer(cfg_.dependency_optimization())));
-    }
-    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
-    }
-    if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
-      if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
-        optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-            // Use the default target node name prefix "gradients/"
-            new MemoryOptimizer(cfg_.memory_optimization())));
-      } else {
-        optimizers.push_back(
-            std::unique_ptr<GraphOptimizer>(new MemoryOptimizer(
-                cfg_.memory_optimization(),
-                cfg_.memory_optimizer_target_node_name_scope())));
-      }
-    }
-    if (cfg_.auto_parallel().enable()) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new AutoParallel(cfg_.auto_parallel().num_replicas())));
-    }
-  } else {
-    const std::set<string> available_optimizers = {
-        "pruning",    "function",      "constfold",  "layout",
-        "memory",     "autoparallel",  "arithmetic", "loop",
-        "dependency", "debug_stripper"};
-    std::vector<string> custom_optimizer_names;
-    for (const auto& optimizer_name : cfg_.optimizers()) {
-      if (available_optimizers.find(optimizer_name) !=
-          available_optimizers.end()) {
-        optimizers.push_back(NewOptimizer(optimizer_name));
-      } else {
-        custom_optimizer_names.push_back(optimizer_name);
-      }
-    }
-    // Now run the custom optimizers.
-    for (const auto& optimizer_name : custom_optimizer_names) {
-      std::unique_ptr<CustomGraphOptimizer> opt =
-          CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
-      if (opt == nullptr) continue;
-      TF_RETURN_IF_ERROR(opt->Init());
-      optimizers.push_back(std::move(opt));
+
+    auto custom_optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom graph optimizer: " << optimizer_name;
+      TF_RETURN_IF_ERROR(custom_optimizer->Init());
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
     }
   }
+  return Status::OK();
+}
+
+Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                                    GraphDef* optimized_graph) {
+  VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id;
+
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  bool register_by_name = !cfg_.optimizers().empty();
+  TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers)
+                                      : InitializeOptimizers(&optimizers));
 
   if (optimizers.empty()) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
 
-  // Some optimizers should be run only once.
-  const std::set<string> run_once_optimizers = {"layout"};
-  bool already_optimized = false;
-  const int num_iterations =
-      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
-          ? 1
-          : cfg_.meta_optimizer_iterations();
+  // Invariant: optimized_graph contains the most recently optimized version of
+  // the graph.
   GrapplerItem optimized_item = item;
   optimized_graph->Swap(&optimized_item.graph);
-  for (int iteration = 0; iteration < num_iterations; ++iteration) {
-    VLOG(1) << "Starting optimization iteration " << iteration + 1;
+
+  GraphOptimizationResult optimization_result(item.id);
+
+  for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
+    VLOG(4) << "Starting optimization iteration " << iteration + 1;
+
     for (const auto& optimizer : optimizers) {
-      // Invariant: optimized_graph contains the most recently optimized
-      // version of the graph.
-      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
-        continue;
-      }
+      // Some optimizers can run only once.
+      if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
+
       uint64 start_us = Env::Default()->NowMicros();
       // This swaps the current optimized_graph into optimized item and
       // resets optimized_graph to an empty graph.
@@ -195,45 +194,114 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       *optimized_graph = GraphDef();
       Status status =
           optimizer->Optimize(cluster, optimized_item, optimized_graph);
-
       uint64 end_us = Env::Default()->NowMicros();
-      float duration_ms = (end_us - start_us) / 1000.0f;
+
       string result;
       if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
-                << status.ToString();
         optimized_graph->Swap(&optimized_item.graph);
         result = status.ToString();
       } else {
-        already_optimized = true;
+        optimization_result.is_optimized = true;
+        float duration_ms = (end_us - start_us) / 1000.0f;
         result = strings::StrCat(
-            optimizer->name(), ": ",
             PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
             ", time = ", duration_ms, "ms.");
       }
-      result_.emplace_back(optimizer->name(), result);
-      VLOG(1) << result;
+      VLOG(4) << optimizer->name() << ": " << result;
+
+      OptimizerResult optimizer_result{optimizer->name(), result};
+      optimization_result.results.push_back(optimizer_result);
     }
   }
 
-  if (already_optimized) {
+  // Record graph optimization result.
+  optimization_results_.push_back(optimization_result);
+
+  if (optimization_result.is_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
-    // Make sure that the optimizers preserved the graph version and library.
-    DCHECK_GE(optimized_graph->library().function_size(),
-              item.graph.library().function_size());
-    DCHECK_GE(optimized_graph->library().gradient_size(),
-              item.graph.library().gradient_size());
+    // Make sure that the optimizers preserved the graph version.
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
   }
+
+  return Status::OK();
+}
+
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  optimization_results_.clear();
+
+  // 1. Optimize main graph
+  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
+
+  // 2. Optimize function library
+  FunctionLibraryDefinition flib(OpRegistry::Global(),
+                                 optimized_graph->library());
+
+  // Optimize each function only once.
+  std::unordered_set<string> optimized_funcs;
+  bool optimize_function_library = true;
+
+  while (optimize_function_library) {
+    optimize_function_library = false;
+
+    for (const FunctionDef& func : optimized_graph->library().function()) {
+      const string& func_name = func.signature().name();
+
+      // Skip already optimized functions.
+      if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
+
+      // Skip parametrized functions (function type or body is defined only at
+      // function call time by caller node attributes).
+      if (IsParametrized(func)) continue;
+
+      VLOG(3) << "Optimize function: function=" << func_name;
+
+      // Function optimization might specialize nested function calls, so we
+      // have to reset the flag and do at least one more pass over the library.
+      optimize_function_library = true;
+      optimized_funcs.insert(func_name);
+
+      // Make a GrapplerItem from a FunctionDef.
+      GrapplerFunctionItem func_item;
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item));
+
+      // Optimize function body graph.
+      GraphDef optimized_func_graph;
+      TF_RETURN_IF_ERROR(
+          OptimizeGraph(cluster, func_item, &optimized_func_graph));
+
+      // Function body optimization might have created new specialized
+      // functions, add them to the library.
+      TF_RETURN_IF_ERROR(flib.AddLibrary(optimized_func_graph.library()));
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      func_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name));
+      TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func));
+    }
+
+    // If optimized at least one function, update the graph library.
+    if (optimize_function_library) {
+      *optimized_graph->mutable_library() = flib.ToProto();
+    }
+  }
+
   return Status::OK();
 }
 
 void MetaOptimizer::PrintResult() {
-  for (const auto& result : result_) {
-    LOG(INFO) << "Return status of optimizer " << result.first << ": "
-              << result.second;
+  for (const GraphOptimizationResult& graph_result : optimization_results_) {
+    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
+    for (const OptimizerResult& result : graph_result.results) {
+      LOG(INFO) << "Return status of optimizer " << result.optimizer_name
+                << ": " << result.result;
+    }
   }
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 382cfe51d4..7cf9a40c2d 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer {
  public:
   MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
       : cpu_device_(cpu_device), cfg_(cfg) {}
-  ~MetaOptimizer() override {}
+  ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
 
@@ -43,10 +43,37 @@ class MetaOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
+  std::unique_ptr<GraphOptimizer> MakeNewOptimizer(
+      const string& optimizer) const;
+
+  // Initialize active optimizers from RewriterConfig toggles.
+  Status InitializeOptimizers(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+  // Initialize active optimizers from RewriterConfig optimizer names.
+  Status InitializeOptimizersByName(
+      std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const;
+
+  // Run optimization pass over a single GrapplerItem. Meta optimizer might run
+  // multiple such passes: 1) for the main graph 2) for the function library
+  Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+                       GraphDef* optimized_graph);
+
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
-  std::vector<std::pair<string, string>> result_;
+
+  struct OptimizerResult {
+    string optimizer_name;
+    string result;
+  };
+
+  struct GraphOptimizationResult {
+    explicit GraphOptimizationResult(const string& id) : id(id) {}
+    string id;
+    bool is_optimized = false;
+    std::vector<OptimizerResult> results;
+  };
+
+  std::vector<GraphOptimizationResult> optimization_results_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index d9a386b9be..8793ad9633 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,6 +31,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kDevice[] = "/device:CPU:0";
+
 class TestOptimizer : public CustomGraphOptimizer {
  public:
   static void SetOptimized(const bool flag_value) { optimized_ = flag_value; }
@@ -56,7 +61,9 @@ bool TestOptimizer::optimized_;
 
 REGISTER_GRAPH_OPTIMIZER(TestOptimizer);
 
-TEST(MetaOptimizerTest, RunsCustomOptimizer) {
+class MetaOptimizerTest : public GrapplerTest {};
+
+TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -72,7 +79,7 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
-TEST(MetaOptimizerTest, RunOptimizersTwice) {
+TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -86,6 +93,167 @@ TEST(MetaOptimizerTest, RunOptimizersTwice) {
   TF_EXPECT_OK(status);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
+  using test::function::NDef;
+
+  // Enable ony function optimization.
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.set_function_optimization(RewriterConfig::ON);
+  rewriter_config.add_optimizers("function");
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+
+  // Define function library:
+  //
+  //   MyMul(x, y)    = x * y
+  //  *MySquare(x)    = MyMul(x, x)
+  //  *MyQuadratic(x) = MySquare(MySquare(x))
+  //
+  //  * - marked as noinline
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "my_mul:z:0"}});
+  (*square_func.mutable_attr())["_noinline"].set_b(true);
+
+  FunctionDef quadratic_func = FunctionDefHelper::Create(
+      "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
+       {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "quadratic:z:0"}});
+  (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   b = tf.Placeholder(tf.int32);
+  //
+  //   square = MySquare(a);        // a^2
+  //   quadratic = MyQuadratic(b);  // b^4
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       // Calls into function library
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice),
+       // Forward outputs
+       NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func, quadratic_func});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized and optimized functions should be added to the graph.
+  EXPECT_EQ(6, optimized_flib.num_functions());
+
+  // MyQuadratic should be specialized once:
+  //   0. 'quadratic' node in the main graph
+  const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
+
+  // MySquare should be specialized and optimized for 3 instantiations:
+  //   1. 'square' node in the main graph
+  //   2. 'square' node in the MyQuadratic specialization
+  //   3. 'quadratic' node in the MyQuadratic specialization
+
+  const string optimized_1 = "MySquare_specialized_for_square";
+  const string optimized_2 = "MySquare_specialized_for_square_1";
+  const string optimized_3 = "MySquare_specialized_for_quadratic";
+
+  const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
+  const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
+  const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
+  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
+
+  ASSERT_NE(optimized_func_0, nullptr);
+  ASSERT_NE(optimized_func_1, nullptr);
+  ASSERT_NE(optimized_func_2, nullptr);
+  ASSERT_NE(optimized_func_3, nullptr);
+
+  // Graph should call optimized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ("MySquare_specialized_for_square", node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  // Specialized MySquare should call specialized functions.
+  count = 0;
+  for (const NodeDef& node : optimized_func_0->node_def()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ(optimized_2, node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ(optimized_3, node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  const std::vector<const FunctionDef*> optimized_funcs = {
+      optimized_func_1, optimized_func_1, optimized_func_3};
+
+  // MyMul should be inlined into all optimized versions of MySquare.
+  for (const FunctionDef* optimized_func : optimized_funcs) {
+    count = 0;
+    for (const NodeDef& node : optimized_func->node_def()) {
+      if (node.name() == "my_mul/inlined_inputs" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("x:0", node.input(0));
+        EXPECT_EQ("x:0", node.input(1));
+      } else if (node.name() == "my_mul/x" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
+      } else if (node.name() == "my_mul/y" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
+      } else if (node.name() == "my_mul/mul" && count++) {
+        EXPECT_EQ("Mul", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("my_mul/x:output:0", node.input(0));
+        EXPECT_EQ("my_mul/y:output:0", node.input(1));
+      } else if (node.name() == "my_mul" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/mul:z:0", node.input(0));
+      }
+      EXPECT_TRUE(node.device().empty());
+    }
+    EXPECT_EQ(5, count);
+  }
+
+  item.fetch = {"out_s", "out_q"};
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<int>(4));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 638fe1999a..790809bc67 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -545,6 +545,12 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   return Status::OK();
 }
 
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
+  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item);
+}
+
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.
 Status RegisterGrapplerFunctionConnectivity(
@@ -560,9 +566,9 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
-                                  const FunctionLibraryDefinition& flib,
-                                  FunctionDef* func) {
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index ab369bcad7..5e8b6c6960 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -38,7 +38,8 @@ using AttrValueMap = std::unordered_map<string, AttrValue>;
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized inputs?
   string input_name;                 // name of the function input argument
   DataType data_type;                // input data type
   bool is_ref;                       // if true, inputs are required to be refs
@@ -53,7 +54,8 @@ struct InputArgExpansion {
 // tensors of a function body nodes and a resolved output data type
 struct OutputArgExpansion {
   // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types
+  // different data types.
+  // TODO(ezhulenev): Support type parametrized outputs?
   string output_name;                  // name of the function output argument
   DataType data_type;                  // output data type
   bool is_ref;                         // if true, outputs are refs
@@ -186,13 +188,6 @@ bool HasParametrizedBody(const FunctionDef& func);
 // Check if function has parametrized type or body.
 bool IsParametrized(const FunctionDef& func);
 
-// Make a GrapplerFunctionItem from the function definition and attributes.
-// Return error if the given function def cannot be converted.
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
-    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
-
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity.  Use function library definition to
 // lookup function body nodes output names and ranges.
@@ -200,11 +195,28 @@ Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity);
 
-// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function
-// library definition to lookup function body nodes output names and ranges.
-Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
-                                  const FunctionLibraryDefinition& flib,
-                                  FunctionDef* func);
+// Make a GrapplerFunctionItem from the function definition and function
+// instantiation attributes (caller node attributes). Returns error if the given
+// function def cannot be converted (e.g. not all attributes are defined).
+Status MakeGrapplerFunctionItem(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
+// Make a GrapplerFunction item from the function definition. Function must be
+// fully defined (no type or body parametrization).
+// TODO(ezhulenev): Support parametrized functions without fully defined
+// instantiation attributes? Do we ever want to optimize parametrized function
+// without specializing it to it's instantiation attributes (at least types)?
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item);
+
+// Make a FunctionDef from the GrapplerFunctionItem. Use function library
+// definition to lookup function body nodes output names and ranges.
+Status MakeFunctionDef(const GrapplerFunctionItem& item,
+                       const FunctionLibraryDefinition& flib,
+                       FunctionDef* func);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 54d235a8a4..6dfd49b943 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
-TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
+TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
       // Name
@@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
 
   // Input and output types are resolved based on instantiation attributes.
   EXPECT_EQ("x", specialized.signature().input_arg(0).name());
@@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
   EXPECT_EQ(2, count);
 }
 
-TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   using test::function::NDef;
 
   FunctionDef mul_func = FunctionDefHelper::Create(
@@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
   // Replace function body with identity function
   item.SwapFunctionBody(std::move(id_func_body));
   FunctionDef specialized;
-  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
 
   // Check that graph body was updated.
   int count = 0;
-- 
GitLab


From 39a2787272f948a043a1ca103159307cfb0f7248 Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 09:20:38 +0800
Subject: [PATCH 1177/1262] Fix incorrect math equation renderings broken by
 backtick (#18386)

* Fix incorrect `` typo format

* Remove breaking ``` for math equations

* fix one more typo

* fix more math equation broken ` typos in py
---
 .../bayesflow/python/ops/monte_carlo_impl.py  | 22 ++++++---------
 .../factorization/python/ops/kmeans.py        |  4 +--
 .../python/contrib.bayesflow.monte_carlo.md   | 28 ++++++++-----------
 tensorflow/python/ops/nn_ops.py               |  2 +-
 4 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 48ff083532..032b859d46 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,15 +44,13 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`.
+  r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\).
 
-  With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns
+  With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns
 
-  ```
   \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
   \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
   \\(=       E_p[f(Z)]\\)
-  ```
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -121,14 +119,12 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`,
+  With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\),
   this `Op` returns
 
-  ```
   \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
   \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
   \\(=       Log[E_p[f(Z)]]\\)
-  ```
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -196,13 +192,11 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`.
+  """Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\).
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
-  ```none
   \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
-  ```
 
   where:
 
@@ -216,8 +210,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where
-  `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`.
+  grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where
+  S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\).
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +290,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `\\(E_p[f(X)]\\)`.  A batch of samples should be indexed by `axis`
+      \\(E_p[f(X)]\\).  A batch of samples should be indexed by `axis`
       dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
@@ -317,7 +311,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `\\(E_p[f(X)]\\)`.
+      of \\(E_p[f(X)]\\).
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index bfe338c9f9..9ffdd3ba5e 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -374,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `\\(||u - v||_2\\)`
+             between vectors `u` and `v` is defined as \\(||u - v||_2\\)
              which is the square root of the sum of the absolute squares of
              the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`.
+             `u` and `v` is defined as \\(1 - (u . v) / (||u||_2 ||v||_2)\\).
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
index f3db5857ae..74fe4a323a 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
@@ -6,43 +6,39 @@ Monte Carlo integration and helpers.
 ## Background
 
 Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable `Z in \\(R^k\\)` with density `p`,
+a sample mean.  For example, given random variable Z in \\(R^k\\) with density `p`,
 the expectation of function `f` can be approximated like:
 
-```
 $$E_p[f(Z)] = \int f(z) p(z) dz$$
 $$          ~ S_n
           := n^{-1} \sum_{i=1}^n f(z_i),  z_i\ iid\ samples\ from\ p.$$
-```
 
-If `\\(E_p[|f(Z)|] < infinity\\)`, then `\\(S_n\\) --> \\(E_p[f(Z)]\\)` by the strong law of large
-numbers.  If `\\(E_p[f(Z)^2] < infinity\\)`, then `\\(S_n\\)` is asymptotically normal with
-variance `\\(Var[f(Z)] / n\\)`.
+If \\(E_p[|f(Z)|] < infinity\\), then \\(S_n\\) --> \\(E_p[f(Z)]\\) by the strong law of large
+numbers.  If \\(E_p[f(Z)^2] < infinity\\), then \\(S_n\\) is asymptotically normal with
+variance \\(Var[f(Z)] / n\\).
 
 Practitioners of Bayesian statistics often find themselves wanting to estimate
-`\\(E_p[f(Z)]\\)` when the distribution `p` is known only up to a constant.  For
+\\(E_p[f(Z)]\\) when the distribution `p` is known only up to a constant.  For
 example, the joint distribution `p(z, x)` may be known, but the evidence
-`\\(p(x) = \int p(z, x) dz\\)` may be intractable.  In that case, a parameterized
-distribution family `\\(q_\lambda(z)\\)` may be chosen, and the optimal `\\(\lambda\\)` is the
-one minimizing the KL divergence between `\\(q_\lambda(z)\\)` and
-`\\(p(z | x)\\)`.  We only know `p(z, x)`, but that is sufficient to find `\\(\lambda\\)`.
+\\(p(x) = \int p(z, x) dz\\) may be intractable.  In that case, a parameterized
+distribution family \\(q_\lambda(z)\\) may be chosen, and the optimal \\(\lambda\\) is the
+one minimizing the KL divergence between \\(q_\lambda(z)\\) and
+\\(p(z | x)\\).  We only know `p(z, x)`, but that is sufficient to find \\(\lambda\\).
 
 
 ## Log-space evaluation and subtracting the maximum
 
 Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate `\\(E_q[f(Z) p(Z) / q(Z)]\\)`
-involves the ratio of two terms `\\(p(Z) / q(Z)\\)`, each of which must have tails
-dropping off faster than `\\(O(|z|^{-(k + 1)})\\)` in order to have finite integral.
+For example, the naive importance sample estimate \\(E_q[f(Z) p(Z) / q(Z)]\\)
+involves the ratio of two terms \\(p(Z) / q(Z)\\), each of which must have tails
+dropping off faster than \\(O(|z|^{-(k + 1)})\\) in order to have finite integral.
 This ratio would often be zero or infinity up to numerical precision.
 
 For that reason, we write
 
-```
 $$Log E_q[ f(Z) p(Z) / q(Z) ]$$
 $$   = Log E_q[ \exp\{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C\} ] + C,$$  where
 $$C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].$$
-```
 
 The maximum value of the exponentiated term will be 0.0, and the expectation
 can be evaluated in a stable manner.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a8d0293d13..cd07550d2e 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1155,7 +1155,7 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   Returns:
     A `Tensor` with the same type as `value`.
-    Output shape with `'VALID`` padding is:
+    Output shape with `'VALID'` padding is:
 
         [batch, height - 2 * (filter_width - 1),
          width - 2 * (filter_height - 1), out_channels].
-- 
GitLab


From a734919fd8fd6d74edf1e7c3abec3ee11fec83fd Mon Sep 17 00:00:00 2001
From: Jiajia Li <jiajia.li@intel.com>
Date: Fri, 20 Apr 2018 09:22:26 +0800
Subject: [PATCH 1178/1262] Fix the error looking for libhdfs.so, Mac OS using
 libhdfs.dylib (#18486)

---
 tensorflow/core/platform/hadoop/hadoop_file_system.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 9a71fbe2b7..a8cb40502c 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -109,6 +109,8 @@ class LibHDFS {
 // in the libhdfs documentation.
 #if defined(PLATFORM_WINDOWS)
     const char* kLibHdfsDso = "hdfs.dll";
+#elif defined(MACOS) || defined(TARGET_OS_MAC)
+    const char* kLibHdfsDso = "libhdfs.dylib";
 #else
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
-- 
GitLab


From 256aad5324d163c028da0dc0318c3e00cf2fc3ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Apr 2018 18:29:00 -0700
Subject: [PATCH 1179/1262] [XLA] Fix a bug in the name_uniquer.

The problem happens because the name_uniquer stripped away the numeric suffix if it <=0. The solution is, if there was a numeric suffix, the result should also have a numeric suffix.

PiperOrigin-RevId: 193606838
---
 tensorflow/compiler/xla/service/name_uniquer.cc      | 11 ++++++-----
 tensorflow/compiler/xla/service/name_uniquer_test.cc | 11 +++++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index 7d8c05fffa..f74bcb0b79 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -53,17 +53,18 @@ NameUniquer::NameUniquer(const string& separator) {
 }
 
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
-  string root = prefix.empty() ? "name" : prefix.ToString();
-  root = GetSanitizedName(root);
+  string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString());
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
+  bool has_numeric_suffix = false;
+  int64 numeric_suffix = 0;
   size_t separator_index = root.rfind(separator_);
   if (separator_index != string::npos && (separator_index > 0) &&
       (separator_index < root.size() - 1)) {
     string after_suffix = root.substr(separator_index + 1);
-    int64 numeric_suffix;
     if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+      has_numeric_suffix = true;
       // Remove numeric suffix from root.
       root = root.substr(0, separator_index);
       // Update count to at least the numeric suffix value to avoid future
@@ -71,11 +72,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
       generated_names_[root] = std::max(generated_names_[root], numeric_suffix);
     }
   }
-
   int64* count = &(generated_names_[root]);
   if (*count == 0) {
     *count = 1;
-    return root;
+    return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0)
+                              : root;
   } else {
     tensorflow::strings::StrAppend(&root, separator_, *count);
     // Increment lookup under old 'root' name.
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 4258cf1687..2ec255558c 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -57,11 +57,18 @@ TEST_F(NameUniquerTest, NumericSuffixes) {
   EXPECT_EQ("foo.55", uniquer.GetUniqueName("foo"));
   EXPECT_EQ("foo.55.1", uniquer.GetUniqueName("foo.55.1"));
   EXPECT_EQ("foo.55.2", uniquer.GetUniqueName("foo.55.1"));
-  EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000"));
+  EXPECT_EQ("bar.0", uniquer.GetUniqueName("bar.-1000"));
   EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000"));
   EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1"));
 }
 
+TEST_F(NameUniquerTest, PrefixHasSuffix) {
+  NameUniquer uniquer(".");
+
+  EXPECT_EQ("foo.11.0", uniquer.GetUniqueName("foo.11.0"));
+  EXPECT_EQ("foo.11", uniquer.GetUniqueName("foo.11"));
+}
+
 TEST_F(NameUniquerTest, Sanitize) {
   NameUniquer uniquer("_");
 
@@ -73,7 +80,7 @@ TEST_F(NameUniquerTest, Sanitize) {
   EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo"));
 
   // Invalid characters will be replaced with '_'.
-  EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000"));
+  EXPECT_EQ("bar_0", uniquer.GetUniqueName("bar<-1000"));
   EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000"));
   EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1"));
 
-- 
GitLab


From 052c3863cf8b901303a1a32e82b6525dc6ea6dbd Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 19 Apr 2018 18:45:47 -0700
Subject: [PATCH 1180/1262] Internal change.

PiperOrigin-RevId: 193608140
---
 tensorflow/compiler/xla/python/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 0b9333b406..ecb87bd889 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -8,7 +8,6 @@ py_library(
     name = "xla_client",
     srcs = ["xla_client.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
     visibility = ["//visibility:public"],
     deps = [
         ":pywrap_xla",
@@ -21,6 +20,7 @@ py_test(
     srcs = ["xla_client_test.py"],
     main = "xla_client_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     deps = [
         ":xla_client",
         "//tensorflow/python:platform_test",
-- 
GitLab


From 6e2df5e471295cd32f9887d76e6ddbf1b4e2a11a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 19:03:03 -0700
Subject: [PATCH 1181/1262] Automated g4 rollback of changelist 193593761

PiperOrigin-RevId: 193609407
---
 tensorflow/compiler/xla/service/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index d5d09bd8a3..9009cbf845 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -699,7 +699,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/stream_executor:stream_executor_impl",
     ],
 )
 
-- 
GitLab


From b001827146ff95c9e0ce5668c85d8cc2daf6b78d Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Thu, 19 Apr 2018 19:11:37 -0700
Subject: [PATCH 1182/1262] Support variable parameter structure in TPU
 distribution strategy.

TPUStrategy is added to a few more tests.

There appears to be an issue with the batch norm test in minimize_loss_test where the moving averages stay at 0.  I'm trying to resolve that separately as the next CL.

PiperOrigin-RevId: 193610264
---
 tensorflow/contrib/distribute/python/BUILD    | 18 +++--
 .../distribute/python/minimize_loss_test.py   | 19 ++++-
 .../distribute/python/single_loss_example.py  |  7 +-
 .../contrib/distribute/python/tpu_strategy.py | 70 +++++++++++--------
 .../contrib/distribute/python/values.py       | 34 +++++++--
 5 files changed, 104 insertions(+), 44 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 837a1f1348..c2834d8226 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -231,15 +231,14 @@ py_library(
     srcs = ["tpu_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/distribute/python:one_device_strategy",
-        "//tensorflow/contrib/eager/python:datasets",
-        "//tensorflow/contrib/optimizer_v2:training",
+        ":one_device_strategy",
+        ":values",
         "//tensorflow/contrib/tpu",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/contrib/tpu:tpu_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -249,9 +248,13 @@ py_library(
     srcs = ["minimize_loss_test.py"],
     deps = [
         ":combinations",
+        ":mirrored_strategy",
         ":single_loss_example",
+        "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
@@ -324,6 +327,7 @@ py_library(
     srcs = ["single_loss_example.py"],
     deps = [
         ":step_fn",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:layers",
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 43b2e91cbf..e134fe34e1 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -96,8 +96,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       combinations.times(
           combinations.distributions_and_v1_optimizers() +
           combinations.distributions_and_v2_optimizers(),
-          combinations.combine(mode=["graph", "eager"])))
-  def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
+          combinations.combine(mode=["graph", "eager"], is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=[
+              combinations.adam_optimizer_v1_fn,
+              combinations.gradient_descent_optimizer_v1_fn
+          ],
+          mode=["graph"],
+          is_tpu=[True]))
+
+  def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu):
     created_variables = []
     trainable_variables = []
 
@@ -128,11 +137,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
       run_step()
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index abd13c6cc6..0db0b59fca 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.distribute.python import step_fn
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -54,7 +55,11 @@ def minimize_loss_example(optimizer_fn,
   """Example of non-distribution-aware legacy code."""
 
   def dataset_fn():
-    return dataset_ops.Dataset.from_tensors([[1.]]).repeat().batch(2)
+    dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+    # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be
+    # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
+    return dataset.apply(
+        batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True))
 
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index ceb52ceca7..a7e4fe80f3 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,15 +21,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 from tensorflow.contrib import tpu
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
 
 
 # TODO(isaprykin):  Consider whether inheriting is really appropriate.
@@ -37,48 +38,53 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
   def __init__(self,
-               global_batch_size=2,
                num_cores_per_host=2,
                iterations_per_step=2):
-    # TODO(isaprykin): Generalize the defaults.
+    # TODO(isaprykin): Generalize the defaults.  They are currently tailored for
+    # the unit test.
     super(TPUStrategy, self).__init__('/cpu:0')
     # TODO(isaprykin): Auto-detect number of cores and hosts.
     self._num_cores_per_host = num_cores_per_host
-    self._global_batch_size = global_batch_size
     # TODO(isaprykin): This might have to be per-call.
     self._iterations_per_step = iterations_per_step
 
   def distribute_dataset(self, dataset_fn):
     return values.PerIterationDataset(
-        self._call_dataset_fn(dataset_fn), self._iterations_per_step)
+        self._call_dataset_fn(dataset_fn), self._iterations_per_step,
+        self._num_cores_per_host)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
 
-    # TODO(isaprykin): Support variable arguments similar to PerDevice+regroup.
-    inputs = args[0]
+    inputs = {'args': args, 'kwargs': kwargs}
+    flat_inputs = nest.flatten(inputs)
+
+    feed_mask = [isinstance(f, values.PerIteration) for f in flat_inputs]
 
-    sharded_shape = [None]  # Python 2 nonlocal.
+    feeds = lambda: itertools.compress(flat_inputs, feed_mask)
+    shapes = [f.get_shape() for f in feeds()]
+    if any([not s.is_fully_defined() for s in shapes]):
+      raise ValueError(
+          'TPU currently requires fully defined shapes. Either use '
+          'set_shape() on the input tensors or use '
+          'dataset.apply(map_and_batch(..., drop_remainder=True)).')
+    types = [f.get_dtype() for f in feeds()]
 
     def infeed_input(i):
       """Get input, split it and then enqueue."""
-      batches = array_ops.gather(inputs, i)
+      iteration_inputs = [f.get(i) for f in feeds()]
 
-      # TODO(isaprykin):  Handle partial batch.
-      global_shape = [self._global_batch_size] + list(batches.get_shape())[1:]
-      sharded_shape[0] = ([self._global_batch_size / self._num_cores_per_host] +
-                          list(global_shape)[1:])
+      infeed_inputs = [[inputs_per_core[core_id]
+                        for inputs_per_core in iteration_inputs]
+                       for core_id in range(self._num_cores_per_host)]
 
-      batches.set_shape(global_shape)
-      batches = array_ops.split(batches, self._num_cores_per_host)
+      infeed_ops = []
+      for core_id, infeed_input in enumerate(infeed_inputs):
+        infeed_ops.append(
+            tpu_ops.infeed_enqueue_tuple(
+                inputs=infeed_input, shapes=shapes, device_ordinal=core_id))
 
-      infeeds = [
-          tpu_ops.infeed_enqueue_tuple(
-              inputs=[batches[j]], shapes=[sharded_shape[0]], device_ordinal=j)
-          for j in range(self._num_cores_per_host)
-      ]
-
-      with ops.control_dependencies(infeeds):
+      with ops.control_dependencies(infeed_ops):
         return i + 1
 
     with ops.device('/task:0/device:CPU:0'):
@@ -87,13 +93,21 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
           infeed_input, [constant_op.constant(0)],
           parallel_iterations=1)
 
-    assert sharded_shape[0]
-
     def dequeueing_fn(*args, **kwargs):
+      """Dequeue input arguments and supply them to `fn`."""
       del args, kwargs
-      x, = tpu.infeed_dequeue_tuple(
-          dtypes=[dtypes.float32], shapes=[sharded_shape[0]])
-      return fn(x)
+      dequeued = tpu.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
+      dequeued = iter(dequeued)
+
+      fn_inputs = []
+      for inp, is_feed in zip(flat_inputs, feed_mask):
+        if is_feed:
+          fn_inputs.append(next(dequeued))
+        else:
+          fn_inputs.append(inp)
+
+      fn_inputs = nest.pack_sequence_as(inputs, fn_inputs)
+      return fn(*fn_inputs['args'], **fn_inputs['kwargs'])
 
     def iterate_on_tpu():
       return tpu.repeat(self._iterations_per_step, dequeueing_fn, [])
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 62016c3a78..8cb5276579 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -570,18 +570,36 @@ class PerDeviceDataset(object):
         dataset_iterator, self._devices, self._prefetch_on_device)
 
 
+class PerIteration(object):
+  """Holds input for multiple iterations at once."""
+
+  def __init__(self, index):
+    self._index = index
+
+  def get(self, iteration):
+    return array_ops.gather(self._index, iteration)
+
+  def get_shape(self):
+    return self._index[-1][-1].get_shape()
+
+  def get_dtype(self):
+    return self._index[-1][-1].dtype
+
+
 class MultiIterator(object):
   """Iterator that returns results of multiple get_next()s."""
 
-  def __init__(self, dataset_iterator, iterations):
+  def __init__(self, dataset_iterator, iterations, batches_per_iteration):
     self._dataset_iterator = dataset_iterator
     self._iterations = iterations
+    self._batches_per_iteration = batches_per_iteration
 
   def get_next(self, name=None):
-    return [
+    return PerIteration([[
         self._dataset_iterator.get_next(name=name)
-        for _ in range(self._iterations)
+        for _ in range(self._batches_per_iteration)
     ]
+                         for _ in range(self._iterations)])
 
   @property
   def initializer(self):
@@ -589,18 +607,22 @@ class MultiIterator(object):
 
 
 class PerIterationDataset(object):
+  """A dataset that returns MultiIterators."""
 
-  def __init__(self, dataset, iterations):
+  def __init__(self, dataset, iterations, batches_per_iteration):
     self._dataset = dataset
     self._iterations = iterations
+    self._batches_per_iteration = batches_per_iteration
 
   def make_one_shot_iterator(self):
     iterator = self._dataset.make_one_shot_iterator()
-    return MultiIterator(iterator, self._iterations)
+    return MultiIterator(iterator, self._iterations,
+                         self._batches_per_iteration)
 
   def make_initializable_iterator(self):
     iterator = self._dataset.make_initializable_iterator()
-    return MultiIterator(iterator, self._iterations)
+    return MultiIterator(iterator, self._iterations,
+                         self._batches_per_iteration)
 
 
 class MapOutput(object):
-- 
GitLab


From 8723770b4cbcac0a528354d8508a5ef83716d1fa Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 19 Apr 2018 19:27:34 -0700
Subject: [PATCH 1183/1262] [XLA] Remove default argument on virtual function
 DeviceMemoryAllocator::Allocate().

Default args on virtual functions are disallowed by the Google style
guide, for good reason.  They have the extremely surprising behavior
that the defaults you get when calling a function on a pointer depend
not on the underlying type of the object, but on whatever is the
semantic type of the pointer!

PiperOrigin-RevId: 193611213
---
 .../xla/service/device_memory_allocator.h     | 30 ++++++++++++++-----
 .../xla/tests/local_client_test_base.cc       |  3 +-
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 240acf8973..da45c4d45a 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -38,13 +38,25 @@ class DeviceMemoryAllocator {
   virtual ~DeviceMemoryAllocator() {}
 
   // 'retry_on_failure': If false, and the first attempt to allocate the memory
-  // fails, the allocation should return immediately without retrying.
-  // An example use case is optional scratch spaces where a failure
-  // has only performance impact.
+  // fails, the allocation should return immediately without retrying.  An
+  // example use case is optional scratch spaces where a failure has only
+  // performance impact.
+  //
   // Allocate() should return a null pointer for a size-0 allocation.
   // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<se::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
+  virtual StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal,
+                                                  uint64 size,
+                                                  bool retry_on_failure) = 0;
+
+  // Two-arg version of Allocate(), which sets retry-on-failure to true.
+  //
+  // (We don't simply use a default argument on the virtual Allocate function
+  // because default args on virtual functions are disallowed by the Google
+  // style guide.)
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size) {
+    return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
+  }
+
   virtual tensorflow::Status Deallocate(int device_ordinal,
                                         se::DeviceMemoryBase* mem) = 0;
 
@@ -67,8 +79,12 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
       const se::Platform* platform,
       tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<se::DeviceMemoryBase> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure = true) override;
+  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
+                                          bool retry_on_failure) override;
+
+  // Pull in two-arg overload that sets retry_on_failure to true.
+  using DeviceMemoryAllocator::Allocate;
+
   tensorflow::Status Deallocate(int device_ordinal,
                                 se::DeviceMemoryBase* mem) override;
 
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index c60ba2422f..bb5aabb214 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -44,7 +44,8 @@ StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size);
+  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
+                                                 retry_on_failure);
 }
 
 tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
-- 
GitLab


From 2a956c9b8f9950405b481ccc0e05636873ecc9ae Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:40:37 +0000
Subject: [PATCH 1184/1262] Support string tensors for tf.count_nonzero

This fix tries to address the issue raised in 18712 where
`tf.count_nonzero` does not support string tensors.

The implementation of `tf.count_nonzero` relies on `tf.not_equal`
which actually support string tensors. The reason the string
tensor does not work is because `tf.count_nonzero` created
a numpy type `zero` which uses `input_tensor.dtype.as_numpy_dtype()`.
The numpy type `zero` is then passed to `tf.not_equal (which converts
numpy `zero` into a tensor zero). However,
`input_tensor.dtype.as_numpy_dtype()` will converts tf.string to
numpy.object thus the exception.

But that is not necessary as `zero` could be created
with `tf.zeros` directly without back and forth conversion
to numpy.

This fix fixes the issue.

This fix fixes 18712.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 781b1c557f..8c9ad66b0e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1487,7 +1487,8 @@ def count_nonzero(input_tensor,
 
   with ops.name_scope(name, "count_nonzero", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
-    zero = input_tensor.dtype.as_numpy_dtype()
+    # A scalar of 'zero' is enough as `not_equal` will broadcast.
+    zero = array_ops.zeros([], dtype=input_tensor.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-- 
GitLab


From 37999ce500f27d587100f0bf45e87957936f5ada Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:48:15 +0000
Subject: [PATCH 1185/1262] Add test case for tf.string support with
 tf.count_nonzero

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/reduction_ops_test.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 589ea54973..0be89e1ff4 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -958,6 +958,12 @@ class CountNonzeroReductionTest(test.TestCase):
           y = math_ops.count_nonzero(x, [0])
           self.assertAllEqual(y.eval(), np.zeros(9938))
 
+  def testStringReduce(self):
+    # Test case for GitHub issue 18712
+    with self.test_session() as sess:
+      v = math_ops.count_nonzero(constant_op.constant(["test"]))
+      self.assertAllClose(sess.run(v), 1)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 7358025743951b42fe0f99fb85b4418769de5357 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:51:54 +0000
Subject: [PATCH 1186/1262] Add test cases with axis and keepdims for
 tf.count_nonzero and string

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/reduction_ops_test.py       | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 0be89e1ff4..943b80b787 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -889,9 +889,9 @@ class AnyReductionTest(test.TestCase):
 
 class CountNonzeroReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keepdims, use_gpu=False,
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False, zero=0,
                feed_dict=None):
-    np_ans = (x != 0).astype(np.int32)
+    np_ans = (x != zero).astype(np.int32)
     if reduction_axes is None:
       np_ans = np.sum(np_ans, keepdims=keepdims)
     else:
@@ -964,6 +964,15 @@ class CountNonzeroReductionTest(test.TestCase):
       v = math_ops.count_nonzero(constant_op.constant(["test"]))
       self.assertAllClose(sess.run(v), 1)
 
+  def testStringReduce1D(self):
+    # Create a 1D array of strings
+    x = np.asarray(["", "", "a", "", "", "b"])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 01ab85f0fdce13f98b705c54901284a165ed7bd8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:53:57 +0000
Subject: [PATCH 1187/1262] Add n-D test cases for better coverage

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/reduction_ops_test.py    | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 943b80b787..ea78b58d88 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -974,5 +974,21 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, [], keepdims=True, zero=np.str(""))
     self._compare(x, [0], keepdims=True, zero=np.str(""))
 
+  def testStringReduce2D(self):
+    # Create a 2D array of strings
+    x = np.asarray([["", "", "a", "", "", "b"],
+                    ["", "c", "", "d", "", ""],
+                    ["e", "", "f", "", "", ""]])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, [1], keepdims=False, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=True, zero=np.str(""))
+
+
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 38dcc57681612c2321169367c8756bb218472dd7 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 19 Apr 2018 19:56:09 -0700
Subject: [PATCH 1188/1262] Revert part of
 tensorflow/core/grappler/optimizers/meta_optimizer.cc from #18479.

---
 .../grappler/optimizers/meta_optimizer.cc     | 22 +------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index bca779c3b3..22799311bc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -168,26 +168,6 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers)
                                       : InitializeOptimizers(&optimizers));
 
-  // Append custom configurable optimizers.
-  std::vector<tensorflow::RewriterConfig_CustomGraphOptimizer>
-      custom_configurable_optimizers;
-  for (const auto& optimizer : cfg_.custom_optimizers()) {
-    if (available_optimizers.find(optimizer.name()) !=
-        available_optimizers.end()) {
-      optimizers.push_back(NewOptimizer(optimizer.name()));
-    } else {
-      custom_configurable_optimizers.push_back(optimizer);
-    }
-  }
-  // Now initialize and configure the custom optimizers.
-  for (const auto& optimizer : custom_configurable_optimizers) {
-    std::unique_ptr<CustomGraphOptimizer> opt =
-        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer.name());
-    if (opt == nullptr) continue;
-    TF_RETURN_IF_ERROR(opt->Init(&optimizer));
-    optimizers.push_back(std::move(opt));
-  }
-
   if (optimizers.empty()) {
     *optimized_graph = item.graph;
     return Status::OK();
@@ -341,7 +321,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.auto_parallel().enable() ||
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
-         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
+         !cfg.optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
-- 
GitLab


From 4ef9de422d452683ac661d3a6313aeb2972b836d Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 19 Apr 2018 20:00:21 -0700
Subject: [PATCH 1189/1262] Always include the local worker in the list of
 filtered targets.

It is currently legal to specify a device filter that doesn't include the local worker.
In that case, the MasterSession includes all local devices regardless of the filter.
This change extends this behavior to the list of filtered workers, which will be crucial for backwards compatibility when we enable CreateWorkerSession for all MasterSessions, because we need to call CreateWorkerSession on all potential workers.

PiperOrigin-RevId: 193613313
---
 tensorflow/core/distributed_runtime/master.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 288656e7f8..e60386fd34 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -167,13 +167,16 @@ class DeviceFinder {
     }
     // Enumerates all known workers' target. A target name is a
     // prefix of a device name. E.g., /job:mnist/replica:0/task:10.
+    CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
+    const string& local_device_name = env_->local_devices[0]->name();
     std::vector<string> workers;
     worker_cache->ListWorkers(&workers);
     if (filters_.empty()) {
       std::swap(workers, targets_);
     } else {
       for (const string& name : workers) {
-        if (MatchFilters(name)) {
+        if (MatchFilters(name) ||
+            DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) {
           targets_.push_back(name);
         }
       }
-- 
GitLab


From ddd763de08c5095d9a0dbb8acceb82135c0aa485 Mon Sep 17 00:00:00 2001
From: imsheridan <xiaoyudong0512@gmail.com>
Date: Fri, 20 Apr 2018 11:08:34 +0800
Subject: [PATCH 1190/1262] Fix unwanted typo caused protobuf load failure

---
 tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
index 743247bb60..ad0aeac004 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -80,4 +80,5 @@ $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
 $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
 $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
 $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+END
 }
-- 
GitLab


From 7f3baa210a45cd0b41e21b63c2be6dd54230ea0b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 20 Apr 2018 02:55:31 +0000
Subject: [PATCH 1191/1262] Update doc string for tf.count_nonzero to add
 string type

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 8c9ad66b0e..31ce83905b 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1467,7 +1467,8 @@ def count_nonzero(input_tensor,
   ```
 
   Args:
-    input_tensor: The tensor to reduce. Should be of numeric type, or `bool`.
+    input_tensor: The tensor to reduce. Should be of numeric type, `string`,
+      or `bool`.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-- 
GitLab


From 2273c4e56334caf31de01c6b6f8f4edd48432972 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 19 Apr 2018 21:33:41 -0700
Subject: [PATCH 1192/1262] Skip tests with no_oss tag in XLA builds.

PiperOrigin-RevId: 193619344
---
 tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a94a627dfb..a410c10b61 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -35,7 +35,7 @@ echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
 
 bazel clean
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-- 
GitLab


From 06bb3364795e443206910c98cee132d719cf41e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 20 Apr 2018 13:33:05 +0800
Subject: [PATCH 1193/1262] TST: byte string for python3

---
 .../python/kernel_tests/scatter_nd_ops_test.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index dfe9600dbb..b7477a768a 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -365,31 +365,35 @@ class ScatterNdTest(test.TestCase):
     return array_ops.scatter_nd(indices, updates, shape)
 
   def testString(self):
-    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    indices = constant_op.constant([[4], [3], [1], [7]],
+                                   dtype=dtypes.int32)
     updates = constant_op.constant(["four", "three", "one", "seven"],
                                    dtype=dtypes.string)
-    expected = np.array(["", "one", "", "three", "four", "", "", "seven"])
+    expected = np.array([b"", b"one", b"", b"three", b"four",
+                         b"", b"", b"seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by same value.
-    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
     updates = constant_op.constant(["a", "b", "b", "c"],
                                    dtype=dtypes.string)
-    expected = np.array(["", "", "", "bb", "a", "", "", "c"])
+    expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by different value.
-    indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32)
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
     updates = constant_op.constant(["a", "b", "c", "d"],
                                    dtype=dtypes.string)
-    expected = [np.array(["", "", "", "bc", "a", "", "", "d"]),
-                np.array(["", "", "", "cb", "a", "", "", "d"])]
+    expected = [np.array([b"", b"", b"", b"bc", b"a", b"", b"", b"d"]),
+                np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.test_session() as sess:
       result = sess.run(scatter)
-- 
GitLab


From 70b8d21edcc84818835c9e2940a5df288c309d45 Mon Sep 17 00:00:00 2001
From: Roy Frostig <frostig@google.com>
Date: Thu, 19 Apr 2018 23:01:07 -0700
Subject: [PATCH 1194/1262] [XLA] Rework the local XLA client's Shape class
 with separate array and tuple shape constructors.

PiperOrigin-RevId: 193624591
---
 .../compiler/xla/python/numpy_bridge.cc       |  20 +--
 tensorflow/compiler/xla/python/xla_client.py  | 137 ++++++++++++------
 .../compiler/xla/python/xla_client_test.py    |  10 +-
 3 files changed, 103 insertions(+), 64 deletions(-)

diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index eec48479c9..dc6f5fe5fc 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -181,16 +181,6 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
                            PyObjectCppRepr(o).c_str());
   };
 
-  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
-    PyObject* result =
-        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
-    if (result == nullptr) {
-      return error(tensorflow::strings::StrCat(
-          "Failed to get attribute of Shape object:", field));
-    }
-    return result;
-  };
-
   auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
     PyObject* result =
         PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
@@ -202,12 +192,16 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
   };
 
   PyObject* np_type;
-  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  TF_ASSIGN_OR_RETURN(np_type, call_method("numpy_dtype"));
   if (np_type->ob_type != &PyArrayDescr_Type) {
-    return error("Shape attribute np_dtype is not an integer numpy dtype");
+    return error(
+        "Return value of shape method numpy_dtype "
+        "is not an integer numpy dtype");
   }
   if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
-    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+    return error(
+        "Return value of shape method numpy_dtype "
+        "is not a valid integer numpy dtype");
   }
   const PrimitiveType element_type =
       NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 9c81f6439d..f6809b6b87 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -166,14 +166,14 @@ class LocalBuffer(object):
     self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_py(npval, layout_fn=None):
-    npval = require_numpy_array_layout(npval)
+  def from_pyval(pyval, layout_fn=None):
+    pyval = require_numpy_array_layout(pyval)
     if layout_fn:
-      shape = Shape.from_numpy(npval)
+      shape = Shape.from_pyval(pyval)
       shape = shape.map_leaves(layout_fn)
     else:
       shape = None
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape))
 
   def to_py(self):
     return self.c_local_shaped_buffer.ToLiteral()
@@ -191,53 +191,104 @@ class LocalBuffer(object):
 
 
 class Shape(object):
-  """XLA shape.
+  """Represents an XLA shape.
 
-  Represents an XLA shape by a corresponding Python/Numpy type and a
-  list of dimensions, which are themselves Shapes in case this one
-  represents an XLA tuple.
+  A shape is either an array shape, having rank-many integer
+  dimensions and an element type (represented by a Numpy dtype), or it
+  is a tuple shape, having a shape for every tuple component:
+
+    type shape =
+        TupleShape of shape list
+      | ArrayShape of { dimensions: int list; element_type: dtype }
+
+  Callers are expected to instantiate this class only via the static
+  constructors: tuple_shape, array_shape, and from_pyval.
   """
 
-  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+  @staticmethod
+  def tuple_shape(tuple_shapes):
+    """Construct a tuple shape."""
+    if (not isinstance(tuple_shapes, (tuple, list)) or
+        not all(isinstance(t, Shape) for t in tuple_shapes)):
+      raise TypeError('tuple_shapes must be a tuple of Shapes')
+    return Shape(tuple_shapes, tuple)
+
+  @staticmethod
+  def array_shape(element_type, dimensions, minor_to_major=None):
+    """Construct an array shape."""
+    if (not isinstance(dimensions, tuple) or
+        not all(isinstance(i, int) for i in dimensions)):
+      dimensions = tuple(int(i) for i in dimensions)
+    return Shape(dimensions, np.dtype(element_type),
+                 minor_to_major=minor_to_major)
+
+  @staticmethod
+  def from_pyval(pyval):
+    def convert(pyval):
+      if isinstance(pyval, tuple):
+        return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
+      else:
+        pyval = require_numpy_array_layout(pyval)
+        return Shape.array_shape(pyval.dtype, np.shape(pyval))
+    return convert(pyval)
+
+  def __init__(self, dimensions, dtype, minor_to_major=None):
     assert isinstance(dimensions, tuple)
-    self.np_dtype = np_dtype
     self._dimensions = dimensions
+    self._dtype = dtype
+    self._is_tuple = dtype == tuple
     self._minor_to_major = minor_to_major
     self._check_minor_to_major()
 
   def __eq__(self, other):
     # pylint: disable=protected-access
-    return (self.np_dtype == other.np_dtype and
+    return (self._dtype == other._dtype and
             self._dimensions == other._dimensions and
             self._minor_to_major == other._minor_to_major)
 
   def __repr__(self):
-    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
-            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
-                                           self._minor_to_major)
-
-  def element_type(self):
-    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
+    return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
+            '_is_tuple={!r}), _minor_to_major={!r}').format(
+                self._dtype, self._dimensions, self._is_tuple,
+                self._minor_to_major)
 
   def is_tuple(self):
-    return self.element_type() == xla_data_pb2.TUPLE
+    return self._is_tuple
 
-  def dimensions(self):
-    if self.is_tuple():
-      raise ValueError('Tuple shape has no dimensions')
-    return self._dimensions
-
-  def minor_to_major(self):
-    return self._minor_to_major
+  def is_array(self):
+    return not self._is_tuple
 
   def tuple_shapes(self):
     if not self.is_tuple():
-      raise ValueError('Shape is not a tuple shape')
+      raise ValueError('not a tuple shape')
+    return self._dimensions
+
+  def numpy_dtype(self):
+    """Like element_type(), but returns dtype('O') in case of a tuple shape."""
+    if self.is_tuple():
+      return np.dtype(np.object)
+    else:
+      return self.element_type()
+
+  def xla_element_type(self):
+    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())]
+
+  def element_type(self):
+    if not self.is_array():
+      raise ValueError('not an array shape')
+    return self._dtype
+
+  def dimensions(self):
+    if not self.is_array():
+      raise ValueError('not an array shape')
     return self._dimensions
 
   def rank(self):
     return len(self.dimensions())
 
+  def minor_to_major(self):
+    return self._minor_to_major
+
   def map_leaves(self, f):
     """Map f over each leaf-level array subshape.
 
@@ -250,7 +301,7 @@ class Shape(object):
     """
     if self.is_tuple():
       children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
-      return Shape(np.dtype('O'), children)
+      return Shape.tuple_shape(children)
     else:
       mapped = f(self)
       return self if mapped is None else mapped
@@ -264,30 +315,24 @@ class Shape(object):
       assert sorted(mtm) == range(len(mtm)), self
 
   def update_minor_to_major(self, minor_to_major):
+    if not self.is_array():
+      raise ValueError('not an array shape')
     if not isinstance(minor_to_major, tuple):
       raise TypeError('minor_to_major must be a tuple')
-    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated = Shape.array_shape(
+        self.element_type(), self.dimensions(), minor_to_major)
     updated._check_minor_to_major()  # pylint: disable=protected-access
     return updated
 
-  @staticmethod
-  def from_numpy(npval):
-
-    def convert(npval):
-      if isinstance(npval, tuple):
-        return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval))
-      else:
-        return Shape(npval.dtype, np.shape(npval))
-
-    return convert(require_numpy_array_layout(npval))
-
 
 def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
   if element_type == xla_data_pb2.TUPLE:
-    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
-  return Shape(dtype, dims)
+    shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
+    return Shape.tuple_shape(shapes)
+  else:
+    return Shape.array_shape(dtype, dims)
 
 
 def _wrap_data_handle(handle):
@@ -420,7 +465,7 @@ class LocalComputation(object):
                                   compile_options=None,
                                   layout_fn=None):
     return self.Compile(
-        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        argument_shapes=[Shape.from_pyval(arg) for arg in arguments],
         compile_options=compile_options,
         layout_fn=layout_fn)
 
@@ -428,7 +473,7 @@ class LocalComputation(object):
     """Execute with Python values as arguments and return value."""
     if not self.is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    argument_shapes = [Shape.from_pyval(arg) for arg in arguments]
     if layout_fn:
       argument_shapes = [
           shape.map_leaves(layout_fn) for shape in argument_shapes
@@ -607,7 +652,7 @@ class ComputationBuilder(object):
       A ComputationDataHandle message.
     """
     return self.ParameterWithShape(
-        Shape.from_numpy(value), name=name, parameter_num=parameter_num)
+        Shape.from_pyval(value), name=name, parameter_num=parameter_num)
 
   def Broadcast(self, operand, sizes):
     """Enqueues a broadcast operation onto the computation.
@@ -968,7 +1013,7 @@ class ComputationBuilder(object):
 
     Returns: a ComputationDataHandle to the generated array of F32 values.
     """
-    shape = Shape(self.GetShape(mu).np_dtype, dims)
+    shape = Shape.array_shape(self.GetShape(mu).element_type(), dims)
     return _wrap_data_handle(
         self._client.RngNormal(
             _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
@@ -988,7 +1033,7 @@ class ComputationBuilder(object):
     Returns: a ComputationDataHandle to the generated array of values with the
       same numeric type (F32, S32, or U32) as the arguments a and b.
     """
-    shape = Shape(self.GetShape(a).np_dtype, dims)
+    shape = Shape.array_shape(self.GetShape(a).element_type(), dims)
     return _wrap_data_handle(
         self._client.RngUniform(
             _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index d97264ea64..6fe7b242e4 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -319,7 +319,7 @@ class LocalBufferTest(LocalComputationTest):
 
   def _Execute(self, c, arguments):
     compiled_c = c.Build().CompileWithExampleArguments(arguments)
-    arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments]
+    arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments]
     result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers)
     return result_buffer.to_py()
 
@@ -350,7 +350,7 @@ class LocalBufferTest(LocalComputationTest):
     c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
     arg = NumpyArrayF32(1.11)
     compiled_c = c.Build().CompileWithExampleArguments([arg])
-    arg_buffer = xla_client.LocalBuffer.from_py(arg)
+    arg_buffer = xla_client.LocalBuffer.from_pyval(arg)
     arg_buffer.delete()
     with self.assertRaises(ValueError):
       compiled_c.ExecuteWithLocalBuffers([arg_buffer])
@@ -1288,7 +1288,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
   def testInfeedS32Values(self):
     to_infeed = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    c.Infeed(xla_client.Shape.from_numpy(to_infeed[0]))
+    c.Infeed(xla_client.Shape.from_pyval(to_infeed[0]))
     compiled_c = c.Build().CompileWithExampleArguments()
     for item in to_infeed:
       xla_client.transfer_to_infeed(item)
@@ -1300,7 +1300,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
   def testInfeedThenOutfeedS32(self):
     to_round_trip = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0]))
+    x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0]))
     c.Outfeed(x)
 
     compiled_c = c.Build().CompileWithExampleArguments()
@@ -1310,7 +1310,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
       execution.start()
       xla_client.transfer_to_infeed(want)
       got = xla_client.transfer_from_outfeed(
-          xla_client.Shape.from_numpy(to_round_trip[0]))
+          xla_client.Shape.from_pyval(to_round_trip[0]))
       execution.join()
       self.assertEqual(want, got)
 
-- 
GitLab


From f7e8fbb28a0fa4e979a94d7b458706abf48f7deb Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 19 Apr 2018 23:08:53 -0700
Subject: [PATCH 1195/1262] Automated g4 rollback of changelist 193602050

PiperOrigin-RevId: 193625346
---
 tensorflow/core/lib/io/record_reader.cc    | 147 ++++----------
 tensorflow/core/lib/io/record_reader.h     |  16 +-
 tensorflow/core/lib/io/recordio_test.cc    | 212 ++++++++++++++-------
 tensorflow/core/lib/io/zlib_inputstream.cc |  16 +-
 tensorflow/core/lib/io/zlib_inputstream.h  |  19 +-
 5 files changed, 220 insertions(+), 190 deletions(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 6de850bb20..c24628be57 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
-    : src_(file), options_(options) {
+    : options_(options),
+      input_stream_(new RandomAccessInputStream(file)),
+      last_read_failed_(false) {
   if (options.buffer_size > 0) {
-    input_stream_.reset(new BufferedInputStream(file, options.buffer_size));
-  } else {
-    input_stream_.reset(new RandomAccessInputStream(file));
+    input_stream_.reset(new BufferedInputStream(input_stream_.release(),
+                                                options.buffer_size, true));
   }
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
 // We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
     LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
 #else   // IS_SLIM_BUILD
-    zlib_input_stream_.reset(new ZlibInputStream(
-        input_stream_.get(), options.zlib_options.input_buffer_size,
-        options.zlib_options.output_buffer_size, options.zlib_options));
+    input_stream_.reset(new ZlibInputStream(
+        input_stream_.release(), options.zlib_options.input_buffer_size,
+        options.zlib_options.output_buffer_size, options.zlib_options, true));
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
-    LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
+    LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
 // stored in the last 4 bytes and store the first n bytes in *result.
-// May use *storage as backing store.
-Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
-                                     StringPiece* result, string* storage) {
+//
+// offset corresponds to the user-provided value to ReadRecord()
+// and is used only in error messages.
+Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) {
   if (n >= SIZE_MAX - sizeof(uint32)) {
     return errors::DataLoss("record size too large");
   }
 
   const size_t expected = n + sizeof(uint32);
-  storage->resize(expected);
-
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    // If we have a zlib compressed buffer, we assume that the
-    // file is being read sequentially, and we use the underlying
-    // implementation to read the data.
-    //
-    // No checks are done to validate that the file is being read
-    // sequentially.  At some point the zlib input buffer may support
-    // seeking, possibly inefficiently.
-    TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
-
-    if (storage->size() != expected) {
-      if (storage->empty()) {
-        return errors::OutOfRange("eof");
-      } else {
-        return errors::DataLoss("truncated record at ", offset);
-      }
-    }
+  TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
-    uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-    if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-      return errors::DataLoss("corrupted record at ", offset);
-    }
-    *result = StringPiece(storage->data(), n);
-  } else {
-#endif  // IS_SLIM_BUILD
-    if (options_.buffer_size > 0) {
-      // If we have a buffer, we assume that the file is being read
-      // sequentially, and we use the underlying implementation to read the
-      // data.
-      //
-      // No checks are done to validate that the file is being read
-      // sequentially.
-      TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage));
-
-      if (storage->size() != expected) {
-        if (storage->empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-
-      const uint32 masked_crc = core::DecodeFixed32(storage->data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(storage->data(), n);
+  if (result->size() != expected) {
+    if (result->empty()) {
+      return errors::OutOfRange("eof");
     } else {
-      // This version supports reading from arbitrary offsets
-      // since we are accessing the random access file directly.
-      StringPiece data;
-      TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
-      if (data.size() != expected) {
-        if (data.empty()) {
-          return errors::OutOfRange("eof");
-        } else {
-          return errors::DataLoss("truncated record at ", offset);
-        }
-      }
-      const uint32 masked_crc = core::DecodeFixed32(data.data() + n);
-      if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) {
-        return errors::DataLoss("corrupted record at ", offset);
-      }
-      *result = StringPiece(data.data(), n);
+      return errors::DataLoss("truncated record at ", offset);
     }
-#if !defined(IS_SLIM_BUILD)
   }
-#endif  // IS_SLIM_BUILD
 
+  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
+    return errors::DataLoss("corrupted record at ", offset);
+  }
+  result->resize(n);
   return Status::OK();
 }
 
@@ -167,50 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
   static const size_t kFooterSize = sizeof(uint32);
 
+  // Position the input stream.
+  int64 curr_pos = input_stream_->Tell();
+  int64 desired_pos = static_cast<int64>(*offset);
+  if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
+      (curr_pos == desired_pos && last_read_failed_)) {
+    last_read_failed_ = false;
+    TF_RETURN_IF_ERROR(input_stream_->Reset());
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos));
+  } else if (curr_pos < desired_pos) {
+    TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos));
+  }
+  DCHECK_EQ(desired_pos, input_stream_->Tell());
+
   // Read header data.
-  StringPiece lbuf;
-  Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record);
+  Status s = ReadChecksummed(*offset, sizeof(uint64), record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(lbuf.data());
+  const uint64 length = core::DecodeFixed64(record->data());
 
   // Read data
-  StringPiece data;
-  s = ReadChecksummed(*offset + kHeaderSize, length, &data, record);
+  s = ReadChecksummed(*offset + kHeaderSize, length, record);
   if (!s.ok()) {
+    last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset);
     }
     return s;
   }
 
-  if (record->data() != data.data()) {
-    // RandomAccessFile placed the data in some other location.
-    memmove(&(*record)[0], data.data(), data.size());
-  }
-
-  record->resize(data.size());
-
   *offset += kHeaderSize + length + kFooterSize;
+  DCHECK_EQ(*offset, input_stream_->Tell());
   return Status::OK();
 }
 
-Status RecordReader::SkipNBytes(uint64 offset) {
-#if !defined(IS_SLIM_BUILD)
-  if (zlib_input_stream_) {
-    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
-  } else {
-#endif
-    if (options_.buffer_size > 0) {
-      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
-    }
-#if !defined(IS_SLIM_BUILD)
-  }
-#endif
-  return Status::OK();
-}  // namespace io
-
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index 26278e0328..f6d587dfa0 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -69,25 +69,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  //
-  // Note: if buffering is used (with or without compression), access must be
-  // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
-  // Skip the records till "offset". Returns OK on success,
-  // OUT_OF_RANGE for end of file, or something else for an error.
-  Status SkipNBytes(uint64 offset);
-
  private:
-  Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
-                         string* storage);
+  Status ReadChecksummed(uint64 offset, size_t n, string* result);
 
-  RandomAccessFile* src_;
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
-#if !defined(IS_SLIM_BUILD)
-  std::unique_ptr<ZlibInputStream> zlib_input_stream_;
-#endif  // IS_SLIM_BUILD
+  bool last_read_failed_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
@@ -121,7 +110,6 @@ class SequentialRecordReader {
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
           " which is less than the current offset: ", offset_);
-    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
     offset_ = offset;
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 63235761d9..da514bd21c 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -26,10 +26,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
+namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-static string BigString(const string& partial_string, size_t n) {
+string BigString(const string& partial_string, size_t n) {
   string result;
   while (result.size() < n) {
     result.append(partial_string);
@@ -39,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-static string NumberString(int n) {
+string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
   return string(buf);
 }
 
 // Return a skewed potentially long string
-static string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class RecordioTest : public ::testing::Test {
+class StringDest : public WritableFile {
+ public:
+  explicit StringDest(string* contents) : contents_(contents) {}
+
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
+  Status Append(const StringPiece& slice) override {
+    contents_->append(slice.data(), slice.size());
+    return Status::OK();
+  }
+
  private:
-  class StringDest : public WritableFile {
-   public:
-    string contents_;
-
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Append(const StringPiece& slice) override {
-      contents_.append(slice.data(), slice.size());
-      return Status::OK();
+  string* contents_;
+};
+
+class StringSource : public RandomAccessFile {
+ public:
+  explicit StringSource(string* contents)
+      : contents_(contents), force_error_(false) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    if (force_error_) {
+      force_error_ = false;
+      return errors::DataLoss("read error");
     }
-  };
-
-  class StringSource : public RandomAccessFile {
-   public:
-    StringPiece contents_;
-    mutable bool force_error_;
-    mutable bool returned_partial_;
-    StringSource() : force_error_(false), returned_partial_(false) {}
-
-    Status Read(uint64 offset, size_t n, StringPiece* result,
-                char* scratch) const override {
-      EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error";
-
-      if (force_error_) {
-        force_error_ = false;
-        returned_partial_ = true;
-        return errors::DataLoss("read error");
-      }
-
-      if (offset >= contents_.size()) {
-        return errors::OutOfRange("end of file");
-      }
-
-      if (contents_.size() < offset + n) {
-        n = contents_.size() - offset;
-        returned_partial_ = true;
-      }
-      *result = StringPiece(contents_.data() + offset, n);
-      return Status::OK();
+
+    if (offset >= contents_->size()) {
+      return errors::OutOfRange("end of file");
+    }
+
+    if (contents_->size() < offset + n) {
+      n = contents_->size() - offset;
     }
-  };
+    *result = StringPiece(contents_->data() + offset, n);
+    return Status::OK();
+  }
+
+  void force_error() { force_error_ = true; }
+
+ private:
+  string* contents_;
+  mutable bool force_error_;
+};
 
+class RecordioTest : public ::testing::Test {
+ private:
+  string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
@@ -104,7 +109,9 @@ class RecordioTest : public ::testing::Test {
 
  public:
   RecordioTest()
-      : reading_(false),
+      : dest_(&contents_),
+        source_(&contents_),
+        reading_(false),
         readpos_(0),
         writer_(new RecordWriter(&dest_)),
         reader_(new RecordReader(&source_)) {}
@@ -119,12 +126,11 @@ class RecordioTest : public ::testing::Test {
     TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg)));
   }
 
-  size_t WrittenBytes() const { return dest_.contents_.size(); }
+  size_t WrittenBytes() const { return contents_.size(); }
 
   string Read() {
     if (!reading_) {
       reading_ = true;
-      source_.contents_ = StringPiece(dest_.contents_);
     }
     string record;
     Status s = reader_->ReadRecord(&readpos_, &record);
@@ -137,26 +143,20 @@ class RecordioTest : public ::testing::Test {
     }
   }
 
-  void IncrementByte(int offset, int delta) {
-    dest_.contents_[offset] += delta;
-  }
+  void IncrementByte(int offset, int delta) { contents_[offset] += delta; }
 
-  void SetByte(int offset, char new_byte) {
-    dest_.contents_[offset] = new_byte;
-  }
+  void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; }
 
-  void ShrinkSize(int bytes) {
-    dest_.contents_.resize(dest_.contents_.size() - bytes);
-  }
+  void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); }
 
   void FixChecksum(int header_offset, int len) {
     // Compute crc of type/len/data
-    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
+    uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len);
     crc = crc32c::Mask(crc);
-    core::EncodeFixed32(&dest_.contents_[header_offset], crc);
+    core::EncodeFixed32(&contents_[header_offset], crc);
   }
 
-  void ForceError() { source_.force_error_ = true; }
+  void ForceError() { source_.force_error(); }
 
   void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; }
 
@@ -165,7 +165,6 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    source_.contents_ = StringPiece(dest_.contents_);
     uint64 offset = WrittenBytes() + offset_past_end;
     string record;
     Status s = reader_->ReadRecord(&offset, &record);
@@ -217,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) {
   ASSERT_EQ("EOF", Read());
 }
 
+void TestNonSequentialReads(const RecordWriterOptions& writer_options,
+                            const RecordReaderOptions& reader_options) {
+  string contents;
+  StringDest dst(&contents);
+  RecordWriter writer(&dst, writer_options);
+  for (int i = 0; i < 10; ++i) {
+    TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i;
+  }
+  TF_ASSERT_OK(writer.Close());
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  string record;
+  // First read sequentially to fill in the offsets table.
+  uint64 offsets[10] = {0};
+  uint64 offset = 0;
+  for (int i = 0; i < 10; ++i) {
+    offsets[i] = offset;
+    TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
+  }
+
+  // Read randomly: First go back to record #3 then forward to #8.
+  offset = offsets[3];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("3.", record);
+  EXPECT_EQ(offsets[4], offset);
+
+  offset = offsets[8];
+  TF_ASSERT_OK(reader.ReadRecord(&offset, &record));
+  EXPECT_EQ("8.", record);
+  EXPECT_EQ(offsets[9], offset);
+}
+
+TEST_F(RecordioTest, NonSequentialReads) {
+  TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 10;
+  TestNonSequentialReads(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
+  TestNonSequentialReads(
+      RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+      RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
+}
+
 // Tests of all the error paths in log_reader.cc follow:
-static void AssertHasSubstr(StringPiece s, StringPiece expected) {
+void AssertHasSubstr(StringPiece s, StringPiece expected) {
   EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
+void TestReadError(const RecordWriterOptions& writer_options,
+                   const RecordReaderOptions& reader_options) {
+  const string wrote = BigString("well hello there!", 100);
+  string contents;
+  StringDest dst(&contents);
+  TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
+
+  StringSource file(&contents);
+  RecordReader reader(&file, reader_options);
+
+  uint64 offset = 0;
+  string read;
+  file.force_error();
+  Status status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(errors::IsDataLoss(status));
+  ASSERT_EQ(0, offset);
+
+  // A failed Read() shouldn't update the offset, and thus a retry shouldn't
+  // lose the record.
+  status = reader.ReadRecord(&offset, &read);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_GT(offset, 0);
+  EXPECT_EQ(wrote, read);
+}
+
 TEST_F(RecordioTest, ReadError) {
-  Write("foo");
-  ForceError();
-  AssertHasSubstr(Read(), "Data loss");
+  TestReadError(RecordWriterOptions(), RecordReaderOptions());
+}
+
+TEST_F(RecordioTest, ReadErrorWithBuffering) {
+  RecordReaderOptions options;
+  options.buffer_size = 1 << 20;
+  TestReadError(RecordWriterOptions(), options);
+}
+
+TEST_F(RecordioTest, ReadErrorWithCompression) {
+  TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"),
+                RecordReaderOptions::CreateRecordReaderOptions("ZLIB"));
 }
 
 TEST_F(RecordioTest, CorruptLength) {
@@ -257,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
 
 TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
 
+}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 984fbc2810..47de36bf6c 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream(
     InputStreamInterface* input_stream,
     size_t input_buffer_bytes,   // size of z_stream.next_in buffer
     size_t output_buffer_bytes,  // size of z_stream.next_out buffer
-    const ZlibCompressionOptions& zlib_options)
-    : input_stream_(input_stream),
+    const ZlibCompressionOptions& zlib_options, bool owns_input_stream)
+    : owns_input_stream_(owns_input_stream),
+      input_stream_(input_stream),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_capacity_]),
@@ -37,14 +38,25 @@ ZlibInputStream::ZlibInputStream(
   InitZlibBuffer();
 }
 
+ZlibInputStream::ZlibInputStream(InputStreamInterface* input_stream,
+                                 size_t input_buffer_bytes,
+                                 size_t output_buffer_bytes,
+                                 const ZlibCompressionOptions& zlib_options)
+    : ZlibInputStream(input_stream, input_buffer_bytes, output_buffer_bytes,
+                      zlib_options, false) {}
+
 ZlibInputStream::~ZlibInputStream() {
   if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
 }
 
 Status ZlibInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
+  inflateEnd(z_stream_.get());
   InitZlibBuffer();
   bytes_read_ = 0;
   return Status::OK();
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 9c7e14441c..37339163ee 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -40,7 +40,15 @@ class ZlibInputStream : public InputStreamInterface {
   // Create a ZlibInputStream for `input_stream` with a buffer of size
   // `input_buffer_bytes` bytes for reading contents from `input_stream` and
   // another buffer with size `output_buffer_bytes` for caching decompressed
-  // contents. Does *not* take ownership of "input_stream".
+  // contents.
+  //
+  // Takes ownership of `input_stream` iff `owns_input_stream` is true.
+  ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
+                  size_t output_buffer_bytes,
+                  const ZlibCompressionOptions& zlib_options,
+                  bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream=false.
   ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes,
                   size_t output_buffer_bytes,
                   const ZlibCompressionOptions& zlib_options);
@@ -65,10 +73,11 @@ class ZlibInputStream : public InputStreamInterface {
  private:
   void InitZlibBuffer();
 
-  InputStreamInterface* input_stream_;  // Not owned
-  size_t input_buffer_capacity_;        // Size of z_stream_input_
-  size_t output_buffer_capacity_;       // Size of z_stream_output_
-  char* next_unread_byte_;              // Next unread byte in z_stream_output_
+  const bool owns_input_stream_;
+  InputStreamInterface* input_stream_;
+  size_t input_buffer_capacity_;   // Size of z_stream_input_
+  size_t output_buffer_capacity_;  // Size of z_stream_output_
+  char* next_unread_byte_;         // Next unread byte in z_stream_output_
 
   // Buffer for storing contents read from compressed stream.
   // TODO(srbs): Consider using circular buffers. That would greatly simplify
-- 
GitLab


From d2fd0bbac6368a6b41e73d18c93b24442f5653f1 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Thu, 19 Apr 2018 23:35:04 -0700
Subject: [PATCH 1196/1262] [TF:XLA] Factor out the handling of while
 instructions to make HloVerifier::Run shorter.

PiperOrigin-RevId: 193626864
---
 .../compiler/xla/service/hlo_verifier.cc      | 83 +++++++++++--------
 .../compiler/xla/service/hlo_verifier.h       |  8 +-
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8c875698eb..80ed6d6832 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -731,6 +731,55 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   return tensorflow::Status::OK();
 }
 
+Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
+  auto* while_cond = instruction->while_condition();
+  auto* while_body = instruction->while_body();
+  if (while_cond->num_parameters() != 1) {
+    return FailedPrecondition(
+        "While condition must have exactly 1 parameter; had %lld : %s",
+        while_cond->num_parameters(), while_cond->ToString().c_str());
+  }
+  if (while_body->num_parameters() != 1) {
+    return FailedPrecondition(
+        "While body must have exactly 1 parameter; had %lld : %s",
+        while_body->num_parameters(), while_body->ToString().c_str());
+  }
+  if (instruction->operand_count() != 1) {
+    return FailedPrecondition(
+        "While loop must have exactly one operand; had %lld : %s",
+        instruction->operand_count(), instruction->ToString().c_str());
+  }
+  auto* init = instruction->operand(0);
+  auto* cond_param = while_cond->parameter_instruction(0);
+  if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) {
+    return FailedPrecondition(
+        "While condition's parameter must have the same shape as the "
+        "loop's 'init'. init: %s, param: %s",
+        init->ToString().c_str(), cond_param->ToString().c_str());
+  }
+  auto* cond_root = while_cond->root_instruction();
+  if (!ShapeUtil::Compatible(cond_root->shape(),
+                             ShapeUtil::MakeShape(PRED, {}))) {
+    return FailedPrecondition("While condition should have shape PRED: %s",
+                              cond_root->ToString().c_str());
+  }
+  auto* body_param = while_body->parameter_instruction(0);
+  if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) {
+    return FailedPrecondition(
+        "While body's parameter must have the same shape as the loop's"
+        " 'init'. init: %s, param: %s",
+        init->ToString().c_str(), body_param->ToString().c_str());
+  }
+  auto* body_root = while_body->root_instruction();
+  if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) {
+    return FailedPrecondition(
+        "While body should have same shape as the loop's 'init'."
+        "init: %s, body: %s",
+        init->ToString().c_str(), body_root->ToString().c_str());
+  }
+  return tensorflow::Status::OK();
+}
+
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
 
@@ -771,39 +820,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
             << instruction->dimensions().size()
             << " != " << ShapeUtil::Rank(instruction->operand(0)->shape());
       } else if (instruction->opcode() == HloOpcode::kWhile) {
-        auto* while_cond = instruction->while_condition();
-        auto* while_body = instruction->while_body();
-        TF_RET_CHECK(while_cond->num_parameters() == 1)
-            << "While condition must have exactly 1 parameter; had "
-            << while_cond->num_parameters() << ": " << while_cond->ToString();
-        TF_RET_CHECK(while_body->num_parameters() == 1)
-            << "While body must have exactly 1 parameter; had "
-            << while_body->num_parameters() << ": " << while_body->ToString();
-        TF_RET_CHECK(instruction->operand_count() == 1)
-            << "While loop must have exactly one operand; had "
-            << instruction->operand_count() << ": " << instruction->ToString();
-
-        auto* init = instruction->operand(0);
-        auto* cond_param = while_cond->parameter_instruction(0);
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), cond_param->shape()))
-            << "While condition's parameter must have the same shape as the "
-               "loop's 'init'. init: "
-            << init->ToString() << ", param: " << cond_param->ToString();
-        auto* cond_root = while_cond->root_instruction();
-        TF_RET_CHECK(ShapeUtil::Compatible(cond_root->shape(),
-                                           ShapeUtil::MakeShape(PRED, {})))
-            << "While condition should have shape PRED: "
-            << cond_root->ToString();
-
-        auto* body_param = while_body->parameter_instruction(0);
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_param->shape()))
-            << "While body's parameter must have the same shape as the loop's "
-               "'init'. init: "
-            << init->ToString() << ", param: " << body_param->ToString();
-        auto* body_root = while_body->root_instruction();
-        TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_root->shape()))
-            << "While body should have same shape as the loop's 'init'. init: "
-            << init->ToString() << ", body: " << body_root->ToString();
+        TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction));
       }
 
       auto previous = instructions.find(instruction->name());
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1dd7ec3c51..1ec55a9bdc 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -102,7 +102,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status CheckTernaryShape(const HloInstruction* instruction);
   Status CheckVariadicShape(const HloInstruction* instruction);
 
-  // Checks if the given two instructions shares the same channel id.
+  // Checks if the given two instructions share the same channel id.
   Status CheckSameChannel(const HloInstruction* instr1,
                           const HloInstruction* instr2);
 
@@ -144,9 +144,11 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
+  Status CheckWhileInstruction(HloInstruction* instruction);
+
   // Creates a ShapeVerifier that checks that shapes match inferred
-  // expectations.  This is a factory function because ShapeVerifier,  Note that
-  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // expectations. This is a factory function because ShapeVerifier,
+  // being a DfsHloVisitor, is stateful. We want a clean object
   // for each run of the verifier.
   ShapeVerifierFactory shape_verifier_factory_;
 };
-- 
GitLab


From 9e0037513040fd09ee01442bd062936b41bee40c Mon Sep 17 00:00:00 2001
From: SukHwan Kim <30820468+jerry4897@users.noreply.github.com>
Date: Fri, 20 Apr 2018 18:24:52 +0900
Subject: [PATCH 1197/1262] Update c_api_test.cc

Typo
---
 tensorflow/c/c_api_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index ca80db23ed..9b86425aa5 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
   TestGradientsError(false);
 }
 
-// REGISTER_OP for CApiTestAttributesTest test cases.
+// REGISTER_OP for CApiAttributesTest test cases.
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
-- 
GitLab


From 1ad32703d4e728d8fba835aaf24418f19cf85dbe Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 20 Apr 2018 03:29:31 -0700
Subject: [PATCH 1198/1262] [TF:XLA] Implement ClipByValue.

PiperOrigin-RevId: 193646890
---
 tensorflow/compiler/tests/ternary_ops_test.py | 18 ++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../tf2xla/kernels/clip_by_value_op.cc        | 61 +++++++++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc

diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index ba5f829936..75a2cf07c5 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -119,6 +120,23 @@ class TernaryOpsTest(XLATestCase):
           np.array([2, 1], dtype=np.int32),
           expected=np.array([[2], [5]], dtype=dtype))
 
+  def testClipByValue(self):
+    # TODO(b/78258593): enable integer types here too.
+    for dtype in self.float_types:
+      test_cases = [
+          (np.array([2, 4, 5], dtype=dtype), dtype(7)),  #
+          (dtype(1), np.array([2, 4, 5], dtype=dtype)),  #
+          (np.array([-2, 7, 7], dtype=dtype), np.array([-2, 9, 8], dtype=dtype))
+      ]
+      x = np.array([-2, 10, 6], dtype=dtype)
+      for lower, upper in test_cases:
+        self._testTernary(
+            gen_math_ops._clip_by_value,
+            x,
+            lower,
+            upper,
+            expected=np.minimum(np.maximum(x, lower), upper))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 579b669699..00fd08b1a0 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -21,6 +21,7 @@ tf_kernel_library(
         "cast_op.cc",
         "categorical_op.cc",
         "cholesky_op.cc",
+        "clip_by_value_op.cc",
         "concat_op.cc",
         "const_op.cc",
         "conv_ops.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
new file mode 100644
index 0000000000..fdf75be7b1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ClipByValueOp : public XlaOpKernel {
+ public:
+  explicit ClipByValueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape shape = ctx->InputShape(0);
+    const TensorShape min_shape = ctx->InputShape(1);
+    const TensorShape max_shape = ctx->InputShape(2);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+    auto input = ctx->Input(0);
+    auto min = ctx->Input(1);
+    auto max = ctx->Input(2);
+
+    auto shape_error = [&]() -> tensorflow::Status {
+      return errors::InvalidArgument(
+          "clip_value_min and clip_value_max must be either of "
+          "the same shape as input, or a scalar. ",
+          "Input shape: ", shape.DebugString(),
+          " clip_value_min shape: ", min_shape.DebugString(),
+          " clip_value_max shape: ", max_shape.DebugString());
+    };
+
+    if (shape != min_shape) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error());
+      min = builder->Broadcast(min, shape.dim_sizes());
+    }
+    if (shape != max_shape) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error());
+      max = builder->Broadcast(max, shape.dim_sizes());
+    }
+    ctx->SetOutput(0, builder->Clamp(min, input, max));
+  }
+};
+
+REGISTER_XLA_OP(Name("ClipByValue"), ClipByValueOp);
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From 0c03255aa5f4b37de97e0685ffa15888fc16e4b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 06:36:56 -0700
Subject: [PATCH 1199/1262] internal change

PiperOrigin-RevId: 193659701
---
 .../lite/toco/graph_transformations/propagate_fixed_sizes.cc   | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index b34aca1f09..ba244cf5ef 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1516,10 +1516,7 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
     return;
   }
 
-  // The current ArgMax implementation only supports 4-dimensional inputs with
-  // the last dimension as the axis to perform ArgMax for.
   const std::vector<int>& input_dims = input_array.shape().dims();
-  CHECK_EQ(input_dims.size(), 4);
   std::vector<int> output_dims;
 
   output_dims.reserve(input_dims.size() - 1);
-- 
GitLab


From c212d5542bb666b613a8567338983288a3ab15f4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 08:08:01 -0700
Subject: [PATCH 1200/1262] Eliminate the guard around Winograd non-fused
 convolutions with cudnn7.

PiperOrigin-RevId: 193669636
---
 .../fused_conv2d_bias_activation_op.cc        |  3 +-
 .../core/kernels/conv_grad_filter_ops.cc      |  3 +-
 .../core/kernels/conv_grad_input_ops.cc       |  3 +-
 tensorflow/core/kernels/conv_grad_ops_3d.cc   |  8 +++--
 tensorflow/core/kernels/conv_ops.cc           |  3 +-
 tensorflow/core/kernels/conv_ops_3d.cc        |  4 ++-
 tensorflow/core/kernels/conv_ops_gpu.h        | 35 +++++++++++++------
 tensorflow/core/kernels/conv_ops_test.cc      | 26 +++++++++-----
 8 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 0e06575d96..1e8f011b5d 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -543,7 +543,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                 fused_conv_parameters, &algorithm_config)) {
     std::vector<dnn::AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
+        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+            stream->parent()),
         &algorithms));
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 66ee474ca3..f3b91494b9 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -912,7 +912,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
                                 conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 71ea0d5d72..66d15c6e78 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -961,7 +961,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                                 conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 3650ab53b2..1234997bc5 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -662,7 +662,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                                    conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
@@ -1029,7 +1031,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                                    conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 88843e4da7..f0888c655f 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -710,7 +710,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmDesc> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+        &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 21c84b2a0e..0b7c1524e6 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -396,7 +396,9 @@ struct LaunchConvOp<GPUDevice, T> {
                                   conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmDesc> algorithms;
       CHECK(stream->parent()->GetConvolveAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
+              stream->parent()),
+          &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index f0085be3a5..7f9cfec981 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -137,20 +137,18 @@ class ConvParameters {
     // clang-format on
   }
 
-  // TODO(yangzihao): The purpose of this function is to disable winograd
-  // nonfused conv algorithm for certain input parameters so as to avoid a bug
-  // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7.
+  // The purpose of this function is to disable winograd nonfused conv algorithm
+  // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
   template <typename T>
-  bool ShouldIncludeWinogradNonfusedAlgo() const {
-    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
-                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
-                       sizeof(T);
-    int64 threshold = 1LL << 31;
-    if (total_size >= threshold) {
-      return false;
-    } else {
+  bool ShouldIncludeWinogradNonfusedAlgo(
+      perftools::gputools::StreamExecutor* stream_exec) const {
+    // Skip this check for cuDNN 7 and newer.
+    perftools::gputools::port::StatusOr<std::tuple<int, int, int>> version =
+        stream_exec->AsDnn()->GetVersion();
+    if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
       return true;
     }
+    return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
   }
 
  protected:
@@ -166,6 +164,21 @@ class ConvParameters {
   uint64 hash_code_;
 
  private:
+  friend struct ConvParametersPeer;  // For testing purposes.
+
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
+    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
+                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
+                       sizeof(T);
+    int64 threshold = 1LL << 31;
+    if (total_size >= threshold) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
   int64 batch_;
   int64 in_depths_;
   int64 out_depths_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index e2e166c02f..8afe6a2cbd 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -22,20 +22,28 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
 
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
-
 namespace tensorflow {
 
 #if GOOGLE_CUDA
 
+struct ConvParametersPeer {
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() {
+    return params.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
+  }
+
+  ConvParameters params;
+};
+
 TEST(ConvParameters, WinogradNonfusedAlgoSize) {
-  ConvParameters conv_params_small = {
+  ConvParametersPeer conv_params_small = {{
       1,         // batch
       32,        // in_depths
       {{300,     // in_rows
@@ -51,10 +59,11 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
         0}},     // padding_cols
       DT_FLOAT,  // tensor datatype
       0,         // device_id
-  };
-  EXPECT_TRUE(conv_params_small.ShouldIncludeWinogradNonfusedAlgo<float>());
+  }};
+  EXPECT_TRUE(
+      conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 
-  ConvParameters conv_params_large = {
+  ConvParametersPeer conv_params_large = {{
       1,         // batch
       128,       // in_depths
       {{300,     // in_rows
@@ -70,8 +79,9 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
         0}},     // padding_cols
       DT_FLOAT,  // tensor datatype
       0,         // device_id
-  };
-  EXPECT_FALSE(conv_params_large.ShouldIncludeWinogradNonfusedAlgo<float>());
+  }};
+  EXPECT_FALSE(
+      conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 }
 
 #endif  // GOOGLE_CUDA
-- 
GitLab


From 814ab7e37dcbfa7f4749a1fd9d687d6be0207cb8 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 20 Apr 2018 09:20:36 -0700
Subject: [PATCH 1201/1262] [TF:XLA] Bump open source llvm revision to r330313

PiperOrigin-RevId: 193678317
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d7bd2a2be0..aeaf8d7a24 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz",
       ],
-      sha256 = "017d7db029cc175634d75416c326770139c76590575ed44a3794c11ab160c955",
-      strip_prefix = "llvm-3210e64b499a31193051208f2f8922dadfc4bb6f",
+      sha256 = "92b7c01074f694a77b4d664951d1ec071e30ef19c61e673158e95fbb6e447b54",
+      strip_prefix = "llvm-c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From d0e3e998376f5e7d59678e5d42f3497e52ca7622 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 20 Apr 2018 09:23:52 -0700
Subject: [PATCH 1202/1262] Fix msan error in MapAndBatchDataset. While
 checkpointing tensors in BatchResult.output save only the initialized slice.
 If the final batch is short, the entire batch tensor may not be initialized.

PiperOrigin-RevId: 193678679
---
 .../kernels/data/map_and_batch_dataset_op.cc  | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index b8105552a0..605ef3c0b7 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -331,7 +331,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         CHECK_EQ(batch_results_.size(), batch_results_size);
         for (size_t i = 0; i < batch_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadBatchResultLocked(reader, i));
+          TF_RETURN_IF_ERROR(ReadBatchResultLocked(ctx, reader, i));
         }
         return Status::OK();
       }
@@ -573,7 +573,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // finish. This may delay saving a checkpoint by a bit but keeps the
         // code clean and also saves us from checkpointing the state of the
         // `BlockingCounter`.
-        batch_results_[index].counter->Wait();
+        int64 num_elements = 0;
+        WaitForBatch(index, &num_elements).IgnoreError();
+
         const BatchResult& result = batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
         {
@@ -587,14 +589,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             full_name(strings::StrCat(prefix, "_output_size")),
             result.output.size()));
         for (size_t i = 0; i < result.output.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_output_", i)),
-              result.output[i]));
+          // If the batch is not full, we only store the first
+          // `num_elements` values. The rest of the batch tensor is
+          // *uninitialized* and accessing that will raise msan errors.
+          if (num_elements < dataset()->batch_size_) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result.output[i].Slice(0, num_elements)));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result.output[i]));
+          }
         }
         return Status::OK();
       }
 
-      Status ReadBatchResultLocked(IteratorStateReader* reader, size_t index)
+      Status ReadBatchResultLocked(IteratorContext* ctx,
+                                   IteratorStateReader* reader, size_t index)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         BatchResult* result = &batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
@@ -618,10 +630,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         result->output.reserve(output_size);
         for (size_t i = 0; i < output_size; i++) {
-          result->output.emplace_back();
+          Tensor t;
           TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(prefix, "_output_", i)),
-              &result->output.back()));
+              full_name(strings::StrCat(prefix, "_output_", i)), &t));
+          // If the batch was not full, we may have stored only the relevant
+          // slice. Since tensors in `BatchResult.output` are expected to
+          // have the leading dimension of size batch_size, we build a larger
+          // tensor and copy the slice read from the checkpoint into it.
+          if (t.dim_size(0) < dataset()->batch_size_) {
+            TensorShape component_shape(t.shape());
+            component_shape.set_dim(0, dataset()->batch_size_);
+            AllocatorAttributes attr;
+            attr.set_gpu_compatible(true);
+            Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
+            TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
+            result->output.emplace_back(std::move(new_t));
+          } else {
+            result->output.emplace_back(std::move(t));
+          }
         }
         return Status::OK();
       }
-- 
GitLab


From cd462f39e58674a43d1f8c156f23235722b2281e Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 20 Apr 2018 09:31:08 -0700
Subject: [PATCH 1203/1262] Don't delete inbound_nodes and outbound_nodes,
 these no longer exist.

PiperOrigin-RevId: 193679512
---
 tensorflow/tools/docs/generate.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index c750539a76..fc93085e3e 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -43,10 +43,6 @@ if __name__ == '__main__':
 
   flags = doc_generator.parse_known_args()
 
-  # Suppress documentation of some symbols that users should never use.
-  del tf.layers.Layer.inbound_nodes
-  del tf.layers.Layer.outbound_nodes
-
   # tf_debug is not imported with tf, it's a separate module altogether
   doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
 
-- 
GitLab


From fb23c0e166179ccf372203982d8fe79de441e360 Mon Sep 17 00:00:00 2001
From: James Keeling <jtkeeling@google.com>
Date: Fri, 20 Apr 2018 09:54:50 -0700
Subject: [PATCH 1204/1262] Correct error in "Adding An Op" docs.

The macro `REGISTER_KERNEL_BUILDER` always declared a functor specialized on floats, instead of the type actually passed into the macro.

PiperOrigin-RevId: 193682519
---
 tensorflow/docs_src/extend/adding_an_op.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 84da2165b5..c3795492ce 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -267,7 +267,7 @@ REGISTER_CPU(int32);
 #ifdef GOOGLE_CUDA
 #define REGISTER_GPU(T)                                          \
   /* Declare explicit instantiations in kernel_example.cu.cc. */ \
-  extern template ExampleFunctor<GPUDevice, float>;              \
+  extern template ExampleFunctor<GPUDevice, T>;                  \
   REGISTER_KERNEL_BUILDER(                                       \
       Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       ExampleOp<GPUDevice, T>);
-- 
GitLab


From a749a6b95932d6f7438a01a2f5fd661343ad536f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 10:16:03 -0700
Subject: [PATCH 1205/1262] Change the TF record reader to use 16MB buffering
 by default in order to improve performance.

PiperOrigin-RevId: 193685521
---
 tensorflow/python/lib/io/py_record_reader.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/lib/io/py_record_reader.cc b/tensorflow/python/lib/io/py_record_reader.cc
index 5fcb51b3b2..9500fc6a7c 100644
--- a/tensorflow/python/lib/io/py_record_reader.cc
+++ b/tensorflow/python/lib/io/py_record_reader.cc
@@ -43,9 +43,10 @@ PyRecordReader* PyRecordReader::New(const string& filename, uint64 start_offset,
   reader->offset_ = start_offset;
   reader->file_ = file.release();
 
+  static const uint64 kReaderBufferSize = 16 * 1024 * 1024;
   RecordReaderOptions options =
       RecordReaderOptions::CreateRecordReaderOptions(compression_type_string);
-
+  options.buffer_size = kReaderBufferSize;
   reader->reader_ = new RecordReader(reader->file_, options);
   return reader;
 }
-- 
GitLab


From 729192823935156ae29d7f0d5f64c0bcd6034c7a Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 20 Apr 2018 10:32:24 -0700
Subject: [PATCH 1206/1262] Adding Shape inference functions to outfeed enqueue
 ops.

PiperOrigin-RevId: 193688099
---
 tensorflow/contrib/tpu/ops/outfeed_ops.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
index 5900c61a38..b05c76ca64 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -26,6 +26,7 @@ REGISTER_OP("OutfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 An op which emits a single Tensor value from an XLA computation.
 
@@ -36,6 +37,7 @@ REGISTER_OP("OutfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 An op which emits multiple Tensor values from an XLA computation.
 
-- 
GitLab


From da5a6d86b856001c03cccace5ac74fa8f045b6ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 10:34:49 -0700
Subject: [PATCH 1207/1262] Disable constant folding and arithmetic
 optimizations for functions.

PiperOrigin-RevId: 193688466
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 22799311bc..cdc4698c34 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -243,6 +243,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::unordered_set<string> optimized_funcs;
   bool optimize_function_library = true;
 
+  // TODO(ezhulenev): turn it on after fixing ranklab: tune_tf_test.
+  cfg_.set_constant_folding(RewriterConfig::OFF);
+  cfg_.set_arithmetic_optimization(RewriterConfig::OFF);
+
   while (optimize_function_library) {
     optimize_function_library = false;
 
-- 
GitLab


From b3f379e907259aa166c1ef734ccfd03331eb0a94 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 20 Apr 2018 11:10:56 -0700
Subject: [PATCH 1208/1262] [XLA:CPU] Use Eigen for F64 dot operations

PiperOrigin-RevId: 193694613
---
 tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc | 3 ++-
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 29afd8ea5f..495fecc4aa 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -1070,7 +1070,8 @@ static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   // 1) be matrices with no padding, and
   // 2) have an allowed element type.
   PrimitiveType output_primitive_type = output_shape.element_type();
-  return (output_primitive_type == F32 || output_primitive_type == F16) &&
+  return (output_primitive_type == F64 || output_primitive_type == F32 ||
+          output_primitive_type == F16) &&
          IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
          IsRank2WithNoPadding(output_shape);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 3405277d44..f990ee2785 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2076,7 +2076,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
 
     TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
         /*instruction=*/*root, /*operands=*/{lhs, rhs},
-        /*supported_types=*/{F16, F32}));
+        /*supported_types=*/{F16, F32, F64}));
 
     llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
     llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
-- 
GitLab


From 49f3469d9533cb12d06ed3907b4ced975e2fcea4 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 20 Apr 2018 11:13:16 -0700
Subject: [PATCH 1209/1262] Use CreateWorkerSession and DeleteWorkerSession for
 all distributed sessions.

This change adds a phase to the session creation protocol: the master now contacts all workers to register a session handle and create a "WorkerSession" on each worker before it first registers or runs a graph on any worker. Subsequent requests to a worker ensure that the worker has the session handle registered before performing the request, and an AbortedError is raised if the worker has not (e.g. because it restarted after a failure).

As a result, more failure cases are covered by the high-level APIs (tf.estimator, Slim, etc.) that recreate the session on receiving an AbortedError. Previously, there was a possible race condition in which a PS task could restart between variable initialization and the first step, leading to a FailedPreconditionError ("Attempting to use uninitialized value") that would not be handled by the high-level APIs.

PiperOrigin-RevId: 193694958
---
 .../core/distributed_runtime/master_session.cc     | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index ebe350d313..1c67b42e76 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -89,6 +89,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ~ReffedClientGraph() override {
     if (should_deregister_) {
       DeregisterPartitions();
+    } else {
+      for (Part& part : partitions_) {
+        worker_cache_->ReleaseWorker(part.name, part.worker);
+      }
     }
   }
 
@@ -1174,14 +1178,8 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  // TODO(b/36574172): Remove these conditions when ClusterSpec
-  // propagation is supported in all servers.
-  if (options.cluster_def != nullptr ||
-      session_opts_.config.isolate_session_state()) {
-    should_delete_worker_sessions_ = true;
-    return CreateWorkerSessions(options);
-  }
-  return Status::OK();
+  should_delete_worker_sessions_ = true;
+  return CreateWorkerSessions(options);
 }
 
 Status MasterSession::CreateWorkerSessions(
-- 
GitLab


From 570d90b9c7e6a19bc2606fdaf7ad0f85b8590c0e Mon Sep 17 00:00:00 2001
From: akindyakov <akindyakov@gmail.com>
Date: Fri, 20 Apr 2018 11:23:15 -0700
Subject: [PATCH 1210/1262] Speed up safe_strtod and safe_strtof functions by
 using double-conversion library Closes #12102.

PiperOrigin-RevId: 193696537
---
 tensorflow/contrib/cmake/CMakeLists.txt       |  4 +
 .../cmake/external/double_conversion.cmake    | 54 ++++++++++++
 tensorflow/contrib/makefile/Makefile          |  8 +-
 .../contrib/makefile/download_dependencies.sh |  4 +-
 tensorflow/core/BUILD                         |  9 +-
 tensorflow/core/lib/strings/numbers.cc        | 51 +++++++----
 tensorflow/core/lib/strings/numbers.h         |  2 +
 tensorflow/core/lib/strings/numbers_test.cc   | 87 +++++++++++++++++++
 tensorflow/core/lib/strings/str_util.cc       |  8 ++
 tensorflow/core/lib/strings/str_util.h        |  5 ++
 tensorflow/core/lib/strings/str_util_test.cc  | 56 ++----------
 tensorflow/tools/lib_package/BUILD            |  2 +
 tensorflow/tools/pip_package/BUILD            |  1 +
 tensorflow/workspace.bzl                      | 10 +++
 third_party/double_conversion.BUILD           | 38 ++++++++
 15 files changed, 270 insertions(+), 69 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/external/double_conversion.cmake
 create mode 100644 third_party/double_conversion.BUILD

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 23b31ae1dc..bdf3e98635 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -193,6 +193,7 @@ include(protobuf)
 include(re2)
 include(cub)
 include(sqlite)
+include(double_conversion)
 if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
@@ -213,6 +214,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${protobuf_STATIC_LIBRARIES}
     ${re2_STATIC_LIBRARIES}
     ${sqlite_STATIC_LIBRARIES}
+    ${double_conversion_STATIC_LIBRARIES}
 )
 
 if (systemlib_ZLIB)
@@ -240,6 +242,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     fft2d
     re2
     sqlite_copy_headers_to_destination
+    double_conversion
 )
 
 include_directories(
@@ -262,6 +265,7 @@ include_directories(
     ${PROTOBUF_INCLUDE_DIRS}
     ${re2_INCLUDE_DIR}
     ${sqlite_INCLUDE_DIR}
+    ${double_conversion_INCLUDE_DIR}
 )
 
 if(tensorflow_ENABLE_SSL_SUPPORT)
diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake
new file mode 100644
index 0000000000..527ccdc8d8
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/double_conversion.cmake
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion)
+set(double_conversion_URL https://github.com/google/double-conversion.git)
+set(double_conversion_TAG 5664746)
+set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR})
+set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so)
+set(double_conversion_INCLUDES ${double_conversion_BUILD})
+
+if(WIN32)
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib)
+else()
+  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a)
+endif()
+
+set(double_conversion_HEADERS
+    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h"
+    "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h"
+)
+
+ExternalProject_Add(double_conversion
+    PREFIX double_conversion
+    GIT_REPOSITORY ${double_conversion_URL}
+    GIT_TAG ${double_conversion_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 05e8d9064b..1a1ab54a53 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -89,6 +89,7 @@ HOST_INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -125,7 +126,9 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text
 # The list of dependencies is derived from the Bazel build file by running
 # the gen_file_lists.sh script on a system with a working Bazel setup.
 PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt)
-PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt)
+PROTO_TEXT_PB_CC_LIST := \
+	$(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \
+	$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc)
 PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt)
 
 # Locations of the intermediate files proto_text generates.
@@ -171,6 +174,7 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -326,6 +330,7 @@ $(MARCH_OPTION) \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
+-I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -603,6 +608,7 @@ $(wildcard tensorflow/core/platform/*/*.cc) \
 $(wildcard tensorflow/core/platform/*/*/*.cc) \
 $(wildcard tensorflow/core/util/*.cc) \
 $(wildcard tensorflow/core/util/*/*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 8b415e6527..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -32,7 +32,8 @@ GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.g
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 
@@ -87,6 +88,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
 download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
+download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
 download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c15e7de186..5b04574a4f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -337,7 +337,9 @@ cc_library(
         "lib/bfloat16/bfloat16.h",
     ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
-    deps = tf_lib_proto_parsing_deps(),
+    deps = tf_lib_proto_parsing_deps() + [
+        "@double_conversion//:double-conversion",
+    ],
 )
 
 # This build rule (along with :lib_internal, :framework, and
@@ -1231,6 +1233,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1270,6 +1273,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1333,6 +1337,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1355,6 +1360,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1751,6 +1757,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
         "@zlib_archive//:zlib",
+        "@double_conversion//:double-conversion",
         "@protobuf_archive//:protobuf",
     ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
 )
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index c296daa95d..e4b909296e 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
+#include "double-conversion/double-conversion.h"
+
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
@@ -110,6 +112,17 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   return result;
 }
 
+static inline const double_conversion::StringToDoubleConverter&
+StringToFloatConverter() {
+  static const double_conversion::StringToDoubleConverter converter(
+      double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES |
+          double_conversion::StringToDoubleConverter::ALLOW_HEX |
+          double_conversion::StringToDoubleConverter::ALLOW_TRAILING_SPACES |
+          double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY,
+      0., 0., "inf", "nan");
+  return converter;
+}
+
 }  // namespace
 
 namespace strings {
@@ -319,25 +332,31 @@ bool safe_strtou32(StringPiece str, uint32* value) {
 }
 
 bool safe_strtof(const char* str, float* value) {
-  const char* endptr;
-  *value = locale_independent_strtonum<float>(str, &endptr);
-  while (isspace(*endptr)) ++endptr;
-  // Ignore range errors from strtod/strtof.
-  // The values it returns on underflow and
-  // overflow are the right fallback in a
-  // robust setting.
-  return *str != '\0' && *endptr == '\0';
+  int processed_characters_count = -1;
+  auto len = str_util::Strnlen(str, kFastToBufferSize);
+
+  // If there is no zero-termination in str, fail.
+  if (len == kFastToBufferSize) return false;
+  // If string length exceeds int max, fail.
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToFloat(str, static_cast<int>(len),
+                                                  &processed_characters_count);
+  return processed_characters_count > 0;
 }
 
 bool safe_strtod(const char* str, double* value) {
-  const char* endptr;
-  *value = locale_independent_strtonum<double>(str, &endptr);
-  while (isspace(*endptr)) ++endptr;
-  // Ignore range errors from strtod/strtof.
-  // The values it returns on underflow and
-  // overflow are the right fallback in a
-  // robust setting.
-  return *str != '\0' && *endptr == '\0';
+  int processed_characters_count = -1;
+  auto len = str_util::Strnlen(str, kFastToBufferSize);
+
+  // If there is no zero-termination in str, fail.
+  if (len == kFastToBufferSize) return false;
+  // If string length exceeds int max, fail.
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToDouble(str, static_cast<int>(len),
+                                                   &processed_characters_count);
+  return processed_characters_count > 0;
 }
 
 size_t FloatToBuffer(float value, char* buffer) {
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 6b7703be37..e9add42849 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -114,11 +114,13 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Convert strings to floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtof(const char* str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
+// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtod(const char* str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc
index e15161de66..0f22dac262 100644
--- a/tensorflow/core/lib/strings/numbers_test.cc
+++ b/tensorflow/core/lib/strings/numbers_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/numbers.h"
 
+#include <cmath>
 #include <string>
 #include "tensorflow/core/platform/test.h"
 
@@ -277,7 +278,49 @@ TEST(safe_strtof, Float) {
   EXPECT_TRUE(safe_strtof("-0x2A", &result));
   EXPECT_EQ(-42.0f, result);
 
+  EXPECT_TRUE(safe_strtof(" -0x2", &result));
+  EXPECT_EQ(-2.0f, result);
+
+  EXPECT_TRUE(safe_strtof("8 \t", &result));
+  EXPECT_EQ(8.0f, result);
+
+  EXPECT_TRUE(safe_strtof("\t20.0\t ", &result));
+  EXPECT_EQ(20.0f, result);
+
   EXPECT_FALSE(safe_strtof("-infinity is awesome", &result));
+
+  // Make sure we exit cleanly if the string is not terminated
+  char test_str[2 * kFastToBufferSize];
+  for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
+  EXPECT_FALSE(safe_strtof(test_str, &result));
+
+  // Make sure we exit cleanly if the string is too long
+  test_str[kFastToBufferSize + 1] = '\0';
+  EXPECT_FALSE(safe_strtof(test_str, &result));
+
+  EXPECT_TRUE(safe_strtof("-inf", &result));
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("+inf", &result));
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("InF", &result));
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("-INF", &result));
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtof("nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("-nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("-NaN", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtof("+NAN", &result));
+  EXPECT_TRUE(std::isnan(result));
 }
 
 TEST(safe_strtod, Double) {
@@ -287,6 +330,15 @@ TEST(safe_strtod, Double) {
   EXPECT_EQ(0.1234567890123, result);
   EXPECT_FALSE(safe_strtod("0.1234567890123abc", &result));
 
+  // Make sure we exit cleanly if the string is not terminated
+  char test_str[2 * kFastToBufferSize];
+  for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
+  EXPECT_FALSE(safe_strtod(test_str, &result));
+
+  // Make sure we exit cleanly if the string is too long
+  test_str[kFastToBufferSize + 1] = '\0';
+  EXPECT_FALSE(safe_strtod(test_str, &result));
+
   // Overflow to infinity, underflow to 0.
   EXPECT_TRUE(safe_strtod("1e310", &result));
   EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
@@ -296,6 +348,41 @@ TEST(safe_strtod, Double) {
 
   EXPECT_TRUE(safe_strtod("1e-325", &result));
   EXPECT_EQ(0, result);
+
+  EXPECT_TRUE(safe_strtod(" -0x1c", &result));
+  EXPECT_EQ(-28.0, result);
+
+  EXPECT_TRUE(safe_strtod("50 \t", &result));
+  EXPECT_EQ(50.0, result);
+
+  EXPECT_TRUE(safe_strtod("\t82.0\t ", &result));
+  EXPECT_EQ(82.0, result);
+
+  EXPECT_FALSE(safe_strtod("infinity", &result));
+
+  EXPECT_TRUE(safe_strtod("-inf", &result));
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("+inf", &result));
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("InF", &result));
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("-INF", &result));
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
+
+  EXPECT_TRUE(safe_strtod("nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("-nan", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("-NaN", &result));
+  EXPECT_TRUE(std::isnan(result));
+
+  EXPECT_TRUE(safe_strtod("+NAN", &result));
+  EXPECT_TRUE(std::isnan(result));
 }
 
 }  // namespace strings
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 2c9e98357a..4598b8ccc7 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -454,6 +454,14 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
                                     result);
 }
 
+size_t Strnlen(const char* str, const size_t string_max_len) {
+  size_t len = 0;
+  while (len < string_max_len && str[len] != '\0') {
+    ++len;
+  }
+  return len;
+}
+
 bool StrContains(StringPiece haystack, StringPiece needle) {
   return std::search(haystack.begin(), haystack.end(), needle.begin(),
                      needle.end()) != haystack.end();
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 065871c1b4..e97d00b975 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -223,6 +223,11 @@ std::vector<string> Split(StringPiece text, char delims, Predicate p) {
   return Split(text, StringPiece(&delims, 1), p);
 }
 
+// Returns the length of the given null-terminated byte string 'str'.
+// Returns 'string_max_len' if the null character was not found in the first
+// 'string_max_len' bytes of 'str'.
+size_t Strnlen(const char* str, const size_t string_max_len);
+
 }  // namespace str_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 63643c3e8e..3bf3e99825 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -430,56 +430,12 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
-TEST(StartsWith, Basic) {
-  const string s1(
-      "123"
-      "\0"
-      "456",
-      7);
-  const StringPiece a("foobar");
-  const StringPiece b(s1);
-  const StringPiece e;
-  EXPECT_TRUE(str_util::StartsWith(a, a));
-  EXPECT_TRUE(str_util::StartsWith(a, "foo"));
-  EXPECT_TRUE(str_util::StartsWith(a, e));
-  EXPECT_TRUE(str_util::StartsWith(b, s1));
-  EXPECT_TRUE(str_util::StartsWith(b, b));
-  EXPECT_TRUE(str_util::StartsWith(b, e));
-  EXPECT_TRUE(str_util::StartsWith(e, ""));
-  EXPECT_FALSE(str_util::StartsWith(a, b));
-  EXPECT_FALSE(str_util::StartsWith(b, a));
-  EXPECT_FALSE(str_util::StartsWith(e, a));
-}
-
-TEST(EndsWith, Basic) {
-  const string s1(
-      "123"
-      "\0"
-      "456",
-      7);
-  const StringPiece a("foobar");
-  const StringPiece b(s1);
-  const StringPiece e;
-  EXPECT_TRUE(str_util::EndsWith(a, a));
-  EXPECT_TRUE(str_util::EndsWith(a, "bar"));
-  EXPECT_TRUE(str_util::EndsWith(a, e));
-  EXPECT_TRUE(str_util::EndsWith(b, s1));
-  EXPECT_TRUE(str_util::EndsWith(b, b));
-  EXPECT_TRUE(str_util::EndsWith(b, e));
-  EXPECT_TRUE(str_util::EndsWith(e, ""));
-  EXPECT_FALSE(str_util::EndsWith(a, b));
-  EXPECT_FALSE(str_util::EndsWith(b, a));
-  EXPECT_FALSE(str_util::EndsWith(e, a));
-}
-
-TEST(StrContains, Basic) {
-  StringPiece a("abcdefg");
-  StringPiece b("abcd");
-  StringPiece c("efg");
-  StringPiece d("gh");
-  EXPECT_TRUE(str_util::StrContains(a, b));
-  EXPECT_TRUE(str_util::StrContains(a, c));
-  EXPECT_TRUE(!str_util::StrContains(a, d));
+TEST(Strnlen, Basic) {
+  EXPECT_EQ(0, str_util::Strnlen("ab", 0));
+  EXPECT_EQ(1, str_util::Strnlen("a", 1));
+  EXPECT_EQ(2, str_util::Strnlen("abcd", 2));
+  EXPECT_EQ(3, str_util::Strnlen("abc", 10));
+  EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 0ede8c6370..569b6678ca 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -118,6 +118,7 @@ genrule(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
@@ -155,6 +156,7 @@ genrule(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 0ac5a5bb6d..7b508f87ab 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -128,6 +128,7 @@ filegroup(
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
+        "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index aeaf8d7a24..bbef4b9e5f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -693,6 +693,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
   )
 
+  native.new_http_archive(
+      name = "double_conversion",
+      urls = [
+          "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip",
+      ],
+      sha256 = "2f7fbffac0d98d201ad0586f686034371a6d152ca67508ab611adc2386ad30de",
+      strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8",
+      build_file = clean_dep("//third_party:double_conversion.BUILD")
+  )
+
   tf_http_archive(
       name = "tflite_mobilenet",
       sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
diff --git a/third_party/double_conversion.BUILD b/third_party/double_conversion.BUILD
new file mode 100644
index 0000000000..9f905216c0
--- /dev/null
+++ b/third_party/double_conversion.BUILD
@@ -0,0 +1,38 @@
+# Bazel(http://bazel.io) BUILD file
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "double-conversion",
+    srcs = [
+        "double-conversion/bignum.cc",
+        "double-conversion/bignum-dtoa.cc",
+        "double-conversion/cached-powers.cc",
+        "double-conversion/diy-fp.cc",
+        "double-conversion/double-conversion.cc",
+        "double-conversion/fast-dtoa.cc",
+        "double-conversion/fixed-dtoa.cc",
+        "double-conversion/strtod.cc",
+        "double-conversion/utils.h",
+    ],
+    hdrs = [
+        "double-conversion/bignum.h",
+        "double-conversion/bignum-dtoa.h",
+        "double-conversion/cached-powers.h",
+        "double-conversion/diy-fp.h",
+        "double-conversion/double-conversion.h",
+        "double-conversion/fast-dtoa.h",
+        "double-conversion/fixed-dtoa.h",
+        "double-conversion/ieee.h",
+        "double-conversion/strtod.h",
+    ],
+    includes = [
+        ".",
+    ],
+    linkopts = [
+        "-lm",
+    ],
+    visibility = ["//visibility:public"],
+)
-- 
GitLab


From 5fbb1feecd77a70b32d333b56bd13b1798b9a766 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Fri, 20 Apr 2018 11:23:29 -0700
Subject: [PATCH 1211/1262] Temporarily set cudnn Rnn math precision to fp32.

Problem:
When calling cudnnGetRNNLinLayerMatrixParams(), return error CUDNN_STATUS_BAD_PARAM if:

* RNN descriptor set math precision = CUDNN_DATA_FLOAT
* input descriptor dataType = CUDNN_DATA_HALF
* weight descriptor dataType= CUDNN_DATA_HALF

If updating Rnn descriptor math precision to CUDNN_DATA_HALF, then no error.

cudnn 7.1.4 will fix the problem.

PiperOrigin-RevId: 193696566
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index d673e19007..640f270323 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2529,12 +2529,20 @@ cudnnDataType_t GetConvComputeType<double>() {
 }
 
 // A helper struct to decide whether to use FP32 as the internal compute type
-// for rnn when the input data type is FP16. By default it is turned on,
-// users can explicitly disable them (choose to use FP16 as the internal compute
-// type) through an env-var "TF_FP16_RNN_USE_FP32_COMPUTE=0".
+// for rnn when the input data type is FP16. At present it is turned off,
+// users can explicitly control them through an env-var
+// TF_FP16_RNN_USE_FP32_COMPUTE.
+// After the TODO below is fixed, users should almost always use fp32 compute
+// type for training. Using fp16 might suffer suboptimal accuracy due to loss
+// in precision.
 struct RnnDoFP32ComputationFP16Input {
   static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
-  static constexpr bool kDefaultFlag = true;
+  // TODO(jamesqin): b/78182362 flip to true when cudnn 7.1.4 fixes the bug.
+  // Before cudnn 7.1.4 RNN are always done in fp32, no matter what math
+  // precision is set.
+  // Set it temporary to false s.t. no error is raised when using fp16 inputs,
+  // fp32 math precision.
+  static constexpr bool kDefaultFlag = false;
 };
 
 // A helper function to return the internal compute type for
-- 
GitLab


From 712bbc5d7babd523951445f361f0e339061cd259 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 20 Apr 2018 11:24:53 -0700
Subject: [PATCH 1212/1262] Allow creating tensors from numpy arrays, and other
 various constants - try #2

Allow type-inference from a different input tensor, similar to args_to_matching_eager.

- Update TFE_Py_TensorShapeSlice to take tuples.
- Update int values to allow int/long in py2
END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 192184809

PiperOrigin-RevId: 193696790
---
 tensorflow/python/eager/pywrap_tensor.cc  | 201 ++++++++--------
 tensorflow/python/eager/pywrap_tensor.h   |  10 +
 tensorflow/python/eager/pywrap_tfe.h      |  12 +-
 tensorflow/python/eager/pywrap_tfe_src.cc | 278 +++++++++++++++++++---
 tensorflow/python/eager/tensor_test.py    |   7 +-
 tensorflow/python/framework/ops.py        |  16 ++
 6 files changed, 389 insertions(+), 135 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 519814b979..b5b4e394e3 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -60,42 +60,6 @@ TFE_TensorHandle* NumpyToTensorHandle(PyObject* obj) {
   }
 }
 
-// Casts data referred to by `handle` from type `src_type_enum` to type
-// `dst_type_enum`.
-TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
-                            TF_DataType src_type_enum,
-                            TF_DataType dst_type_enum, TF_Status* out_status) {
-  if (ctx == nullptr) return nullptr;
-  const char* op_name = "Cast";
-  const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
-  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
-#define RETURN_ERROR  \
-  {                   \
-    TFE_DeleteOp(op); \
-    return nullptr;   \
-  }
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpSetDevice(op, device_name, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpAddInput(op, handle, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
-  TFE_OpSetAttrType(op, "SrcT", src_type_enum);
-  TFE_OpSetAttrType(op, "DstT", dst_type_enum);
-  TFE_TensorHandle* output = nullptr;
-  int num_outputs = 1;
-  TFE_Execute(op, &output, &num_outputs, out_status);
-  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
-      output == nullptr) {
-    if (output != nullptr) {
-      TFE_DeleteTensorHandle(output);
-    }
-    RETURN_ERROR
-  }
-  TFE_DeleteOp(op);
-  return output;
-#undef RETURN_ERROR
-}
-
 TFE_TensorHandle* CopyToDevice(TFE_TensorHandle* handle, PyObject* ctx,
                                PyObject* dev) {
   const char* device = "";
@@ -161,6 +125,100 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 
 }  // namespace
 
+namespace tensorflow {
+// Casts data referred to by `handle` from type `src_type_enum` to type
+// `dst_type_enum`.
+TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
+                            TF_DataType src_type_enum,
+                            TF_DataType dst_type_enum, TF_Status* out_status) {
+  if (ctx == nullptr) return nullptr;
+  const char* op_name = "Cast";
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
+#define RETURN_ERROR  \
+  {                   \
+    TFE_DeleteOp(op); \
+    return nullptr;   \
+  }
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpSetDevice(op, device_name, out_status);
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpAddInput(op, handle, out_status);
+  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  TFE_OpSetAttrType(op, "SrcT", src_type_enum);
+  TFE_OpSetAttrType(op, "DstT", dst_type_enum);
+  TFE_TensorHandle* output = nullptr;
+  int num_outputs = 1;
+  TFE_Execute(op, &output, &num_outputs, out_status);
+  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
+      output == nullptr) {
+    if (output != nullptr) {
+      TFE_DeleteTensorHandle(output);
+    }
+    RETURN_ERROR
+  }
+  TFE_DeleteOp(op);
+  return output;
+#undef RETURN_ERROR
+}
+
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
+  int desired_dtype = -1;
+  if (dtype != Py_None) {
+    if (!PyIntToDataType(dtype, &desired_dtype)) {
+      PyErr_SetString(PyExc_TypeError,
+                      tensorflow::strings::StrCat(
+                          "Expecting a DataType value for dtype. Got ",
+                          Py_TYPE(dtype)->tp_name)
+                          .c_str());
+      return nullptr;
+    }
+  }
+  if (PyArray_Check(value)) {
+    int desired_np_dtype = -1;
+    if (desired_dtype >= 0) {
+      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
+               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
+               .ok()) {
+        PyErr_SetString(PyExc_TypeError,
+                        tensorflow::strings::StrCat(
+                            "Invalid dtype argument value ", desired_dtype)
+                            .c_str());
+        return nullptr;
+      }
+    }
+    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+    int current_np_dtype = PyArray_TYPE(array);
+    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
+    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
+        !PyArray_ISCARRAY(array)) {
+      int new_dtype =
+          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
+      safe_value = tensorflow::make_safe(
+          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
+                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
+      if (PyErr_Occurred()) return nullptr;
+      if (safe_value == nullptr) {
+        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
+        return nullptr;
+      }
+      value = safe_value.get();
+    }
+    return NumpyToTensorHandle(value);
+  } else {
+    tensorflow::Tensor t;
+    // TODO(josh11b): Have PySeqToTensor set python errors instead of
+    // returning Status.
+    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
+    if (!cppstatus.ok()) {
+      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
+      return nullptr;
+    }
+    return TFE_NewTensorHandle(t);
+  }
+}
+}  // namespace tensorflow
+
 extern "C" {
 
 static const int kMaxEagerTensorParentSize = 64;
@@ -230,61 +288,16 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       return -1;
     }
   }
-  tensorflow::Safe_TFE_TensorHandlePtr handle =
-      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(nullptr));
   PyErr_Clear();
-  if (PyArray_Check(value)) {
-    int desired_np_dtype = -1;
-    if (desired_dtype >= 0) {
-      if (!tensorflow::TF_DataType_to_PyArray_TYPE(
-               static_cast<TF_DataType>(desired_dtype), &desired_np_dtype)
-               .ok()) {
-        PyErr_SetString(PyExc_TypeError,
-                        tensorflow::strings::StrCat(
-                            "Invalid dtype argument value ", desired_dtype)
-                            .c_str());
-        return -1;
-      }
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
-    int current_np_dtype = PyArray_TYPE(array);
-    auto safe_value = tensorflow::make_safe(static_cast<PyObject*>(nullptr));
-    if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) ||
-        !PyArray_ISCARRAY(array)) {
-      int new_dtype =
-          desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype;
-      safe_value = tensorflow::make_safe(
-          PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0,
-                          NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr));
-      if (PyErr_Occurred()) return -1;
-      if (safe_value == nullptr) {
-        PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value");
-        return -1;
-      }
-      value = safe_value.get();
-    }
-    handle = tensorflow::make_safe(NumpyToTensorHandle(value));
-  } else {
-    tensorflow::Tensor t;
-    // TODO(josh11b): Have PySeqToTensor set python errors instead of
-    // returning Status.
-    auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t);
-    if (!cppstatus.ok()) {
-      PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str());
-      return -1;
-    }
-    handle = tensorflow::make_safe(TFE_NewTensorHandle(t));
-  }
-  if (PyErr_Occurred()) return -1;
-  if (handle == nullptr) {
-    PyErr_SetString(PyExc_ValueError, "Error while creating an EagerTensor");
-    return -1;
-  }
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(value, dtype)));
+  if (handle == nullptr) return -1;
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
-    handle = tensorflow::make_safe(
-        EagerCast(GetContext(context), handle.get(), handle_dtype,
-                  static_cast<TF_DataType>(desired_dtype), self->status));
+    handle = tensorflow::make_safe(tensorflow::EagerCast(
+        GetContext(context), handle.get(), handle_dtype,
+        static_cast<TF_DataType>(desired_dtype), self->status));
     if (TF_GetCode(self->status) != TF_OK) {
       PyErr_SetString(PyExc_ValueError,
                       tensorflow::strings::StrCat(
@@ -701,12 +714,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
-  if (!PyList_Check(tensor_list)) {
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
+  if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) {
     PyErr_SetString(PyExc_TypeError,
                     tensorflow::strings::StrCat(
-                        "tensor_list argument must be a list. Got \"",
-                        Py_TYPE(tensor_list)->tp_name, "\"")
+                        "tensors argument must be a list or a tuple. Got \"",
+                        Py_TYPE(tensors)->tp_name, "\"")
                         .c_str());
     return nullptr;
   }
@@ -720,14 +733,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
     return nullptr;
   }
 
-  Py_ssize_t num_tensors = PyList_Size(tensor_list);
+  Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors);
   int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
   auto tensor = tensorflow::make_safe(TF_AllocateTensor(
       TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
   int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
   auto status = tensorflow::make_safe(TF_NewStatus());
   for (Py_ssize_t i = 0; i < num_tensors; ++i) {
-    PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i);
+    PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i);
     if (!EagerTensor_CheckExact(tensor_obj)) {
       PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index aa1efdd1b8..63ab1ed84d 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -22,4 +22,14 @@ limitations under the License.
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
 
+namespace tensorflow {
+TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
+
+// TODO(nareshmodi): Move EagerCast and ReadVariableOp (which use the C API to
+// execute TFE Ops) to a separate common library.
+TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
+                            TF_DataType src_type_enum,
+                            TF_DataType dst_type_enum, TF_Status* out_status);
+}
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 32d731d0f6..691b613e48 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -186,16 +186,16 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
 // Returns the set of variables watched by the given tape.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
-// Returns an EagerTensor of dimension [len(`tensor_list`)] containing
-// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
+// Returns an EagerTensor of dimension [len(`tensors`)] containing
+// the `slice_dim`'th dimension of each tensor in `tensors`. In other words,
 // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
-// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes
+// `tensors`. For example, if `tensors` contains tensors of with shapes
 // [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with
 // `slice_dim` equal to 1 will return [2, 5, 7].
 // On error, returns nullptr and sets python exception.
-// REQUIRES: `tensor_list` is a python list of EagerTensors
+// REQUIRES: `tensors` is a python list/tuple of EagerTensors
 // REQUIRES: `slice_dim` is non-negative and smaller than the rank of all
-//   tensors in `tensor_list`.
-PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim);
+//   tensors in `tensors`.
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index d99bd0b0ff..2bfa1f052c 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -38,6 +38,54 @@ using tensorflow::strings::Printf;
 
 namespace {
 
+struct InputInfo {
+  InputInfo(int i, bool is_list) : i(i), is_list(is_list) {}
+
+  int i;
+  bool is_list = false;
+};
+
+using AttrToInputsMap =
+    tensorflow::gtl::FlatMap<string,
+                             tensorflow::gtl::InlinedVector<InputInfo, 4>>;
+
+tensorflow::mutex all_attr_to_input_maps_lock(
+    tensorflow::LINKER_INITIALIZED);
+tensorflow::gtl::FlatMap<string, AttrToInputsMap*>* GetAllAttrToInputsMaps() {
+  static auto* all_attr_to_input_maps =
+      new tensorflow::gtl::FlatMap<string, AttrToInputsMap*>;
+  return all_attr_to_input_maps;
+}
+
+AttrToInputsMap* GetAttrToInputsMap(const tensorflow::OpDef& op_def) {
+  tensorflow::mutex_lock l(all_attr_to_input_maps_lock);
+  auto* all_attr_to_input_maps = GetAllAttrToInputsMaps();
+
+  auto* output =
+      tensorflow::gtl::FindPtrOrNull(*all_attr_to_input_maps, op_def.name());
+  if (output != nullptr) {
+    return output;
+  }
+
+  std::unique_ptr<AttrToInputsMap> m(new AttrToInputsMap);
+
+  // Store a list of InputIndex -> List of corresponding inputs.
+  for (int i = 0; i < op_def.input_arg_size(); i++) {
+    if (!op_def.input_arg(i).type_attr().empty()) {
+      auto it = m->find(op_def.input_arg(i).type_attr());
+      if (it == m->end()) {
+        it = m->insert({op_def.input_arg(i).type_attr(), {}}).first;
+      }
+      it->second.emplace_back(i, !op_def.input_arg(i).number_attr().empty());
+    }
+  }
+
+  auto* retval = m.get();
+  (*all_attr_to_input_maps)[op_def.name()] = m.release();
+
+  return retval;
+}
+
 struct FastPathOpExecInfo {
   TFE_Context* ctx;
   const char* device_name;
@@ -53,6 +101,14 @@ struct FastPathOpExecInfo {
   // The op type name of the main op being executed.
   PyObject* op_name;
   PyObject* callbacks;
+
+  // All the args passed into the FastPathOpExecInfo.
+  PyObject* args;
+
+  // DTypes can come from another input that has the same attr. So build that
+  // map.
+  const AttrToInputsMap* attr_to_inputs_map;
+  tensorflow::gtl::FlatMap<string, tensorflow::DataType> cached_dtypes;
 };
 
 #define PARSE_VALUE(fn_name, type, check_fn, parse_fn)                       \
@@ -76,12 +132,29 @@ PARSE_VALUE(ParseIntValue, int, PyLong_Check, PyLong_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
-PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
+#if PY_MAJOR_VERSION < 3
+bool ParseInt64Value(const string& key, PyObject* py_value, TF_Status* status,
+                     int64_t* value) {
+  if (PyInt_Check(py_value)) {
+    *value = static_cast<int64_t>(PyInt_AsLong(py_value));
+    return true;
+  } else if (PyLong_Check(py_value)) {
+    *value = static_cast<int64_t>(PyLong_AsLong(py_value));
+    return true;
+  }
+  TF_SetStatus(
+      status, TF_INVALID_ARGUMENT,
+      tensorflow::strings::StrCat("Expecting int or long value for attr ", key,
+                                  ", got ", py_value->ob_type->tp_name)
+          .c_str());
+  return false;
+}
+#endif
+
 Py_ssize_t TensorShapeNumDims(PyObject* value) {
   const auto size = PySequence_Size(value);
   if (size == -1) {
@@ -234,7 +307,7 @@ bool SetOpAttrList(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -296,7 +369,7 @@ void SetOpAttrListDefault(
     TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     int num_values = attr.default_value().list().s_size();
-    std::unique_ptr<const char* []> values(new const char*[num_values]);
+    std::unique_ptr<const char*[]> values(new const char*[num_values]);
     (*attr_list_sizes)[key] = num_values;
     for (int i = 0; i < num_values; i++) {
       values[i] = attr.default_value().list().s(i).data();
@@ -349,7 +422,7 @@ void SetOpAttrListDefault(
     std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
     // Copy the input dims into the buffer and set dims to point to
     // the start of each list's dims.
-    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<const int64_t*[]> dims(new const int64_t*[num_values]);
     std::unique_ptr<int[]> num_dims(new int[num_values]);
     int64_t* offset = buffer.get();
     for (int i = 0; i < num_values; ++i) {
@@ -369,7 +442,7 @@ void SetOpAttrListDefault(
   } else if (type == TF_ATTR_FUNC) {
     int num_values = attr.default_value().list().func_size();
     (*attr_list_sizes)[key] = num_values;
-    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
     for (int i = 0; i < num_values; i++) {
       funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
     }
@@ -1399,10 +1472,39 @@ PyObject* GetPythonObjectFromString(const char* s) {
 #endif
 }
 
+PyObject* GetPythonObjectFromInt(int num) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_FromLong(num);
+#else
+  return PyInt_FromLong(num);
+#endif
+}
+
 bool CheckResourceVariable(PyObject* item) {
   return PyObject_TypeCheck(item, resource_variable_type);
 }
 
+bool IsNumberType(PyObject* item) {
+#if PY_MAJOR_VERSION >= 3
+  return PyFloat_Check(item) || PyLong_Check(item);
+#else
+  return PyFloat_Check(item) || PyInt_Check(item) || PyLong_Check(item);
+#endif
+}
+
+bool CheckOneInput(PyObject* item) {
+  if (EagerTensor_CheckExact(item) || CheckResourceVariable(item) ||
+      PyArray_Check(item) || IsNumberType(item)) {
+    return true;
+  }
+
+  // Sequences are not properly handled. Sequences with purely python numeric
+  // types work, but sequences with mixes of EagerTensors and python numeric
+  // types don't work.
+  // TODO(nareshmodi): fix
+  return false;
+}
+
 bool CheckInputsOk(PyObject* seq, int start_index,
                    const tensorflow::OpDef& op_def) {
   for (int i = 0; i < op_def.input_arg_size(); i++) {
@@ -1419,8 +1521,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
       }
       for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
         PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
-        if (!EagerTensor_CheckExact(inner_item) &&
-            !CheckResourceVariable(inner_item)) {
+        if (!CheckOneInput(inner_item)) {
           VLOG(1)
               << "Falling back to slow path for Op \"" << op_def.name()
               << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
@@ -1430,7 +1531,7 @@ bool CheckInputsOk(PyObject* seq, int start_index,
           return false;
         }
       }
-    } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
+    } else if (!CheckOneInput(item)) {
       VLOG(1)
           << "Falling back to slow path for Op \"" << op_def.name()
           << "\", Input \"" << op_def.input_arg(i).name()
@@ -1443,6 +1544,52 @@ bool CheckInputsOk(PyObject* seq, int start_index,
   return true;
 }
 
+PyObject* MaybeGetDType(PyObject* item) {
+  if (EagerTensor_CheckExact(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  if (CheckResourceVariable(item)) {
+    tensorflow::Safe_PyObjectPtr py_dtype(
+        PyObject_GetAttrString(item, "_dtype"));
+    return PyObject_GetAttrString(py_dtype.get(), "_type_enum");
+  }
+
+  return nullptr;
+}
+
+PyObject* MaybeGetDTypeForAttr(const string& attr,
+                               FastPathOpExecInfo* op_exec_info) {
+  auto cached_it = op_exec_info->cached_dtypes.find(attr);
+  if (cached_it != op_exec_info->cached_dtypes.end()) {
+    return GetPythonObjectFromInt(cached_it->second);
+  }
+
+  auto it = op_exec_info->attr_to_inputs_map->find(attr);
+  if (it == op_exec_info->attr_to_inputs_map->end()) {
+    // No other inputs - this should never happen.
+    Py_RETURN_NONE;
+  }
+
+  for (const auto& input_info : it->second) {
+    PyObject* item = PyTuple_GET_ITEM(
+        op_exec_info->args, kFastPathExecuteInputStartIndex + input_info.i);
+    if (input_info.is_list) {
+      for (int i = 0; i < PySequence_Fast_GET_SIZE(item); i++) {
+        auto* dtype = MaybeGetDType(PySequence_Fast_GET_ITEM(item, i));
+        if (dtype != nullptr) return dtype;
+      }
+    } else {
+      auto* dtype = MaybeGetDType(item);
+      if (dtype != nullptr) return dtype;
+    }
+  }
+
+  Py_RETURN_NONE;
+}
+
 bool OpDoesntRequireOutput(const string& op_name) {
   static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_outputs =
       new tensorflow::gtl::FlatSet<string>({
@@ -1668,23 +1815,80 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
 //  i) input is an EagerTensor
 //  ii) input is a ResourceVariable - in this case, the is_variable param is set
 //  to true.
-bool ConvertToTensor(const FastPathOpExecInfo& op_exec_info, PyObject* input,
-                     tensorflow::Safe_PyObjectPtr* output_handle,
-                     TF_Status* status) {
-  if (CheckResourceVariable(input)) {
+//
+//  NOTE: dtype_hint_getter must *always* return a PyObject that can be
+//  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
+//  increfs Py_None).
+bool ConvertToTensor(
+    const FastPathOpExecInfo& op_exec_info, PyObject* input,
+    tensorflow::Safe_PyObjectPtr* output_handle,
+    // This gets a hint for this particular input.
+    const std::function<PyObject*()>& dtype_hint_getter,
+    // This sets the dtype after conversion is complete.
+    const std::function<void(const TF_DataType& dtype)>& dtype_setter,
+    TF_Status* status) {
+  if (EagerTensor_CheckExact(input)) {
+    Py_INCREF(input);
+    output_handle->reset(input);
+    return true;
+  } else if (CheckResourceVariable(input)) {
     return ReadVariableOp(op_exec_info, input, output_handle, status);
   }
 
-  Py_INCREF(input);
-  output_handle->reset(input);
+  // The hint comes from a supposedly similarly typed tensor.
+  tensorflow::Safe_PyObjectPtr dtype_hint(dtype_hint_getter());
+  if (PyErr_Occurred()) {
+    return false;
+  }
+
+  tensorflow::Safe_TFE_TensorHandlePtr handle =
+      tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
+          tensorflow::ConvertToEagerTensor(input, dtype_hint.get())));
+  if (handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Unable to convert value to tensor");
+    return false;
+  }
+
+  int desired_dtype = -1;
+  if (dtype_hint.get() != Py_None) {
+    if (!ParseTypeValue("", dtype_hint.get(), status, &desired_dtype)) {
+      status->status = tensorflow::errors::InvalidArgument(
+          "Expecting a DataType value for dtype. Got ",
+          Py_TYPE(dtype_hint.get())->tp_name);
+    }
+  }
+
+  TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
+  if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
+    handle = tensorflow::make_safe(
+        tensorflow::EagerCast(op_exec_info.ctx, handle.get(), handle_dtype,
+                              static_cast<TF_DataType>(desired_dtype), status));
+    if (!status->status.ok()) return false;
+
+    handle_dtype = TFE_TensorHandleDataType(handle.get());
+  }
+
+  if (handle_dtype != TF_INT32) {
+    // Note that this is a shallow copy and will share the underlying buffer
+    // if copying to the same device.
+    handle = tensorflow::make_safe(TFE_TensorHandleCopyToDevice(
+        handle.get(), op_exec_info.ctx, op_exec_info.device_name, status));
+    if (!status->status.ok()) return false;
+  }
+
+  output_handle->reset(EagerTensorFromHandle(handle.release()));
+
+  dtype_setter(handle_dtype);
 
   return true;
 }
 
 // Adds input and type attr to the op, and to the list of flattened
 // inputs/attrs.
-bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
-                  const tensorflow::OpDef::ArgDef* input_arg,
+bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
+                  const bool add_type_attr,
+                  const tensorflow::OpDef::ArgDef& input_arg,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_attrs,
                   std::vector<tensorflow::Safe_PyObjectPtr>* flattened_inputs,
                   TFE_Op* op, TF_Status* status) {
@@ -1693,18 +1897,30 @@ bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
   // out of scope in this function.
   tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr;
 
-  if (!ConvertToTensor(op_exec_info, input, &py_eager_tensor, status)) {
+  if (!ConvertToTensor(
+          *op_exec_info, input, &py_eager_tensor,
+          [&]() {
+            if (input_arg.type() != tensorflow::DataType::DT_INVALID) {
+              return GetPythonObjectFromInt(input_arg.type());
+            }
+            return MaybeGetDTypeForAttr(input_arg.type_attr(), op_exec_info);
+          },
+          [&](const TF_DataType dtype) {
+            op_exec_info->cached_dtypes[input_arg.type_attr()] =
+                static_cast<tensorflow::DataType>(dtype);
+          },
+          status)) {
     return false;
   }
 
   TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get());
 
-  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+  if (add_type_attr && !input_arg.type_attr().empty()) {
     auto dtype = TFE_TensorHandleDataType(input_handle);
-    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    TFE_OpSetAttrType(op, input_arg.type_attr().data(), dtype);
     if (flattened_attrs != nullptr) {
       flattened_attrs->emplace_back(
-          GetPythonObjectFromString(input_arg->type_attr().data()));
+          GetPythonObjectFromString(input_arg.type_attr().data()));
       flattened_attrs->emplace_back(PyLong_FromLong(dtype));
     }
   }
@@ -1844,6 +2060,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+  op_exec_info.args = args;
 
   if (op_exec_info.ctx == nullptr) {
     // The context hasn't been initialized. It will be in the slow path.
@@ -1892,6 +2109,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
+  op_exec_info.attr_to_inputs_map = GetAttrToInputsMap(*op_def);
+
   TF_Status* status = TF_NewStatus();
   TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
   auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
@@ -1986,17 +2205,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
-                          &input_arg, flattened_attrs.get(),
+        if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          true, input_arg, flattened_attrs.get(),
                           flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j),
-                            nullptr /* input_arg */,
-                            nullptr /* flattened_attrs */,
+          if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j),
+                            false, input_arg, nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
           }
@@ -2018,7 +2236,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             status)) {
+                             []() { Py_RETURN_NONE; },
+                             [](const TF_DataType& dtype) {}, status)) {
           return nullptr;
         }
 
@@ -2048,8 +2267,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       attr_list_sizes[attr_name] = len;
     } else {
       // The item is a single item.
-      if (!AddInputToOp(op_exec_info, input, &input_arg, flattened_attrs.get(),
-                        flattened_inputs.get(), op, status)) {
+      if (!AddInputToOp(&op_exec_info, input, true, input_arg,
+                        flattened_attrs.get(), flattened_inputs.get(), op,
+                        status)) {
         return nullptr;
       }
     }
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0bd5a5dbaf..b044b30231 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -278,14 +278,9 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
     with self.assertRaisesRegexp(
         TypeError,
-        r"tensor_list argument must be a list. Got \"EagerTensor\""):
+        r"tensors argument must be a list or a tuple. Got \"EagerTensor\""):
       pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2)
 
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"tensor_list argument must be a list. Got \"tuple\""):
-      pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2)
-
   def testNegativeSliceDim(self):
     t1 = _create_tensor([1, 2], dtype=dtypes.int32)
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 662cda2a7d..8cd6820f6a 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1385,6 +1385,22 @@ def register_tensor_conversion_function(base_type,
     if not callable(conversion_func):
       raise TypeError("conversion_func must be callable.")
 
+    # context._context is checked so that we don't inadvertently create it.
+    # This is because enable_eager_execution will fail when called from the main
+    # function if the context._context is already created, and the
+    # register_tensor_conversion_function calls happen when the module is
+    # imported.
+    if context._context is not None and context.executing_eagerly(
+    ) and isinstance(base_type, six.integer_types + (
+        float,
+        np.ndarray,
+    )):
+      # TODO(nareshmodi): consider setting a context variable which disables the
+      # fastpath instead.
+      raise TypeError(
+          "Cannot register conversions for numpy arrays, python number types "
+          "when executing eagerly.")
+
     try:
       funcs_at_priority = _tensor_conversion_func_registry[priority]
     except KeyError:
-- 
GitLab


From 76ea66f24d4370e6e7848b83fc0b571ba7edfa2d Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 20 Apr 2018 11:34:55 -0700
Subject: [PATCH 1213/1262] Move the guts of TFE_Op into EagerOperation

PiperOrigin-RevId: 193698320
---
 tensorflow/c/eager/BUILD                      |   2 +
 tensorflow/c/eager/c_api.cc                   | 230 +++++++++---------
 tensorflow/c/eager/c_api_internal.h           |  16 +-
 tensorflow/core/common_runtime/eager/BUILD    |  16 ++
 .../common_runtime/eager/eager_operation.cc   |  33 +++
 .../common_runtime/eager/eager_operation.h    |  74 ++++++
 6 files changed, 242 insertions(+), 129 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/eager/eager_operation.cc
 create mode 100644 tensorflow/core/common_runtime/eager/eager_operation.h

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3e14c10727..d66386acbd 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -51,6 +51,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [],
     }) + [
+        "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -73,6 +74,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 369342b142..b7a3097208 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -241,21 +241,18 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
-  tensorflow::Device* d = nullptr;
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status = op->ctx->context.FindDeviceByName(device_name, &d);
-  }
-  op->device = d;
+  status->status = op->operation.SetDevice(device_name);
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
-  tensorflow::Device* device =
-      (op->device == nullptr) ? op->ctx->context.HostCPU() : op->device;
+  tensorflow::Device* device = (op->operation.Device() == nullptr)
+                                   ? op->operation.EagerContext()->HostCPU()
+                                   : op->operation.Device();
   return device->name().c_str();
 }
 
 void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
-  op->use_xla = enable;
+  op->operation.SetUseXla(enable);
 #ifndef TENSORFLOW_EAGER_USE_XLA
   LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
                   "built with XLA support.";
@@ -263,22 +260,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  h->handle->Ref();
-  op->inputs.push_back(h->handle);
-  op->attrs.NumInputs(op->inputs.size());
+  op->operation.AddInput(h->handle);
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->is_function()) {
+  if (op->operation.is_function()) {
     status->status = tensorflow::errors::Unimplemented(
         "TODO(apassos): Support for attributes for TensorFlow functions is not "
         "ready yet.");
     return TF_ATTR_INT;  // The compiler requires that we return something.
   }
-  status->status =
-      tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list);
+  status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
+                                              attr_name, &ret, is_list);
   return ret;
 }
 
@@ -297,23 +292,24 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx,
 }
 
 void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) {
-  op->attrs.Set(attr_name, value);
+  op->operation.MutableAttrs()->Set(attr_name, value);
 }
 
 void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) {
-  op->attrs.Set(attr_name, static_cast<int64>(value));
+  op->operation.MutableAttrs()->Set(attr_name, static_cast<int64>(value));
 }
 
 void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) {
-  op->attrs.Set(attr_name, value);
+  op->operation.MutableAttrs()->Set(attr_name, value);
 }
 
 void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) {
-  op->attrs.Set(attr_name, (value == 0) ? false : true);
+  op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true);
 }
 
 void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) {
-  op->attrs.Set(attr_name, static_cast<tensorflow::DataType>(value));
+  op->operation.MutableAttrs()->Set(attr_name,
+                                    static_cast<tensorflow::DataType>(value));
 }
 
 void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
@@ -335,23 +331,24 @@ void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
       proto.add_dim()->set_size(dims[d]);
     }
   }
-  op->attrs.Set(attr_name, proto);
+  op->operation.MutableAttrs()->Set(attr_name, proto);
 }
 
 void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
                            const TFE_Op* value) {
   tensorflow::AttrValue attr_value;
   tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(value->name);
-  value->attrs.FillAttrValueMap(func->mutable_attr());
-  op->attrs.Set(attr_name, attr_value);
+  func->set_name(value->operation.Name());
+  value->operation.Attrs().FillAttrValueMap(func->mutable_attr());
+  op->operation.MutableAttrs()->Set(attr_name, attr_value);
 }
 
 #define TFE_OP_SET_ATTR_LIST(fn, type)                                \
   void fn(TFE_Op* op, const char* attr_name, const type* values,      \
           int num_values) {                                           \
-    op->attrs.Set(attr_name, tensorflow::gtl::ArraySlice<const type>( \
-                                 values, num_values));                \
+    op->operation.MutableAttrs()->Set(                                \
+        attr_name,                                                    \
+        tensorflow::gtl::ArraySlice<const type>(values, num_values)); \
   }
 TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*)
 TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float)
@@ -359,14 +356,14 @@ TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float)
 
 void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
                           const int64_t* values, int num_values) {
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const int64>(
-                    reinterpret_cast<const int64*>(values), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const int64>(
+                     reinterpret_cast<const int64*>(values), num_values));
 }
 
 void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
                            const TF_DataType* values, int num_values) {
-  op->attrs.Set(
+  op->operation.MutableAttrs()->Set(
       attr_name,
       tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
           reinterpret_cast<const tensorflow::DataType*>(values), num_values));
@@ -378,8 +375,8 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
   for (int i = 0; i < num_values; ++i) {
     b[i] = values[i];
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
 }
 
 void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
@@ -409,9 +406,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
       }
     }
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
-                    proto.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
+                     proto.get(), num_values));
 }
 
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
@@ -419,12 +416,12 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
   std::unique_ptr<tensorflow::NameAttrList[]> funcs(
       new tensorflow::NameAttrList[num_values]);
   for (int i = 0; i < num_values; i++) {
-    funcs[i].set_name(value[i]->name);
-    value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr());
+    funcs[i].set_name(value[i]->operation.Name());
+    value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr());
   }
-  op->attrs.Set(attr_name,
-                tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
-                    funcs.get(), num_values));
+  op->operation.MutableAttrs()->Set(
+      attr_name, tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
+                     funcs.get(), num_values));
 }
 }  // extern "C"
 
@@ -460,18 +457,19 @@ int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
 }
 
 tensorflow::Status ValidateInputTypeAndPlacement(
-    tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op,
-    const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) {
+    tensorflow::EagerContext* ctx, tensorflow::Device* op_device,
+    tensorflow::EagerOperation* op, const tensorflow::OpKernel* kernel,
+    tensorflow::RunMetadata* run_metadata) {
   tensorflow::Device* host_device = ctx->HostCPU();
   const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->inputs.size()) {
+  if (memtypes.size() != op->Inputs().size()) {
     return tensorflow::errors::InvalidArgument(
-        "expected ", memtypes.size(), " inputs, got ", op->inputs.size());
+        "expected ", memtypes.size(), " inputs, got ", op->Inputs().size());
   }
-  for (int i = 0; i < op->inputs.size(); ++i) {
+  for (int i = 0; i < op->Inputs().size(); ++i) {
     const tensorflow::Device* expected_device =
         memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device;
-    tensorflow::TensorHandle* handle = op->inputs[i];
+    tensorflow::TensorHandle* handle = op->Inputs()[i];
     tensorflow::Device* handle_device = nullptr;
     TF_RETURN_IF_ERROR(handle->Device(&handle_device));
     const tensorflow::Device* actual_device =
@@ -491,7 +489,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
           return tensorflow::errors::InvalidArgument(
               "Tensors on conflicting devices:"
               " cannot compute ",
-              op->name, " as input #", i, " was expected to be on ",
+              op->Name(), " as input #", i, " was expected to be on ",
               expected_device->name(), " but is actually on ",
               actual_device->name(), " (operation running on ",
               op_device->name(), ")",
@@ -502,7 +500,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
               "between devices"
               " may slow down your model");
         case tensorflow::DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->name << " input #" << i
+          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
                        << " was expected to be on " << expected_device->name()
                        << " but is actually on " << actual_device->name()
                        << " (operation running on " << op_device->name()
@@ -534,16 +532,16 @@ tensorflow::Status ValidateInputTypeAndPlacement(
         if (copied_tensor != nullptr) copied_tensor->Unref();
         return tensorflow::errors::Internal(
             "Failed copying input tensor from ", actual_device->name(), " to ",
-            expected_device->name(), " in order to run ", op->name, ": ",
+            expected_device->name(), " in order to run ", op->Name(), ": ",
             status.error_message());
       }
       handle->Unref();
       handle = copied_tensor;
-      op->inputs[i] = copied_tensor;
+      (*op->MutableInputs())[i] = copied_tensor;
     }
     if (handle->dtype != kernel->input_type(i)) {
       return tensorflow::errors::InvalidArgument(
-          "cannot compute ", op->name, " as input #", i,
+          "cannot compute ", op->Name(), " as input #", i,
           " was expected to be a ",
           tensorflow::DataTypeString(kernel->input_type(i)),
           " tensor but is a ", tensorflow::DataTypeString(handle->dtype),
@@ -554,9 +552,10 @@ tensorflow::Status ValidateInputTypeAndPlacement(
 }
 
 tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
-                                 TFE_Context* ctx, TF_Status* status) {
+                                 tensorflow::EagerContext* ctx,
+                                 TF_Status* status) {
   tensorflow::DeviceSet ds;
-  for (tensorflow::Device* d : *ctx->context.devices()) {
+  for (tensorflow::Device* d : *ctx->devices()) {
     ds.AddDevice(d);
   }
   tensorflow::DeviceTypeVector final_devices;
@@ -570,7 +569,7 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
         "Could not find valid device for node ", ndef.DebugString());
     return nullptr;
   }
-  for (tensorflow::Device* d : *ctx->context.devices()) {
+  for (tensorflow::Device* d : *ctx->devices()) {
     if (d->device_type() == final_devices[0].type_string()) {
       return d;
     }
@@ -599,15 +598,16 @@ const tensorflow::FunctionDef* OpToFunction(
     std::vector<TF_DataType>* arg_input_types,
     tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
     TF_Status* status) {
-  DCHECK(!op->is_function());
+  DCHECK(!op->operation.is_function());
 
   tensorflow::FunctionDef fdef;
 
   // Get the OpDef of the op we are trying to encapsulate.
-  TFE_Context* ctx = op->ctx;
+  TFE_Context* ctx = op->operation.ctx;
   const tensorflow::OpRegistrationData* op_data;
   {
-    status->status = ctx->context.FindFunctionOpData(op->name, &op_data);
+    status->status =
+        ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
     if (!status->status.ok()) {
       return nullptr;
     }
@@ -618,7 +618,8 @@ const tensorflow::FunctionDef* OpToFunction(
 
   // Handle constant inputs.
   const std::unordered_set<string> const_inputs(
-      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name));
+      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
+          op->operation.Name()));
 
   // First add place holders for the input args, so that we can refer to them by
   // position in the next loop. Also tally up the resource inputs.
@@ -644,7 +645,7 @@ const tensorflow::FunctionDef* OpToFunction(
       (*op_input_to_func_input)[i] = const_index;
       func_input_arg = signature->mutable_input_arg(const_index++);
       const_input_types->push_back(
-          static_cast<TF_DataType>(op->inputs[i]->dtype));
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
     } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
       VLOG(1) << "For resource input, mapping op input " << i
               << " to func input " << resource_index;
@@ -656,11 +657,11 @@ const tensorflow::FunctionDef* OpToFunction(
       (*op_input_to_func_input)[i] = arg_index;
       func_input_arg = signature->mutable_input_arg(arg_index++);
       arg_input_types->push_back(
-          static_cast<TF_DataType>(op->inputs[i]->dtype));
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
     }
 
     func_input_arg->set_name(op_input_arg.name());
-    func_input_arg->set_type(op->inputs[i]->dtype);
+    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
   }
   VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
 
@@ -673,7 +674,8 @@ const tensorflow::FunctionDef* OpToFunction(
       op_def.name(), func_id_generator.fetch_add(1)));
 
   // Add the node def and set its input names to match op_def's names.
-  const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
+  const tensorflow::NodeDef& ndef =
+      op->operation.MutableAttrs()->BuildNodeDef();
   DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
   *fdef.add_node_def() = ndef;
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
@@ -713,17 +715,18 @@ const tensorflow::FunctionDef* OpToFunction(
 // Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
 // via XLA.
 std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name;
-  auto launch_op =
-      std::unique_ptr<TFE_Op>(TFE_NewOp(op->ctx, "_XlaLaunch", status));
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  auto launch_op = std::unique_ptr<TFE_Op>(
+      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
   if (TF_GetCode(status) != TF_OK) return nullptr;
-  if (op->device) {
-    TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status);
+  if (op->operation.device) {
+    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
+                    status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
   }
 
   const tensorflow::FunctionDef* fdef;
-  { fdef = op->ctx->context.FindFunctionDef(op->name); }
+  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
   std::vector<TF_DataType> const_input_types;
   std::vector<TF_DataType> arg_input_types;
   tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
@@ -748,20 +751,21 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   // Copy inputs and their devices.
   // Since input param reordering may have occurred between `op` and `launch_op`
   // via `op_input_to_func_input`, adjust the actual inputs accordingly.
-  launch_op->inputs = op->inputs;
-  for (tensorflow::TensorHandle* h : launch_op->inputs) {
+  *launch_op->operation.MutableInputs() = op->operation.Inputs();
+  for (tensorflow::TensorHandle* h : launch_op->operation.Inputs()) {
     h->Ref();
   }
   if (!op_input_to_func_input.empty()) {
-    DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size());
+    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
     for (int i = 0; i < op_input_to_func_input.size(); ++i) {
       VLOG(1) << "mapping op input " << i << " to func input "
               << op_input_to_func_input[i];
 
-      launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i];
+      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
+          op->operation.Inputs()[i];
     }
   }
-  launch_op->attrs.NumInputs(op->inputs.size());
+  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
 
   TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
                         const_input_types.size());
@@ -796,16 +800,17 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
 
 extern "C" {
 
-void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
+void TFE_Execute(TFE_Op* tfe_op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  TFE_Context* ctx = op->ctx;
-  status->status = ctx->context.GetStatus();
+  tensorflow::EagerOperation* op = &tfe_op->operation;
+  tensorflow::EagerContext* ctx = op->EagerContext();
+  status->status = ctx->GetStatus();
   if (!status->status.ok()) {
     return;
   }
 #ifdef TENSORFLOW_EAGER_USE_XLA
   std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->use_xla && op->name != "_XlaLaunch") {
+  if (op->UseXla() && op->Name() != "_XlaLaunch") {
     xla_launch_op = BuildXlaLaunch(op, status);
     if (!status->status.ok()) {
       return;
@@ -816,31 +821,31 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
   // Ensure all resource-touching ops run in the device the resource is,
   // regardless of anything else that has been specified. This is identical to
   // the graph mode behavior.
-  for (int i = 0; i < op->inputs.size(); ++i) {
+  for (int i = 0; i < op->Inputs().size(); ++i) {
     tensorflow::Device* input_op_device = nullptr;
-    status->status = op->inputs[i]->OpDevice(&input_op_device);
+    status->status = op->Inputs()[i]->OpDevice(&input_op_device);
     if (!status->status.ok()) return;
-    VLOG(2) << "for op " << op->name << " input " << i << " "
-            << tensorflow::DataTypeString(op->inputs[i]->dtype) << " "
+    VLOG(2) << "for op " << op->Name() << " input " << i << " "
+            << tensorflow::DataTypeString(op->Inputs()[i]->dtype) << " "
             << (input_op_device == nullptr ? "cpu" : input_op_device->name())
-            << " " << (op->device == nullptr ? "cpu" : op->device->name());
-    if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE &&
-        (input_op_device != op->device || input_op_device == nullptr)) {
+            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
+    if (op->Inputs()[i]->dtype == tensorflow::DT_RESOURCE &&
+        (input_op_device != op->Device() || input_op_device == nullptr)) {
       tensorflow::Device* d =
-          input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device;
-      VLOG(1) << "Changing device of operation " << op->name << " to "
+          input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
+      VLOG(1) << "Changing device of operation " << op->Name() << " to "
               << d->name() << " because input #" << i
               << " is a resource in this device.";
-      op->device = d;
+      op->SetDevice(d);
     }
   }
-  tensorflow::Device* device = op->device;
+  tensorflow::Device* device = op->Device();
 
-  tensorflow::Fprint128 cache_key =
-      op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name());
-  tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key);
+  tensorflow::Fprint128 cache_key = op->MutableAttrs()->CacheKey(
+      device == nullptr ? "unspecified" : device->name());
+  tensorflow::KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
   if (kernel == nullptr) {
-    const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
+    const tensorflow::NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
     if (device == nullptr) {
       device = SelectDevice(ndef, ctx, status);
       if (!status->status.ok()) {
@@ -848,19 +853,19 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
       }
     }
     CHECK(device != nullptr);
-    if (ctx->context.LogDevicePlacement()) {
+    if (ctx->LogDevicePlacement()) {
       LOG(INFO) << "Executing op " << ndef.op() << " in device "
                 << device->name();
     }
-    kernel = new tensorflow::KernelAndDevice(ctx->context.GetRendezvous());
+    kernel = new tensorflow::KernelAndDevice(ctx->GetRendezvous());
     // Knowledge of the implementation of Init (and in-turn
     // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
     // will be accessed, so grab on to the lock.
     // See WARNING comment in Execute (before kernel->Run) - would be nice to
     // rework to avoid this subtlety.
-    tensorflow::tf_shared_lock l(*ctx->context.FunctionsMu());
-    status->status = tensorflow::KernelAndDevice::Init(
-        ndef, ctx->context.func_lib(device), kernel);
+    tensorflow::tf_shared_lock l(*ctx->FunctionsMu());
+    status->status =
+        tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
     if (!status->status.ok()) {
       delete kernel;
       return;
@@ -868,7 +873,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     // Update output_dtypes inside `kernel`.
     const tensorflow::OpDef* op_def = nullptr;
     const tensorflow::FunctionDef* function_def =
-        ctx->context.FuncLibDef()->Find(ndef.op());
+        ctx->FuncLibDef()->Find(ndef.op());
     if (function_def != nullptr) {
       op_def = &(function_def->signature());
     }
@@ -884,7 +889,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     if (!status->status.ok()) {
       return;
     }
-    ctx->context.AddKernelToCache(cache_key, kernel);
+    ctx->AddKernelToCache(cache_key, kernel);
   }
   const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes();
   const int output_dtypes_size = output_dtypes.size();
@@ -903,43 +908,42 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     device = kernel->device();
   }
   status->status = ValidateInputTypeAndPlacement(
-      &ctx->context, device, op, kernel->kernel(),
-      ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto()
-                                         : nullptr);
+      ctx, device, op, kernel->kernel(),
+      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
   if (!status->status.ok()) return;
   std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
-  if (ctx->context.ShouldStoreMetadata()) {
+  if (ctx->ShouldStoreMetadata()) {
     maybe_stats.reset(new tensorflow::NodeExecStats);
-    maybe_stats->set_node_name(op->name);
+    maybe_stats->set_node_name(op->Name());
     maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
     maybe_stats->set_op_start_rel_micros(0);
     maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
     // TODO(apassos) track referenced tensors
   }
-  if (ctx->context.Async()) {
+  if (ctx->Async()) {
     // Note that for async mode, execution order will make sure that all
     // input handles are ready before executing them.
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
         *num_retvals);
-    tensorflow::uint64 id = op->ctx->context.NextId();
+    tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
       tensorflow::TensorHandle* h =
-          new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context);
+          new tensorflow::TensorHandle(id, output_dtypes[i], ctx);
       retvals[i] = new TFE_TensorHandle(h);
       handle_retvals[i] = h;
     }
     tensorflow::EagerNode* node = new tensorflow::ExecuteNode(
-        id, &op->ctx->context, op->device, op->inputs, kernel,
-        maybe_stats.release(), output_dtypes, handle_retvals);
-    ctx->context.ExecutorAdd(node);
+        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
+        output_dtypes, handle_retvals);
+    ctx->ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
         *num_retvals);
     status->status = tensorflow::EagerExecute(
-        &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(),
+        ctx, op->Device(), op->Inputs(), kernel, maybe_stats.get(),
         handle_retvals.data(), *num_retvals);
     for (int i = 0; i < *num_retvals; ++i) {
       retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
@@ -1142,9 +1146,3 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
   }
 }
 }  // namespace tensorflow
-
-TFE_Op::~TFE_Op() {
-  for (tensorflow::TensorHandle* h : inputs) {
-    h->Unref();
-  }
-}
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 05dc64f521..49e1aab1ce 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/version.h"
 
-
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
   // true if async execution is enabled.
@@ -85,19 +85,9 @@ struct TFE_Op {
   // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
   // primitive operation.
   TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {}
-
-  ~TFE_Op();
-
-  bool const is_function() const { return attr_types == nullptr; }
+      : operation(&ctx->context, op, t) {}
 
-  TFE_Context* ctx;  // Must outlive the TFE_Op.
-  const tensorflow::string name;
-  tensorflow::AttrBuilder attrs;
-  const tensorflow::AttrTypeMap* attr_types;
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs;
-  tensorflow::Device* device;
-  bool use_xla = false;
+  tensorflow::EagerOperation operation;
 };
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 941a0e61c7..00ac4a4e47 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -54,6 +54,22 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "eager_operation",
+    srcs = [
+        "eager_operation.cc",
+    ],
+    hdrs = [
+        "eager_operation.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":tensor_handle",
+        "//tensorflow/c/eager:runtime",
+    ],
+)
+
 tf_cuda_library(
     name = "tensor_handle",
     srcs = [
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
new file mode 100644
index 0000000000..381b05ada8
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+
+namespace tensorflow {
+tensorflow::Status EagerOperation::SetDevice(const char* device) {
+  auto status = Status::OK();
+  tensorflow::Device* d = nullptr;
+  if (device != nullptr && strlen(device) > 0) {
+    status.Update(ctx_->FindDeviceByName(device, &d));
+  }
+  device_ = d;
+  return status;
+}
+
+void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
+  h->Ref();
+  inputs_.push_back(h);
+  attrs_.NumInputs(static_cast<int>(inputs_.size()));
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
new file mode 100644
index 0000000000..6b6e53da87
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
+
+#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+
+namespace tensorflow {
+class EagerOperation {
+ public:
+  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
+  // instead of a primitive operation.
+  EagerOperation(tensorflow::EagerContext* ctx, const char* op,
+                 const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+
+  ~EagerOperation() {
+    for (tensorflow::TensorHandle* h : inputs_) {
+      h->Unref();
+    }
+  }
+
+  bool is_function() const { return attr_types_ == nullptr; }
+
+  tensorflow::EagerContext* EagerContext() { return ctx_; }
+
+  tensorflow::AttrBuilder* MutableAttrs() { return &attrs_; }
+  const tensorflow::AttrBuilder& Attrs() const { return attrs_; }
+
+  const tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>& Inputs()
+      const {
+    return inputs_;
+  }
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>*
+  MutableInputs() {
+    return &inputs_;
+  }
+  void AddInput(tensorflow::TensorHandle* h);
+
+  const tensorflow::string& Name() const { return name_; }
+  const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; }
+
+  tensorflow::Device* Device() const { return device_; }
+  tensorflow::Status SetDevice(const char* device);
+  void SetDevice(tensorflow::Device* device) { device_ = device; }
+
+  void SetUseXla(bool use_xla) { use_xla_ = use_xla; }
+
+ private:
+  tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
+  const tensorflow::string name_;
+  tensorflow::AttrBuilder attrs_;
+  const tensorflow::AttrTypeMap* attr_types_;
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
+  tensorflow::Device* device_;
+  bool use_xla_ = false;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
-- 
GitLab


From 2b0b015ebb1c33a409836bd1c9c98124dfd841ec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 11:43:48 -0700
Subject: [PATCH 1214/1262] [XLA] Fix a bug in ToProto: don't add gather
 attributes twice.

PiperOrigin-RevId: 193699745
---
 tensorflow/compiler/xla/service/hlo_instruction.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a638d54d85..a714d0e114 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -2451,12 +2451,6 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
-  if (gather_dimension_numbers_ != nullptr) {
-    *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_;
-  }
-  for (int64 bound : gather_window_bounds_) {
-    proto.add_gather_window_bounds(bound);
-  }
   proto.set_channel_name(channel_name_);
   proto.set_cost_estimate_ns(cost_estimate_ns_);
 
-- 
GitLab


From 0074dffd076e0faf4da5913aebfa594ef925d6c7 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 20 Apr 2018 12:01:21 -0700
Subject: [PATCH 1215/1262] Prefix compat import with underscore in
 meta_graph_transform.py so that it doesn't get exported as part of API:
 https://www.tensorflow.org/versions/r1.8/api_docs/python/tf/contrib/meta_graph_transform/meta_graph_transform

PiperOrigin-RevId: 193702570
---
 .../meta_graph_transform/meta_graph_transform.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index ff88b4fa84..4090c1ff3e 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import importer as _importer
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.saved_model import constants as _saved_model_constants
 from tensorflow.python.training import saver as _saver_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util import compat as _compat
 from tensorflow.tools import graph_transforms as _graph_transforms
 
 
@@ -161,7 +161,7 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shapes = []
   dtypes = []
   for index, value in enumerate(name_op_value_tensor.string_val):
-    if not _is_removed(compat.as_str(value), removed_op_names):
+    if not _is_removed(_compat.as_str(value), removed_op_names):
       names.append(value)
       shapes.append(shape_op_value_tensor.string_val[index])
       dtypes.append(op.attr['dtypes'].list.type[index])
@@ -651,7 +651,7 @@ def _is_removed_mentioned(s, removed_op_names):
   # /foo/bar. This regex ensures that we handle these two nodes
   # as separate entities.  It matches on nodes having names in the form of
   # '/foo/bar_x' as well as nodes having names in the form of 'foo.'
-  s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', compat.as_str_any(s))
+  s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', _compat.as_str_any(s))
   for removed_op_name in removed_op_names:
     for s_name in s_names:
       if s_name.endswith(removed_op_name):
@@ -737,9 +737,9 @@ def meta_graph_transform(
   for tag in tags:
     meta_graph_def.meta_info_def.tags.append(tag)
 
-  base_op_names = [compat.as_str(node.name)
+  base_op_names = [_compat.as_str(node.name)
                    for node in base_meta_graph_def.graph_def.node]
-  retained_op_names = [compat.as_str(node.name)
+  retained_op_names = [_compat.as_str(node.name)
                        for node in meta_graph_def.graph_def.node]
   removed_op_names = set(base_op_names) - set(retained_op_names)
 
-- 
GitLab


From 1b5839e6acad5d360ea9e5b94226b30047924cb9 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Fri, 20 Apr 2018 12:02:56 -0700
Subject: [PATCH 1216/1262] [TF:XLA] Now that the compiler no longer introduces
 implicit broadcasts, forbid them in the HLO verifier.

PiperOrigin-RevId: 193702874
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/hlo_verifier.cc      | 21 ++++++++
 .../compiler/xla/service/hlo_verifier.h       |  4 ++
 .../xla/service/reshape_mover_test.cc         | 51 -------------------
 4 files changed, 26 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9009cbf845..9555d91817 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2032,6 +2032,7 @@ cc_library(
     srcs = ["hlo_verifier.cc"],
     hdrs = ["hlo_verifier.h"],
     deps = [
+        ":hlo",
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 80ed6d6832..8a30cbf9cd 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <set>
 
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -780,6 +781,24 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
   return tensorflow::Status::OK();
 }
 
+Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
+  const Shape& out_shape = instruction->shape();
+  for (HloInstruction* operand : instruction->operands()) {
+    const Shape& operand_shape = operand->shape();
+    if (!ShapeUtil::IsScalar(operand_shape) &&
+        !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) {
+      return FailedPrecondition(
+          "Implicit broadcast is not allowed in HLO."
+          "Found non-compatible shapes for instruction %s.\n"
+          "output: %s\noperand: %s\n",
+          HloOpcodeString(instruction->opcode()).c_str(),
+          ShapeUtil::HumanString(out_shape).c_str(),
+          ShapeUtil::HumanString(operand_shape).c_str());
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
 
@@ -821,6 +840,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
             << " != " << ShapeUtil::Rank(instruction->operand(0)->shape());
       } else if (instruction->opcode() == HloOpcode::kWhile) {
         TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction));
+      } else if (instruction->IsElementwise()) {
+        TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction));
       }
 
       auto previous = instructions.find(instruction->name());
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1ec55a9bdc..6208887547 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -146,6 +146,10 @@ class HloVerifier : public HloPassInterface {
 
   Status CheckWhileInstruction(HloInstruction* instruction);
 
+  // Checks that the non-scalar operand shapes are compatible to the output
+  // shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+  Status CheckElementwiseInstruction(HloInstruction* instruction);
+
   // Creates a ShapeVerifier that checks that shapes match inferred
   // expectations. This is a factory function because ShapeVerifier,
   // being a DfsHloVisitor, is stateful. We want a clean object
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 094f7319f4..13e2d3258e 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -458,57 +458,6 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   EXPECT_EQ(select, computation->root_instruction());
 }
 
-// Tree looks like:
-//
-// param0 [1,128,1]
-//  |
-// reshape [128,1]          constant [128,1024]
-//   \                         /
-//     multiply w/implicit broadcast [128,1024]
-//
-// The reshape mover would like to sink the reshape below the multiply.
-//
-// Previously we would attempt to insert a reshape of the constant to [1,128,1]
-// (which is unsound, because it has a different number of elements) as
-// preparation for sinking the reshape.
-//
-// To eliminate the unsoundness, we outlaw reshape sinking when one of the
-// operands is implicitly broadcast in the elementwise consumer.
-//
-// TODO(b/37799338) However, it would be possible in this case to do a more
-// in-depth analysis to get reshape movement to occur:
-//
-// 1. Note that the broadcast dimension (logical dimension 1) in the operands
-//    would map back to logical dimension 2 in the param0 node.
-// 2. Match rank of the constant to the param0 node (by prepending a trivial 1
-//    dimension).
-// 3. Reshape to [128,1024] at the root.
-//
-// But this is not currently done.
-TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
-  HloComputation::Builder builder(TestName());
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0"));
-  auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(F32, {128, 1}), param0));
-  Array2D<float> a(128, 1024);
-  auto literal = Literal::CreateR2FromArray2D<float>(a);
-  auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(std::move(literal)));
-  auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
-      constant->shape(), HloOpcode::kMultiply, constant, reshape));
-
-  auto computation = module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Constant(), op::Reshape(param0)));
-
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Constant(), op::Reshape(param0)));
-  EXPECT_EQ(multiply, computation->root_instruction());
-}
-
 // Tree looks like this:
 //
 // add1
-- 
GitLab


From ceed923d600584ade8d159271422b4a08f728cbb Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yangzihao@google.com>
Date: Fri, 20 Apr 2018 12:05:11 -0700
Subject: [PATCH 1217/1262] Add native dilated support for conv3d and its
 gradients in cudnn v>=6.

PiperOrigin-RevId: 193703316
---
 tensorflow/core/framework/common_shape_fns.cc |  32 ++-
 .../core/framework/common_shape_fns_test.cc   |  55 ++++-
 tensorflow/core/kernels/conv_grad_ops_3d.cc   | 115 +++++++++-
 tensorflow/core/kernels/conv_ops_3d.cc        |  52 ++++-
 tensorflow/core/ops/nn_ops.cc                 |   2 +
 .../python/kernel_tests/conv_ops_3d_test.py   | 196 +++++++++++++++++-
 tensorflow/python/ops/nn_grad.py              |   6 +
 7 files changed, 426 insertions(+), 32 deletions(-)

diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 72eeda7a43..0916c9b7a8 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -487,6 +487,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   string data_format;
   Status s = c->GetAttr("data_format", &data_format);
 
+  std::vector<int32> dilations;
+  TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
+
+  if (dilations.size() != 5) {
+    return errors::InvalidArgument(
+        "Conv3D requires the dilation attribute to contain 5 values, but got: ",
+        dilations.size());
+  }
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
@@ -496,6 +505,7 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   }
 
   int32 stride_planes, stride_rows, stride_cols;
+  int32 dilation_planes, dilation_rows, dilation_cols;
   if (s.ok() && data_format == "NCDHW") {
     // Convert input_shape to NDHWC.
     auto dim = [&](char dimension) {
@@ -506,10 +516,16 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
     stride_planes = strides[2];
     stride_rows = strides[3];
     stride_cols = strides[4];
+    dilation_planes = dilations[2];
+    dilation_cols = dilations[3];
+    dilation_rows = dilations[4];
   } else {
     stride_planes = strides[1];
     stride_rows = strides[2];
     stride_cols = strides[3];
+    dilation_planes = dilations[1];
+    dilation_cols = dilations[2];
+    dilation_rows = dilations[3];
   }
 
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
@@ -530,13 +546,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
   DimensionHandle output_planes, output_rows, output_cols;
 
-  TF_RETURN_IF_ERROR(
-      GetWindowedOutputSizeFromDims(c, in_planes_dim, filter_planes_dim,
-                                    stride_planes, padding, &output_planes));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, filter_rows_dim, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes,
+      padding, &output_planes));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding,
+      &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding,
+      &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCDHW") {
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 13d429b895..919e0967c0 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -644,15 +644,19 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
                     .Finalize(&op.node_def));
   };
 
-  // 1x1x1 filter
-  set_op({{1, 1, 1, 1, 1}}, "VALID");
-  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
-
   // Invalid rank for input
   INFER_ERROR("must be rank 5", op, "[4,4];[2,1,1,1]");
   // Invalid rank for filter
   INFER_ERROR("must be rank 5", op, "[1,4,4,1];[2,1,1]");
 
+  // Invalid value for strides
+  set_op({{1, 1, 1, 0, 1}}, "VALID");
+  INFER_ERROR("must be > 0", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // 1x1x1 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
   // unknown dims in the critical fields give partial inference.
   INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
   INFER_OK(op, "[1,?,2,2,1];[1,1,1,1,1]", "[d0_0,?,2,2,d1_4]");
@@ -712,6 +716,49 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   INFER_OK(op, "[1,4,9,4,1];[2,2,2,1,?]", "[d0_0,2,3,1,d1_4]");
 }
 
+TEST(CommonShapeFnsTest, Conv3DDilatedShapeTest) {
+  ShapeInferenceTestOp op("Conv3D");
+  auto set_op = [&op](const std::vector<int32>& dilations,
+                      const std::vector<int32>& strides,
+                      const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("dilations", dilations)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid rank for dilation
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_ERROR("contain 5 values", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // Invalid value for dilation
+  set_op({{1, 2, 0, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_ERROR("must be >= 1", op, "[1,2,2,2,1];[1,1,1,1,1]");
+
+  // 2x1x1 dilation 1x1x1 filter
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 2x1x1 dilation 2x2x2 filter
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,2,2,1];[2,2,2,1,1]", "[d0_0,1,1,1,d1_4]");
+
+  // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x2x2 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 2, 2, 2, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 2, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,3,3,d1_4]");
+
+  // 2x1x1 dilation 4x4x4 input, 2x2x2 filter, 1x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "SAME");
+  INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,d0_1,d0_2,d0_3,d1_4]");
+}
+
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
   std::vector<int32> strides = {{1, 1, 1, 1}};
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 1234997bc5..092e859a5b 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -79,13 +79,18 @@ typedef Eigen::GpuDevice GPUDevice;
       context, out_depth == GetTensorDim(out_backprop, data_format_, 'C'),     \
       errors::InvalidArgument(                                                 \
           label, ": filter and out_backprop must have the same out_depth"));   \
+  const std::array<int64, 3> dilations = {                                     \
+      {GetTensorDim(dilation_, data_format_, '0'),                             \
+       GetTensorDim(dilation_, data_format_, '1'),                             \
+       GetTensorDim(dilation_, data_format_, '2')}};                           \
   const std::array<int64, 3> strides = {                                       \
       {GetTensorDim(stride_, data_format_, '0'),                               \
        GetTensorDim(stride_, data_format_, '1'),                               \
        GetTensorDim(stride_, data_format_, '2')}};                             \
   std::array<int64, 3> out, padding;                                           \
-  OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,    \
-                                          padding_, &out, &padding));          \
+  OP_REQUIRES_OK(                                                              \
+      context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,  \
+                                 padding_, &out, &padding));                   \
   OP_REQUIRES(context, output_planes == out[0],                                \
               errors::InvalidArgument(                                         \
                   label,                                                       \
@@ -151,6 +156,26 @@ class Conv3DBackpropInputOp : public OpKernel {
               "Conv3DBackpropInputOpV2 only supports NDHWC on the CPU."));
     }
 
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -223,6 +248,7 @@ class Conv3DBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -261,6 +287,26 @@ class Conv3DBackpropFilterOp : public OpKernel {
               "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
     }
 
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -370,6 +416,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -438,6 +485,22 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -448,6 +511,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
          GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -471,6 +540,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[0] == 1 && filter_size[1] == 1 && filter_size[2] == 1 &&
+        dilation_[0] == 1 && dilation_[1] == 1 && dilation_[2] == 1 &&
         stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = batch * input_size[0] * input_size[1] * input_size[2];
@@ -580,7 +650,10 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, padding_cols / 2)
@@ -645,9 +718,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
-        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
-        // conv is supported.
-        /*dilation=*/{{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -755,6 +826,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -784,6 +856,22 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
@@ -794,6 +882,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
          GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -820,6 +914,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[1] == 1 && filter_size[2] == 1 && filter_size[0] == 1 &&
+        dilations[2] == 1 && dilations[1] == 1 && dilations[0] == 1 &&
         strides[2] == 1 && strides[1] == 1 && strides[0] == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = in_depth;
@@ -943,7 +1038,10 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, padding_cols / 2)
@@ -1016,7 +1114,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
-        {{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1102,6 +1200,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 0b7c1524e6..48dd3c9eb0 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -49,12 +49,18 @@ template <typename T>
 struct LaunchConvOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, bool cudnn_use_autotune,
                      const Tensor& input, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
                      const std::array<int64, 3>& strides, const Padding padding,
                      TensorFormat data_format, Tensor* output) {
     OP_REQUIRES(context, data_format == FORMAT_NHWC,
                 errors::InvalidArgument("CPU implementation of Conv3D "
                                         "currently only supports the NHWC "
                                         "tensor format."));
+    OP_REQUIRES(context,
+                dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument("CPU implementation of Conv3D "
+                                        "currently only supports dilated rates "
+                                        "of 1."));
     functor::CuboidConvolution<CPUDevice, T>()(
         context->eigen_device<CPUDevice>(), output->tensor<T, 5>(),
         input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
@@ -80,6 +86,28 @@ class Conv3DOp : public BinaryOp<T> {
          GetTensorDim(stride_, data_format_, 'C') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'N') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'C') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
@@ -115,13 +143,18 @@ class Conv3DOp : public BinaryOp<T> {
          GetTensorDim(input, data_format_, '2')}};
     std::array<int64, 3> filter_size = {
         {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}};
+    std::array<int64, 3> dilations = {
+        {GetTensorDim(dilation_, data_format_, '0'),
+         GetTensorDim(dilation_, data_format_, '1'),
+         GetTensorDim(dilation_, data_format_, '2')}};
     std::array<int64, 3> strides = {{GetTensorDim(stride_, data_format_, '0'),
                                      GetTensorDim(stride_, data_format_, '1'),
                                      GetTensorDim(stride_, data_format_, '2')}};
     std::array<int64, 3> out, padding;
 
-    OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,
-                                            padding_, &out, &padding));
+    OP_REQUIRES_OK(
+        context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
+                                   padding_, &out, &padding));
     TensorShape out_shape = ShapeFromFormat(
         data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
     Tensor* output;
@@ -131,10 +164,12 @@ class Conv3DOp : public BinaryOp<T> {
     if (out_shape.num_elements() == 0) return;
 
     LaunchConvOp<Device, T>::launch(context, cudnn_use_autotune_, input, filter,
-                                    strides, padding_, data_format_, output);
+                                    dilations, strides, padding_, data_format_,
+                                    output);
   }
 
  private:
+  std::vector<int32> dilation_;
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
@@ -165,6 +200,7 @@ template <typename T>
 struct LaunchConvOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
                      const Tensor& input_param, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
                      const std::array<int64, 3>& strides, const Padding padding,
                      TensorFormat data_format, Tensor* output) {
     auto* stream = ctx->op_device_context()->stream();
@@ -199,6 +235,7 @@ struct LaunchConvOp<GPUDevice, T> {
 
     // NOTE: This only works in NHWC.
     if (filter_planes == 1 && filter_rows == 1 && filter_cols == 1 &&
+        dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1 &&
         strides[0] == 1 && strides[1] == 1 && strides[2] == 1 &&
         data_format == FORMAT_NHWC) {
       // 1x1 filter, so call cublas directly.
@@ -330,7 +367,10 @@ struct LaunchConvOp<GPUDevice, T> {
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_filter_stride(DimIndex::X, strides[2])
+    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+        .set_dilation_rate(DimIndex::Y, dilations[1])
+        .set_dilation_rate(DimIndex::Z, dilations[0])
+        .set_filter_stride(DimIndex::X, strides[2])
         .set_filter_stride(DimIndex::Y, strides[1])
         .set_filter_stride(DimIndex::Z, strides[0])
         .set_zero_padding(DimIndex::X, pad_cols / 2)
@@ -377,9 +417,7 @@ struct LaunchConvOp<GPUDevice, T> {
         {{in_planes, in_rows, in_cols}},
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
-        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
-        // conv is supported.
-        /*dilation=*/{{1, 1, 1}},
+        {{dilations[0], dilations[1], dilations[2]}},
         {{strides[0], strides[1], strides[2]}},
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 12d6dc5eaf..6dc3d9df31 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -524,6 +524,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -537,6 +538,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &out));
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index f4616fd661..0b531125f3 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -61,18 +62,18 @@ class Conv3DTest(test.TestCase):
 
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride,
                             padding, data_format, dtype, use_gpu):
-    total_size_1 = 1
-    total_size_2 = 1
+    total_size_tensor = 1
+    total_size_filter = 1
     for s in tensor_in_sizes:
-      total_size_1 *= s
+      total_size_tensor *= s
     for s in filter_in_sizes:
-      total_size_2 *= s
+      total_size_filter *= s
 
     # Initializes the input tensor with array containing numbers from 0 to 1.
     # We keep the input tensor values fairly small to avoid overflowing float16
     # during the conv3d.
-    x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)]
+    x1 = [f * 1.0 / total_size_tensor for f in range(1, total_size_tensor + 1)]
+    x2 = [f * 1.0 / total_size_filter for f in range(1, total_size_filter + 1)]
     with self.test_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
@@ -118,6 +119,79 @@ class Conv3DTest(test.TestCase):
 
           self.assertAllClose(expected, value.flatten(), atol=tol, rtol=tol)
 
+  def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
+                                   stride, dilation, padding, data_format,
+                                   use_gpu):
+    total_size_tensor = 1
+    total_size_filter = 1
+    for s in tensor_in_sizes:
+      total_size_tensor *= s
+    for s in filter_in_sizes:
+      total_size_filter *= s
+
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_tensor + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_filter + 1)]
+    with self.test_session(use_gpu=use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride, stride]
+      if data_format == "NCDHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format)
+      computed = nn_ops.conv3d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCDHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, stride,
+                               padding, dilations):
+    expected_results = []
+    computed_results = []
+    default_dilations = (
+        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)
+    for data_format, use_gpu in GetTestConfigs():
+      # If any dilation rate is larger than 1, only do test on the GPU
+      # because we currently do not have a CPU implementation for arbitrary
+      # dilation rates.
+      if default_dilations or use_gpu:
+        expected, computed = self._ComputeReferenceDilatedConv(
+            tensor_in_sizes, filter_in_sizes, stride, dilations, padding,
+            data_format, use_gpu)
+        expected_results.append(expected)
+        computed_results.append(computed)
+        tolerance = 1e-2 if use_gpu else 1e-5
+        with self.test_session() as sess:
+          expected_values = sess.run(expected_results)
+          computed_values = sess.run(computed_results)
+          for e_value, c_value in zip(expected_values, computed_values):
+            print("expected = ", e_value)
+            print("actual = ", c_value)
+            self.assertAllClose(
+                e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-6)
+
   def testConv3D1x1x1Filter(self):
     expected_output = [
         0.18518519, 0.22222222, 0.25925926, 0.40740741, 0.5, 0.59259259,
@@ -145,6 +219,15 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  def testConv3D1x1x1Filter2x1x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 3, 6, 1, 1],
+          filter_in_sizes=[1, 1, 1, 1, 1],
+          stride=1,
+          padding="VALID",
+          dilations=[2, 1, 1])
+
   # Expected values computed using scipy's correlate function.
   def testConv3D2x2x2Filter(self):
     expected_output = [
@@ -161,6 +244,15 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  def testConv3D2x2x2Filter1x2x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 4, 6, 3, 1],
+          filter_in_sizes=[2, 2, 2, 1, 1],
+          stride=1,
+          padding="VALID",
+          dilations=[1, 2, 1])
+
   def testConv3DStrides(self):
     expected_output = [
         0.06071429, 0.08988095, 0.10238095, 0.11488095, 0.12738095, 0.13988095,
@@ -546,6 +638,98 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  # Testing for backprops
+  def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
+                            strides, dilations, padding, data_format, use_gpu,
+                            err, mode):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (
+        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)
+
+    # If any dilation rate is larger than 1, only do test on the GPU
+    # because we currently do not have a CPU implementation for arbitrary
+    # dilation rates.
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCDHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCDHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        actual = nn_ops.conv3d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        expected = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCDHW":
+          actual = test_util.NCHWToNHWC(actual)
+          expected = test_util.NCHWToNHWC(expected)
+        actual_grad = gradients_impl.gradients(actual, t1
+                                               if mode == "input" else t2)[0]
+        expected_grad = gradients_impl.gradients(expected, t1
+                                                 if mode == "input" else t2)[0]
+        # "values" consists of two tensors for two backprops
+        actual_value = sess.run(actual_grad)
+        expected_value = sess.run(expected_grad)
+        self.assertShapeEqual(actual_value, actual_grad)
+        self.assertShapeEqual(expected_value, expected_grad)
+      print("expected = ", expected_value)
+      print("actual = ", actual_value)
+      self.assertArrayNear(expected_value.flatten(), actual_value.flatten(),
+                           err)
+
+  def testConv3D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackprop(
+            input_sizes=[1, 3, 6, 1, 1],
+            filter_sizes=[2, 2, 1, 1, 1],
+            output_sizes=[1, 1, 5, 1, 1],
+            strides=[1, 1, 1],
+            dilations=[2, 1, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5,
+            mode="filter")
+
+  def testConv3D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackprop(
+            input_sizes=[1, 3, 6, 1, 1],
+            filter_sizes=[2, 2, 1, 1, 1],
+            output_sizes=[1, 1, 5, 1, 1],
+            strides=[1, 1, 1],
+            dilations=[2, 1, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5,
+            mode="input")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4af5bd26dd..3a41391340 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -94,6 +94,7 @@ def _Conv3DGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           op.inputs[1],
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
@@ -101,6 +102,7 @@ def _Conv3DGrad(op, grad):
           op.inputs[0],
           array_ops.shape(op.inputs[1]),
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -116,12 +118,14 @@ def _Conv3DBackpropInputGrad(op, grad):
           grad,
           array_ops.shape(op.inputs[1]),
           op.inputs[2],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format),
       nn_ops.conv3d(
           grad,
           op.inputs[1],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
@@ -136,12 +140,14 @@ def _Conv3DBackpropFilterGrad(op, grad):
           array_ops.shape(op.inputs[0]),
           grad,
           op.inputs[2],
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format), None,
       nn_ops.conv3d(
           op.inputs[0],
           grad,
+          dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           data_format=data_format)
-- 
GitLab


From a175841eb549f069ac205fb32bf55314a387fe6d Mon Sep 17 00:00:00 2001
From: jinghuangintel <jing1.huang@intel.com>
Date: Fri, 20 Apr 2018 12:20:00 -0700
Subject: [PATCH 1218/1262] [INTEL MKLDNN]: Upgrade mkldnn version to v13
 (#18508)

* upgrade mkldnn version to v13

* upgrade mkldnn version to v13 for all platforms
---
 tensorflow/workspace.bzl | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c58ef87338..f0a81f7754 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
       ],
-      sha256 = "feacc3d82565c1231470359b42c696236fae873704e0b013436afba5fd4fd30f",
-      strip_prefix = "mklml_lnx_2018.0.1.20171227",
+      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
+      strip_prefix = "mklml_lnx_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
       ],
-      sha256 = "24bae8d7b22b431a654acadea43f2243c46ae6b1e5a73a4a936825f31d284ee4",
-      strip_prefix = "mklml_win_2018.0.1.20171227",
+      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
+      strip_prefix = "mklml_win_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
       ],
-      sha256 = "0e954ec6fd3dc5e37f64c4043f6b5613dd687558da3df1028b3b7c29ff5cf77f",
-      strip_prefix = "mklml_mac_2018.0.1.20171227",
+      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
+      strip_prefix = "mklml_mac_2018.0.2.20180127",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.12.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.12.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
       ],
-      sha256 = "86fa2a8c12a56e3b725945acedeaa82492746be02545aba6d710f097e013e19e",
-      strip_prefix = "mkl-dnn-0.12",
+      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
+      strip_prefix = "mkl-dnn-0.13",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
-- 
GitLab


From b23e91d247368f2046dae035b5c7bdda56512077 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 12:37:39 -0700
Subject: [PATCH 1219/1262] Changed tf_to_tflite build rule.

PiperOrigin-RevId: 193707628
---
 tensorflow/contrib/lite/build_def.bzl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index b8f6b7fd59..8521677682 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -124,19 +124,19 @@ def tf_to_tflite(name, src, options, out):
     out: name of the output flatbuffer file.
   """
 
-  toco = "//tensorflow/contrib/lite/toco:toco"
+  toco_cmdline = " ".join([
+      "//tensorflow/contrib/lite/toco:toco",
+      "--input_format=TENSORFLOW_GRAPHDEF",
+      "--output_format=TFLITE",
+      ("--input_file=$(location %s)" % src),
+      ("--output_file=$(location %s)" % out),
+  ] + options )
   native.genrule(
       name = name,
-      srcs=[src, options],
+      srcs=[src],
       outs=[out],
-      cmd = ("$(location %s) " +
-             "   --input_file=$(location %s) " +
-             "   --output_file=$(location %s) " +
-             "   --input_format=TENSORFLOW_GRAPHDEF" +
-             "   --output_format=TFLITE" +
-             "   `cat $(location %s)`")
-            % (toco, src, out, options),
-      tools= [toco],
+      cmd = toco_cmdline,
+      tools= ["//tensorflow/contrib/lite/toco:toco"],
   )
 
 def tflite_to_json(name, src, out):
-- 
GitLab


From 517d1912f4ec71180944320350a3694332a1dedc Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 20 Apr 2018 12:40:57 -0700
Subject: [PATCH 1220/1262] Add a utility to visualize object-based checkpoints

Useful for generating a warm fuzzy feeling that everything you think should be saved was saved, and for explaining what object-based checkpointing is. (Also useful on the former front will be a planned "assert that all of this Graph's trainable variables are accessible from object X" function.)

Somewhat hacky since it generates strings rather than using the pydot bindings (and so works without a pydot dependency).

PiperOrigin-RevId: 193708003
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/checkpoint/__init__.py     |   3 +
 tensorflow/contrib/checkpoint/python/BUILD    |  32 +++++
 .../contrib/checkpoint/python/visualize.py    | 111 ++++++++++++++++++
 .../checkpoint/python/visualize_test.py       |  97 +++++++++++++++
 5 files changed, 244 insertions(+)
 create mode 100644 tensorflow/contrib/checkpoint/python/visualize.py
 create mode 100644 tensorflow/contrib/checkpoint/python/visualize_test.py

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 7e47516550..d28392a62c 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -25,6 +25,7 @@ py_library(
         "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
+        "//tensorflow/contrib/checkpoint/python:checkpoint",
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 70d7d2d8d7..1192cc44a1 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -16,6 +16,7 @@
 
 
 For creating and managing dependencies:
+@@dot_graph_from_checkpoint
 @@split_dependency
 """
 
@@ -24,6 +25,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
+from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index d57b01aab2..a5681ffa61 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -4,6 +4,15 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_library(
+    name = "checkpoint",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":split_dependency",
+        ":visualize",
+    ],
+)
+
 py_library(
     name = "split_dependency",
     srcs = ["split_dependency.py"],
@@ -27,3 +36,26 @@ py_test(
         "//tensorflow/python/eager:test",
     ],
 )
+
+py_library(
+    name = "visualize",
+    srcs = ["visualize.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+py_test(
+    name = "visualize_test",
+    srcs = ["visualize_test.py"],
+    deps = [
+        ":visualize",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
new file mode 100644
index 0000000000..86fbdb41d2
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -0,0 +1,111 @@
+"""Utilities for visualizing dependency graphs."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.training import checkpointable
+
+
+def dot_graph_from_checkpoint(save_path):
+  r"""Visualizes an object-based checkpoint (from `tf.train.Checkpoint`).
+
+  Useful for inspecting checkpoints and debugging loading issues.
+
+  Example usage from Python (requires pydot):
+  ```python
+  import tensorflow as tf
+  import pydot
+
+  dot_string = tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt')
+  parsed, = pydot.graph_from_dot_data(dot_string)
+  parsed.write_svg('/tmp/tensorflow/visualized_checkpoint.svg')
+  ```
+
+  Example command line usage:
+  ```sh
+  python -c "import tensorflow as tf;\
+    print(tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt'))"\
+    | dot -Tsvg > /tmp/tensorflow/checkpoint_viz.svg
+  ```
+
+  Args:
+    save_path: The checkpoint prefix, as returned by `tf.train.Checkpoint.save`
+      or `tf.train.latest_checkpoint`.
+  Returns:
+    A graph in DOT format as a string.
+  """
+  reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+  try:
+    object_graph_string = reader.get_tensor(
+        checkpointable.OBJECT_GRAPH_PROTO_KEY)
+  except errors_impl.NotFoundError:
+    raise ValueError(
+        ('The specified checkpoint "%s" does not appear to be object-based (it '
+         'is missing the key "%s"). Likely it was created with a name-based '
+         'saver and does not contain an object dependency graph.') % (
+             save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY))
+  shape_map = reader.get_variable_to_shape_map()
+  dtype_map = reader.get_variable_to_dtype_map()
+  object_graph = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  object_graph.ParseFromString(object_graph_string)
+  graph = 'digraph {\n'
+  def _escape(name):
+    return name.replace('"', '\\"')
+  slot_ids = set()
+  for node in object_graph.nodes:
+    for slot_reference in node.slot_variables:
+      slot_ids.add(slot_reference.slot_variable_node_id)
+  for node_id, node in enumerate(object_graph.nodes):
+    if (len(node.attributes) == 1
+        and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY):
+      if node_id in slot_ids:
+        color = 'orange'
+        tooltip_prefix = 'Slot variable'
+      else:
+        color = 'blue'
+        tooltip_prefix = 'Variable'
+      attribute = node.attributes[0]
+      graph += ('N_%d [shape=point label="" color=%s width=.25'
+                ' tooltip="%s %s shape=%s %s"]\n') % (
+                    node_id,
+                    color,
+                    tooltip_prefix,
+                    _escape(attribute.full_name),
+                    shape_map[attribute.checkpoint_key],
+                    dtype_map[attribute.checkpoint_key].name)
+    elif node.slot_variables:
+      graph += ('N_%d [shape=point label="" width=.25 color=red,'
+                'tooltip="Optimizer"]\n') % node_id
+    else:
+      graph += 'N_%d [shape=point label="" width=.25]\n' % node_id
+    for reference in node.children:
+      graph += 'N_%d -> N_%d [label="%s"]\n' % (
+          node_id, reference.node_id, _escape(reference.local_name))
+    for slot_reference in node.slot_variables:
+      graph += 'N_%d -> N_%d [label="%s" style=dotted]\n' % (
+          node_id,
+          slot_reference.slot_variable_node_id,
+          _escape(slot_reference.slot_name))
+      graph += 'N_%d -> N_%d [style=dotted]\n' % (
+          slot_reference.original_variable_node_id,
+          slot_reference.slot_variable_node_id)
+  graph += '}\n'
+  return graph
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
new file mode 100644
index 0000000000..1d9ab78923
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.contrib.checkpoint.python import visualize
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable_utils
+
+try:
+  import pydot  # pylint: disable=g-import-not-at-top
+except ImportError:
+  pydot = None
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class DotGraphTests(test.TestCase):
+
+  def testMakeDotGraph(self):
+    with context.eager_mode():
+      input_value = constant_op.constant([[3.]])
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      optimizer_step = resource_variable_ops.ResourceVariable(12)
+      save_checkpoint = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+      optimizer.minimize(functools.partial(model, input_value))
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+      save_path = save_checkpoint.save(checkpoint_prefix)
+      prefix = save_checkpoint.save(save_path)
+
+    dot_graph_string = visualize.dot_graph_from_checkpoint(prefix)
+
+    # The remainder of this test is more-or-less optional since it's so
+    # dependent on pydot/platform/Python versions.
+    if pydot is None:
+      self.skipTest('pydot is required for the remainder of this test.')
+    try:
+      parsed, = pydot.graph_from_dot_data(dot_graph_string)
+    except NameError as e:
+      if "name 'dot_parser' is not defined" in str(e):
+        self.skipTest("pydot isn't working")
+      else:
+        raise
+    # Check that the graph isn't completely trivial
+    self.assertEqual(
+        '"model"',
+        parsed.obj_dict['edges'][('N_0', 'N_1')][0]['attributes']['label'])
+    image_path = os.path.join(self.get_temp_dir(), 'saved.svg')
+    try:
+      parsed.write_svg(image_path)
+    except Exception as e:  # pylint: disable=broad-except
+      # For some reason PyDot's "dot not available" error is an Exception, not
+      # something more specific.
+      if '"dot" not found in path' in str(e):
+        self.skipTest("pydot won't save SVGs (dot not available)")
+      else:
+        raise
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 0b6ca72332735fe460da23fbcca5c8c24d838f28 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 13:18:02 -0700
Subject: [PATCH 1221/1262] Update ops-related pbtxt files.

PiperOrigin-RevId: 193712839
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 124 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  26 ++++
 2 files changed, 150 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index dbd6f859c4..247f9edf5b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -13445,6 +13445,68 @@ op {
     version: 10
   }
 }
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
 op {
   name: "Conv3DBackpropFilterV2"
   input_arg {
@@ -13718,6 +13780,68 @@ op {
     version: 10
   }
 }
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
 op {
   name: "Conv3DBackpropInputV2"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 46afe357f0..d1773daebe 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5651,6 +5651,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
   deprecation {
     version: 10
     explanation: "Use Conv3DBackpropFilterV2"
@@ -5774,6 +5787,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
   deprecation {
     version: 10
     explanation: "Use Conv3DBackpropInputV2"
-- 
GitLab


From 02075fa2456d951ff3b7bdb8fee76a1b9c6d8716 Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <Guozhong.Zhuang@intel.com>
Date: Fri, 20 Apr 2018 13:43:06 -0700
Subject: [PATCH 1222/1262] MKLDNN: conv2d forward DNN primitive reuse
 enhancement (#17943)

* Enable conv2d fwd primitive reuse

* coding style change based on suggestions from TF team

* minor code style fix

* refactor conv2d primitive reuse class and enhance key creation utility

* refactor by introducing ConvFwdDimensions structure

* change 'Execute' method to be a template one per PR review suggestion

* Per PR review suggestion, update DnnOp class to declared related  method as abstract ones

* refactor AddAsKey method - template for scalar value and remove Execute()which is not used yet

* rename padding_l/_r/pl/pr to padding_left or padding_right as recommended

* parameter and variable renaming - to make them more explicit
---
 tensorflow/core/kernels/mkl_conv_ops.cc | 414 +++++++++++++++++-------
 tensorflow/core/util/mkl_util.h         |  87 ++++-
 2 files changed, 389 insertions(+), 112 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index f0818eb96d..f2b14f1278 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <map>
 #include <string>
 #include <vector>
+#include <memory>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,14 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
-
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
 using mkldnn::stream;
-
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
+
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -57,11 +57,232 @@ using mkldnn::convolution_forward;
 
 namespace tensorflow {
 
+#ifndef INTEL_MKL_ML
+
+struct ConvFwdDimensions {
+  memory::dims src_dims;
+  memory::dims filter_dims;
+  memory::dims bias_dims;
+  memory::dims dst_dims;
+  memory::dims strides;
+  memory::dims dilations;
+  memory::dims padding_left;
+  memory::dims padding_right;
+
+  ConvFwdDimensions(memory::dims src_dims,
+    memory::dims filter_dims, memory::dims bias_dims,
+    memory::dims dst_dims, memory::dims strides,
+    memory::dims dilations, memory::dims padding_left,
+    memory::dims padding_right) :
+      src_dims(src_dims), filter_dims(filter_dims),
+      bias_dims(bias_dims), dst_dims(dst_dims),
+      strides(strides), dilations(dilations),
+      padding_left(padding_left), padding_right(padding_right) {
+  }
+};
+
+template <typename T>
+class Conv2DFwd : public DnnOp {
+ public:
+  explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_fwd_ == nullptr) {
+      Setup(convFwdDims);
+    }
+  }
+
+  ~Conv2DFwd() {}
+
+  // Convolution forward execute with bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   bias_data:   input data buffer of bias
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    bias_mem_->set_data_handle(static_cast<void*>(bias_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    bias_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // Convolution forward execute without bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // expected memory format for this primitive instance
+  memory::format src_fmt_;
+  memory::format filter_fmt_;
+
+  // convolution primitive
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<mkldnn::primitive> conv_fwd_;
+
+ private:
+  void Setup(const ConvFwdDimensions& convFwdDims) {
+    // create memory descriptors for convolution data w/ no specified format
+    src_md_.reset(new memory::desc({convFwdDims.src_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    filter_md_.reset(new memory::desc({convFwdDims.filter_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    dst_md_.reset(new memory::desc({convFwdDims.dst_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    if (!convFwdDims.bias_dims.empty())
+        bias_md_.reset(new memory::desc({convFwdDims.bias_dims},
+            MklDnnType<T>(), memory::format::any));
+
+    // create a convolution
+    if (!convFwdDims.bias_dims.empty()) {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    } else {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    }
+
+    fwd_pd_.reset(new convolution_forward::primitive_desc(
+        *fwd_desc_, cpu_engine_));
+
+    // store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->src_primitive_desc().desc().data.format);
+
+    filter_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData));
+    filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(),
+                      DummyData));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData));
+
+    // create convolution primitive and add it to net
+    if (!convFwdDims.bias_dims.empty()) {
+        bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(),
+                        memory::format::x}, cpu_engine_}, DummyData));
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *bias_mem_, *dst_mem_));
+    } else {
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *dst_mem_));
+    }
+
+    fwd_primitives_.push_back(*conv_fwd_);
+    return;
+  }
+
+  // MKLDNN memory
+  std::shared_ptr<mkldnn::memory> src_mem_;
+  std::shared_ptr<mkldnn::memory> filter_mem_;
+  std::shared_ptr<mkldnn::memory> bias_mem_;
+  std::shared_ptr<mkldnn::memory> dst_mem_;
+
+  std::shared_ptr<mkldnn::stream> fwd_stream_;
+  std::vector<mkldnn::primitive> fwd_primitives_;
+
+  // desc & prmitive desc
+  std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+
+  // memory desc
+  std::shared_ptr<mkldnn::memory::desc> src_md_;
+  std::shared_ptr<mkldnn::memory::desc> filter_md_;
+  std::shared_ptr<mkldnn::memory::desc> bias_md_;
+  std::shared_ptr<mkldnn::memory::desc> dst_md_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+template <typename T>
+class Conv2DFwdFactory : public DnnOpFactory<T> {
+ public:
+  static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) {
+     Conv2DFwd<T>* conv2d_fwd = nullptr;
+
+     // try to find a suitable one in pool
+     conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> (
+       Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims));
+
+     if (conv2d_fwd == nullptr) {
+       conv2d_fwd = new Conv2DFwd<T>(convFwdDims);
+       Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd(
+           convFwdDims, conv2d_fwd);
+     }
+     return conv2d_fwd;
+  }
+
+ private:
+  Conv2DFwdFactory() {}
+  ~Conv2DFwdFactory() {}
+
+  static const int kDilationH = 0, kDilationW = 1;
+
+  static Conv2DFwdFactory& GetInstance() {
+    static Conv2DFwdFactory instance_;
+    return instance_;
+  }
+
+  static std::string CreateKey(const ConvFwdDimensions& convFwdDims) {
+    std::string prefix = "conv2d_fwd_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(convFwdDims.src_dims);
+    key_creator.AddAsKey(convFwdDims.filter_dims);
+    key_creator.AddAsKey(convFwdDims.bias_dims);
+    key_creator.AddAsKey(convFwdDims.dst_dims);
+    key_creator.AddAsKey(convFwdDims.strides);
+    key_creator.AddAsKey(convFwdDims.dilations);
+    key_creator.AddAsKey(convFwdDims.padding_left);
+    key_creator.AddAsKey(convFwdDims.padding_right);
+    return key_creator.GetKey();
+  }
+
+  DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    std::string key = CreateKey(convFwdDims);
+    return this->GetOp(key);
+  }
+
+  void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) {
+    std::string key = CreateKey(convFwdDims);
+    this->SetOp(key, op);
+  }
+};
+
+#endif
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// MKL-DNN is now default. MKL-ML must be specified explicitly.
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML
-
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
  public:
@@ -528,8 +749,6 @@ class MklConv2DOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(engine::cpu, 0);
-
       // Input tensors
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
@@ -538,16 +757,16 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Filter should not be in "
-                                          "Mkl Layout"));
+            errors::InvalidArgument("Filter should not be in "
+            "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);  // output
 
-      memory::dims src_dims, filter_dims, padding_l, padding_r,
+      memory::dims src_dims, filter_dims, padding_left, padding_right,
                    dilations, strides;
-      memory::dims output_dims_tf_order, output_dims_mkl_order;
+      memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
@@ -555,31 +774,29 @@ class MklConv2DOp : public OpKernel {
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
       conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
-          &dilations, &output_dims_tf_order, &output_dims_mkl_order,
-          &padding_l, &padding_r);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
+          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
+          &padding_left, &padding_right);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
+      TensorShape dst_tf_shape = MklDnnDimsToTFShape(dst_dims_tf_order);
 
       // Corner cases: output with 0 elements and 0 batch size.
-      Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
-        // TODO(jbobba): Verify correctness here
-        //               Need semantics for Null MKL tensor
-        MklDnnShape output_mkl_shape;
-        output_mkl_shape.SetMklTensor(false);
-
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                  src_tf_shape, output_mkl_shape);
+      Tensor* dst_tensor = nullptr;
+      if (dst_tf_shape.num_elements() == 0 ||
+          dst_dims_tf_order[0] == 0) {
+        MklDnnShape dst_mkl_shape;
+        dst_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
+                    &dst_tensor, src_tf_shape, dst_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor, filter_tf_shape,
-                                  filter_mkl_shape);
+                                  &output_filter_tensor,
+                                  filter_tf_shape, filter_mkl_shape);
         return;
       }
 
@@ -587,6 +804,7 @@ class MklConv2DOp : public OpKernel {
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
       auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@@ -595,6 +813,7 @@ class MklConv2DOp : public OpKernel {
                         ? src_mkl_shape.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
+
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
@@ -603,98 +822,70 @@ class MklConv2DOp : public OpKernel {
                                           memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
-      // Set output shape (output_dims) required in MKL-DNN order.
-      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
-      // depending on data format). But later we propagate Mkl layout of the
-      // output to the next op directly.
-      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      src.SetOpMemDesc(src_dims, memory::format::any);
-      filter.SetOpMemDesc(filter_dims, memory::format::any);
-      output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
-
       // MKLDNN dilation starts from 0.
       dilations[kDilationH] -= 1;
       dilations[kDilationW] -= 1;
 
+      // get a conv2d fwd from primitive pool
+      Conv2DFwd<T> *conv2d_fwd = nullptr;
+      if (biasEnabled) {
+        memory::dims bias_dims = {};
+        conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+      } else {
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+      }
+
+      // allocate output tensors output_tensor and filter_out_tensor
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
+      conv_fwd_pd = conv2d_fwd->fwd_pd_;
+      AllocateOutputTensor(context, *conv_fwd_pd,
+                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      Tensor* filter_out_tensor = nullptr;
+      AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                 TFShapeToMklDnnDims(filter_tf_shape),
+                                 &filter_out_tensor);
+
+      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+
+      // check whether src/filter need reorder
+      std::vector<primitive> net;
+      if (src_md.data.format != conv2d_fwd->src_fmt_)
+          src.CheckReorderToOpMem(
+              conv_fwd_pd.get()->src_primitive_desc(), &net);
+
+      if (filter_md.data.format != conv2d_fwd->filter_fmt_)
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor), &net);
+      stream(stream::kind::eager).submit(net).wait();
+
+      T* src_data = static_cast<T*>(
+                src.GetOpMem().get_data_handle());
+      T* filter_data = static_cast<T*>(
+                filter.GetOpMem().get_data_handle());
+
+      // execute convolution
       if (biasEnabled) {
-          // Create convolution primitive with Bias.
-          MklDnnData<T> bias(&cpu_engine);
-          memory::dims bias_size;
-          conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_size);
-          const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-          bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
-          bias.SetOpMemDesc(bias_size, memory::format::any);
-
-          // Create convolution primitive with Bias.
-          // Use MKLDNN dilated convolution in case of dilated rate (>0).
-          auto conv_desc = (dilations[kDilationH] > 0 ||
-              dilations[kDilationW] > 0) ?
-              convolution_forward::desc(prop_kind::forward,
-                      convolution_direct, src.GetOpMemDesc(),
-                      filter.GetOpMemDesc(), bias.GetOpMemDesc(),
-                      output.GetOpMemDesc(), strides, dilations,
-                      padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_)):
-              convolution_forward::desc(prop_kind::forward,
-                      convolution_direct, src.GetOpMemDesc(),
-                      filter.GetOpMemDesc(), bias.GetOpMemDesc(),
-                      output.GetOpMemDesc(), strides,
-                      padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_));
-
-          auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                  cpu_engine);
-          AllocateOutputTensor(context, conv_prim_desc,
-                               output_dims_mkl_order, tf_fmt, &output_tensor);
-          // Set data handle for output.
-          output.SetUsrMemDataHandle(output_tensor);
-
-          Tensor* filter_out_tensor = nullptr;
-          AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-
-          PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
-                               filter_out_tensor);
+        const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
+        T* bias_data = static_cast<T*>(const_cast<T*>(
+            bias_tensor.flat<T>().data()));
+
+        conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
-          // Create convolution primitive without Bias.
-          // Use MKLDNN dilated convolution in case of dilated rate (>0).
-          auto conv_desc = (dilations[kDilationH] > 0 ||
-            dilations[kDilationW] > 0) ?
-            convolution_forward::desc(prop_kind::forward,
-              convolution_direct, src.GetOpMemDesc(),
-              filter.GetOpMemDesc(), output.GetOpMemDesc(),
-              strides, dilations, padding_l, padding_r,
-              TFPaddingToMklDnnPadding(padding_)):
-          convolution_forward::desc(prop_kind::forward,
-              convolution_direct, src.GetOpMemDesc(),
-              filter.GetOpMemDesc(), output.GetOpMemDesc(),
-              strides, padding_l, padding_r,
-              TFPaddingToMklDnnPadding(padding_));
-
-          auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                  cpu_engine);
-          AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
-                               tf_fmt, &output_tensor);
-          // Set data handle for output.
-          output.SetUsrMemDataHandle(output_tensor);
-
-          Tensor* filter_out_tensor = nullptr;
-          AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-          PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                              nullptr, &output, filter_out_tensor);
+        conv2d_fwd->Execute(src_data, filter_data, dst_data);
       }
-    } catch (mkldnn::error& e) {
+    } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -706,6 +897,7 @@ class MklConv2DOp : public OpKernel {
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
+  engine cpu_engine = engine(engine::cpu, 0);
 
   // Allocate output tensor.
   void AllocateOutputTensor(
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index bc6d2d77a4..50a8e30574 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include <unordered_map>
+#include <utility>
 
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -1759,7 +1761,90 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_ML
+/// Base class for operations with reuse of DNN primitives
+///
+class DnnOp {
+ public:
+  virtual ~DnnOp() {}
+
+  // Dummy data. Its size, hard-coded as 256 here, does
+  // not matter since MKL should never operate on this buffer.
+  unsigned char DummyData[256];
+};
+
+const mkldnn::memory::dims NONE_DIMS = {};
+// This constant is used to declare dummy buffer (size), for MKL primitives
+template <typename T>
+class DnnOpFactory {
+ public:
+  DnnOpFactory() {}
+  ~DnnOpFactory() {}
+
+  DnnOp* GetOp(const std::string& key) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+    if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) {
+      return nullptr;
+    } else {
+      return stream_iter->second;
+    }
+  }
+
+  void SetOp(const std::string& key, DnnOp* op) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+
+    CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end());
+
+    DnnOpFactory<T>::GetHashMap()[key] = op;
+  }
+
+ private:
+  static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() {
+    static thread_local std::unordered_map<std::string, DnnOp*> map_;
+    return map_;
+  }
+};
+
+// utility class for creating keys of MKL primitive pool.
+class FactoryKeyCreator {
+ public:
+  FactoryKeyCreator() {
+    key_.reserve(kMaxKeyLength);
+  }
+
+  ~FactoryKeyCreator() {}
+
+  void AddAsKey(const string &str) {
+    auto buffer = reinterpret_cast<const char *>(str.c_str());
+    Append(buffer, str.length());
+  }
+
+  void AddAsKey(const mkldnn::memory::dims &dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AddAsKey<int>(dims[i]);
+    }
+  }
+
+  template <typename T>
+  void AddAsKey(const T data) {
+    auto buffer = reinterpret_cast<const char *>(&data);
+    Append(buffer, sizeof(T));
+  }
+
+  std::string GetKey() {
+    return key_;
+  }
+
+ private:
+  string key_;
+  const char delimiter = 'x';
+  const int kMaxKeyLength = 256;
+  void Append(const char* data, int len) {
+    key_.append(data, len);
+    key_.append(1, delimiter);
+  }
+};
+
+#endif  // INTEL_MKL_DNN
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-- 
GitLab


From 99167d3a6393ac47c2e01b6f620a03adeb9ac3e4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 13:48:37 -0700
Subject: [PATCH 1223/1262] Merged commit includes the following changes:
 193717076  by yifeif:

    Automated g4 rollback of changelist 193713153.

--
193716750  by fchollet:

    Refactor `tf.keras.layers.Embedding` layer to use `embedding_lookup` instead of `gather`. This makes the layer TPU-compatible.

--
193716664  by A. Unique TensorFlower:

    Go: Update generated wrapper functions for TensorFlow ops.

--
193713153  by power:

    Experimental Keras TPU compatibility layer.

--

PiperOrigin-RevId: 193717076
---
 tensorflow/go/op/wrappers.go                  | 32 +++++++++++++++++--
 tensorflow/python/keras/BUILD                 |  1 +
 .../keras/_impl/keras/layers/embeddings.py    |  4 +--
 .../_impl/keras/layers/embeddings_test.py     | 13 ++++++++
 4 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3b3dff0573..ec7d9dcc4f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5917,6 +5917,17 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of 3-D convolution with respect to the filter.
 //
 // DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
@@ -5930,11 +5941,14 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
@@ -12306,6 +12320,17 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of 3-D convolution with respect to the input.
 //
 // DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
@@ -12319,11 +12344,14 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 70040b7e74..1c58553156 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -208,6 +208,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 591bab7cd8..07b8726b85 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -24,7 +24,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -155,7 +155,7 @@ class Embedding(Layer):
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
       inputs = math_ops.cast(inputs, 'int32')
-    out = array_ops.gather(self.embeddings, inputs)
+    out = embedding_ops.embedding_lookup(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
index 9f6793eac8..6ebf5dc94a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
@@ -65,6 +67,17 @@ class EmbeddingTest(test.TestCase):
         input_dtype='int32',
         expected_output_dtype='float32')
 
+  def test_embedding_correctness(self):
+    with self.test_session():
+      layer = keras.layers.Embedding(output_dim=2, input_dim=2)
+      layer.build((None, 2))
+      matrix = np.array([[1, 1], [2, 2]])
+      layer.set_weights([matrix])
+
+      inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
+      outputs = keras.backend.eval(layer(inputs))
+      self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 5a4356be6822dfe0b0f973852b9b65d69e4c169c Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Fri, 20 Apr 2018 13:54:00 -0700
Subject: [PATCH 1224/1262] Fix for: Suggest braces around initialization of
 subobject.

PiperOrigin-RevId: 193717872
---
 tensorflow/python/lib/core/bfloat16.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index 7f07deebef..77fa2c1f66 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -616,8 +616,8 @@ bool Initialize() {
   };
 
   // Comparisons
-  const std::array<int, 3> compare_types = {npy_bfloat16_, npy_bfloat16_,
-                                            NPY_BOOL};
+  const std::array<int, 3> compare_types = {
+      {npy_bfloat16_, npy_bfloat16_, NPY_BOOL}};
 
   if (!register_ufunc("equal", CompareUFunc<Bfloat16EqFunctor>,
                       compare_types)) {
-- 
GitLab


From 1cd64d57143814fc0652c09165735be62d96124f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 13:56:55 -0700
Subject: [PATCH 1225/1262] Track dependencies between outside_compilation
 clusters so that control edges can be correctly added to sequence compiled
 computations.

PiperOrigin-RevId: 193718295
---
 .../jit/encapsulate_subgraphs_pass.cc         | 378 ++++++++++-
 .../jit/encapsulate_subgraphs_pass_test.cc    | 590 +++++++++++++++++-
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  25 +
 tensorflow/compiler/tf2xla/xla_compiler.h     |  20 +
 4 files changed, 1005 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 9465385b58..7507e193b5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
@@ -160,6 +161,11 @@ class Encapsulator {
             std::move(outside_compilation_attribute)),
         graph_in_(graph_in) {}
 
+  // Find dependencies between subgraphs and outside_compilation clusters that
+  // only manifest via edges between outside_compilation clusters in the outer
+  // (non-compiled) graph.
+  Status FindClusterDependencies();
+
   // Find subgraphs marked with 'group_attribute', and build a new
   // subgraph, one for each value of 'group_attribute'.
   Status SplitIntoSubgraphs();
@@ -230,6 +236,19 @@ class Encapsulator {
   // the shapes of any ancestor RAH outputs. If it can be determined that the
   // shape of the SFH inputs will not be inferrable even once the shapes of the
   // RAH outputs are known, an error is returned by the rewriter.
+  //
+  // Once edges between compiled and outside_compilation clusters have been
+  // replaced by send/recv ops, some dependencies may no longer be apparent.
+  // A clustering pass finds all the dependencies between HC nodes that are only
+  // present as a result of edges between nodes in outside_compilaton clusters.
+  // Suppose there is a path from outside_compilation cluster C in subgraph S
+  // to outside_compilation cluster D in subgraph T. If S != T then a control
+  // edge is added from the call node for S to the call node for T, which
+  // ensures that C will execute before D because S executes before T. If S==T
+  // then a control dependency is added between the HC nodes for C and D in S,
+  // and the HC node for C is added to an 'ancestors' attr in the HC node for D
+  // so that during compilation of the HC node for D, an XLA control dependency
+  // can be added to ensure C's SendToHost executes before D's RecvFromHost.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -324,6 +343,18 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
+    // Records the fact that there is a path from a node in outside_compilation
+    // cluster ancestor to node in cluster successor that does not go through
+    // the subgraph.
+    void RecordOutsideCompilationDependency(const string& successor,
+                                            const string& ancestor);
+
+    // Returns the mapping from outside_compilation cluster C to the set of
+    // outside_compilation clusters that have a path to C entirely outside
+    // compiled subgraphs.
+    const std::unordered_map<string, std::unordered_set<string>>
+    OutsideCompilationAncestorMap() const;
+
     // Adds the HostCompute nodes for each outside_compilation subgraph.
     Status AddHostComputes(
         const string& subgraph_name,
@@ -406,6 +437,13 @@ class Encapsulator {
     Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph,
                                         Graph* graph_out);
 
+    // Get the set of outside_compilation clusters and the dependency edges
+    // between them.
+    void GetActiveClusterDependencyGraph(
+        std::unordered_set<string>* clusters,
+        std::unordered_set<string>* has_successor,
+        std::unordered_map<string, std::unordered_set<string>>* ancestors_map);
+
     // Builds a _RecvAtHost node producing all the inputs of an
     // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
     Status AddRecvAtHostNode(const string& group_attribute,
@@ -468,6 +506,14 @@ class Encapsulator {
     // The outside_compilation clusters in this subgraph.
     std::unordered_map<string, OutsideCompilationSubgraph>
         outside_compilation_subgraphs_;
+    // For each outside_compilation cluster C, the outside_compilation clusters
+    // that have a path to C outside the compiled graph.
+    std::unordered_map<string, std::unordered_set<string>>
+        outside_compilation_ancestors_;
+    // For each outside_compilation cluster C, the outside_compilation clusters
+    // that have a path from C outside the compiled graph.
+    std::unordered_map<string, std::unordered_set<string>>
+        outside_compilation_successors_;
 
     // NoOp node in the output graph that is sequenced after the call node and
     // used to prevent host-side outside_compilation sends and recvs from being
@@ -556,6 +602,10 @@ class Encapsulator {
       std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
           edges_added);
 
+  // Adds control dependencies between subgraph call nodes that have
+  // dependencies via outside_compilation edges.
+  Status AddCallNodeDependencies(Graph* graph_out);
+
   // Adds all edges to the output graph.
   Status AddEdgesToOutputGraph(
       const std::unordered_map<const Node*, Node*>& node_images,
@@ -620,10 +670,65 @@ class Encapsulator {
   const Graph* graph_in_;
 
   std::unordered_map<string, Subgraph> subgraphs_;
+  // For each subgraph S the subgraphs S' such that there is a path in some
+  // outside_compilation cluster C in S to some outside_compilation cluster C'
+  // in S', that goes only through the uncompiled graph.
+  std::unordered_map<string, std::unordered_set<string>> subgraph_ancestors_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
 
+namespace {
+
+// Return in 'sorted' a topological sort of clusters according to the
+// dependencies encoded in ancestors. clusters is the list of all clusters
+// including clusters that are not present in the ancestors map. has_successors
+// is the set of clusters that are ancestors of some other cluster.
+void TopologicalClusterSort(
+    const std::unordered_set<string>& clusters,
+    const std::unordered_set<string>& has_successors,
+    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    std::vector<string>* sorted) {
+  // The nodes are placed in 'sorted' in topological order.
+  sorted->clear();
+  // We don't use the standard DFS because we are not operating on Node*
+  // objects.
+  struct Work {
+    string cluster;
+    bool leave;
+  };
+  std::set<string> visited;
+  std::vector<Work> stack;
+  // Seed the processing list with clusters that have no successors.
+  for (const auto& cluster : clusters) {
+    if (has_successors.find(cluster) == has_successors.end()) {
+      stack.push_back({cluster, false});
+    }
+  }
+  while (!stack.empty()) {
+    const Work item = stack.back();
+    stack.pop_back();
+    if (item.leave) {
+      sorted->push_back(item.cluster);
+      continue;
+    }
+
+    if (visited.find(item.cluster) != visited.end()) continue;
+    visited.insert(item.cluster);
+
+    stack.push_back({item.cluster, true});
+    const auto& iter = ancestors.find(item.cluster);
+    if (iter != ancestors.end()) {
+      for (const auto& ancestor : iter->second) {
+        stack.push_back({ancestor, false});
+      }
+    }
+  }
+  CHECK(sorted->size() == clusters.size());
+}
+
+}  // namespace
+
 Node* Encapsulator::Subgraph::GetCallNodeForInputs() const {
   return call_node_inputs_;
 }
@@ -786,12 +891,71 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
   }
 }
 
+void Encapsulator::Subgraph::RecordOutsideCompilationDependency(
+    const string& successor, const string& ancestor) {
+  outside_compilation_ancestors_[successor].insert(ancestor);
+  outside_compilation_successors_[ancestor].insert(successor);
+}
+
+const std::unordered_map<string, std::unordered_set<string>>
+Encapsulator::Subgraph::OutsideCompilationAncestorMap() const {
+  return outside_compilation_ancestors_;
+}
+
+void Encapsulator::Subgraph::GetActiveClusterDependencyGraph(
+    std::unordered_set<string>* clusters,
+    std::unordered_set<string>* has_successor,
+    std::unordered_map<string, std::unordered_set<string>>* ancestors_map) {
+  // During initial clustering the ancestor and successor datastructures may
+  // have been built including oc_cluster names that never turned into subgraphs
+  // because they had no edges into or out of the compiled cluster. Remove them
+  // before proceeding to simplify the logic. Get the set of clusters that was
+  // actually added, then remove references to the others.
+  for (const auto& oc_subgraph : outside_compilation_subgraphs_) {
+    clusters->insert(oc_subgraph.first);
+  }
+  for (const auto& cluster : outside_compilation_successors_) {
+    if (clusters->find(cluster.first) != clusters->end()) {
+      for (const auto& successor : cluster.second) {
+        if (clusters->find(successor) != clusters->end()) {
+          has_successor->insert(cluster.first);
+          break;
+        }
+      }
+    }
+  }
+  for (const auto& cluster : outside_compilation_ancestors_) {
+    if (clusters->find(cluster.first) != clusters->end()) {
+      std::unordered_set<string>& ancestors = (*ancestors_map)[cluster.first];
+      for (const auto& ancestor : cluster.second) {
+        if (clusters->find(ancestor) != clusters->end()) {
+          ancestors.insert(ancestor);
+        }
+      }
+    }
+  }
+}
+
 Status Encapsulator::Subgraph::AddHostComputes(
     const string& subgraph_name,
     const std::unordered_map<const Node*, Node*>& node_images) {
-  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
-    const string& oc_subgraph_name = oc_subgraph_iter.first;
-    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
+  // Get the set of outside_compilation clusters and the dependency edges
+  // between them.
+  std::unordered_set<string> clusters;
+  std::unordered_set<string> has_successor;
+  std::unordered_map<string, std::unordered_set<string>> ancestors_map;
+  GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map);
+  // Topologically sort the outside_compilation clusters according to their
+  // dependency relation.
+  std::vector<string> sorted_clusters;
+  TopologicalClusterSort(clusters, has_successor, ancestors_map,
+                         &sorted_clusters);
+
+  // The host compute nodes added for each outside_compilation_cluster;
+  std::unordered_map<string, Node*> host_compute_node;
+  for (const string& oc_subgraph_name : sorted_clusters) {
+    OutsideCompilationSubgraph& oc_subgraph =
+        outside_compilation_subgraphs_[oc_subgraph_name];
     if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
         !oc_subgraph.outputs_by_src.empty() ||
         !oc_subgraph.control_outputs.empty()) {
@@ -811,13 +975,22 @@ Status Encapsulator::Subgraph::AddHostComputes(
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
         input_dtypes[input_index] = dtype;
       }
-
       for (const auto& output : oc_subgraph.outputs_by_src) {
         DataType dtype = output.first.dtype;
         int output_index = output.second;
         output_dtypes[output_index] = dtype;
       }
 
+      std::vector<string> host_compute_ancestors;
+      const auto iter = ancestors_map.find(oc_subgraph_name);
+      if (iter != ancestors_map.end()) {
+        for (const string& ancestor_cluster : iter->second) {
+          host_compute_ancestors.push_back(
+              outside_compilation_subgraphs_[ancestor_cluster]
+                  .host_compute_name);
+        }
+      }
+
       NodeDef host_compute_def;
       NodeDefBuilder builder(strings::StrCat("outside_compilation_",
                                              oc_subgraph_name, "_host_compute"),
@@ -825,6 +998,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       builder.Input(inputs);
       builder.Attr("Tinputs", input_dtypes);
       builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("ancestors", host_compute_ancestors);
       builder.Attr("key",
                    strings::StrCat("host_compute_channel_", subgraph_name, "_",
                                    oc_subgraph_name));
@@ -834,6 +1008,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
 
       Node* host_compute = graph_->AddNode(host_compute_def, &s);
       if (!s.ok()) return s;
+      host_compute_node[host_compute->name()] = host_compute;
       oc_subgraph.host_compute_name = host_compute->name();
 
       // Connect the _HostCompute node to its producers in the subgraph.
@@ -852,6 +1027,12 @@ Status Encapsulator::Subgraph::AddHostComputes(
         graph_->AddControlEdge(src_image, host_compute);
       }
 
+      // Connect the _HostCompute node to its ancestor host compute nodes.
+      for (const auto& ancestor_name : host_compute_ancestors) {
+        Node* ancestor = host_compute_node[ancestor_name];
+        graph_->AddControlEdge(ancestor, host_compute);
+      }
+
       // Connect the consumers in the subgraph to the _HostCompute node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
@@ -1654,6 +1835,17 @@ Status Encapsulator::CopyEdgeToOutputGraph(
   return Status::OK();
 }
 
+Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
+  for (const auto& ancestors : subgraph_ancestors_) {
+    const string& subgraph = ancestors.first;
+    for (const string& ancestor : ancestors.second) {
+      graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(),
+                                subgraphs_[subgraph].GetCallNodeForInputs());
+    }
+  }
+  return Status::OK();
+}
+
 Status Encapsulator::AddEdgesToOutputGraph(
     const std::unordered_map<const Node*, Node*>& node_images,
     bool parallel_checking, Graph* graph_out) {
@@ -1703,6 +1895,7 @@ Status Encapsulator::AddEdgesToOutputGraph(
     Subgraph& subgraph = subgraph_entry.second;
     subgraph.ConnectSequencerToCallNode(graph_out);
   }
+  TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out));
 
   return Status::OK();
 }
@@ -1960,6 +2153,182 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
   return Status::OK();
 }
 
+namespace {
+
+// Helper struct for building cluster dependencies and also debugging cycles in
+// the dependencies. While computing dependencies we construct a mapping from
+// Node* to PathDetails.
+struct PathDetails {
+  struct SubgraphAndCluster {
+    string subgraph;
+    string outside_compilation_cluster;
+    bool operator==(const SubgraphAndCluster& other) const {
+      return subgraph == other.subgraph &&
+             outside_compilation_cluster == other.outside_compilation_cluster;
+    }
+  };
+
+  struct SubgraphAndClusterHash {
+    inline std::size_t operator()(const SubgraphAndCluster& v) const {
+      return hash<string>()(
+          strings::StrCat(v.subgraph, v.outside_compilation_cluster));
+    }
+  };
+
+  typedef std::unordered_set<SubgraphAndCluster, SubgraphAndClusterHash>
+      SubgraphAndClusterSet;
+
+  // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as
+  // ancestors for any successor of this node. If the node is in the outer
+  // graph, it returns the transitive union of the ancestors of the node's
+  // inputs. If the node is in an outside_compilation cluster, it returns just
+  // that cluster. If the node is compiled, it returns the empty set.
+  SubgraphAndClusterSet AncestorsForSuccessor() {
+    if (subgraph.empty()) {
+      return ancestor_clusters;
+    } else if (outside_compilation_cluster.empty()) {
+      return SubgraphAndClusterSet();
+    } else {
+      SubgraphAndCluster entry;
+      entry.subgraph = subgraph;
+      entry.outside_compilation_cluster = outside_compilation_cluster;
+      return SubgraphAndClusterSet({entry});
+    }
+  }
+
+  // The transitive union of the ancestor's of this node's inputs. This is only
+  // saved for debugging in order to print out enough information to debug a
+  // discovered cycle.
+  SubgraphAndClusterSet ancestor_clusters;
+  // The subgraph attr on this node.
+  string subgraph;
+  // The outside_compilation attr on this node.
+  string outside_compilation_cluster;
+};
+
+// Adds an edge from ancestor to successor to the cycle detector, and returns an
+// error if that edge causes the formation of a cycle. In the error case, logs
+// the contents of the node_ancestors_map to facilitate debugging.
+Status CheckClusterDependencyForCycles(
+    const string& ancestor, const string& successor,
+    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
+    GraphCycles* cycle_detector, std::map<string, int>* cycle_detector_map) {
+  if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
+    (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
+  }
+  if (cycle_detector_map->find(successor) == cycle_detector_map->end()) {
+    (*cycle_detector_map)[successor] = cycle_detector->NewNode();
+  }
+
+  if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor],
+                                  (*cycle_detector_map)[successor])) {
+    LOG(ERROR) << "Cycle in outside_compilation clusters";
+    for (const auto& cluster : ancestors) {
+      LOG(ERROR) << "Cluster " << cluster.first << " depends on:";
+      for (const auto& ancestor : cluster.second) {
+        LOG(ERROR) << "  " << ancestor;
+      }
+    }
+    for (const auto& node_ancestors : node_ancestors_map) {
+      LOG(ERROR) << "Node " << node_ancestors.first->name() << " ("
+                 << node_ancestors.second.subgraph << ";"
+                 << node_ancestors.second.outside_compilation_cluster
+                 << ") has ancestor clusters:";
+      for (const auto& ancestor : node_ancestors.second.ancestor_clusters) {
+        LOG(ERROR) << "  " << ancestor.subgraph << ";"
+                   << ancestor.outside_compilation_cluster;
+      }
+    }
+    return errors::InvalidArgument(
+        "Can't compile outside_compilation clusters because there is a "
+        "dependency cycle: see error log for details.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status Encapsulator::FindClusterDependencies() {
+  // Map from nodes to ancestor details. A node is entered into the map if it is
+  // in a compilation subgraph, and outside_compilation cluster, or appears on a
+  // path in the outer graph leading from an outside_compilation subgraph.
+  std::unordered_map<Node*, PathDetails> node_ancestors_map;
+  // We check that clusters are acyclic using this cycle detector.
+  GraphCycles cycle_detector;
+  // Map from cluster name to cycle detector node id.
+  std::map<string, int> cycle_detector_map;
+  // Process the nodes in topologically-sorted order.
+  std::vector<Node*> nodes;
+  GetReversePostOrder(*graph_in_, &nodes);
+  for (Node* node : nodes) {
+    string subgraph_name;
+    string oc_cluster;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster));
+    // First create an entry in the ancestors map if the node is in a compiled
+    // subgraph or outside_compilation cluster, or if any incoming edge is from
+    // a node with an ancestor map entry; and find the union of all the
+    // ancestors.
+    if (!subgraph_name.empty()) {
+      node_ancestors_map[node].subgraph = subgraph_name;
+      node_ancestors_map[node].outside_compilation_cluster = oc_cluster;
+    }
+    for (Node* src : node->in_nodes()) {
+      const auto iter = node_ancestors_map.find(src);
+      if (iter != node_ancestors_map.end()) {
+        const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor();
+        for (const auto& ancestor : ancestors_to_follow) {
+          if (ancestor.subgraph != subgraph_name ||
+              ancestor.outside_compilation_cluster != oc_cluster) {
+            node_ancestors_map[node].ancestor_clusters.insert(ancestor);
+          }
+        }
+      }
+    }
+    if (!subgraph_name.empty()) {
+      // The node is in a compiled subgraph or an outside_compilation cluster.
+      if (oc_cluster.empty()) {
+        // The node is not in an outside_compilation cluster. Record the
+        // subgraph's ancestor dependencies.
+        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
+          if (cluster.subgraph != subgraph_name) {
+            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
+            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                cluster.subgraph, subgraph_name, subgraph_ancestors_,
+                node_ancestors_map, &cycle_detector, &cycle_detector_map));
+          }
+        }
+      } else {
+        Subgraph& subgraph = subgraphs_[subgraph_name];
+        // The node is in an outside_compilation cluster. Record the cluster
+        // and/or subgraph ancestor dependencies.
+        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
+          if (cluster.subgraph == subgraph_name) {
+            // The ancestor is in the same subgraph.
+            if (cluster.outside_compilation_cluster != oc_cluster) {
+              // But not in the same oc_cluster, so record the dependency.
+              subgraph.RecordOutsideCompilationDependency(
+                  oc_cluster, cluster.outside_compilation_cluster);
+              TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                  cluster.outside_compilation_cluster, oc_cluster,
+                  subgraph.OutsideCompilationAncestorMap(), node_ancestors_map,
+                  &cycle_detector, &cycle_detector_map));
+            }
+          } else {
+            // The ancestor is in a different subgraph, so record the
+            // dependency.
+            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
+            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
+                cluster.subgraph, subgraph_name, subgraph_ancestors_,
+                node_ancestors_map, &cycle_detector, &cycle_detector_map));
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status Encapsulator::MakePrunedGraphCopyAndInline(
     const Graph& graph, const std::vector<Node*>& sink_nodes,
     std::unique_ptr<Graph>* pruned_graph,
@@ -2166,6 +2535,7 @@ Status EncapsulateSubgraphsInFunctions(
   Encapsulator encapsulator(std::move(group_attribute),
                             std::move(outside_compilation_attribute),
                             &graph_in);
+  TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies());
   TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs());
 
   TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs(
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 8599a7038a..3502d1bb45 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -74,7 +74,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
     if (!compare(elt_a.first, elt_a.second, iter->second)) {
       if (diff) {
         *diff = strings::StrCat(map_name, " expected: element with key '",
-                                key_to_string(elt_a.first), " has value '",
+                                key_to_string(elt_a.first), "' has value '",
                                 value_to_string(elt_a.second), "' got: '",
                                 value_to_string(iter->second), "'");
       }
@@ -121,8 +121,22 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
+  std::unordered_set<string> control_input_a;
+  std::unordered_set<string> control_input_b;
   for (int i = 0; i < a.input_size(); ++i) {
-    if (a.input(i) != b.input(i)) {
+    if (str_util::StartsWith(a.input(i), "^")) {
+      if (!str_util::StartsWith(b.input(i), "^")) {
+        if (diff) {
+          *diff = strings::StrCat(
+              diff_preamble, " mismatch for node ", a.name(), " input ", i,
+              ", expected control input ", a.input(i), " got ", b.input(i),
+              " expected:\n", a.DebugString(), "\ngot:\n", b.DebugString());
+        }
+        return false;
+      }
+      control_input_a.insert(a.input(i));
+      control_input_b.insert(b.input(i));
+    } else if (a.input(i) != b.input(i)) {
       if (diff) {
         *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
                                 " input ", i, ", expected ", a.input(i),
@@ -132,11 +146,29 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
       return false;
     }
   }
+  if (control_input_a != control_input_b) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              " control inputs differ expected:\n",
+                              a.DebugString(), "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
   return EqualProtoMap<string, AttrValue>(
       a.attr(), b.attr(), [](const string& s) { return s; },
       [](const AttrValue& v) { return v.DebugString(); },
       [](const string& key, const AttrValue& av, const AttrValue& bv) {
-        return av.DebugString() == bv.DebugString();
+        if (key == "ancestors") {
+          // The ancestors are added from a set so the order is unpredictable;
+          // just compare set equality not list equality.
+          std::unordered_set<string> a_set(av.list().s().begin(),
+                                           av.list().s().end());
+          std::unordered_set<string> b_set(bv.list().s().begin(),
+                                           bv.list().s().end());
+          return a_set == b_set;
+        } else {
+          return av.DebugString() == bv.DebugString();
+        }
       },
       strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
       diff);
@@ -261,6 +293,7 @@ REGISTER_OP("XlaHostCompute")
     .Output("outputs: Toutputs")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Toutputs: list(type) >= 0")
+    .Attr("ancestors: list(string) >= 0")
     .Attr("key: string")
     .Attr("shape_inference_graph: string = ''")
     .Attr("shapes: list(shape) >= 0")
@@ -899,6 +932,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {"C:o:0", "c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
@@ -1044,17 +1078,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"D:o:0", "F:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors",
+             gtl::ArraySlice<string>({"outside_compilation_O1_host_compute"})},
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O2"},
             {"shapes", gtl::ArraySlice<DataType>({})},
             {"_outside_compilation_subgraph", "O2"}},
-           {"F"}},
+           {"F", "outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
@@ -1193,6 +1230,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
@@ -1215,6 +1253,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"G:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F2_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
@@ -1279,6 +1318,179 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test with two functions to transform, each with one outside_compilation
+// cluster, with the dependency between them purely from an outside_compilation
+// edge.
+TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Binary(c, d,
+                     b1.opts()
+                         .WithName("E")
+                         .WithControlInputs({b, d})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Node* g =
+        Binary(a, b, b1.opts().WithName("G").WithAttr("_encapsulate", "F2"));
+    Node* h = Unary(g, b1.opts()
+                           .WithName("H")
+                           .WithAttr("_encapsulate", "F2")
+                           .WithAttr("_outside", "O1")
+                           .WithControlInput(e));
+    Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2"));
+    Binary(f, i, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
+  }
+
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1",
+                            {DT_FLOAT}, shape.opts());
+    Node* h = Unary(recv, shape.opts()
+                              .WithName("H")
+                              .WithAttr("_encapsulate", "F2")
+                              .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"C:o:0", "D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}},
+           {"D"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
+      {
+          {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"G:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F2_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
+      },
+      {{"i_0_retval", "I:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
+
+    Node* key_constant1 =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
+
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 =
+        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+
+    Node* key_constant2 =
+        KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* h = Unary(recv2, b2.opts()
+                               .WithName("H")
+                               .WithAttr("_encapsulate", "F2")
+                               .WithAttr("_outside", "O1")
+                               .WithControlInput(e));
+    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                               b2.opts());
+
+    Node* s2 = Sequencer(
+        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
+        "F2");
+    NodeBuilder node_builder2("F2", "F2", lib_def.get());
+    node_builder2.Input(a).Input(b);
+    Node* call2 = b2.opts()
+                      .WithControlInputs({s2, call1})
+                      .FinalizeBuilder(&node_builder2);
+    Binary(call1, call2, b2.opts().WithName("J"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 // Test with one outside_compilation cluster that has no inputs from the
 // compiled subgraph.
 TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
@@ -1323,6 +1535,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
@@ -1406,6 +1619,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {},
            {{"Tinputs", gtl::ArraySlice<DataType>({})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
@@ -1487,6 +1701,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
@@ -1567,6 +1782,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {"D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
@@ -1607,6 +1823,371 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph, where the ancestor has no HostCompute Op.
+TEST(EncapsulateSubgraphsTest,
+     OutsideCompilationClusterDependencyNoSrcCluster) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(a, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Node* g = Unary(f, b1.opts()
+                           .WithName("G")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O2")
+                           .WithControlInput(e));
+    Node* h = Unary(g, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    Binary(e, h, b1.opts().WithName("I"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT}, shape2.opts());
+    Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts()
+                                                .WithName("G")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O2"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"}, "UnaryTest", {"D:o:0"}},
+          {{"H"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"outside_compilation_O2_host_compute"},
+           "XlaHostCompute",
+           {"F:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O2"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O2"}}},
+      },
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                            {DT_FLOAT}, b2.opts());
+    Node* g = Unary(recv, b2.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts());
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("I"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph, where the successor has no HostCompute Op.
+TEST(EncapsulateSubgraphsTest,
+     OutsideCompilationClusterDependencyNoDstCluster) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    /*Node* g =*/Unary(a, b1.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    Binary(e, h, b1.opts().WithName("I"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, shape1.opts());
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "UnaryTest",
+           {"outside_compilation_O1_host_compute:outputs:0"}},
+          {{"H"}, "UnaryTest", {"F:o:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph",
+             "_outside_compilation_shape_inference_F1_O1"},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
+      },
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv, b2.opts()
+                              .WithName("E")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    /*Node* g =*/Unary(a, b2.opts()
+                              .WithName("G")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O2")
+                              .WithControlInput(e));
+    Node* s1 = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("I"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two outside_compilation clusters that interact outside the compiled
+// subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Node* g = Unary(d, b1.opts()
+                           .WithName("G")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O2")
+                           .WithControlInput(e));
+    Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1"));
+    /*Node* i =*/Binary(d, e,
+                        b1.opts()
+                            .WithName("I")
+                            .WithAttr("_encapsulate", "F1")
+                            .WithAttr("_outside", "O3")
+                            .WithControlInput(g));
+    Binary(e, h, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant =
+        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, shape1.opts());
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      {{{"C"}, "UnaryTest", {"a_0_arg"}},
+       {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+       {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}},
+       {{"H"}, "UnaryTest", {"F:o:0"}},
+       {{"outside_compilation_O1_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"ancestors", gtl::ArraySlice<string>({})},
+         {"key", "host_compute_channel_F1_O1"},
+         {"shape_inference_graph",
+          "_outside_compilation_shape_inference_F1_O1"},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O1"}}},
+       {{"outside_compilation_O2_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({})},
+         {"ancestors",
+          gtl::ArraySlice<string>({"outside_compilation_O1_host_compute"})},
+         {"key", "host_compute_channel_F1_O2"},
+         {"shape_inference_graph", ""},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O2"}},
+        {"outside_compilation_O1_host_compute"}},
+       {{"outside_compilation_O3_host_compute"},
+        "XlaHostCompute",
+        {"D:o:0"},
+        {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+         {"Toutputs", gtl::ArraySlice<DataType>({})},
+         {"ancestors",
+          gtl::ArraySlice<string>({"outside_compilation_O1_host_compute",
+                                   "outside_compilation_O2_host_compute"})},
+         {"key", "host_compute_channel_F1_O3"},
+         {"shape_inference_graph", ""},
+         {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+         {"_outside_compilation_subgraph", "O3"}},
+        {"outside_compilation_O1_host_compute",
+         "outside_compilation_O2_host_compute"}}},
+      {{"h_0_retval", "H:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT}, b2.opts());
+    Node* g = Unary(recv2, b2.opts()
+                               .WithName("G")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O2")
+                               .WithControlInput(e));
+    Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3",
+                             {DT_FLOAT}, b2.opts());
+    /*Node* i =*/Binary(recv3, e,
+                        b2.opts()
+                            .WithName("I")
+                            .WithAttr("_encapsulate", "F1")
+                            .WithAttr("_outside", "O3")
+                            .WithControlInput(g));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, send, recv2, recv3}),
+                         "F1");
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b).ControlInput(s1);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("J"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 // Test with one outside_compilation cluster that has no outputs from the
 // compiled subgraph.
 TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
@@ -1731,6 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"c:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"ancestors", gtl::ArraySlice<string>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 86263d847a..c0e9967684 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -813,4 +813,29 @@ Status XlaCompiler::SetHostToDeviceMetadata(
   return Status::OK();
 }
 
+Status XlaCompiler::GetHostComputeControlDependency(
+    const string& host_compute_name, xla::ComputationDataHandle* handle) {
+  const auto iter = host_compute_control_output_.find(host_compute_name);
+  if (iter == host_compute_control_output_.end()) {
+    return errors::InvalidArgument(
+        "No registered control handle for host compute Op '", host_compute_name,
+        "'");
+  } else {
+    *handle = iter->second;
+  }
+  return Status::OK();
+}
+
+Status XlaCompiler::SetHostComputeControlDependency(
+    const string& host_compute_name, const xla::ComputationDataHandle& handle) {
+  if (host_compute_control_output_.find(host_compute_name) !=
+      host_compute_control_output_.end()) {
+    return errors::InvalidArgument(
+        "Duplicate control handles registered for for host compute Op ",
+        host_compute_name);
+  }
+  host_compute_control_output_[host_compute_name] = handle;
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index a6747bbe72..8f564f35ec 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -325,6 +325,23 @@ class XlaCompiler {
                                  gtl::ArraySlice<DataType> types,
                                  gtl::ArraySlice<TensorShape> shapes);
 
+  // In order to avoid deadlocks from dependencies in host computations, it can
+  // be necessary to enforce a partial order on the execution of HostCompute
+  // Ops. In particular it may be necessary to constrain the SendToHost for one
+  // HostCompute to run before blocking on the RecvAtHost for another
+  // HostCompute. The compiler maintains a mapping from 'host_compute_name' to
+  // handle, where the handle is an 'output' of the HostCompute Op corresponding
+  // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced
+  // later can add the handle as an 'input' to enforce the constraints.
+  // 'host_compute_name' can be any string the client wishes to use to identify
+  // a given HostCompute Op as long as the names are unique within the
+  // compilation.
+  Status GetHostComputeControlDependency(const string& host_compute_name,
+                                         xla::ComputationDataHandle* handle);
+  Status SetHostComputeControlDependency(
+      const string& host_compute_name,
+      const xla::ComputationDataHandle& handle);
+
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
   FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; }
@@ -391,6 +408,9 @@ class XlaCompiler {
   std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
   std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
 
+  std::unordered_map<string, xla::ComputationDataHandle>
+      host_compute_control_output_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
 };
 
-- 
GitLab


From d82d04f15992e224743f29aa75134ed04aa064a7 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 20 Apr 2018 13:58:51 -0700
Subject: [PATCH 1226/1262] Automated g4 rollback of changelist 193694958

PiperOrigin-RevId: 193718607
---
 .../core/distributed_runtime/master_session.cc     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 1c67b42e76..ebe350d313 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -89,10 +89,6 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ~ReffedClientGraph() override {
     if (should_deregister_) {
       DeregisterPartitions();
-    } else {
-      for (Part& part : partitions_) {
-        worker_cache_->ReleaseWorker(part.name, part.worker);
-      }
     }
   }
 
@@ -1178,8 +1174,14 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  should_delete_worker_sessions_ = true;
-  return CreateWorkerSessions(options);
+  // TODO(b/36574172): Remove these conditions when ClusterSpec
+  // propagation is supported in all servers.
+  if (options.cluster_def != nullptr ||
+      session_opts_.config.isolate_session_state()) {
+    should_delete_worker_sessions_ = true;
+    return CreateWorkerSessions(options);
+  }
+  return Status::OK();
 }
 
 Status MasterSession::CreateWorkerSessions(
-- 
GitLab


From 9fc5bacba49eb31c7d536963879ccc62ecfbaf76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 14:25:57 -0700
Subject: [PATCH 1227/1262] Pin rbe-debian8-tf container tp a newer base image

- Also improve how numpy is installed (not compiling from source) for containers based on other distros than Ubuntu14.04

PiperOrigin-RevId: 193722848
---
 tensorflow/tools/ci_build/Dockerfile.rbe.cpu             | 2 +-
 .../tools/ci_build/install/install_pip_packages.sh       | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
index 6f0798b1af..3bc52b9ed6 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
@@ -1,4 +1,4 @@
-FROM launcher.gcr.io/google/rbe-debian8:r322167
+FROM launcher.gcr.io/google/rbe-debian8:r327695
 LABEL maintainer="Yu Yi <yiyu@google.com>"
 
 # Copy install scripts
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 9644277fab..5aaf544afd 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -65,8 +65,13 @@ rm -rf /usr/lib/python3/dist-packages/six*
 # numpy needs to be installed from source to fix segfaults. See:
 # https://github.com/tensorflow/tensorflow/issues/6968
 # This workaround isn't needed for Ubuntu 16.04 or later.
-pip2 install --no-binary=:all: --upgrade numpy==1.12.0
-pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+if $(cat /etc/*-release | grep -q 14.04); then
+  pip2 install --no-binary=:all: --upgrade numpy==1.12.0
+  pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+else
+  pip2 install --upgrade numpy==1.12.0
+  pip3 install --upgrade numpy==1.12.0
+fi
 
 pip2 install scipy==0.18.1
 pip3 install scipy==0.18.1
-- 
GitLab


From 9f312f32091534bfc115212d2ec7c838180df663 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 14:30:48 -0700
Subject: [PATCH 1228/1262] Updating Generate Random Tensor to generate tensors
 whose values are small and do not cause overflow for arithmetic operations.

PiperOrigin-RevId: 193723661
---
 tensorflow/core/grappler/optimizers/BUILD      | 1 -
 tensorflow/core/grappler/utils/BUILD           | 1 +
 tensorflow/core/grappler/utils/grappler_test.h | 4 +++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3ab8d8f584..42c3580d40 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -112,7 +112,6 @@ tf_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
     shard_count = 5,
-    tags = ["noasan"],
     deps = [
         ":constant_folding",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index b473f32c45..44ef4a965b 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -128,6 +128,7 @@ cc_library(
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index e1394b9c35..c2ba5ee7e8 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -62,7 +63,8 @@ class GrapplerTest : public ::testing::Test {
   Tensor GenerateRandomTensor(const TensorShape& shape) const {
     typedef typename EnumToDataType<DTYPE>::Type T;
     Tensor tensor(DTYPE, shape);
-    tensor.flat<T>() = tensor.flat<T>().random();
+    for (auto i = 0; i < tensor.NumElements(); i++)
+      tensor.flat<T>()(i) = i + random::New64() % 10;
     return tensor;
   }
 
-- 
GitLab


From bc78f9b060cece8e29a89f7dbcdedcadbc61891d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 14:32:07 -0700
Subject: [PATCH 1229/1262] internal END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 193600682

PiperOrigin-RevId: 193723856
---
 .../layers/python/layers/rev_block_lib.py     |  77 ++-----------
 .../python/layers/rev_block_lib_test.py       | 102 ------------------
 2 files changed, 11 insertions(+), 168 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 9f904cc302..02d294c68f 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -45,7 +45,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
@@ -430,13 +429,12 @@ def enable_with_args(dec):
 
 
 @enable_with_args
-def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False,
-                   tensor_arg_names=None):
+def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """Decorator that recomputes the function on the backwards pass.
 
   Args:
-    fn: the subgraph-producing function to wrap and recompute when computing
-      gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s.
+    fn: a function that takes Tensors (all as positional arguments) and returns
+      a tuple of Tensors.
     use_data_dep: `bool`, if `True` will use a dummy data dependency to force
       the recompute to happen. If `False` will use a control dependency. By
       default will be `True` if in an XLA context and `False` otherwise. XLA
@@ -445,25 +443,17 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False,
       that all gradients are produced before any are consumed by downstream ops.
       If `use_data_dep` is also `True`, will use a data dependency instead of
       a control dependency.
-    tensor_arg_names: `list<str>`, names of the `Tensor` arguments to `fn`. If
-      `None`, assumes all arguments are `Tensor`s.
 
   Returns:
     A wrapped fn that is identical to fn when called, but its activations will
     be discarded and recomputed on the backwards pass (i.e. on a call to
     tf.gradients).
   """
-  if tensor_arg_names:
-    if not isinstance(tensor_arg_names, (list, tuple)):
-      raise TypeError("tensor_arg_names must be a list")
 
   @functools.wraps(fn)
-  def wrapped(*args, **kwargs):
-    tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs,
-                                                    tensor_arg_names)
+  def wrapped(*args):
     return _recompute_grad(
-        tensor_only_fn, tensor_args, use_data_dep=use_data_dep,
-        tupleize_grads=tupleize_grads)
+        fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
 
   return wrapped
 
@@ -473,59 +463,11 @@ def _is_on_tpu():
   return control_flow_util.GetContainingXLAContext(ctxt) is not None
 
 
-def _make_tensor_only(fn, args, kwargs, tensor_arg_names):
-  """Return fn such that it only takes Tensor args for tensor_arg_names."""
-  argspec = tf_inspect.getargspec(fn)
-  if argspec.varargs is not None or argspec.keywords is not None:
-    raise ValueError("Function decorated with recompute_grad must not use "
-                     "*args or **kwargs.")
-  fn_arg_names = list(argspec.args)
-
-  # name_to_arg is a dict of argument name to argument value, including both
-  # positional and keyword arguments passed.
-  name_to_arg = {}
-  # Populate positional arguments.
-  for name, arg in zip(fn_arg_names[:len(args)], args):
-    name_to_arg[name] = arg
-  # Populate keyword arguments.
-  name_to_arg.update(kwargs)
-
-  # Separate the Tensor arguments from the non-Tensor arguments.
-  # The default is that all arguments are Tensor arguments.
-  tensor_arg_names = tensor_arg_names or fn_arg_names
-  for name in tensor_arg_names:
-    if name not in name_to_arg:
-      raise ValueError("Must provide Tensor argument %s" % name)
-  tensor_args = [name_to_arg[name] for name in tensor_arg_names]
-  non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items()
-                            if name not in tensor_arg_names])
-
-  # Check that Tensor arguments are in fact Tensors and that non-Tensor
-  # arguments are not.
-  for name, arg in zip(tensor_arg_names, tensor_args):
-    if not isinstance(arg, framework_ops.Tensor):
-      raise TypeError("Fn argument %s must be a Tensor." % name)
-  for name, arg in non_tensor_kwargs.items():
-    if isinstance(arg, framework_ops.Tensor):
-      raise TypeError("Fn argument %s must not be a Tensor." % name)
-
-  # Construct a Tensor-only wrapper function that will pass the non-Tensor
-  # arguments as well when called.
-  def tensor_only_fn(*tensors):
-    all_kwargs = dict(zip(tensor_arg_names, tensors))
-    all_kwargs.update(non_tensor_kwargs)
-    return fn(**all_kwargs)
-
-  return tensor_only_fn, tensor_args
-
-
-def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT,
-                    tupleize_grads=False):
+def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """See recompute_grad."""
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
-
   use_data_dep_ = use_data_dep
   if use_data_dep_ == _USE_DEFAULT:
     use_data_dep_ = _is_on_tpu()
@@ -559,11 +501,14 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT,
     grad_vars = grads[len(inputs):]
     return grad_inputs, grad_vars
 
-  # TODO(rsepassi): Replace with tf.custom_gradient
   @_fn_with_custom_grad(grad_fn)
   def fn_with_recompute(*args):
     cached_vs.append(variable_scope.get_variable_scope())
-    cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
+    # TODO(rsepassi): Rm conditional in TF 1.4
+    if hasattr(contrib_framework_ops, "current_arg_scope"):
+      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
+    else:
+      cached_arg_scope.append({})
     return fn(*args)
 
   return fn_with_recompute(*args)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 66ccc696f9..392a490be1 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -318,108 +318,6 @@ class RecomputeTest(test.TestCase):
       self.assertEqual(1, len(grads))
       self.assertTrue(grads[0] is not None)
 
-  def testWithNontensorArgs(self):
-    @rev_block_lib.recompute_grad(tupleize_grads=True,
-                                  tensor_arg_names=["inputs"])
-    def layer_with_recompute(inputs, plus=None):
-      var = variable_scope.get_variable("var", ())
-      self.assertFalse(plus)  # called with False below
-      if plus:
-        return var + inputs
-      else:
-        return var * inputs
-
-    inputs = array_ops.ones((), dtypes.float32)
-    outputs = layer_with_recompute(inputs, plus=False)
-    loss = math_ops.square(outputs)
-    grads = gradients_impl.gradients(loss, variables.trainable_variables())
-    self.assertEqual(1, len(grads))
-    self.assertTrue(grads[0] is not None)
-
-
-class MakeTensorOnlyTest(test.TestCase):
-
-  def testMakeTensorOnly(self):
-    def fn(a, b, c, d=1, e=None, f=7):
-      return (a, b, c, d, e, f)
-
-    t1 = array_ops.ones(())
-    t2 = array_ops.ones(())
-    t3 = array_ops.ones(())
-    args = [1, t1, 3, t2]
-    kwargs = {"e": t3}
-    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
-        fn, args, kwargs, ["b", "d", "e"])
-    self.assertAllEqual(tensor_args, [t1, t2, t3])
-    out = tensor_only_fn(*tensor_args)
-    self.assertAllEqual(out, (1, t1, 3, t2, t3, 7))
-
-  def testMakeTensorOnlyPositionalArgsOnly(self):
-    def fn(a, b, c):
-      return (a, b, c)
-
-    t1 = array_ops.ones(())
-    t2 = array_ops.ones(())
-    args = [t1, 3, t2]
-    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
-        fn, args, {}, ["a", "c"])
-    self.assertAllEqual(tensor_args, [t1, t2])
-    out = tensor_only_fn(*tensor_args)
-    self.assertAllEqual(out, (t1, 3, t2))
-
-  def testMakeTensorOnlyKwargsArgsOnly(self):
-    def fn(a=1, b=2, c=3):
-      return (a, b, c)
-
-    t1 = array_ops.ones(())
-    t2 = array_ops.ones(())
-    args = [t1]
-    kwargs = {"c": t2}
-    tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only(
-        fn, args, kwargs, ["a", "c"])
-    self.assertAllEqual(tensor_args, [t1, t2])
-    out = tensor_only_fn(*tensor_args)
-    self.assertAllEqual(out, (t1, 2, t2))
-
-  def testErrorOnMissingTensorArg(self):
-    def fn(a, b):
-      return (a, b)
-
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, "provide Tensor argument"):
-      rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"])
-
-  def testErrorOnSignatureSplats(self):
-    def fn1(a, *args):
-      return (a, args)
-
-    err_msg = r"must not use \*args or \*\*kwargs"
-    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
-      rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"])
-
-    def fn2(a, **kwargs):
-      return (a, kwargs)
-
-    with self.assertRaisesWithPredicateMatch(ValueError, err_msg):
-      rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"])
-
-  def testErrorOnNonTensorForTensor(self):
-    def fn(a, b):
-      return (a, b)
-
-    with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"):
-      rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"])
-
-  def testErrorOnTensorForNonTensor(self):
-    def fn(a, b):
-      return (a, b)
-
-    with self.assertRaisesWithPredicateMatch(
-        TypeError, "must not be a Tensor"):
-      t1 = array_ops.ones(())
-      t2 = array_ops.ones(())
-      rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"])
-
 
 class FnWithCustomGradTest(test.TestCase):
 
-- 
GitLab


From b133f8c70622e52f19631fd93d4b87ee21c52ac6 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 20 Apr 2018 14:58:56 -0700
Subject: [PATCH 1230/1262] Move the guts of TFE_Execute into EagerExecute

PiperOrigin-RevId: 193728072
---
 tensorflow/c/eager/BUILD                      |   1 -
 tensorflow/c/eager/c_api.cc                   | 531 +-----------------
 tensorflow/core/common_runtime/eager/BUILD    |  21 +-
 .../core/common_runtime/eager/execute.cc      | 489 ++++++++++++++++
 .../core/common_runtime/eager/execute.h       |   7 +
 5 files changed, 508 insertions(+), 541 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d66386acbd..fae922ea3b 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -31,7 +31,6 @@ tf_cuda_library(
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:eager_executor",
             "//tensorflow/core/common_runtime/eager:execute",
-            "//tensorflow/core/common_runtime/eager:execute_node",
             "//tensorflow/core/common_runtime/eager:kernel_and_device",
             "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/common_runtime/eager:copy_to_device_node",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index b7a3097208..975bde7c7f 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
-#include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -219,9 +218,6 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
   }
   return retval;
 }
-}  // extern "C"
-
-extern "C" {
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
@@ -423,531 +419,18 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
       attr_name, tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
                      funcs.get(), num_values));
 }
-}  // extern "C"
-
-namespace {
-
-// Initializes the step stats if needed.
-void MaybeInitializeStepStats(tensorflow::StepStats* step_stats,
-                              tensorflow::EagerContext* ctx) {
-  // Lazily initialize the RunMetadata with information about all devices if
-  // this is the first call.
-  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
-    int device_idx = step_stats->dev_stats_size();
-    auto* dev_stats = step_stats->add_dev_stats();
-    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
-  }
-}
-
-int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
-                         tensorflow::EagerContext* ctx,
-                         tensorflow::Device* device) {
-  // Find the current device's index.
-  if (device == nullptr) {
-    device = ctx->HostCPU();
-  }
-  for (int i = 0; i < ctx->devices()->size(); ++i) {
-    if (ctx->devices()->at(i) == device ||
-        ctx->devices()->at(i)->name() == device->name()) {
-      return i;
-    }
-  }
-  // TODO(apassos) do not fall back to host CPU if device is unknown.
-  return 0;
-}
-
-tensorflow::Status ValidateInputTypeAndPlacement(
-    tensorflow::EagerContext* ctx, tensorflow::Device* op_device,
-    tensorflow::EagerOperation* op, const tensorflow::OpKernel* kernel,
-    tensorflow::RunMetadata* run_metadata) {
-  tensorflow::Device* host_device = ctx->HostCPU();
-  const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->Inputs().size()) {
-    return tensorflow::errors::InvalidArgument(
-        "expected ", memtypes.size(), " inputs, got ", op->Inputs().size());
-  }
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    const tensorflow::Device* expected_device =
-        memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device;
-    tensorflow::TensorHandle* handle = op->Inputs()[i];
-    tensorflow::Device* handle_device = nullptr;
-    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
-    const tensorflow::Device* actual_device =
-        handle_device == nullptr ? host_device : handle_device;
-    if (expected_device != actual_device) {
-      switch (ctx->GetDevicePlacementPolicy()) {
-        case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32:
-          // TODO(xpan): See if we could bubble python related error up
-          // to python level.
-          if (handle->dtype == tensorflow::DT_INT32) {
-            // Note: enabling silent copies of int32 tensors to match behavior
-            // of graph mode.
-            break;
-          }
-          TF_FALLTHROUGH_INTENDED;
-        case tensorflow::DEVICE_PLACEMENT_EXPLICIT:
-          return tensorflow::errors::InvalidArgument(
-              "Tensors on conflicting devices:"
-              " cannot compute ",
-              op->Name(), " as input #", i, " was expected to be on ",
-              expected_device->name(), " but is actually on ",
-              actual_device->name(), " (operation running on ",
-              op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu() "
-              "methods,"
-              " or transparently copied by using tf.enable_eager_execution("
-              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
-              "between devices"
-              " may slow down your model");
-        case tensorflow::DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
-                       << " was expected to be on " << expected_device->name()
-                       << " but is actually on " << actual_device->name()
-                       << " (operation running on " << op_device->name()
-                       << "). This triggers a copy which can be a performance "
-                          "bottleneck.";
-          break;
-        case tensorflow::DEVICE_PLACEMENT_SILENT:  // Do nothing.
-          break;
-      }
-      // We are only here if the policy is warn or silent copies, so we should
-      // trigger a copy.
-      auto pre_time = tensorflow::Env::Default()->NowMicros();
-      tensorflow::TensorHandle* copied_tensor = nullptr;
-      tensorflow::Status status = tensorflow::EagerCopyToDevice(
-          handle, ctx, expected_device->name().c_str(), &copied_tensor);
-      if (run_metadata != nullptr) {
-        auto* step_stats = run_metadata->mutable_step_stats();
-        MaybeInitializeStepStats(step_stats, ctx);
-        // Record the sending on the source device for now.
-        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
-        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-        auto* node_stats = dev_stats->add_node_stats();
-        node_stats->set_node_name("_Send");
-        node_stats->set_all_start_micros(pre_time);
-        node_stats->set_op_end_rel_micros(
-            tensorflow::Env::Default()->NowMicros() - pre_time);
-      }
-      if (!status.ok()) {
-        if (copied_tensor != nullptr) copied_tensor->Unref();
-        return tensorflow::errors::Internal(
-            "Failed copying input tensor from ", actual_device->name(), " to ",
-            expected_device->name(), " in order to run ", op->Name(), ": ",
-            status.error_message());
-      }
-      handle->Unref();
-      handle = copied_tensor;
-      (*op->MutableInputs())[i] = copied_tensor;
-    }
-    if (handle->dtype != kernel->input_type(i)) {
-      return tensorflow::errors::InvalidArgument(
-          "cannot compute ", op->Name(), " as input #", i,
-          " was expected to be a ",
-          tensorflow::DataTypeString(kernel->input_type(i)),
-          " tensor but is a ", tensorflow::DataTypeString(handle->dtype),
-          " tensor");
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
-                                 tensorflow::EagerContext* ctx,
-                                 TF_Status* status) {
-  tensorflow::DeviceSet ds;
-  for (tensorflow::Device* d : *ctx->devices()) {
-    ds.AddDevice(d);
-  }
-  tensorflow::DeviceTypeVector final_devices;
-  status->status = tensorflow::SupportedDeviceTypesForNode(
-      ds.PrioritizedDeviceTypeList(), ndef, &final_devices);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-  if (final_devices.empty()) {
-    status->status = tensorflow::errors::Internal(
-        "Could not find valid device for node ", ndef.DebugString());
-    return nullptr;
-  }
-  for (tensorflow::Device* d : *ctx->devices()) {
-    if (d->device_type() == final_devices[0].type_string()) {
-      return d;
-    }
-  }
-  status->status = tensorflow::errors::Unknown(
-      "Could not find a device for node ", ndef.DebugString());
-  return nullptr;
-}
-
-#ifdef TENSORFLOW_EAGER_USE_XLA
-// Synthesizes and returns a wrapper function over `op`, which must be a
-// primitive op (e.g. matmul).
-//
-// The wrapper function conforms to the function signature expected by
-// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
-// resources>. For example, if the op has input params <Const1, Arg2, Const3,
-// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
-// Resource4> as the input params to the synthesized function.
-//
-// It populates `const_input_types`, `arg_input_types` and
-// `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
-// `status` accordingly.
-const tensorflow::FunctionDef* OpToFunction(
-    TFE_Op* op, std::vector<TF_DataType>* const_input_types,
-    std::vector<TF_DataType>* arg_input_types,
-    tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
-    TF_Status* status) {
-  DCHECK(!op->operation.is_function());
-
-  tensorflow::FunctionDef fdef;
-
-  // Get the OpDef of the op we are trying to encapsulate.
-  TFE_Context* ctx = op->operation.ctx;
-  const tensorflow::OpRegistrationData* op_data;
-  {
-    status->status =
-        ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
-    if (!status->status.ok()) {
-      return nullptr;
-    }
-  }
-  const tensorflow::OpDef& op_def = op_data->op_def;
-
-  tensorflow::OpDef* signature = fdef.mutable_signature();
-
-  // Handle constant inputs.
-  const std::unordered_set<string> const_inputs(
-      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
-          op->operation.Name()));
-
-  // First add place holders for the input args, so that we can refer to them by
-  // position in the next loop. Also tally up the resource inputs.
-  int num_resource_inputs = 0;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) {
-      ++num_resource_inputs;
-    }
-    signature->add_input_arg();
-  }
-
-  // Now we map the input params from `op_def` to `signature`, where the param
-  // ordering for `signature` is: <constants, args, resources>.
-  int const_index = 0;
-  int arg_index = const_inputs.size();
-  int resource_index = op_def.input_arg_size() - num_resource_inputs;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
-    tensorflow::OpDef::ArgDef* func_input_arg = nullptr;
-    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
-      VLOG(1) << "For const input, mapping op input " << i << " to func input "
-              << const_index;
-      (*op_input_to_func_input)[i] = const_index;
-      func_input_arg = signature->mutable_input_arg(const_index++);
-      const_input_types->push_back(
-          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
-    } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
-      VLOG(1) << "For resource input, mapping op input " << i
-              << " to func input " << resource_index;
-      (*op_input_to_func_input)[i] = resource_index;
-      func_input_arg = signature->mutable_input_arg(resource_index++);
-    } else {
-      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
-              << arg_index;
-      (*op_input_to_func_input)[i] = arg_index;
-      func_input_arg = signature->mutable_input_arg(arg_index++);
-      arg_input_types->push_back(
-          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
-    }
-
-    func_input_arg->set_name(op_input_arg.name());
-    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
-  }
-  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
-
-  // Resources args are at the end of the function input params, and we should
-  // have iterated over all of them.
-  DCHECK_EQ(signature->input_arg_size(), resource_index);
-
-  // Make the synthesized function's name unique.
-  signature->set_name(tensorflow::strings::StrCat(
-      op_def.name(), func_id_generator.fetch_add(1)));
-
-  // Add the node def and set its input names to match op_def's names.
-  const tensorflow::NodeDef& ndef =
-      op->operation.MutableAttrs()->BuildNodeDef();
-  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
-  *fdef.add_node_def() = ndef;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
-  }
-  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
-
-  // Fix the output names and set output types.
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    tensorflow::OpDef::ArgDef* arg = signature->add_output_arg();
-    const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
-    const string& out_tensor_name = tensorflow::strings::StrCat(
-        ndef.name(), ":", op_def_arg.name(), ":", 0);
-    arg->set_name(op_def_arg.name());
-    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
-    const string& type_attr = op_def_arg.type_attr();
-    if (!type_attr.empty()) {
-      auto i = ndef.attr().find(type_attr);
-      if (i == ndef.attr().end()) {
-        status->status = tensorflow::errors::InvalidArgument(
-            tensorflow::strings::StrCat("Could not find attr ", type_attr,
-                                        " in NodeDef ", ndef.DebugString()));
-        return nullptr;
-      }
-      arg->set_type(i->second.type());
-    }
-  }
-  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
-
-  status->status = ctx->context.AddFunctionDef(fdef);
-  if (!status->status.ok()) return nullptr;
-  const auto ret = ctx->context.FindFunctionDef(signature->name());
-  DCHECK(ret != nullptr);
-  return ret;
-}
-
-// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
-// via XLA.
-std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
-  auto launch_op = std::unique_ptr<TFE_Op>(
-      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  if (op->operation.device) {
-    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
-                    status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-  }
-
-  const tensorflow::FunctionDef* fdef;
-  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
-  std::vector<TF_DataType> const_input_types;
-  std::vector<TF_DataType> arg_input_types;
-  tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
-  if (fdef == nullptr) {
-    // See if this is a primitive op, and if so create a function for it, so
-    // that _XlaLaunchOp can access it.
-    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
-                        &op_input_to_func_input, status);
-    if (!status->status.ok()) return nullptr;
-  } else {
-    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
-    // functions, so we need to find another way to handle constant inputs.
-    for (int i = const_input_types.size();
-         i < fdef->signature().input_arg_size(); ++i) {
-      VLOG(1) << "Adding Targs from input arg " << i;
-      const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i);
-      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
-    }
-  }
-  DCHECK(fdef != nullptr);
-
-  // Copy inputs and their devices.
-  // Since input param reordering may have occurred between `op` and `launch_op`
-  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
-  *launch_op->operation.MutableInputs() = op->operation.Inputs();
-  for (tensorflow::TensorHandle* h : launch_op->operation.Inputs()) {
-    h->Ref();
-  }
-  if (!op_input_to_func_input.empty()) {
-    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
-    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
-      VLOG(1) << "mapping op input " << i << " to func input "
-              << op_input_to_func_input[i];
-
-      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
-          op->operation.Inputs()[i];
-    }
-  }
-  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
-
-  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
-                        const_input_types.size());
-
-  // Set Targs and Nresources attrs.
-  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
-                        arg_input_types.size());
-  const int num_resource_inputs = fdef->signature().input_arg_size() -
-                                  const_input_types.size() -
-                                  arg_input_types.size();
-  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
-
-  // Set Tresults attr.
-  std::vector<TF_DataType> tresults;
-  for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) {
-    tresults.push_back(static_cast<TF_DataType>(arg.type()));
-  }
-  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
-                        tresults.size());
-
-  // Set function attr.
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(fdef->signature().name());
-  launch_op->attrs.Set("function", attr_value);
-
-  return launch_op;
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
 
-}  // namespace
-
-extern "C" {
-
-void TFE_Execute(TFE_Op* tfe_op, TFE_TensorHandle** retvals, int* num_retvals,
+void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  tensorflow::EagerOperation* op = &tfe_op->operation;
-  tensorflow::EagerContext* ctx = op->EagerContext();
-  status->status = ctx->GetStatus();
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
+      *num_retvals);
+  status->status =
+      tensorflow::EagerExecute(&op->operation, &handle_retvals, num_retvals);
   if (!status->status.ok()) {
     return;
   }
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->UseXla() && op->Name() != "_XlaLaunch") {
-    xla_launch_op = BuildXlaLaunch(op, status);
-    if (!status->status.ok()) {
-      return;
-    }
-    op = xla_launch_op.get();
-  }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-  // Ensure all resource-touching ops run in the device the resource is,
-  // regardless of anything else that has been specified. This is identical to
-  // the graph mode behavior.
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    tensorflow::Device* input_op_device = nullptr;
-    status->status = op->Inputs()[i]->OpDevice(&input_op_device);
-    if (!status->status.ok()) return;
-    VLOG(2) << "for op " << op->Name() << " input " << i << " "
-            << tensorflow::DataTypeString(op->Inputs()[i]->dtype) << " "
-            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
-            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
-    if (op->Inputs()[i]->dtype == tensorflow::DT_RESOURCE &&
-        (input_op_device != op->Device() || input_op_device == nullptr)) {
-      tensorflow::Device* d =
-          input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
-      VLOG(1) << "Changing device of operation " << op->Name() << " to "
-              << d->name() << " because input #" << i
-              << " is a resource in this device.";
-      op->SetDevice(d);
-    }
-  }
-  tensorflow::Device* device = op->Device();
-
-  tensorflow::Fprint128 cache_key = op->MutableAttrs()->CacheKey(
-      device == nullptr ? "unspecified" : device->name());
-  tensorflow::KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
-  if (kernel == nullptr) {
-    const tensorflow::NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
-    if (device == nullptr) {
-      device = SelectDevice(ndef, ctx, status);
-      if (!status->status.ok()) {
-        return;
-      }
-    }
-    CHECK(device != nullptr);
-    if (ctx->LogDevicePlacement()) {
-      LOG(INFO) << "Executing op " << ndef.op() << " in device "
-                << device->name();
-    }
-    kernel = new tensorflow::KernelAndDevice(ctx->GetRendezvous());
-    // Knowledge of the implementation of Init (and in-turn
-    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
-    // will be accessed, so grab on to the lock.
-    // See WARNING comment in Execute (before kernel->Run) - would be nice to
-    // rework to avoid this subtlety.
-    tensorflow::tf_shared_lock l(*ctx->FunctionsMu());
-    status->status =
-        tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
-    if (!status->status.ok()) {
-      delete kernel;
-      return;
-    }
-    // Update output_dtypes inside `kernel`.
-    const tensorflow::OpDef* op_def = nullptr;
-    const tensorflow::FunctionDef* function_def =
-        ctx->FuncLibDef()->Find(ndef.op());
-    if (function_def != nullptr) {
-      op_def = &(function_def->signature());
-    }
-    if (op_def == nullptr) {
-      status->status = OpDefForOp(ndef.op().c_str(), &op_def);
-      if (!status->status.ok()) {
-        return;
-      }
-    }
-    tensorflow::DataTypeVector input_dtypes;
-    status->status = InOutTypesForNode(ndef, *op_def, &input_dtypes,
-                                       kernel->mutable_output_dtypes());
-    if (!status->status.ok()) {
-      return;
-    }
-    ctx->AddKernelToCache(cache_key, kernel);
-  }
-  const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes();
-  const int output_dtypes_size = output_dtypes.size();
-  if (output_dtypes_size > *num_retvals) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 tensorflow::strings::StrCat("Expecting ", output_dtypes.size(),
-                                             " outputs, but *num_retvals is ",
-                                             *num_retvals)
-                     .c_str());
-    return;
-  }
-  *num_retvals = output_dtypes_size;
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
-  status->status = ValidateInputTypeAndPlacement(
-      ctx, device, op, kernel->kernel(),
-      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
-  if (!status->status.ok()) return;
-  std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
-  if (ctx->ShouldStoreMetadata()) {
-    maybe_stats.reset(new tensorflow::NodeExecStats);
-    maybe_stats->set_node_name(op->Name());
-    maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
-    maybe_stats->set_op_start_rel_micros(0);
-    maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
-    // TODO(apassos) track referenced tensors
-  }
-  if (ctx->Async()) {
-    // Note that for async mode, execution order will make sure that all
-    // input handles are ready before executing them.
-    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
-    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
-        *num_retvals);
-    tensorflow::uint64 id = ctx->NextId();
-    for (int i = 0; i < *num_retvals; ++i) {
-      tensorflow::TensorHandle* h =
-          new tensorflow::TensorHandle(id, output_dtypes[i], ctx);
-      retvals[i] = new TFE_TensorHandle(h);
-      handle_retvals[i] = h;
-    }
-    tensorflow::EagerNode* node = new tensorflow::ExecuteNode(
-        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
-        output_dtypes, handle_retvals);
-    ctx->ExecutorAdd(node);
-  } else {
-    // Execute checks if retvals[i] is nullptr or not to figure if it needs to
-    // allocate it.
-    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
-        *num_retvals);
-    status->status = tensorflow::EagerExecute(
-        ctx, op->Device(), op->Inputs(), kernel, maybe_stats.get(),
-        handle_retvals.data(), *num_retvals);
-    for (int i = 0; i < *num_retvals; ++i) {
-      retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
-    }
+  for (int i = 0; i < *num_retvals; ++i) {
+    retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 00ac4a4e47..13d6b021b5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -154,26 +154,15 @@ tf_cc_test(
 cc_library(
     name = "execute",
     srcs = ["execute.cc"],
-    hdrs = ["execute.h"],
-    deps = [
-        ":context",
-        ":copy_to_device_node",
-        ":kernel_and_device",
-        ":tensor_handle",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
+    hdrs = [
+        "execute.h",
+        "execute_node.h",
     ],
-)
-
-cc_library(
-    name = "execute_node",
-    hdrs = ["execute_node.h"],
     deps = [
         ":context",
+        ":copy_to_device_node",
         ":eager_executor",
-        ":execute",
+        ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
         "//tensorflow/core:core_cpu_lib",
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 98e8471102..a514f81e14 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -32,6 +34,493 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+// Initializes the step stats if needed.
+void MaybeInitializeStepStats(StepStats* step_stats, EagerContext* ctx) {
+  // Lazily initialize the RunMetadata with information about all devices if
+  // this is the first call.
+  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+    int device_idx = step_stats->dev_stats_size();
+    auto* dev_stats = step_stats->add_dev_stats();
+    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
+  }
+}
+
+int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
+                         Device* device) {
+  // Find the current device's index.
+  if (device == nullptr) {
+    device = ctx->HostCPU();
+  }
+  for (int i = 0; i < ctx->devices()->size(); ++i) {
+    if (ctx->devices()->at(i) == device ||
+        ctx->devices()->at(i)->name() == device->name()) {
+      return i;
+    }
+  }
+  // TODO(apassos) do not fall back to host CPU if device is unknown.
+  return 0;
+}
+
+Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
+                                     EagerOperation* op, const OpKernel* kernel,
+                                     RunMetadata* run_metadata) {
+  Device* host_device = ctx->HostCPU();
+  const MemoryTypeVector& memtypes = kernel->input_memory_types();
+  if (memtypes.size() != op->Inputs().size()) {
+    return errors::InvalidArgument("expected ", memtypes.size(),
+                                   " inputs, got ", op->Inputs().size());
+  }
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    const Device* expected_device =
+        memtypes[i] == HOST_MEMORY ? host_device : op_device;
+    TensorHandle* handle = op->Inputs()[i];
+    Device* handle_device = nullptr;
+    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
+    const Device* actual_device =
+        handle_device == nullptr ? host_device : handle_device;
+    if (expected_device != actual_device) {
+      switch (ctx->GetDevicePlacementPolicy()) {
+        case DEVICE_PLACEMENT_SILENT_FOR_INT32:
+          // TODO(xpan): See if we could bubble python related error up
+          // to python level.
+          if (handle->dtype == DT_INT32) {
+            // Note: enabling silent copies of int32 tensors to match behavior
+            // of graph mode.
+            break;
+          }
+          TF_FALLTHROUGH_INTENDED;
+        case DEVICE_PLACEMENT_EXPLICIT:
+          return errors::InvalidArgument(
+              "Tensors on conflicting devices:"
+              " cannot compute ",
+              op->Name(), " as input #", i, " was expected to be on ",
+              expected_device->name(), " but is actually on ",
+              actual_device->name(), " (operation running on ",
+              op_device->name(), ")",
+              " Tensors can be copied explicitly using .gpu() or .cpu() "
+              "methods,"
+              " or transparently copied by using tf.enable_eager_execution("
+              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+              "between devices"
+              " may slow down your model");
+        case DEVICE_PLACEMENT_WARN:
+          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
+                       << " was expected to be on " << expected_device->name()
+                       << " but is actually on " << actual_device->name()
+                       << " (operation running on " << op_device->name()
+                       << "). This triggers a copy which can be a performance "
+                          "bottleneck.";
+          break;
+        case DEVICE_PLACEMENT_SILENT:  // Do nothing.
+          break;
+      }
+      // We are only here if the policy is warn or silent copies, so we should
+      // trigger a copy.
+      auto pre_time = Env::Default()->NowMicros();
+      TensorHandle* copied_tensor = nullptr;
+      Status status = EagerCopyToDevice(
+          handle, ctx, expected_device->name().c_str(), &copied_tensor);
+      if (run_metadata != nullptr) {
+        auto* step_stats = run_metadata->mutable_step_stats();
+        MaybeInitializeStepStats(step_stats, ctx);
+        // Record the sending on the source device for now.
+        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        auto* node_stats = dev_stats->add_node_stats();
+        node_stats->set_node_name("_Send");
+        node_stats->set_all_start_micros(pre_time);
+        node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
+                                          pre_time);
+      }
+      if (!status.ok()) {
+        if (copied_tensor != nullptr) copied_tensor->Unref();
+        return errors::Internal("Failed copying input tensor from ",
+                                actual_device->name(), " to ",
+                                expected_device->name(), " in order to run ",
+                                op->Name(), ": ", status.error_message());
+      }
+      handle->Unref();
+      handle = copied_tensor;
+      (*op->MutableInputs())[i] = copied_tensor;
+    }
+    if (handle->dtype != kernel->input_type(i)) {
+      return errors::InvalidArgument(
+          "cannot compute ", op->Name(), " as input #", i,
+          " was expected to be a ", DataTypeString(kernel->input_type(i)),
+          " tensor but is a ", DataTypeString(handle->dtype), " tensor");
+    }
+  }
+  return Status::OK();
+}
+
+Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
+  DeviceSet ds;
+  for (Device* d : *ctx->devices()) {
+    ds.AddDevice(d);
+  }
+  DeviceTypeVector final_devices;
+  auto status = SupportedDeviceTypesForNode(ds.PrioritizedDeviceTypeList(),
+                                            ndef, &final_devices);
+  if (!status.ok()) return status;
+  if (final_devices.empty()) {
+    return errors::Internal("Could not find valid device for node ",
+                            ndef.DebugString());
+  }
+  for (Device* d : *ctx->devices()) {
+    if (d->device_type() == final_devices[0].type_string()) {
+      *device = d;
+      return Status::OK();
+    }
+  }
+  return errors::Unknown("Could not find a device for node ",
+                         ndef.DebugString());
+}
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+// Synthesizes and returns a wrapper function over `op`, which must be a
+// primitive op (e.g. matmul).
+//
+// The wrapper function conforms to the function signature expected by
+// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// resources>. For example, if the op has input params <Const1, Arg2, Const3,
+// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
+// Resource4> as the input params to the synthesized function.
+//
+// It populates `const_input_types`, `arg_input_types` and
+// `op_input_to_func_input` based on the reordering results, that the caller can
+// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// `status` accordingly.
+const FunctionDef* OpToFunction(TFE_Op* op,
+                                std::vector<TF_DataType>* const_input_types,
+                                std::vector<TF_DataType>* arg_input_types,
+                                gtl::FlatMap<int, int>* op_input_to_func_input,
+                                TF_Status* status) {
+  DCHECK(!op->operation.is_function());
+
+  FunctionDef fdef;
+
+  // Get the OpDef of the op we are trying to encapsulate.
+  TFE_Context* ctx = op->operation.ctx;
+  const OpRegistrationData* op_data;
+  {
+    status = ctx->context.FindFunctionOpData(op->operation.Name(), &op_data);
+    if (!status.ok()) {
+      return nullptr;
+    }
+  }
+  const OpDef& op_def = op_data->op_def;
+
+  OpDef* signature = fdef.mutable_signature();
+
+  // Handle constant inputs.
+  const std::unordered_set<string> const_inputs(
+      *XlaOpRegistry::CompileTimeConstantInputs(op->operation.Name()));
+
+  // First add place holders for the input args, so that we can refer to them by
+  // position in the next loop. Also tally up the resource inputs.
+  int num_resource_inputs = 0;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    if (op_def.input_arg(i).type() == DT_RESOURCE) {
+      ++num_resource_inputs;
+    }
+    signature->add_input_arg();
+  }
+
+  // Now we map the input params from `op_def` to `signature`, where the param
+  // ordering for `signature` is: <constants, args, resources>.
+  int const_index = 0;
+  int arg_index = const_inputs.size();
+  int resource_index = op_def.input_arg_size() - num_resource_inputs;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    const OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
+    OpDef::ArgDef* func_input_arg = nullptr;
+    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
+      VLOG(1) << "For const input, mapping op input " << i << " to func input "
+              << const_index;
+      (*op_input_to_func_input)[i] = const_index;
+      func_input_arg = signature->mutable_input_arg(const_index++);
+      const_input_types->push_back(
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
+    } else if (op_input_arg.type() == DT_RESOURCE) {
+      VLOG(1) << "For resource input, mapping op input " << i
+              << " to func input " << resource_index;
+      (*op_input_to_func_input)[i] = resource_index;
+      func_input_arg = signature->mutable_input_arg(resource_index++);
+    } else {
+      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
+              << arg_index;
+      (*op_input_to_func_input)[i] = arg_index;
+      func_input_arg = signature->mutable_input_arg(arg_index++);
+      arg_input_types->push_back(
+          static_cast<TF_DataType>(op->operation.Inputs()[i]->dtype));
+    }
+
+    func_input_arg->set_name(op_input_arg.name());
+    func_input_arg->set_type(op->operation.Inputs()[i]->dtype);
+  }
+  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
+
+  // Resources args are at the end of the function input params, and we should
+  // have iterated over all of them.
+  DCHECK_EQ(signature->input_arg_size(), resource_index);
+
+  // Make the synthesized function's name unique.
+  signature->set_name(
+      strings::StrCat(op_def.name(), func_id_generator.fetch_add(1)));
+
+  // Add the node def and set its input names to match op_def's names.
+  const NodeDef& ndef = op->operation.MutableAttrs()->BuildNodeDef();
+  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
+  *fdef.add_node_def() = ndef;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
+  }
+  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
+
+  // Fix the output names and set output types.
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    OpDef::ArgDef* arg = signature->add_output_arg();
+    const OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
+    const string& out_tensor_name =
+        strings::StrCat(ndef.name(), ":", op_def_arg.name(), ":", 0);
+    arg->set_name(op_def_arg.name());
+    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
+    const string& type_attr = op_def_arg.type_attr();
+    if (!type_attr.empty()) {
+      auto i = ndef.attr().find(type_attr);
+      if (i == ndef.attr().end()) {
+        status = errors::InvalidArgument(
+            strings::StrCat("Could not find attr ", type_attr, " in NodeDef ",
+                            ndef.DebugString()));
+        return nullptr;
+      }
+      arg->set_type(i->second.type());
+    }
+  }
+  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
+
+  status = ctx->context.AddFunctionDef(fdef);
+  if (!status.ok()) return nullptr;
+  const auto ret = ctx->context.FindFunctionDef(signature->name());
+  DCHECK(ret != nullptr);
+  return ret;
+}
+
+// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// via XLA.
+std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  auto launch_op = std::unique_ptr<TFE_Op>(
+      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (op->operation.device) {
+    TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+
+  const FunctionDef* fdef;
+  { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); }
+  std::vector<TF_DataType> const_input_types;
+  std::vector<TF_DataType> arg_input_types;
+  gtl::FlatMap<int, int> op_input_to_func_input;
+  if (fdef == nullptr) {
+    // See if this is a primitive op, and if so create a function for it, so
+    // that _XlaLaunchOp can access it.
+    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
+                        &op_input_to_func_input, status);
+    if (!status.ok()) return nullptr;
+  } else {
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
+    // functions, so we need to find another way to handle constant inputs.
+    for (int i = const_input_types.size();
+         i < fdef->signature().input_arg_size(); ++i) {
+      VLOG(1) << "Adding Targs from input arg " << i;
+      const OpDef::ArgDef& arg = fdef->signature().input_arg(i);
+      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
+    }
+  }
+  DCHECK(fdef != nullptr);
+
+  // Copy inputs and their devices.
+  // Since input param reordering may have occurred between `op` and `launch_op`
+  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  *launch_op->operation.MutableInputs() = op->operation.Inputs();
+  for (TensorHandle* h : launch_op->operation.Inputs()) {
+    h->Ref();
+  }
+  if (!op_input_to_func_input.empty()) {
+    DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size());
+    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
+      VLOG(1) << "mapping op input " << i << " to func input "
+              << op_input_to_func_input[i];
+
+      (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] =
+          op->operation.Inputs()[i];
+    }
+  }
+  launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size());
+
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
+                        const_input_types.size());
+
+  // Set Targs and Nresources attrs.
+  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
+                        arg_input_types.size());
+  const int num_resource_inputs = fdef->signature().input_arg_size() -
+                                  const_input_types.size() -
+                                  arg_input_types.size();
+  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
+
+  // Set Tresults attr.
+  std::vector<TF_DataType> tresults;
+  for (const OpDef::ArgDef& arg : fdef->signature().output_arg()) {
+    tresults.push_back(static_cast<TF_DataType>(arg.type()));
+  }
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
+                        tresults.size());
+
+  // Set function attr.
+  AttrValue attr_value;
+  NameAttrList* func = attr_value.mutable_func();
+  func->set_name(fdef->signature().name());
+  launch_op->attrs.Set("function", attr_value);
+
+  return launch_op;
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  EagerContext* ctx = op->EagerContext();
+  auto status = ctx->GetStatus();
+  if (!status.ok()) return status;
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  std::unique_ptr<TFE_Op> xla_launch_op;
+  if (op->UseXla() && op->Name() != "_XlaLaunch") {
+    xla_launch_op = BuildXlaLaunch(op, status);
+    if (!status.ok()) return status;
+    op = xla_launch_op.get();
+  }
+#endif  // TENSORFLOW_EAGER_USE_XLA
+  // Ensure all resource-touching ops run in the device the resource is,
+  // regardless of anything else that has been specified. This is identical to
+  // the graph mode behavior.
+  for (int i = 0; i < op->Inputs().size(); ++i) {
+    Device* input_op_device = nullptr;
+    status = op->Inputs()[i]->OpDevice(&input_op_device);
+    if (!status.ok()) return status;
+    VLOG(2) << "for op " << op->Name() << " input " << i << " "
+            << DataTypeString(op->Inputs()[i]->dtype) << " "
+            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
+            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
+    if (op->Inputs()[i]->dtype == DT_RESOURCE &&
+        (input_op_device != op->Device() || input_op_device == nullptr)) {
+      Device* d = input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
+      VLOG(1) << "Changing device of operation " << op->Name() << " to "
+              << d->name() << " because input #" << i
+              << " is a resource in this device.";
+      op->SetDevice(d);
+    }
+  }
+  Device* device = op->Device();
+
+  Fprint128 cache_key = op->MutableAttrs()->CacheKey(
+      device == nullptr ? "unspecified" : device->name());
+  KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
+  if (kernel == nullptr) {
+    const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+    if (device == nullptr) {
+      status = SelectDevice(ndef, ctx, &device);
+      if (!status.ok()) return status;
+    }
+    CHECK(device != nullptr);
+    if (ctx->LogDevicePlacement()) {
+      LOG(INFO) << "Executing op " << ndef.op() << " in device "
+                << device->name();
+    }
+    kernel = new KernelAndDevice(ctx->GetRendezvous());
+    // Knowledge of the implementation of Init (and in-turn
+    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
+    // will be accessed, so grab on to the lock.
+    // See WARNING comment in Execute (before kernel->Run) - would be nice to
+    // rework to avoid this subtlety.
+    tf_shared_lock l(*ctx->FunctionsMu());
+    status = KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel);
+    if (!status.ok()) {
+      delete kernel;
+      return status;
+    }
+    // Update output_dtypes inside `kernel`.
+    const OpDef* op_def = nullptr;
+    const FunctionDef* function_def = ctx->FuncLibDef()->Find(ndef.op());
+    if (function_def != nullptr) {
+      op_def = &(function_def->signature());
+    }
+    if (op_def == nullptr) {
+      status = OpDefForOp(ndef.op().c_str(), &op_def);
+      if (!status.ok()) return status;
+    }
+    DataTypeVector input_dtypes;
+    status = InOutTypesForNode(ndef, *op_def, &input_dtypes,
+                               kernel->mutable_output_dtypes());
+    if (!status.ok()) return status;
+    ctx->AddKernelToCache(cache_key, kernel);
+  }
+  const DataTypeVector& output_dtypes = kernel->output_dtypes();
+  const int output_dtypes_size = static_cast<int>(output_dtypes.size());
+  if (output_dtypes_size > *num_retvals) {
+    return errors::InvalidArgument("Expecting ", output_dtypes.size(),
+                                   " outputs, but *num_retvals is ",
+                                   *num_retvals);
+  }
+  *num_retvals = output_dtypes_size;
+  if (device == nullptr) {
+    // TODO(apassos) debug how the assignment below might return a different
+    // device from the one requested above.
+    device = kernel->device();
+  }
+  status = ValidateInputTypeAndPlacement(
+      ctx, device, op, kernel->kernel(),
+      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
+  if (!status.ok()) return status;
+  std::unique_ptr<NodeExecStats> maybe_stats;
+  if (ctx->ShouldStoreMetadata()) {
+    maybe_stats.reset(new NodeExecStats);
+    maybe_stats->set_node_name(op->Name());
+    maybe_stats->set_all_start_micros(Env::Default()->NowMicros());
+    maybe_stats->set_op_start_rel_micros(0);
+    maybe_stats->set_scheduled_micros(Env::Default()->NowMicros());
+    // TODO(apassos) track referenced tensors
+  }
+  retvals->resize(*num_retvals);
+  if (ctx->Async()) {
+    // Note that for async mode, execution order will make sure that all
+    // input handles are ready before executing them.
+    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
+    tensorflow::uint64 id = ctx->NextId();
+    for (int i = 0; i < *num_retvals; ++i) {
+      (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx);
+    }
+    EagerNode* node =
+        new ExecuteNode(id, ctx, op->Device(), op->Inputs(), kernel,
+                        maybe_stats.release(), output_dtypes, *retvals);
+    ctx->ExecutorAdd(node);
+  } else {
+    // Execute checks if retvals[i] is nullptr or not to figure if it needs to
+    // allocate it.
+    status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel,
+                          maybe_stats.get(), retvals->data(), *num_retvals);
+  }
+
+  return status;
+}
+
 Status EagerExecute(EagerContext* ctx, Device* device,
                     const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                     KernelAndDevice* kernel, NodeExecStats* maybe_stats,
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 0f6ad031e1..7c8d7e164d 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -25,6 +26,12 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Utility function that executes a fully constructed EagerOperation.
+Status EagerExecute(
+    EagerOperation* op,
+    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
+    int* num_retvals);
+
 // Low-level utility to execute the kernel specified by kernel on device device,
 // with the inputs op_inputs, in the context ctx.
 Status EagerExecute(EagerContext* ctx, Device* device,
-- 
GitLab


From 60a0e2f5261cf72da4e4d8e65b56b695d611b984 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 15:19:59 -0700
Subject: [PATCH 1231/1262] Do not force default layout when there is no need
 to. Allow the inner computations to negotiate a root and parameter layouts
 different from default.

PiperOrigin-RevId: 193731341
---
 tensorflow/compiler/xla/service/BUILD         |   3 +
 .../xla/service/computation_layout.cc         |   7 +-
 .../compiler/xla/service/computation_layout.h |   5 +-
 .../compiler/xla/service/hlo_instruction.h    |   8 +
 .../compiler/xla/service/layout_assignment.cc | 328 +++++++++++++-----
 .../compiler/xla/service/layout_assignment.h  |  65 +++-
 tensorflow/compiler/xla/service/service.cc    |   5 +-
 .../compiler/xla/service/tuple_simplifier.cc  |  25 +-
 8 files changed, 325 insertions(+), 121 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 9555d91817..bc577c173d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1953,10 +1953,12 @@ cc_library(
     deps = [
         ":computation_layout",
         ":hlo",
+        ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":logical_buffer",
         ":tuple_points_to_analysis",
+        ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -2433,6 +2435,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index d2d4f14fce..cb61f3da39 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -23,12 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-ComputationLayout::ComputationLayout(const ProgramShape& program_shape)
+ComputationLayout::ComputationLayout(const ProgramShape& program_shape,
+                                     bool ignore_layouts)
     : result_layout_(program_shape.result()) {
   for (auto& shape : program_shape.parameters()) {
     parameter_layouts_.emplace_back(shape);
   }
-  SetToDefaultLayout();
+  if (ignore_layouts) {
+    SetToDefaultLayout();
+  }
 }
 
 void ComputationLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 80e102411c..53c3a3f7b7 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -34,8 +34,9 @@ class ComputationLayout {
  public:
   // Constructs a ComputationLayout from a ProgramShape. The layouts of the
   // parameters and results are set to the default layout. Layouts in the
-  // ProgramShape are ignored.
-  explicit ComputationLayout(const ProgramShape& program_shape);
+  // ProgramShape are ignored if ignore_layouts is true.
+  explicit ComputationLayout(const ProgramShape& program_shape,
+                             bool ignore_layouts = true);
 
   // Returns the layout of a particular parameter.
   const ShapeLayout& parameter_layout(int64 param_no) const {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a5e9aecb9e..f3da3fc256 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -956,6 +956,14 @@ class HloInstruction {
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
+  // Checks whether the instruction has compatible sharding with the other
+  // instruction.
+  bool has_compatible_sharding(const HloInstruction* other) const {
+    if (!has_sharding()) {
+      return !other->has_sharding();
+    }
+    return other->has_sharding() ? sharding() == other->sharding() : false;
+  }
 
   // When creating a new instruction which either replaces, or shifts up (kCopy
   // insertion case), another instruction, we need to make sure the certain
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 2494569db5..7067b6f86a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -31,10 +31,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -400,9 +402,9 @@ string LayoutConstraints::ToString() const {
 }
 
 Status LayoutAssignment::AddMandatoryConstraints(
-    const ComputationLayout& computation_layout,
-    const ChannelLayoutConstraints* channel_constraints,
-    HloComputation* computation, LayoutConstraints* constraints) {
+    const ComputationLayout* computation_layout,
+    ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
+    LayoutConstraints* constraints) {
   VLOG(3) << "Adding mandatory layout constraints to computation "
           << computation->name();
 
@@ -424,11 +426,16 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
-      // Parameter layouts must match the respective layout in
-      // ComputationLayout.
-      shape_with_layout =
-          &computation_layout.parameter_layout(instruction->parameter_number())
-               .shape();
+      if (computation_layout != nullptr) {
+        const ShapeLayout& parameter_layout =
+            computation_layout->parameter_layout(
+                instruction->parameter_number());
+        if (parameter_layout.LayoutIsSet()) {
+          // Parameter layouts must match the respective layout in
+          // ComputationLayout, if there is one.
+          shape_with_layout = &parameter_layout.shape();
+        }
+      }
     }
     if (shape_with_layout != nullptr) {
       TF_RETURN_IF_ERROR(
@@ -493,9 +500,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       HloComputation* body = instruction->while_body();
       HloComputation* condition = instruction->while_condition();
       const HloInstruction* init = instruction->operand(0);
-      const ComputationLayout& body_layout =
-          FindOrDie(computation_layouts_, body);
-      const ComputationLayout& condition_layout =
+      ComputationLayout& body_layout = FindOrDie(computation_layouts_, body);
+      ComputationLayout& condition_layout =
           FindOrDie(computation_layouts_, condition);
 
       // Check a few invariants irrespective of layout.
@@ -508,26 +514,19 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    condition_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(body_layout.result_shape(), init->shape()));
 
-      // Return error if earlier layout assignment of the embedded computations
-      // has produced conflicting layouts.
-      if (!ShapeUtil::Equal(body_layout.result_shape(),
-                            body_layout.parameter_shape(0))) {
-        return InternalError(
-            "Parameter and result of body computation %s of while instruction "
-            "%s have different layouts: %s vs %s",
-            body->name().c_str(), instruction->name().c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str(),
-            ShapeUtil::HumanString(body_layout.parameter_shape(0)).c_str());
+      if (body_layout.result_layout() != body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while body parameter layout: body=" << body->name()
+                << " while=" << instruction->name()
+                << " shape=" << body_layout.result_layout().ToString();
+        *body_layout.mutable_parameter_layout(0) = body_layout.result_layout();
       }
-      if (!ShapeUtil::Equal(body->root_instruction()->shape(),
-                            condition->parameter_instruction(0)->shape())) {
-        return InternalError(
-            "Parameter of condition computation %s of while instruction "
-            "%s does not match body computation %s result: %s vs %s",
-            condition->name().c_str(), instruction->name().c_str(),
-            body->name().c_str(),
-            ShapeUtil::HumanString(condition_layout.parameter_shape(0)).c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str());
+      if (condition_layout.parameter_layout(0) !=
+          body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while condition parameter layout: cond="
+                << condition->name() << " while=" << instruction->name()
+                << " shape=" << body_layout.parameter_layout(0).ToString();
+        *condition_layout.mutable_parameter_layout(0) =
+            body_layout.parameter_layout(0);
       }
 
       // Constrain the output and the operand of the while instruction to match
@@ -557,7 +556,20 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    true_computation_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(
           false_operand->shape(), false_computation_layout.parameter_shape(0)));
-
+      if (true_computation_layout.result_layout() !=
+          false_computation_layout.result_layout()) {
+        // We assign layouts in DFS fashion, so the true and false computations
+        // might have negotiated a different layout. But for the conditional
+        // instruction POV the layout must match, so we run again on the false
+        // computation, this time with proper computation layout.
+        VLOG(2) << "Reset %conditional false computation result layout: "
+                   "false_computation="
+                << false_computation->name()
+                << " conditional=" << instruction->name() << " shape="
+                << true_computation_layout.result_layout().ToString();
+        *false_computation_layout.mutable_result_layout() =
+            true_computation_layout.result_layout();
+      }
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           true_computation_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
@@ -593,10 +605,14 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
     }
   }
-
-  // Finally set the result layout to match ComputationLayout.
-  return constraints->SetResultLayout(
-      computation_layout.result_layout().shape());
+  // Finally set the result layout to match ComputationLayout, if there is one.
+  if (computation_layout != nullptr) {
+    const ShapeLayout& result_layout = computation_layout->result_layout();
+    if (result_layout.LayoutIsSet()) {
+      TF_RETURN_IF_ERROR(constraints->SetResultLayout(result_layout.shape()));
+    }
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -760,6 +776,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
+    RegisterAddedCopy(copy);
     SetupCopiedInstruction(*instruction, copy, {});
     LayoutUtil::ClearLayout(copy->mutable_shape());
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
@@ -783,13 +800,19 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
   TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
 
   if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    VLOG(5) << "Operand " << operand->ToString() << " layout matches in "
+            << instruction->ToString();
     // Operand layout already matches our constraint. Nothing to do.
     return Status::OK();
   }
+  VLOG(4) << "Operand " << operand->ToString() << " layout does not match "
+          << operand_layout.ToString() << " in " << instruction->ToString();
 
   TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
                       CreateCopyWithNewLayout(operand_layout.shape(), operand));
 
+  VLOG(4) << "New copy of " << operand->ToString() << " is "
+          << operand_copy->ToString();
   return instruction->ReplaceOperandWith(operand_no, operand_copy);
 }
 
@@ -896,15 +919,16 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
       }
     }
   }
-
-  // Finally verify the result layout matches the layout of the entry
+  // Finally verify the result layout, if set, matches the layout of the entry
   // computation root.
-  TF_RET_CHECK(ShapeUtil::Equal(
-      module->entry_computation()->root_instruction()->shape(),
+  const ShapeLayout& result_layout =
       FindOrDie(computation_layouts_, module->entry_computation())
-          .result_layout()
-          .shape()));
-
+          .result_layout();
+  if (result_layout.LayoutIsSet()) {
+    TF_RET_CHECK(ShapeUtil::Equal(
+        module->entry_computation()->root_instruction()->shape(),
+        result_layout.shape()));
+  }
   return Status::OK();
 }
 
@@ -913,18 +937,13 @@ LayoutAssignment::LayoutAssignment(
     ChannelLayoutConstraints* channel_constraints)
     : entry_computation_layout_(entry_computation_layout),
       channel_layout_constraints_(channel_constraints) {
-  VLOG(1) << "entry computation layout given to layout assignment: "
+  VLOG(1) << "Entry computation layout given to layout assignment: "
           << entry_computation_layout_->ToString();
   // Layouts of all parameter instructions must be set.
   for (const ShapeLayout& parameter_layout :
        entry_computation_layout_->parameter_layouts()) {
     CHECK(parameter_layout.LayoutIsSet());
   }
-  // If the result layout is not set, then choose the default.
-  // TODO(b/29118294): Choose a better layout in this case.
-  if (!entry_computation_layout_->result_layout().LayoutIsSet()) {
-    entry_computation_layout_->mutable_result_layout()->SetToDefaultLayout();
-  }
 }
 
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
@@ -1484,16 +1503,60 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
   return Status::OK();
 }
 
+Status LayoutAssignment::CalculateComputationLayout(
+    HloComputation* computation) {
+  ComputationLayout computation_layout(computation->ComputeProgramShape(),
+                                       /*ignore_layouts=*/false);
+  InsertOrDie(&computation_layouts_, computation, computation_layout);
+  VLOG(2) << "  Calculated ComputationLayout = "
+          << computation_layout.ToString();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
+  // Clear existing layouts of the instructions.  All layouts must be assigned
+  // by the LayoutAssignment pass, except for those on infeeds, parameters,
+  // and the computation result. The latter two are specified in
+  // computation_layout, so we only need to keep the existing layouts for
+  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+  // layout assignment pass that may accidently use the existing layout.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kBitcast) {
+      // bitcasts are inherently layout sensitive and so a bitcast instruction
+      // present in the IR before layout assignment is a bug.
+      return InternalError(
+          "Unexpected bitcast operation seen during layout assignment: %s.",
+          instruction->ToString().c_str());
+    }
+    if (instruction->opcode() != HloOpcode::kInfeed) {
+      LayoutUtil::ClearLayout(instruction->mutable_shape());
+    }
+  }
+  return Status::OK();
+}
+
 Status LayoutAssignment::RunOnComputation(
-    const ComputationLayout& computation_layout,
+    ComputationLayout* computation_layout,
     const TuplePointsToAnalysis& points_to_analysis,
     HloComputation* computation,
     ChannelLayoutConstraints* channel_constraints) {
-  DCHECK(computation_layout.LayoutIsSet());
-  InsertOrDie(&computation_layouts_, computation, computation_layout);
   VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name()
           << ")";
-  VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
+  TF_RETURN_IF_ERROR(ClearComputationLayouts(computation));
+  if (computation_layout != nullptr) {
+    auto it = computation_layouts_.find(computation);
+    if (it == computation_layouts_.end()) {
+      VLOG(2) << "  New ComputationLayout = " << computation_layout->ToString();
+      computation_layouts_.emplace(computation, *computation_layout);
+    } else {
+      TF_RET_CHECK(computation_layout == &it->second ||
+                   computation_layout == entry_computation_layout_);
+      VLOG(2) << "  Existing ComputationLayout = "
+              << computation_layout->ToString();
+    }
+  } else {
+    VLOG(2) << "  No ComputationLayout specified (will be calculated)";
+  }
 
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
@@ -1536,12 +1599,19 @@ Status LayoutAssignment::RunOnComputation(
     CHECK_LT(constraints.unconstrained_buffer_ids().size(),
              unconstrained_count);
   }
-
   // All logical buffers should have constraints at this point. All that
   // remains is assign the constraints to the buffers and infer layouts for
   // aliased buffers.
   TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation));
 
+  // If the computation layout wasn't specified, now it is the time to compute
+  // it according to the parameters and root instruction layouts.
+  // This allows the first pass through this API to record the best flowing
+  // layout to parameters and root instruction.
+  if (computation_layout == nullptr) {
+    TF_RETURN_IF_ERROR(CalculateComputationLayout(computation));
+  }
+
   // Record the layouts assigned for any communication ops in
   // channel_constraints so that they are constrained for future modules.
   for (HloInstruction* instruction : computation->instructions()) {
@@ -1556,6 +1626,34 @@ Status LayoutAssignment::RunOnComputation(
   return Status::OK();
 }
 
+Status LayoutAssignment::PropagateComputationLayouts(
+    HloComputation* computation, ComputationLayout* computation_layout) {
+  ComputationLayout computed_computation_layout(
+      computation->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  for (int64 i = 0; i < computed_computation_layout.parameter_count(); ++i) {
+    ShapeLayout* param_layout = computation_layout->mutable_parameter_layout(i);
+    if (!param_layout->LayoutIsSet()) {
+      VLOG(4) << "Assigning layout to parameter " << i << " of computation "
+              << computation->name() << ": "
+              << computed_computation_layout.parameter_layout(i).ToString();
+      *param_layout = computed_computation_layout.parameter_layout(i);
+    } else {
+      TF_RET_CHECK(computed_computation_layout.parameter_layout(i) ==
+                   *param_layout);
+    }
+  }
+  ShapeLayout* result_layout = computation_layout->mutable_result_layout();
+  if (!result_layout->LayoutIsSet()) {
+    VLOG(4) << "Assigning result layout of computation " << computation->name()
+            << ": " << computed_computation_layout.result_layout().ToString();
+    *result_layout = computed_computation_layout.result_layout();
+  } else {
+    TF_RET_CHECK(computed_computation_layout.result_layout() == *result_layout);
+  }
+  return Status::OK();
+}
+
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
   XLA_VLOG_LINES(3, module->ToString());
@@ -1564,52 +1662,45 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "before layout assignment",
                                 module->config().debug_options());
   }
-
-  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // Assign layouts to computations in an order such that a callee computation
-  // is handled before its caller computation. This ensures that the layout of
-  // all callers of a computation will agree.
-  std::list<HloComputation*> computation_post_order =
-      module->MakeComputationPostOrder();
-  for (auto* computation : module->MakeComputationPostOrder()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-    // Clear existing layouts of the instructions.  All layouts must be assigned
-    // by the LayoutAssignment pass, except for those on infeeds, parameters,
-    // and the computation result. The latter two are specified in
-    // computation_layout, so we only need to keep the existing layouts for
-    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-    // layout assignment pass that may accidently use the existing layout.
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kBitcast) {
-        // bitcasts are inherently layout sensitive and so a bitcast instruction
-        // present in the IR before layout assignment is a bug.
-        return InternalError(
-            "Unexpected bitcast operation seen during layout assignment: %s.",
-            instruction->ToString().c_str());
+  TF_RETURN_IF_ERROR(Init());
+
+  // We do two passes. The first one we pass a nullptr ComputationLayout to
+  // the RunOnComputation() calls (for non entry computations), and we register
+  // the ComputationLayout which are naturally flowing in DFS fashion to the
+  // parameters and root instruction.
+  // Walking in DFS mode though, means that we can end up with incorrect layouts
+  // when seen from an outer instruction, which has across-computation
+  // constraints to impose.
+  // For example, the kWhile instruction needs to enforce the same layouts for
+  // the parameters and root of the bosy, as well as the condition parameters.
+  // Similarly, the kConditional instruction needs to enforce the same layouts
+  // for the root of the true and false computations.
+  // So in the first pass, while allowing the layouts to flow to parameters and
+  // root, we also fix up the eventually inconsistent ComputationLayout, which
+  // will be then made mandatory by the second pass.
+  for (int64 i = 0; i < 2; ++i) {
+    TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module));
+    TF_ASSIGN_OR_RETURN(auto points_to_analysis,
+                        TuplePointsToAnalysis::Run(module));
+    for (auto* computation : module->MakeComputationPostOrder()) {
+      if (computation->IsFusionComputation()) {
+        continue;
       }
-      if (instruction->opcode() != HloOpcode::kInfeed) {
-        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      if (computation == module->entry_computation()) {
+        TF_RETURN_IF_ERROR(RunOnComputation(
+            entry_computation_layout_, *points_to_analysis,
+            module->entry_computation(), channel_layout_constraints_));
+      } else {
+        ComputationLayout* computation_layout =
+            (i == 0) ? nullptr : &FindOrDie(computation_layouts_, computation);
+        TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
+                                            *points_to_analysis, computation,
+                                            channel_layout_constraints_));
       }
     }
-    if (computation == module->entry_computation()) {
-      TF_RETURN_IF_ERROR(RunOnComputation(
-          *entry_computation_layout_, *points_to_analysis,
-          module->entry_computation(), channel_layout_constraints_));
-    } else {
-      ComputationLayout computation_layout(computation->ComputeProgramShape());
-      // Setting all embedded computations to the default layout is potentially
-      // suboptimal.
-      computation_layout.SetToDefaultLayout();
-      TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
-                                          *points_to_analysis, computation,
-                                          channel_layout_constraints_));
-    }
   }
-
+  TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
+                                                 entry_computation_layout_));
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   VLOG(3) << "After layout assignment:";
@@ -1619,9 +1710,54 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "after layout assignment",
                                 module->config().debug_options());
   }
-
   // All layouts are reset then reassigned by this pass.
   return true;
 }
 
+Status LayoutAssignment::Init() {
+  computation_layouts_.clear();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
+  // Clear all the copies which have been added, and all the related
+  // instructions (like GTE and tuples).
+  int64 removed_copies = 0;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          added_copies_.count(instruction) > 0) {
+        VLOG(5) << "Removing added copy: " << instruction->ToString();
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+        ++removed_copies;
+      }
+    }
+  }
+  added_copies_.clear();
+  if (removed_copies > 0) {
+    TupleSimplifier tuple_simplifier;
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+  return Status::OK();
+}
+
+Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
+                                           int64 operand_number) {
+  HloInstruction* operand = instruction->mutable_operand(operand_number);
+  if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            operand->shape(), HloOpcode::kCopy, operand));
+    SetupCopiedInstruction(*operand, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(operand_number, copy));
+  }
+  return Status::OK();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index ae4986d6ad..8b4e07995a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -362,12 +363,15 @@ class LayoutAssignment : public HloPassInterface {
       int64 operand_no);
 
  private:
+  // Initializes the layout assignment object for a new Run() call.
+  Status Init();
+
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(
-      const ComputationLayout& computation_layout,
-      const ChannelLayoutConstraints* channel_constraints,
-      HloComputation* computation, LayoutConstraints* constraints);
+  Status AddMandatoryConstraints(const ComputationLayout* computation_layout,
+                                 ChannelLayoutConstraints* channel_constraints,
+                                 HloComputation* computation,
+                                 LayoutConstraints* constraints);
 
   // This method can be overridden to add backend-specific constraints to the
   // layout of the instructions of a computation. This method is called after
@@ -378,10 +382,12 @@ class LayoutAssignment : public HloPassInterface {
   }
 
   // Construct contraints and assign layouts to all instructions in the
-  // computation satisfying the given ComputationLayout. Layouts constraints are
-  // added, then propagated until all LogicalBuffers in the computation are
-  // constrained.
-  Status RunOnComputation(const ComputationLayout& computation_layout,
+  // computation satisfying the given ComputationLayout, if not nullptr.
+  // Otherwise the ComputationLayout will be calculated by propagating the
+  // computation instruction contraints.
+  // Layouts constraints are added, then propagated until all LogicalBuffers in
+  // the computation are constrained.
+  Status RunOnComputation(ComputationLayout* computation_layout,
                           const TuplePointsToAnalysis& points_to_analysis,
                           HloComputation* computation,
                           ChannelLayoutConstraints* channel_constraints);
@@ -402,6 +408,25 @@ class LayoutAssignment : public HloPassInterface {
   // necessary conditions.
   Status CheckLayouts(HloModule* module);
 
+  // Computes the ComputationLayout of the given computation based of the
+  // layouts assigned to parameters and root instruction, and inserts it to the
+  // computation_layouts_ map.
+  Status CalculateComputationLayout(HloComputation* computation);
+
+  // Clears all the layouts which can be cleared within a computation.
+  Status ClearComputationLayouts(HloComputation* computation);
+
+  // Clears the side effects of a previous pass, like added copy instructions.
+  Status ClearPreviousPassSideEffects(HloModule* module);
+
+  // Propagates the layouts computed by the layout assignment pass on the given
+  // computation, to the computation layout passed in to this API.
+  // This API propagates missing layout, and also checks that the caller
+  // specified have been respected, by comparing those with the parameters and
+  // root computation instruction.
+  Status PropagateComputationLayouts(HloComputation* computation,
+                                     ComputationLayout* computation_layout);
+
   ComputationLayout* entry_computation_layout_;
 
  protected:
@@ -418,21 +443,37 @@ class LayoutAssignment : public HloPassInterface {
   // Creates and returns a copy of the given instruction with a different
   // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
   // instruction producing the copy is returned.
-  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+  StatusOr<HloInstruction*> CreateCopyWithNewLayout(
       const Shape& shape_with_layout, HloInstruction* instruction);
 
   // Creates a copy of the given operand if the operand's layout does not match
   // the given layout. This copy replaces the use in the given instruction.
   // Tuple operands will be deep-copied.
-  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                           HloInstruction* instruction,
-                                           int64 operand_no);
+  Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                    HloInstruction* instruction,
+                                    int64 operand_no);
+
+  // Registers a copy instruction added by the layout assignment pass.
+  void RegisterAddedCopy(HloInstruction* copy) {
+    CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+    added_copies_.insert(copy);
+  }
+
+  // Adds a copy for the operand of an instruction, unless such operand is
+  // already a copy, and has a single user (which is forcibly the instruction
+  // itself).
+  Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number);
 
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
   // instructions can be set to match the computation.
   std::map<HloComputation*, ComputationLayout> computation_layouts_;
+
+  // Every copy added to the module by the layout assignment pass is registered
+  // here.
+  tensorflow::gtl::FlatSet<HloInstruction*> added_copies_;
+
   ChannelLayoutConstraints* channel_layout_constraints_;
 };
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 39f3aefdf8..a73118c68a 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -308,7 +308,10 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
         computation_layout->mutable_result_layout()->CopyLayoutFromShape(
             shape_with_output_layout));
   } else {
-    computation_layout->mutable_result_layout()->Clear();
+    // TODO(b/78356948): We are forcing the default layout here. We should fix
+    // clients which expect a default layout, to be explicit about it, by
+    // passing the proper ExecutionOptions with shape_with_output_layout set.
+    computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
 
   config->set_replica_count(options_.number_of_replicas());
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 113c2e2bd9..d668855084 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -69,6 +69,7 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //       Tuple
       //
       HloInstruction* top_tuple = nullptr;
+      HloInstruction* first_gte = nullptr;
       bool can_simplify = true;
       for (int64 operand_number = 0;
            operand_number < instruction->operand_count(); ++operand_number) {
@@ -78,11 +79,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
           can_simplify = false;
           break;
         }
-
+        if (first_gte == nullptr) {
+          first_gte = operand;
+        } else if (!first_gte->has_compatible_sharding(operand)) {
+          can_simplify = false;
+          break;
+        }
         if (top_tuple == nullptr) {
           top_tuple = operand->mutable_operand(0);
           if (!ShapeUtil::Compatible(top_tuple->shape(),
-                                     instruction->shape())) {
+                                     instruction->shape()) ||
+              !instruction->has_compatible_sharding(top_tuple)) {
             can_simplify = false;
             break;
           }
@@ -108,15 +115,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //          |
       //         GTE
       if (instruction->operand(0)->opcode() == HloOpcode::kTuple) {
-        changed = true;
         HloInstruction* element_source =
             instruction->mutable_operand(0)->mutable_operand(
                 instruction->tuple_index());
-        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
-        for (HloInstruction* user : element_source->users()) {
-          if (user->opcode() == HloOpcode::kTuple ||
-              user->opcode() == HloOpcode::kGetTupleElement) {
-            worklist.push(user);
+        if (instruction->has_compatible_sharding(element_source)) {
+          changed = true;
+          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
+          for (HloInstruction* user : element_source->users()) {
+            if (user->opcode() == HloOpcode::kTuple ||
+                user->opcode() == HloOpcode::kGetTupleElement) {
+              worklist.push(user);
+            }
           }
         }
       }
-- 
GitLab


From 6af31f6260161bab02db83d7e9e1d7ba7fd14b2c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 15:20:37 -0700
Subject: [PATCH 1232/1262] [XLA] Redesign: add comparator and printer for the
 XlaOp.

This is to prepare the migration of tf2xla. There were some codes used ComputationDataHandle::handle() for comparison/printing. Now implement XlaOp's comparator and printer.

PiperOrigin-RevId: 193731437
---
 .../compiler/xla/client/xla_client/xla_builder.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 5977ee4f4b..4955f1515d 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -57,11 +57,27 @@ class XlaOp {
 
   StatusOr<Shape> GetShape() const;
 
+  const XlaBuilder* builder() const { return builder_; }
+
+  bool operator==(const XlaOp& rhs) const {
+    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
+  }
+
+  bool operator!=(const XlaOp& rhs) const {
+    return handle_ != rhs.handle_ || builder_ != rhs.builder_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
+    out << op.handle();
+    return out;
+  }
+
  private:
   XlaOp(int64 handle, XlaBuilder* builder)
       : handle_(handle), builder_(builder) {}
 
   int64 handle() const { return handle_; }
+
   friend class XlaBuilder;
 
   int64 handle_;
-- 
GitLab


From cadbb0b70b9441388a04533433245ac85f2887a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 15:32:32 -0700
Subject: [PATCH 1233/1262] [XLA] Redesign: implement DumpToDirectory for the
 HloSession.

This is to prepare the migration of tf2xla.

PiperOrigin-RevId: 193733029
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 tensorflow/compiler/xla/service/executable.cc | 20 +++++++++++++++++++
 tensorflow/compiler/xla/service/executable.h  |  5 +++++
 3 files changed, 26 insertions(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bc577c173d..afb344e5ae 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -755,6 +755,7 @@ cc_library(
         ":hlo",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
+        ":hlo_proto",
         ":pool",
         ":session_proto",
         ":shaped_buffer",
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index b097ef79cc..8218b5f7c8 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -163,4 +163,24 @@ Status Executable::DumpSessionModule() {
                                        result);
 }
 
+/* static */ Status Executable::DumpToDirectory(const string& directory_path,
+                                                string filename,
+                                                const HloSession& hlo_session) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+  if (!env->IsDirectory(directory_path).ok()) {
+    // NB! CreateDir does not work reliably with multiple XLA threads -- two
+    // threads can race to observe the absence of the dump directory and
+    // simultaneously try to create it, causing the "losing" thread to get a
+    // "directory already exists" error.
+    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
+  }
+  filename = SanitizeFileName(std::move(filename));
+  string file_path = tensorflow::io::JoinPath(directory_path, filename);
+  string result;
+  TF_RET_CHECK(
+      tensorflow::SerializeToStringDeterministic(hlo_session, &result));
+  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
+                                       result);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 9c725f21d8..bdbe119120 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -155,6 +156,10 @@ class Executable {
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
+  // Dump hlo_session to directory_path/filename.
+  static Status DumpToDirectory(const string& directory_path, string filename,
+                                const HloSession& hlo_session);
+
  protected:
   mutable tensorflow::mutex mutex_;
 
-- 
GitLab


From b2f786867dca85b6b848f09f2c1d40dd123fc0fc Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 20 Apr 2018 15:38:06 -0700
Subject: [PATCH 1234/1262] Always use the local worker name in
 CreateWorkerSession when not doing ClusterSpec propagation.

Previously, the master would send a job name and task index in an
otherwise-empty ServerDef, and the worker would unquestioningly use
those to build its worker name. However, this would lead to errors if
the worker had a local name like "/job:worker/replica:1/task:0",
because the ServerDef doesn't support non-zero replica IDs, and so the
local worker would end up an inconsistent view of what its worker name
should be. In particular `WorkerSession::worker_name` would disagree
with the device names added during graph partitioning by the master,
which would lead to runtime failures ("InvalidArgumentError: Invalid
rendezvous key").

PiperOrigin-RevId: 193733855
---
 tensorflow/core/distributed_runtime/BUILD     |  1 +
 .../distributed_runtime/master_session.cc     | 28 +++++++++---------
 .../core/distributed_runtime/session_mgr.cc   |  6 ++--
 .../distributed_runtime/session_mgr_test.cc   | 29 +++++++++++++++++++
 4 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index d564727da5..343dd5d456 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ tf_cc_test(
     deps = [
         ":session_mgr",
         ":worker_env",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index ebe350d313..e3022f38a2 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1219,17 +1219,6 @@ Status MasterSession::CreateWorkerSessions(
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
-    if (options.cluster_def) {
-      *workers[i].request.mutable_server_def()->mutable_cluster() =
-          *options.cluster_def;
-      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
-      // Session state is always isolated when ClusterSpec propagation
-      // is in use.
-      workers[i].request.set_isolate_session_state(true);
-    } else {
-      workers[i].request.set_isolate_session_state(
-          session_opts_.config.isolate_session_state());
-    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
@@ -1243,8 +1232,21 @@ Status MasterSession::CreateWorkerSessions(
       return status;
     }
 
-    workers[i].request.mutable_server_def()->set_job_name(name.job);
-    workers[i].request.mutable_server_def()->set_task_index(name.task);
+    if (options.cluster_def) {
+      *workers[i].request.mutable_server_def()->mutable_cluster() =
+          *options.cluster_def;
+      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+      workers[i].request.mutable_server_def()->set_job_name(name.job);
+      workers[i].request.mutable_server_def()->set_task_index(name.task);
+      // Session state is always isolated when ClusterSpec propagation
+      // is in use.
+      workers[i].request.set_isolate_session_state(true);
+    } else {
+      // NOTE(mrry): Do not set any component of the ServerDef,
+      // because the worker will use its local configuration.
+      workers[i].request.set_isolate_session_state(
+          session_opts_.config.isolate_session_state());
+    }
   }
 
   for (size_t i = 0; i < worker_names.size(); ++i) {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 357e9f8930..7ef4206c78 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -43,6 +43,7 @@ SessionMgr::SessionMgr(
               new GraphMgr(worker_env, worker_env->device_mgr)))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
+/* static */
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
   return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
                          server_def.task_index());
@@ -56,13 +57,14 @@ Status SessionMgr::CreateSession(const string& session,
     return errors::InvalidArgument("Session must be non-empty.");
   }
 
-  const string worker_name = WorkerNameFromServerDef(server_def);
-
   WorkerCacheInterface* worker_cache = nullptr;
+  string worker_name;
   if (server_def.cluster().job().empty()) {
     worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
+    worker_name = legacy_session_->worker_name;
   } else {
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+    worker_name = WorkerNameFromServerDef(server_def);
   }
 
   if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 0da333833a..99192119a6 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 
 namespace tensorflow {
 
@@ -77,6 +78,34 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, CreateSessionClusterDefWorkerName) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+  auto job = server_def.mutable_cluster()->add_job();
+  job->set_name("worker");
+  job->mutable_tasks()->insert({3, "localhost:3333"});
+
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_EQ("/job:worker/replica:0/task:3", session->worker_name);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
+TEST_F(SessionMgrTest, CreateSessionDefaultWorkerName) {
+  ServerDef server_def;
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
+  std::shared_ptr<WorkerSession> session;
+  TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_EQ("/job:mnist/replica:0/task:0", session->worker_name);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
 TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   ServerDef server_def;
   server_def.set_job_name("worker");
-- 
GitLab


From c015a45646029f8c116028505f2da9e023b5c2b7 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Fri, 20 Apr 2018 15:51:16 -0700
Subject: [PATCH 1235/1262] Support legacy clusters

PiperOrigin-RevId: 193735742
---
 .../cluster_resolver/python/training/tpu_cluster_resolver.py | 2 +-
 .../python/training/tpu_cluster_resolver_test.py             | 3 +--
 tensorflow/contrib/tpu/python/tpu/tpu_config.py              | 5 +++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 5a2771229d..1403483d28 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -245,7 +245,7 @@ class TPUClusterResolver(ClusterResolver):
     else:
       if not self._tpu.startswith(compat.as_bytes('grpc://')):
         # Case 3.
-        return server_lib.ClusterSpec({})
+        return None
       # Case 2.
       cluster_spec = {self._job_name: [self._tpu[len(
           compat.as_bytes('grpc://')):]]}
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index dff7a03b68..5b3f9be5a1 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -356,8 +356,7 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
     self.assertEqual(
         compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
-    self.assertEqual(
-        server_lib.ClusterSpec({}), tpu_cluster_resolver.cluster_spec())
+    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
 
   def testGkeEnvironment(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index cc1a7fd801..6d7331e3c7 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -210,8 +210,9 @@ class RunConfig(run_config_lib.RunConfig):
         raise ValueError(
             'You cannot provide a ClusterResolver and '
             'session_config.cluster_def.')
-      self._session_config.cluster_def.CopyFrom(
-          self._cluster_spec.as_cluster_def())
+      if self._cluster_spec:
+        self._session_config.cluster_def.CopyFrom(
+            self._cluster_spec.as_cluster_def())
 
   @property
   def evaluation_master(self):
-- 
GitLab


From a0071844d0af47f22ab512363b56383acf762dff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 20 Apr 2018 16:05:47 -0700
Subject: [PATCH 1236/1262] Remove protected data members from
 GraphOptimizerStage.

PiperOrigin-RevId: 193737654
---
 .../optimizers/arithmetic_optimizer.cc        | 54 +++++++++----------
 .../optimizers/graph_optimizer_stage.h        |  5 +-
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 232132e1e8..ed199c1ac8 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -294,8 +294,8 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
       for (int i = src->input_size() - 1; i >= 0; --i) {
         if (IsControlInput(src->input(i))) {
           *target_node->add_input() = src->input(i);
-          ctx_.node_map->AddOutput(NodeName(src->input(i)),
-                                   target_node->name());
+          ctx().node_map->AddOutput(NodeName(src->input(i)),
+                                    target_node->name());
         } else {
           break;
         }
@@ -442,7 +442,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
   // TODO(ezhulenev): move to GraphOptimizerStage?
   bool DrivesControlDependency(const NodeDef& node) const {
     int position;
-    for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
+    for (const NodeDef* output : ctx().node_map->GetOutputs(node.name())) {
       for (int i = 0; i < output->input_size(); ++i) {
         auto input = output->input(i);
         string name = ParseNodeName(input, &position);
@@ -476,8 +476,8 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
   }
 
   bool IsInPreserveSet(const NodeDef& node) const {
-    return ctx_.nodes_to_preserve->find(node.name()) !=
-           ctx_.nodes_to_preserve->end();
+    return ctx().nodes_to_preserve->find(node.name()) !=
+           ctx().nodes_to_preserve->end();
   }
 
   bool IsAlreadyOptimized(const NodeDef& node) const {
@@ -546,7 +546,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     // with a single output data consumer (presumably if we reach this node from
     // previously absorbed or a root node, it means that this node is not used
     // as an input to any other op, outside of the group)
-    if (NumNonControlDataOutputs(node, *ctx_.node_map) != 1) {
+    if (NumNonControlDataOutputs(node, *ctx().node_map) != 1) {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
@@ -685,7 +685,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     (*node->mutable_attr())["N"].set_i(inputs.size());
 
     for (const auto& inputAndShape : inputs) {
-      ctx_.node_map->AddOutput(inputAndShape.input, node_name);
+      ctx().node_map->AddOutput(inputAndShape.input, node_name);
       node->add_input(inputAndShape.input);
     }
 
@@ -707,8 +707,8 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     node->set_device(root_node.device());
     (*node->mutable_attr())["T"].set_type(dtype);
 
-    ctx_.node_map->AddOutput(left.input, node_name);
-    ctx_.node_map->AddOutput(right.input, node_name);
+    ctx().node_map->AddOutput(left.input, node_name);
+    ctx().node_map->AddOutput(right.input, node_name);
 
     node->add_input(left.input);
     node->add_input(right.input);
@@ -784,20 +784,20 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
           new_outer_node->set_input(1, new_add_node->name());
         }
 
-        ctx_.node_map->AddOutput(common_factor, new_outer_node->name());
-        ctx_.node_map->AddOutput(new_add_node->name(), new_outer_node->name());
+        ctx().node_map->AddOutput(common_factor, new_outer_node->name());
+        ctx().node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
           const string& unique_factor_i = unique_factors[i];
           new_add_node->set_input(i, unique_factor_i);
-          ctx_.node_map->AddOutput(unique_factor_i, new_add_node->name());
+          ctx().node_map->AddOutput(unique_factor_i, new_add_node->name());
         }
 
         // Add control deps on add node
         for (const string& ctrl_dep : ctrl_deps) {
           *new_add_node->add_input() = ctrl_dep;
-          ctx_.node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
+          ctx().node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
         }
 
         // optimize new inner aggregation node
@@ -931,8 +931,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     // if graph rewrite happens in multiple passes without graph pruning between
     // them, it's possible that rewritten node already exists in a graph
     return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() ||
-           ctx_.node_map->NodeExists(OuterNodeName(node, false)) ||
-           ctx_.node_map->NodeExists(OuterNodeName(node, true));
+           ctx().node_map->NodeExists(OuterNodeName(node, false)) ||
+           ctx().node_map->NodeExists(OuterNodeName(node, true));
   }
 
   // keep names of the nodes that were optimized by this stage
@@ -996,7 +996,7 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
     }
     // Optimized nodes updated in place, and that would break the graph, if the
     // node has multiple output consumers
-    if (NumNonControlOutputs(node, *ctx_.node_map) != 1) {
+    if (NumNonControlOutputs(node, *ctx().node_map) != 1) {
       return false;
     }
     // All input shapes must be broadcastable to the node shape
@@ -1120,13 +1120,13 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
       node->set_input(0, input_0);
       node->set_input(1, input_1);
       // Invalidate node properties (shape)
-      ctx_.graph_properties->ClearOutputProperties(node->name());
-      ctx_.graph_properties->ClearInputProperties(node->name());
+      ctx().graph_properties->ClearOutputProperties(node->name());
+      ctx().graph_properties->ClearInputProperties(node->name());
       // Update the node map
-      ctx_.node_map->RemoveOutput(NodeName(old_input_0), node->name());
-      ctx_.node_map->RemoveOutput(NodeName(old_input_1), node->name());
-      ctx_.node_map->AddOutput(NodeName(input_0), node->name());
-      ctx_.node_map->AddOutput(NodeName(input_1), node->name());
+      ctx().node_map->RemoveOutput(NodeName(old_input_0), node->name());
+      ctx().node_map->RemoveOutput(NodeName(old_input_1), node->name());
+      ctx().node_map->AddOutput(NodeName(input_0), node->name());
+      ctx().node_map->AddOutput(NodeName(input_1), node->name());
       // Add updated node to optimization queue
       AddToOptimizationQueue(node);
     }
@@ -1257,8 +1257,8 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
       // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
       bitcast->set_input(0, operand->input(0));
       SetSourceDataType(GetSourceDataType(*operand), bitcast);
-      ctx_.node_map->UpdateInput(bitcast->name(), bitcast->input(0),
-                                 operand->input(0));
+      ctx().node_map->UpdateInput(bitcast->name(), bitcast->input(0),
+                                  operand->input(0));
       AddToOptimizationQueue(bitcast);
       *simplified_node_name = bitcast->name();
     }
@@ -1313,14 +1313,14 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
         node->mutable_input()->SwapElements(0, 1);
         node->set_input(1, x->input(0));
         node->add_input(AsControlDependency(x->name()));
-        ctx_.node_map->AddOutput(NodeName(x->input(0)), node_name);
+        ctx().node_map->AddOutput(NodeName(x->input(0)), node_name);
         updated = true;
       } else if (IsNeg(*y)) {
         // a + (-b) = a - b
         node->set_op("Sub");
         node->set_input(1, y->input(0));
         node->add_input(AsControlDependency(y->name()));
-        ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name);
+        ctx().node_map->AddOutput(NodeName(y->input(0)), node_name);
         updated = true;
       }
     } else if (IsSub(*node)) {
@@ -1329,7 +1329,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
         node->set_op("Add");
         node->set_input(1, y->input(0));
         node->add_input(AsControlDependency(y->name()));
-        ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name);
+        ctx().node_map->AddOutput(NodeName(y->input(0)), node_name);
         updated = true;
       }
     }
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index ed398525f3..089cad36e9 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -182,7 +182,10 @@ class GraphOptimizerStage {
     return ::tensorflow::grappler::AddEmptyNode(ctx_, name);
   }
 
- protected:  // Data members
+ protected:
+  const GraphOptimizerContext& ctx() const { return ctx_; }
+
+ private:  // Data members
   const string optimizer_name_;
   const string stage_name_;
   const GraphOptimizerContext ctx_;
-- 
GitLab


From 3fa8795c511931b55a9703956bdf564fde817c2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Branchaud-Charron?=
 <frederic.branchaud-charron@usherbrooke.ca>
Date: Fri, 20 Apr 2018 19:10:41 -0400
Subject: [PATCH 1237/1262] Fix casting in Keras estimator (#18104)

---
 .../python/keras/_impl/keras/estimator.py     | 22 +++++++++++++----
 .../keras/_impl/keras/estimator_test.py       | 24 +++++++++++++++----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index b922a6c683..c3c3fceb45 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -29,12 +29,14 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import models
 from tensorflow.python.keras._impl.keras import optimizers
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.ops import variables as variables_module
@@ -55,6 +57,17 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
+def _convert_tensor(x):
+  """Create or cast tensor if needed."""
+  if not tensor_util.is_tensor(x):
+    # x is a numpy array
+    x = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(x)
+  if check_ops.is_numeric_tensor(x):
+    # is_numeric_tensor returns False if provided with a numpy array
+    x = _cast_tensor_to_floatx(x)
+  return x
+
+
 def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
@@ -86,7 +99,7 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True):
   if isinstance(estimator_io, (list, tuple)):
     # Case currently not supported by most built-in input_fn,
     # but it's good to have for sanity
-    return [_cast_tensor_to_floatx(x) for x in estimator_io]
+    return [_convert_tensor(x) for x in estimator_io]
   elif isinstance(estimator_io, dict):
     if is_input:
       if keras_model._is_graph_network:
@@ -108,12 +121,12 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True):
             'It needs to match one '
             'of the following: %s' % ('input' if is_input else 'output', key,
                                       ', '.join(keras_io_names)))
-      tensors = [_cast_tensor_to_floatx(estimator_io[io_name])
+      tensors = [_convert_tensor(estimator_io[io_name])
                  for io_name in keras_io_names]
     return tensors
   else:
     # Plain array.
-    return _cast_tensor_to_floatx(estimator_io)
+    return _convert_tensor(estimator_io)
 
 
 def _in_place_subclassed_model_reset(model):
@@ -274,8 +287,7 @@ def _clone_and_build_model(mode,
                                         is_input=False)
   else:
     target_tensors = [
-        _cast_tensor_to_floatx(
-            sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels))
+        _convert_tensor(labels)
     ]
 
   if keras_model._is_graph_network:
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 653cdc01e2..80fa87d041 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
 from tensorflow.python.keras._impl.keras.optimizers import SGD
@@ -142,16 +143,20 @@ def randomize_io_type(array, name):
 
 
 def multi_inputs_multi_outputs_model():
-  # test multi-input layer
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
+
   a_2 = dense(a)
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
   b_2 = dense(b)
-  merged = keras.layers.concatenate([a_2, b_2], name='merge')
+  merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
   d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
-  model = keras.models.Model(inputs=[a, b], outputs=[c, d])
+  model = keras.models.Model(inputs=[a, b, m], outputs=[c, d])
   model.compile(
       loss='categorical_crossentropy',
       optimizer='rmsprop',
@@ -352,18 +357,27 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         test_samples=50,
         input_shape=(16,),
         num_classes=2)
+    np.random.seed(_RANDOM_SEED)
+    (input_m_train, _), (input_m_test, _) = testing_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(8,),
+        num_classes=2)
+
     c_train = keras.utils.to_categorical(c_train)
     c_test = keras.utils.to_categorical(c_test)
     d_train = keras.utils.to_categorical(d_train)
     d_test = keras.utils.to_categorical(d_test)
 
     def train_input_fn():
-      input_dict = {'input_a': a_train, 'input_b': b_train}
+      input_dict = {'input_a': a_train, 'input_b': b_train,
+                    'input_m': input_m_train > 0}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
-      input_dict = {'input_a': a_test, 'input_b': b_test}
+      input_dict = {'input_a': a_test, 'input_b': b_test,
+                    'input_m': input_m_test > 0}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
-- 
GitLab


From cd095e0c455b3df98841ca70ba24fd41935552e7 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Fri, 20 Apr 2018 16:18:29 -0700
Subject: [PATCH 1238/1262] tf.contrib.data.scan: Support eager execution.

PiperOrigin-RevId: 193739234
---
 .../contrib/data/python/kernel_tests/BUILD    |  1 +
 .../kernel_tests/scan_dataset_op_test.py      | 23 ++++++++++++-------
 .../contrib/data/python/ops/scan_ops.py       |  1 +
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 05a4f5028a..9d1e8b20c2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -343,6 +343,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
index e0494736b7..1a97a84b2c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
@@ -24,9 +24,11 @@ import numpy as np
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -57,19 +59,24 @@ class ScanDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFibonacci(self):
     iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
         scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
     ).make_one_shot_iterator()
-    next_element = iterator.get_next()
 
-    with self.test_session() as sess:
-      self.assertEqual(1, sess.run(next_element))
-      self.assertEqual(1, sess.run(next_element))
-      self.assertEqual(2, sess.run(next_element))
-      self.assertEqual(3, sess.run(next_element))
-      self.assertEqual(5, sess.run(next_element))
-      self.assertEqual(8, sess.run(next_element))
+    if context.executing_eagerly():
+      next_element = iterator.get_next
+    else:
+      get_next = iterator.get_next()
+      next_element = lambda: get_next
+
+    self.assertEqual(1, self.evaluate(next_element()))
+    self.assertEqual(1, self.evaluate(next_element()))
+    self.assertEqual(2, self.evaluate(next_element()))
+    self.assertEqual(3, self.evaluate(next_element()))
+    self.assertEqual(5, self.evaluate(next_element()))
+    self.assertEqual(8, self.evaluate(next_element()))
 
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 1c88366273..711a538697 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -144,6 +144,7 @@ class _ScanDataset(dataset_ops.Dataset):
                                                    weakened_state_shapes)
 
     self._scan_func = tf_scan_func
+    self._scan_func.add_to_graph(ops.get_default_graph())
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-- 
GitLab


From 8d3a41f459b776856ff668bb076d4bc449927e09 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Fri, 20 Apr 2018 16:30:02 -0700
Subject: [PATCH 1239/1262] [XLA] Remove constant cast in literal util.

It's not portable to modify an underlying char array of a c++ string object: (https://stackoverflow.com/questions/5729203/modifying-underlying-char-array-of-a-c-string-object)

RELNOTES: n/a
PiperOrigin-RevId: 193740595
---
 tensorflow/compiler/xla/literal_util.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index c315b4ff30..bb6dd4f909 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -44,8 +44,16 @@ namespace {
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
-// Converts between little and big endian, assuming elements in the array are 16
-// bits long.
+// Converts between little and big endian.
+//
+// Precondition: size % 2 == 0 (elements in the array are 16 bits long)
+void ConvertEndianShort(string* bytes) {
+  CHECK_EQ(bytes->size() / 2, 0);
+  for (int64 i = 0; i < bytes->size(); i += 2) {
+    std::swap((*bytes)[i], (*bytes)[i + 1]);
+  }
+}
+
 void ConvertEndianShort(char* bytes, int64 size) {
   CHECK_EQ(size / 2, 0);
   for (int64 i = 0; i < size; i += 2) {
@@ -1930,16 +1938,14 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const {
       *proto->mutable_f16s() = string(
           reinterpret_cast<const char*>(data<half>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto->mutable_f16s()->data()),
-                           proto->f16s().size());
+        ConvertEndianShort(proto->mutable_f16s());
       }
       break;
     case BF16:
       *proto->mutable_bf16s() = string(
           reinterpret_cast<const char*>(data<bfloat16>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto->mutable_bf16s()->data()),
-                           proto->bf16s().size());
+        ConvertEndianShort(proto->mutable_bf16s());
       }
       break;
     case F32:
-- 
GitLab


From 82679654af098df1de27bcdcf6fc6942ccf4f236 Mon Sep 17 00:00:00 2001
From: ADiegoCAlonso <A.Diego.C.Alonso@gmail.com>
Date: Sat, 21 Apr 2018 11:43:51 +0200
Subject: [PATCH 1240/1262] Add __init__py

---
 tensorflow/examples/tutorials/estimators/__init__.py | 0
 tensorflow/examples/tutorials/input_fn/__init__.py   | 0
 tensorflow/examples/tutorials/layers/__init__.py     | 0
 tensorflow/examples/tutorials/monitors/__init__.py   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tensorflow/examples/tutorials/estimators/__init__.py
 create mode 100644 tensorflow/examples/tutorials/input_fn/__init__.py
 create mode 100644 tensorflow/examples/tutorials/layers/__init__.py
 create mode 100644 tensorflow/examples/tutorials/monitors/__init__.py

diff --git a/tensorflow/examples/tutorials/estimators/__init__.py b/tensorflow/examples/tutorials/estimators/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow/examples/tutorials/input_fn/__init__.py b/tensorflow/examples/tutorials/input_fn/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow/examples/tutorials/layers/__init__.py b/tensorflow/examples/tutorials/layers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow/examples/tutorials/monitors/__init__.py b/tensorflow/examples/tutorials/monitors/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
-- 
GitLab


From aed22c552905d74de04c98b34aabedd12926790a Mon Sep 17 00:00:00 2001
From: ADiegoCAlonso <A.Diego.C.Alonso@gmail.com>
Date: Sat, 21 Apr 2018 11:56:10 +0200
Subject: [PATCH 1241/1262] Specify float32 as float type instead of float64

---
 tensorflow/examples/tutorials/monitors/iris_monitors.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
index 850d105f7b..a2b7fe6023 100644
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -32,9 +32,9 @@ IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
 def main(unused_argv):
   # Load datasets.
   training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
   test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
 
   validation_metrics = {
       "accuracy":
@@ -83,7 +83,7 @@ def main(unused_argv):
 
   # Classify two new flower samples.
   new_samples = np.array(
-      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
   y = list(classifier.predict(new_samples))
   print("Predictions: {}".format(str(y)))
 
-- 
GitLab


From cea18851e2d81ee97ebf8e9f6aeddd55a34e3227 Mon Sep 17 00:00:00 2001
From: foo0x29a <thiago.nobayashi@gmail.com>
Date: Sat, 21 Apr 2018 13:30:52 -0300
Subject: [PATCH 1242/1262] fix typo

---
 .../core/grappler/optimizers/custom_graph_optimizer_registry.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
index 796da91373..3148a5f809 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@@ -33,7 +33,7 @@ class CustomGraphOptimizerRegistry {
   static std::vector<string> GetRegisteredOptimizers();
 
   typedef std::function<CustomGraphOptimizer*()> Creator;
-  // Regsiter graph optimizer which can be called during program initialization.
+  // Register graph optimizer which can be called during program initialization.
   // This class is not thread-safe.
   static void RegisterOptimizerOrDie(const Creator& optimizer_creator,
                                      const string& name);
-- 
GitLab


From fe4146d884c8805fceaa6d73d0bcc7fbf21df7cd Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 21 Apr 2018 18:42:03 +0000
Subject: [PATCH 1243/1262] Update .gitignore for cmake generated files

After running cmake on Linux with:
```
tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
```

the following file is left:
```
ubuntu@ubuntu:~/tensorflow$ git status
On branch master
Your branch is up-to-date with 'origin/master'.
Untracked files:
  (use "git add <file>..." to include in what will be committed)

        api_init_files_list.txt

nothing added to commit but untracked files present (use "git add" to track)
ubuntu@ubuntu:~/tensorflow$
```

This fix updates the .gitignore file so that cmake generated files
is not added with git inadvertently.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index be75938ec4..828bbe9bd3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ Podfile.lock
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
 xcuserdata/**
+/api_init_files_list.txt
 
 # Android
 .gradle
-- 
GitLab


From 8f558d67450f3ec6aa0d96af9fad84042d6b79df Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Sat, 21 Apr 2018 15:25:37 -0700
Subject: [PATCH 1244/1262] Changed calls to the depreacted
 StringPiece::contains with str_util::StrContains

---
 tensorflow/core/graph/mkl_layout_pass.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 5368774f2d..72a13d4da7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -547,14 +547,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
-- 
GitLab


From 5518db48074c3bd125089bccc3edec03c192bf56 Mon Sep 17 00:00:00 2001
From: Bryan Heden <b.heden@gmail.com>
Date: Sat, 21 Apr 2018 19:45:42 -0500
Subject: [PATCH 1245/1262] update $ source spacing

When viewing install_linux, the spacing was off for 'Next Steps' section.
---
 tensorflow/docs_src/install/install_linux.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 1a349f5412..02af21bcf2 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -231,7 +231,7 @@ Note that you must activate the Virtualenv environment each time you
 use TensorFlow. If the Virtualenv environment is not currently active,
 invoke one of the following commands:
 
-<pre> $ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
+<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
 $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
 
 When the Virtualenv environment is active, you may run
-- 
GitLab


From bfffd2041106dac5b7bb3efcbb311a20505ac61f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 14:43:21 +0000
Subject: [PATCH 1246/1262] Update docs to add note and examples for
 tf.count_nonzero with string

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 31ce83905b..30ac001c25 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1466,9 +1466,18 @@ def count_nonzero(input_tensor,
   tf.count_nonzero(x, [0, 1])  # 3
   ```
 
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
   Args:
-    input_tensor: The tensor to reduce. Should be of numeric type, `string`,
-      or `bool`.
+    input_tensor: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-- 
GitLab


From 21bd19a8b8b0be8ac4d39b6bc32366ba908f5105 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:49:13 +0000
Subject: [PATCH 1247/1262] Change from squeeze_dims to axis when calling
 tf.squeeze

The `squeeze_dims` in `tf.squeeze` has been deprecated in favor
of `axis` while many places still use `squeeze_dims`. That
generates lots of warnings.

This fix switches from `squeeze_dims` to `axis` to remove those warnings.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_grad.py | 2 +-
 tensorflow/python/ops/array_ops.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 57d2657838..3678bd4c1f 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -196,7 +196,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
             array_ops.where(
                 math_ops.logical_and(grad.indices >= start,
                                      grad.indices < end)),
-            squeeze_dims=[1])
+            axis=[1])
         new_indices = array_ops.gather(grad.indices, indices_to_select) - start
         new_values = array_ops.gather(grad.values, indices_to_select)
         out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 23202ae28e..bbffff0483 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1230,7 +1230,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
-    indices = squeeze(where(mask), squeeze_dims=[1])
+    indices = squeeze(where(mask), axis=[1])
     return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
-- 
GitLab


From 100b6000d4d04a344a1516578f724e46cdede5e1 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:52:31 +0000
Subject: [PATCH 1248/1262] Fix warning in image related ops.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 601010bce9..bd5b2ae83b 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -652,7 +652,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
     padded.set_shape(padded_shape)
 
     if not is_batch:
-      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+      padded = array_ops.squeeze(padded, axis=[0])
 
     return padded
 
@@ -732,7 +732,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
     cropped.set_shape(cropped_shape)
 
     if not is_batch:
-      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+      cropped = array_ops.squeeze(cropped, axis=[0])
 
     return cropped
 
@@ -849,7 +849,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     resized = control_flow_ops.with_dependencies(assert_ops, resized)
 
     if not is_batch:
-      resized = array_ops.squeeze(resized, squeeze_dims=[0])
+      resized = array_ops.squeeze(resized, axis=[0])
 
     return resized
 
@@ -942,7 +942,7 @@ def resize_images(images,
            for x in [new_width_const, width, new_height_const, height]) and (
                width == new_width_const and height == new_height_const):
       if not is_batch:
-        images = array_ops.squeeze(images, squeeze_dims=[0])
+        images = array_ops.squeeze(images, axis=[0])
       return images
 
     if method == ResizeMethod.BILINEAR:
@@ -965,7 +965,7 @@ def resize_images(images,
     images.set_shape([None, new_height_const, new_width_const, None])
 
     if not is_batch:
-      images = array_ops.squeeze(images, squeeze_dims=[0])
+      images = array_ops.squeeze(images, axis=[0])
     return images
 
 
-- 
GitLab


From 8cdc752227af998da946decc9365d63bcaa7f184 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:53:10 +0000
Subject: [PATCH 1249/1262] Fix warning in tf.nn ops where squeeze_dims was
 used with tf.squeeze

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/nn_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index d0d5ed07ce..576627e78e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -765,9 +765,9 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     weighted_variance = math_ops.multiply(weighted_distsq, divisor)
 
     if not keep_dims:
-      weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes)
+      weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)
       weighted_variance = array_ops.squeeze(
-          weighted_variance, squeeze_dims=axes)
+          weighted_variance, axis=axes)
 
     if needs_cast:
       weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
-- 
GitLab


From 12fd64f72f59ff5ba114903d4b851f855aaf2458 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:53:58 +0000
Subject: [PATCH 1250/1262] Fix warnings in reduce_join_op_test.py

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/reduce_join_op_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 7f3049b9f8..fb9e5cc2a3 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -160,7 +160,7 @@ class ReduceJoinTest(UnicodeTestCase):
             separator=separator)
       if not reduction_indices:
         truth = constant_op.constant(truth)
-      truth_squeezed = array_ops.squeeze(truth, squeeze_dims=reduction_indices)
+      truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices)
       output_array = output.eval()
       output_keep_dims_array = output_keep_dims.eval()
       truth_array = truth.eval()
-- 
GitLab


From 9aa142284166c51dfc202b551b4592f9c9ed54e7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:54:26 +0000
Subject: [PATCH 1251/1262] Fix tf.contrib.timeseries warnings related to
 squeeze_dims

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../timeseries/python/timeseries/state_management_test.py   | 2 +-
 .../python/timeseries/state_space_models/kalman_filter.py   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
index d5dce30fda..5f7e3da2db 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
@@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel):
     batch_end_values = array_ops.squeeze(
         array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0],
                         [-1, 1, -1]),
-        squeeze_dims=[1, 2])
+        axis=[1, 2])
     # A pretty odd but easy to think about loss: L1 loss on the batch end
     # values.
     loss = math_ops.reduce_sum(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
index 1fcd3e391b..a614386121 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -170,7 +170,7 @@ class KalmanFilter(object):
         math_ops.matmul(
             transition_matrices,
             prior_state[..., None]),
-        squeeze_dims=[-1])
+        axis=[-1])
     return advanced_state
 
   def predict_state_var(
@@ -254,7 +254,7 @@ class KalmanFilter(object):
             kalman_gain_transposed,
             array_ops.expand_dims(residual, -1),
             adjoint_a=True),
-        squeeze_dims=[-1])
+        axis=[-1])
     gain_obs = math_ops.matmul(
         kalman_gain_transposed, observation_model, adjoint_a=True)
     identity_extradim = linalg_ops.eye(
@@ -332,7 +332,7 @@ class KalmanFilter(object):
             array_ops.expand_dims(state_mean, 1),
             observation_model,
             adjoint_b=True),
-        squeeze_dims=[1])
+        axis=[1])
     observed_var = math_ops.matmul(
         math_ops.matmul(observation_model, state_var),
         observation_model,
-- 
GitLab


From 8257b9096062a87555d72f7c15e16b1d8e748d70 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:55:06 +0000
Subject: [PATCH 1252/1262] Fix warnings in tf.contrib.tensor_forest

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/tensor_forest/client/eval_metrics.py       | 4 ++--
 .../tensor_forest/hybrid/python/layers/fully_connected.py     | 2 +-
 tensorflow/contrib/tensor_forest/python/tensor_forest.py      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index 90033015eb..e893e1d1c8 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -37,7 +37,7 @@ def _top_k_generator(k):
   def _top_k(probabilities, targets):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
-      targets = array_ops.squeeze(targets, squeeze_dims=[1])
+      targets = array_ops.squeeze(targets, axis=[1])
     return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
@@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None):
 
 
 def _squeeze_and_onehot(targets, depth):
-  targets = array_ops.squeeze(targets, squeeze_dims=[1])
+  targets = array_ops.squeeze(targets, axis=[1])
   return array_ops.one_hot(math_ops.to_int32(targets), depth)
 
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
index ff3ab21eaa..745a5b1caf 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
@@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer):
 
       # There is always one activation per instance by definition, so squeeze
       # away the extra dimension.
-      return array_ops.squeeze(nn_activations, squeeze_dims=[1])
+      return array_ops.squeeze(nn_activations, axis=[1])
 
 
 class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer):
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index b9bcbb170b..7a35a70bbe 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -445,7 +445,7 @@ class RandomForestGraphs(object):
           mask = math_ops.less(
               r, array_ops.ones_like(r) * self.params.bagging_fraction)
           gather_indices = array_ops.squeeze(
-              array_ops.where(mask), squeeze_dims=[1])
+              array_ops.where(mask), axis=[1])
           # TODO(thomaswc): Calculate out-of-bag data and labels, and store
           # them for use in calculating statistics later.
           tree_data = array_ops.gather(processed_dense_features, gather_indices)
-- 
GitLab


From 685fec394235b409b58d7ef1c4a26655f9fedcfd Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:55:35 +0000
Subject: [PATCH 1253/1262] Fix squeeze_dims warnings in tf.contrib.learn

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/learn/python/learn/estimators/head.py | 4 ++--
 tensorflow/contrib/learn/python/learn/ops/losses_ops.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 2b4b6eff39..e28e6854a5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -777,7 +777,7 @@ class _RegressionHead(_SingleHead):
     key = prediction_key.PredictionKey.SCORES
     with ops.name_scope(None, "predictions", (logits,)):
       if self.logits_dimension == 1:
-        logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key)
+        logits = array_ops.squeeze(logits, axis=(1,), name=key)
       return {key: self._link_fn(logits)}
 
   def _metrics(self, eval_loss, predictions, labels, weights):
@@ -974,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
     is_squeezed_labels = False
     # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
-      labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      labels = array_ops.squeeze(labels, axis=(1,))
       is_squeezed_labels = True
 
     loss = nn.sparse_softmax_cross_entropy_with_logits(
diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
index 92976d1539..9f2cadb017 100644
--- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
@@ -40,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None):
                       [tensor_in, labels]):
     predictions = nn.xw_plus_b(tensor_in, weights, biases)
     if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2:
-      predictions = array_ops_.squeeze(predictions, squeeze_dims=[1])
+      predictions = array_ops_.squeeze(predictions, axis=[1])
     return predictions, losses.mean_squared_error(labels, predictions)
 
 
-- 
GitLab


From 5c19fc7810f13712127b8527b040f8f656474fe5 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:56:09 +0000
Subject: [PATCH 1254/1262] Fix tf.contrib.layers warnings where squeeze_dims
 were used with tf.squeeze

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/layers/python/layers/target_column.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 3e639a180e..69bb6be814 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn):
 
   def logits_to_predictions(self, logits, proba=False):
     if self.num_label_columns == 1:
-      return array_ops.squeeze(logits, squeeze_dims=[1])
+      return array_ops.squeeze(logits, axis=[1])
     return logits
 
   def get_eval_ops(self, features, logits, labels, metrics=None):
@@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target):
                      "Instead got %s." % target.dtype)
   # sparse_softmax_cross_entropy_with_logits requires [batch_size] target.
   if len(target.get_shape()) == 2:
-    target = array_ops.squeeze(target, squeeze_dims=[1])
+    target = array_ops.squeeze(target, axis=[1])
   loss_vec = nn.sparse_softmax_cross_entropy_with_logits(
       labels=target, logits=logits)
   return loss_vec
-- 
GitLab


From 50a8df144d24ce60866bff96645f04e84a31f8b4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:57:06 +0000
Subject: [PATCH 1255/1262] Fix warnings in tf.contrib.factorization

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/factorization/python/ops/gmm_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index ccdd679d6a..e076631bc1 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -397,7 +397,7 @@ class GmmAlgorithm(object):
     # Compute the effective number of data points assigned to component k.
     with ops.control_dependencies(self._w):
       points_in_k = array_ops.squeeze(
-          math_ops.add_n(self._points_in_k), squeeze_dims=[0])
+          math_ops.add_n(self._points_in_k), axis=[0])
       # Update alpha.
       if 'w' in self._params:
         final_points_in_k = points_in_k / num_batches
-- 
GitLab


From 82eacbd4ac29db754b86a0be0cdfcc65b467c6af Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 22 Apr 2018 17:57:31 +0000
Subject: [PATCH 1256/1262] Fix warnings in tf.contrib.distributions with
 squeeze_dims

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/ops/bijectors/cholesky_outer_product.py              | 2 +-
 tensorflow/contrib/distributions/python/ops/shape.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index caae2adcfa..ecdb8967f4 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -170,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector):
     sum_weighted_log_diag = array_ops.squeeze(
         math_ops.matmul(math_ops.log(diag),
                         exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
+        axis=-1)
     fldj = p_float * np.log(2.) + sum_weighted_log_diag
 
     return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index bac0b79d59..6a7f28713a 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -439,7 +439,7 @@ class _DistributionShape(object):
           if self._batch_ndims_is_0 and expand_batch_dim:
             squeeze_dims += [1]
           if squeeze_dims:
-            x = array_ops.squeeze(x, squeeze_dims=squeeze_dims)
+            x = array_ops.squeeze(x, axis=squeeze_dims)
             # x.shape: [prod(S)]+B+E
         _, batch_shape, event_shape = self.get_shape(x)
       else:
-- 
GitLab


From c1544d1c34dac9aa01ed2de84bc850f8d1bfe919 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sun, 22 Apr 2018 19:08:21 -0700
Subject: [PATCH 1257/1262] Update tuple for cuda version with auto as it was
 removed in #18434.

---
 tensorflow/core/kernels/conv_ops_gpu.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 7f9cfec981..bbd5a53660 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -143,8 +143,7 @@ class ConvParameters {
   bool ShouldIncludeWinogradNonfusedAlgo(
       perftools::gputools::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
-    perftools::gputools::port::StatusOr<std::tuple<int, int, int>> version =
-        stream_exec->AsDnn()->GetVersion();
+    auto version = stream_exec->AsDnn()->GetVersion();
     if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
       return true;
     }
-- 
GitLab


From e5cfbd0eceb4dca98b388b13acff499a5420f863 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sun, 22 Apr 2018 20:00:54 -0700
Subject: [PATCH 1258/1262] Fix more for cuda version check.

---
 tensorflow/core/kernels/conv_ops_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index bbd5a53660..e8da5298e6 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -144,7 +144,7 @@ class ConvParameters {
       perftools::gputools::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
     auto version = stream_exec->AsDnn()->GetVersion();
-    if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+    if (version.ok() && version.ValueOrDie().major_version() >= 7) {
       return true;
     }
     return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
-- 
GitLab


From 829ec055afdfca3424030794c469d19290df13fe Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Mon, 23 Apr 2018 11:44:22 -0700
Subject: [PATCH 1259/1262] Update resources.h

---
 .../core/kernels/boosted_trees/resources.h    | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index ef42604897..df78d3f275 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -82,26 +82,6 @@ class BoostedTreesEnsembleResource : public StampedResource {
 
   int64 GetNumNodes(const int32 tree_id);
 
-  void UpdateLastLayerNodesRange(const int32 node_range_start,
-                                 int32 node_range_end) const {
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
-        node_range_start);
-    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
-        node_range_end);
-  }
-
-  void GetLastLayerNodesRange(int32* node_range_start,
-                              int32* node_range_end) const {
-    *node_range_start =
-        tree_ensemble_->growing_metadata().last_layer_node_start();
-    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
-  }
-
-  int64 GetNumNodes(const int32 tree_id) {
-    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
-    return tree_ensemble_->trees(tree_id).nodes_size();
-  }
-
   void UpdateGrowingMetadata() const;
 
   int32 GetNumLayersAttempted();
-- 
GitLab


From 06d5ca2ae097c08c886759dd27f90b19e4c6f49d Mon Sep 17 00:00:00 2001
From: Andy Kernahan <andrew.kernahan@gmail.com>
Date: Mon, 23 Apr 2018 20:32:35 +0100
Subject: [PATCH 1260/1262] Fix tfcompile module label. (#16582)

---
 tensorflow/docs_src/performance/xla/tfcompile.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md
index f57ca3948d..8521d7eacb 100644
--- a/tensorflow/docs_src/performance/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@@ -86,7 +86,7 @@ code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into
 executable code.
 
 ```build
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 # Use the tf_library macro to compile your graph into executable code.
 tf_library(
@@ -258,8 +258,8 @@ file.
 
 ```build
 # Example of linking your binary
-# Also see //third_party/tensorflow/compiler/aot/tests/BUILD
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+# Also see //tensorflow/compiler/aot/tests/BUILD
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 # The same tf_library call from step 2 above.
 tf_library(
-- 
GitLab


From d3b60b2210521a71961f675cb69bbe148b21b8da Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Mon, 23 Apr 2018 14:24:11 -0700
Subject: [PATCH 1261/1262] Reapply #18446.

---
 tensorflow/python/framework/test_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f954b9d6c7..5a8bc43727 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1014,6 +1014,8 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
+      config.graph_options.rewrite_options.arithmetic_optimization = (
+          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
-- 
GitLab


From 09398096284995d8a93c124bdbd70d6e1a44fbc3 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 24 Apr 2018 10:59:10 -0700
Subject: [PATCH 1262/1262] Update README.md

---
 tensorflow/tools/docker/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index f46c56e11a..525f2995ce 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -16,12 +16,12 @@ quick links here:
 
 We currently maintain two Docker container images:
 
-* `gcr.io/tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
+* `tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
 
-* `gcr.io/tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
+* `tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
   and support for NVidia CUDA
 
-Note: We also publish the same containers into
+Note: We store all our containers on 
 [Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 
@@ -29,12 +29,12 @@ Note: We also publish the same containers into
 
 Run non-GPU container using
 
-    $ docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow
+    $ docker run -it -p 8888:8888 tensorflow/tensorflow
 
 For GPU support install NVidia drivers (ideally latest) and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using
 
-    $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
+    $ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu
 
 
 Note: If you would have a problem running nvidia-docker you may try the old method
@@ -44,7 +44,7 @@ it there and try using nvidia-docker as described above.
     $ # The old, not recommended way to run docker with gpu support:
     $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu
+    $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES tensorflow/tensorflow:latest-gpu
 
 
 ## More containers
-- 
GitLab